# Enzymatic Constraints Enhancement of AGORA models

### Context



### Goals:

- 

In [1]:
from tqdm.auto import tqdm
from reframed.io.sbml import load_cbmodel,save_cbmodel
from cobra.io import read_sbml_model, write_sbml_model
from mewpy.simulation import get_simulator
from mewpy.simulation.environment import Environment as Environment
from mewpy.util.request import retreive_gene,retreive_protein,get_smiles,brenda_query
from mewpy.cobra.util import convert_gpr_to_dnf
import pandas as pd
import numpy as np
from urllib.request import urlopen
from functools import reduce
import json
from ast import literal_eval
from copy import deepcopy

In [2]:
filepath = "../models/non-ec/agora/Bacteroides_thetaiotaomicron_VPI_5482.xml"
model = read_sbml_model(filepath)
model2 = load_cbmodel(filepath)
organism = 'Bacteroides sp'

Set parameter Username
Academic license - for non-commercial use only - expires 2024-03-01


In [3]:
model.id

'M_Bacteroides_thetaiotaomicron_VPI_5482'

In [4]:
model_id = str(model.id.strip('M_'))
print(model_id)

Bacteroides_thetaiotaomicron_VPI_5482


In [5]:
sim = get_simulator(model)
sim.set_objective("biomass")

sim2 = get_simulator(model2)
sim2.set_objective("R_biomass")

In [6]:
sim.simulate()

objective: 73.25259606170322
Status: OPTIMAL
Method:SimulationMethod.FBA

In [6]:
sim.id

'M_Bacteroides_thetaiotaomicron_VPI_5482'

In [7]:
sim2.simulate()

objective: 322.9452607782981
Status: OPTIMAL
Method:SimulationMethod.FBA

## Annotation scraping

In [8]:
ls_ge = []

for ge in sim.genes:
    i = sim.genes.index(ge)
    gene = sim.genes[i]
    rxns = sim.get_gene(ge).reactions
    for rx in rxns:
        anno = sim.get_reaction(rx)['annotations']
        seed_id = anno.get('seed.reactions')
        rxn_name = sim.get_reaction(rx).name
        ecnumber = anno.get('ec-code')
        metanetx = anno.get('metanetx.reaction')
        kegg = anno.get('kegg.reaction')
        res = [gene,rx,rxn_name,seed_id,metanetx,kegg,ecnumber]
        ls_ge.append(res)

df_ge = pd.DataFrame(ls_ge,columns=[['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','ecNumber']])

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,,"1.1.1.86, 5.4.99.3"
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,3.1.4.16
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,3.1.4.16
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,3.1.4.16
...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,rxn13784,,,
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,rxn13784,,,
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,rxn13784,,,
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,rxn13784,,,


In [9]:
mask = df_ge['ecNumber'].values!=None
ec_nona = df_ge[mask]
ec_nona

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,,"1.1.1.86, 5.4.99.3"
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,3.1.4.16
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,3.1.4.16
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,3.1.4.16
...,...,...,...,...,...,...,...
1677,6666666.58896.peg.4188,r0777,"GTP 7, 8-8, 9-Dihydrolase",rxn03421,MNXR74388,,3.5.4.16
1678,226186.12.2464_16.peg,r0127,L-Asparagine Amidohydrolase / Cyanoamino Acid ...,rxn00342,MNXR81754,,3.5.1.38
1679,226186.12.2805_15.peg,r0127,L-Asparagine Amidohydrolase / Cyanoamino Acid ...,rxn00342,MNXR81754,,3.5.1.38
1680,226186.12.526_16.peg,r0127,L-Asparagine Amidohydrolase / Cyanoamino Acid ...,rxn00342,MNXR81754,,3.5.1.38


In [10]:
seed_id = df_ge['ModelSEED_id'].values.tolist()

seed_id = [reduce(lambda x: x, inner_list) for inner_list in seed_id]

metanetx_id = df_ge['MetaNetX'].values.tolist()

metanetx_id = [reduce(lambda x: x, inner_list) for inner_list in metanetx_id]

kegg_id = df_ge['KEGG_id'].values.tolist()

kegg_id = [reduce(lambda x: x, inner_list) for inner_list in kegg_id]

 ## ModelSEED query

In [11]:
SOLR_URL='https://modelseed.org'
ls_name = []
ls_kegg = []
ls_bigg = []

for mseed_id in tqdm(seed_id):
    i = seed_id.index(mseed_id)
    if mseed_id == None:
        ls_name.append('None')
        ls_kegg.append('None')
        ls_bigg.append('None')
    else:
        try:
            connection = urlopen(SOLR_URL+f'/solr/reactions/select?wt=json&q=id:{mseed_id}&fl=name,id,formula,charge,aliases')
            response = json.load(connection)
            for document in response['response']['docs']:  
                ms_name = document.get('name')
                ls_alias = document.get('aliases')
                ms_bigg = list(filter(lambda a: 'BiGG:' in a, document.get('aliases')))
                ms_kegg = list(filter(lambda a: 'KEGG:' in a, document.get('aliases')))
                if len(ms_bigg)== 0 and len(ms_kegg)== 0:
                    ms_bigg = 'None'
                    ms_kegg = 'None'
                elif len(ms_bigg)== 0 and len(ms_kegg)!= 0:
                    ms_bigg = 'None'
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                elif len(ms_bigg)!= 0 and len(ms_kegg)== 0:
                    ms_kegg = 'None'
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')
                else:
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')    
                ls_name.append(ms_name)
                ls_bigg.append(ms_bigg)
                ls_kegg.append(ms_kegg)           
        except:
            ls_name.append('None')
            ls_kegg.append('None')
            ls_bigg.append('None')

  0%|          | 0/1691 [00:00<?, ?it/s]

In [12]:
new_kegg = [next(filter(None, i)) for i in zip(ls_kegg, kegg_id)]

In [13]:
df_ge['BIGG_id'] = ls_bigg
df_ge['KEGG_id'] = new_kegg

## BiGG query

In [14]:
import requests

ls_bigg = df_ge['BIGG_id'].values.tolist()
bigg_ls = []

for bigg in tqdm(ls_bigg):
    for bigg_n in bigg:
        bigg_n = str(bigg_n)
        bigg_n = bigg_n.split(';')
        bigg_n = [x.strip(' ') for x in bigg_n]
        sub_bigg_ls = []
        for bi in bigg_n:
            if bi == 'None':
                pass
            else:
                url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bi}'
                with requests.request("GET", url) as resp:
                    try:
                        resp.raise_for_status()  # raises exception when not a 2xx response
                        if resp.status_code != 204:
                            data = dict(resp.json())
                            ec_l = data['database_links']
                            if ec_l == None:
                                sub_bigg_ls.append(None)
                            else:
                                ec = [i['id'] for i in ec_l['EC Number']]
                                if ec == None:
                                    sub_bigg_ls.append(None)
                                sub_bigg_ls.append(ec)
                        else: 
                            sub_bigg_ls.append(None)
                    except:
                        sub_bigg_ls.append(None)
    bigg_ls.append(sub_bigg_ls)

  0%|          | 0/1691 [00:00<?, ?it/s]

In [15]:
ec_l = df_ge['ecNumber'].values.tolist()
new_l = [next(filter(None, i)) for i in zip(bigg_ls, ec_l)]
df_ge['ecNumber'] = new_l

In [16]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,[1.1.1.86],
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,"[1.1.1.86, 5.4.99.3]",
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,R03538,"[[3.1.4.16], [3.1.4.16]]",23CN2P2; 23PDE2pp
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,R03929,"[[3.1.4.16], [3.1.4.16]]",23CN2P3; 23PDE4pp
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,R03537,"[[3.1.4.16], [3.1.4.16]]",23CN2P1; 23PDE7pp
...,...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,rxn13784,,,[None],
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,rxn13784,,,[None],
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,rxn13784,,,[None],
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,rxn13784,,,[None],


## Substrates

In [17]:
rx_l = df_ge['Reaction'].values.tolist()
ls_sub = []

for rxn in rx_l:
    for rx in rxn:
        sub = list(sim.get_substrates(rx).keys())
        ls_sub.append(sub)
    
df_ge["Substrates"] = ls_sub   

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id,Substrates
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,[1.1.1.86],,"[23dhmp[c], nadp[c]]"
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,"[1.1.1.86, 5.4.99.3]",,[2ahbut[c]]
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,R03538,"[[3.1.4.16], [3.1.4.16]]",23CN2P2; 23PDE2pp,"[23cump[c], h2o[c]]"
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,R03929,"[[3.1.4.16], [3.1.4.16]]",23CN2P3; 23PDE4pp,"[23ccmp[c], h2o[c]]"
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,R03537,"[[3.1.4.16], [3.1.4.16]]",23CN2P1; 23PDE7pp,"[23camp[c], h2o[c]]"
...,...,...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]


In [18]:
sub_na = df_ge['Substrates'].values.tolist()

ls_sub = []
ls_smile =[]



for sub_l in tqdm(sub_na):
    sub_ls_sub = []
    sub_ls_smile = []
    for sub_s in sub_l:
        for sub in sub_s:
            sub_name = sim.get_metabolite(sub).get('name')
            if "2,3-C" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "2,3-c" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "3-triphosphate" in sub_name:
                sub_name = sub_name.replace("3","3'")
            else:
                pass
            smile = get_smiles(sub_name)
            sub_ls_smile.append(smile)
            sub_ls_sub.append(sub_name)
        ls_sub.append(sub_ls_sub)
        ls_smile.append(sub_ls_smile)

df_ge['Substrate Name'] = ls_sub
df_ge['Substrate SMILES'] = ls_smile
df_ge.to_csv(f'../data/ec_data/ge_data_{model_id}.csv',na_rep='None')

  0%|          | 0/1691 [00:00<?, ?it/s]

In [19]:
df_ge = pd.read_csv(f'../data/ec_data/ge_data_{model_id}.csv')
df_ge = df_ge.loc[:,['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','Substrates','Substrate Name','Substrate SMILES','ecNumber']]
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86']
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,,['2ahbut[c]'],['(S)-2-Aceto-2-hydroxybutanoate'],['CCC(C(=O)C)(C(=O)O)O'],"['1.1.1.86, 5.4.99.3']"
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","[""2',3'-cyclic UMP(1-)"", 'Water']",['C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,"[['3.1.4.16'], ['3.1.4.16']]"
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","[""2',3'-Cyclic CMP"", 'Water']",['C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,"[['3.1.4.16'], ['3.1.4.16']]"
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","[""2',3'-Cyclic AMP"", 'Water']",['C1=NC(=C2C(=N1)N(C=N2)C3C4C(C(O3)CO)OP(=O)(O...,"[['3.1.4.16'], ['3.1.4.16']]"
...,...,...,...,...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]


In [20]:
ec_l = df_ge['ecNumber'].values.tolist()
ec_nl = []
ei = []

for es in ec_l:
    es = str(es)
    es = es.split(',')
    ei = []
    for sublist in es:
        sublist = str(sublist)
        sublist = sublist.strip("[[")
        sublist = sublist.strip(" ")
        sublist = sublist.strip("'")
        sublist = sublist.strip("[")
        sublist = sublist.strip("[")
        sublist = sublist.strip("'")
        sublist = sublist.strip("'")
        sublist = sublist.strip("]")
        sublist = sublist.strip("]]")  
        sublist = sublist.strip("'")
        if sublist not in ei:
            if sublist == 'None' and len(ei) > 0:
                pass
            elif '-' in sublist:
                pass
            else:
                ei.append(sublist)
    ec_nl.append(ei)

df_ge['ecNumber'] = ec_nl

## BRENDA query

### Kcat extraction

In [21]:
from brendapyrser import BRENDA
from brendapyrser import EnzymePropertyDict


dataFile = '../../brenda_2023_1.txt'

In [22]:
brenda = BRENDA(dataFile)
brenda

0,1
Number of Enzymes,7832
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


In [23]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()

for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip(' ')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_kcat_ls.append(None)
            continue
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec)'] = kcat_ls

  0%|          | 0/1691 [00:00<?, ?it/s]

In [24]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.filter_by_organism(organism).get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec and species)'] = kcat_ls

  0%|          | 0/1691 [00:00<?, ?it/s]

### Sequence extraction

In [25]:
seq_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_seq_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
            continue
        else:
            try:
                from zeep import Client
                import hashlib

                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","sequence*", "noOfAminoAcids*", "firstAccessionCode*", "source*", "id*", f"organism*{organism}")
                resultString = client.service.getSequence(*parameters) 
                sub_seq_ls.append(resultString[0]['sequence'])
            except:
                sub_seq_ls.append(None)                  
    seq_ls.append(sub_seq_ls)
    
df_ge['Protein Sequence'] = seq_ls

  0%|          | 0/1691 [00:00<?, ?it/s]

### Molecular Weight extraction

In [26]:
mw_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_mw_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
            continue
        else:
            try:
                from zeep import Client
                import hashlib
                
                res_mw = 0
                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ("alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","molecularWeight*","molecularWeightMaximum*","commentary*",f"organism*","literature*" )
                resultString = client.service.getMolecularWeight(*parameters)
                for i in range(len(resultString)):
                    res_mw = res_mw + int(resultString[i]['molecularWeight'])
                res_mw = res_mw/len(resultString)
                sub_mw_ls.append(res_mw)
            except:
                sub_mw_ls.append(None)                  
    mw_ls.append(sub_mw_ls)

df_ge['Molecular Weight'] = mw_ls

  0%|          | 0/1691 [00:00<?, ?it/s]

In [27]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),Protein Sequence,Molecular Weight
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",[1.1.1.86],[2.0091262401360535],[None],[MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVII...,[138656.35714285713]
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,,['2ahbut[c]'],['(S)-2-Aceto-2-hydroxybutanoate'],['CCC(C(=O)C)(C(=O)O)O'],"[1.1.1.86, 5.4.99.3]","[2.0091262401360535, None]","[None, None]",[MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVII...,"[None, None]"
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","[""2',3'-cyclic UMP(1-)"", 'Water']",['C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,[3.1.4.16],[8.665329411764708],[None],[None],[77195.8]
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","[""2',3'-Cyclic CMP"", 'Water']",['C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,[3.1.4.16],[8.665329411764708],[None],[None],[None]
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","[""2',3'-Cyclic AMP"", 'Water']",['C1=NC(=C2C(=N1)N(C=N2)C3C4C(C(O3)CO)OP(=O)(O...,[3.1.4.16],[8.665329411764708],[None],[None],[77195.8]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]


## DLKcat - Kcat prediction

In [28]:
dk_prep = df_ge.drop(columns=['Substrates','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','ecNumber'])
dk_prep

Unnamed: 0,Gene,Reaction,Name,Substrate Name,Substrate SMILES,Avg Kcat (by ec),Avg Kcat (by ec and species),Protein Sequence,Molecular Weight
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",[2.0091262401360535],[None],[MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVII...,[138656.35714285713]
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,['(S)-2-Aceto-2-hydroxybutanoate'],['CCC(C(=O)C)(C(=O)O)O'],"[2.0091262401360535, None]","[None, None]",[MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVII...,"[None, None]"
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase","[""2',3'-cyclic UMP(1-)"", 'Water']",['C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,[8.665329411764708],[None],[None],[77195.8]
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase","[""2',3'-Cyclic CMP"", 'Water']",['C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,[8.665329411764708],[None],[None],[None]
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase","[""2',3'-Cyclic AMP"", 'Water']",['C1=NC(=C2C(=N1)N(C=N2)C3C4C(C(O3)CO)OP(=O)(O...,[8.665329411764708],[None],[None],[77195.8]
...,...,...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]


In [29]:
dk_prep['Substrate Name'] = dk_prep['Substrate Name'].apply(literal_eval) #convert to list type
dk_prep['Substrate SMILES'] = dk_prep['Substrate SMILES'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Substrate Name','Substrate SMILES']).reset_index(drop=True)

In [30]:
#dk_prep['Protein Sequence'] = dk_prep['Protein Sequence'].apply(literal_eval) #convert to list type
#dk_prep['Molecular Weight'] = dk_prep['Molecular Weight'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Protein Sequence','Molecular Weight','Avg Kcat (by ec)','Avg Kcat (by ec and species)']).reset_index(drop=True)

In [31]:
dk_inp = dk_prep.drop(columns=['Gene','Reaction', 'Name', 'Avg Kcat (by ec)', 'Avg Kcat (by ec and species)','Molecular Weight'])
dk_inp.to_csv(f'../../DLKcat/DeeplearningApproach/Code/example/dk_input_{model_id}.tsv',sep="\t",na_rep='None',index= False)

In [32]:
dk_inp

Unnamed: 0,Substrate Name,Substrate SMILES,Protein Sequence
0,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...
1,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...
2,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...
3,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,
4,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],
...,...,...,...
4970,,,
4971,,,
4972,,,
4973,,,


Run DLKcat

In [34]:
dk_out = pd.read_csv(f'../../DLKcat/DeeplearningApproach/Code/example/output.tsv', sep="\t")
dk_out['Gene'] = dk_prep['Gene']
dk_out['Reaction'] = dk_prep['Reaction']
dk_out['Molecular Weight'] = dk_prep['Molecular Weight']
dk_out['Avg Kcat (by ec)'] = dk_prep['Avg Kcat (by ec)']
dk_out = dk_out.loc[:,['Gene','Reaction','Substrate Name','Substrate SMILES','Protein Sequence','Molecular Weight','Kcat value (1/s)','Avg Kcat (by ec)']]  
dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec)
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,138656.357143,2.1624,2.009126
1,6666666.58896.peg.2230,23DHMPO,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,138656.357143,2.3207,2.009126
2,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,,1.4796,2.009126
3,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,,,4.4442,
4,6666666.58896.peg.1313,23PDE2,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],,77195.8,2.0576,8.665329
...,...,...,...,...,...,...,...,...
4970,6666666.58896.peg.2922,rtranscription,,,,,,
4971,6666666.58896.peg.3450,rtranscription,,,,,,
4972,6666666.58896.peg.3588,rtranscription,,,,,,
4973,6666666.58896.peg.3637,rtranscription,,,,,,


### Joining Kcat values

In [35]:
kcat_brenda = dk_out['Avg Kcat (by ec)'].values.tolist()
kcat_dl = dk_out['Kcat value (1/s)'].values.tolist()
new_kcat = []

for i in range(len(kcat_brenda)):
    if kcat_brenda[i] != None and kcat_dl[i] != 'None':
        kcat = (float(kcat_brenda[i]) + float(kcat_dl[i]))/2
    elif kcat_brenda[i] == None and kcat_dl[i] != 'None':
        kcat = kcat_dl[i]
    elif kcat_brenda[i] != None and kcat_dl[i] == 'None':
        kcat = kcat_brenda[i]
    elif kcat_brenda[i] == None and kcat_dl[i] == 'None':
        kcat = 0
    new_kcat.append(kcat)
    
dk_out['New Kcat'] = new_kcat
dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec),New Kcat
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,138656.357143,2.1624,2.009126,2.085763
1,6666666.58896.peg.2230,23DHMPO,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,138656.357143,2.3207,2.009126,2.164913
2,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,,1.4796,2.009126,1.744363
3,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,,,4.4442,,4.4442
4,6666666.58896.peg.1313,23PDE2,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],,77195.8,2.0576,8.665329,5.361465
...,...,...,...,...,...,...,...,...,...
4970,6666666.58896.peg.2922,rtranscription,,,,,,,0
4971,6666666.58896.peg.3450,rtranscription,,,,,,,0
4972,6666666.58896.peg.3588,rtranscription,,,,,,,0
4973,6666666.58896.peg.3637,rtranscription,,,,,,,0


In [36]:
dk_out['Molecular Weight'] = dk_out['Molecular Weight'].fillna(0)

dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec),New Kcat
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,138656.357143,2.1624,2.009126,2.085763
1,6666666.58896.peg.2230,23DHMPO,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,138656.357143,2.3207,2.009126,2.164913
2,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,MAKIYYQQDCNLSMLDGKTIAIIGYGSQGHAHALNAKESGCHVIIG...,0.000000,1.4796,2.009126,1.744363
3,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate,CCC(C(=O)C)(C(=O)O)O,,0.000000,4.4442,,4.4442
4,6666666.58896.peg.1313,23PDE2,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],,77195.800000,2.0576,8.665329,5.361465
...,...,...,...,...,...,...,...,...,...
4970,6666666.58896.peg.2922,rtranscription,,,,0.000000,,,0
4971,6666666.58896.peg.3450,rtranscription,,,,0.000000,,,0
4972,6666666.58896.peg.3588,rtranscription,,,,0.000000,,,0
4973,6666666.58896.peg.3637,rtranscription,,,,0.000000,,,0


## Adding ezymatic constraints

In [37]:
genes = dk_out['Gene'].values.tolist()
mweights = dk_out['Molecular Weight'].values.tolist()
kcats = dk_out['New Kcat'].values.tolist()

ec_data = dict()
for gene in genes:
    ge = sim.get_gene(gene).id
    i = genes.index(gene)
    mw = mweights[i]
    kcat = kcats[i]
    if kcat == 0:
        kcat = 1
    ec_data[ge]={'protein':ge[len(sim._g_prefix):],'mw':mw,'kcat':kcat}
    
print(ec_data)

{'6666666.58896.peg.2230': {'protein': '6666666.58896.peg.2230', 'mw': 138656.35714285713, 'kcat': 2.0857631200680267}, '6666666.58896.peg.1313': {'protein': '6666666.58896.peg.1313', 'mw': 77195.8, 'kcat': 5.361464705882353}, '6666666.58896.peg.4707': {'protein': '6666666.58896.peg.4707', 'mw': 77195.8, 'kcat': 5.361464705882353}, '6666666.58896.peg.573': {'protein': '6666666.58896.peg.573', 'mw': 62275.0, 'kcat': 9.904871428571429}, '226186.12.1885.peg': {'protein': '226186.12.1885.peg', 'mw': 0.0, 'kcat': '1.1192'}, '226186.12.660.peg': {'protein': '226186.12.660.peg', 'mw': 0.0, 'kcat': '1.1192'}, '6666666.58896.peg.833': {'protein': '6666666.58896.peg.833', 'mw': 44816.291666666664, 'kcat': 6.808541666666667}, '6666666.58896.peg.4486': {'protein': '6666666.58896.peg.4486', 'mw': 229477.23076923078, 'kcat': 6.6908521875}, '226186.12.1912.peg': {'protein': '226186.12.1912.peg', 'mw': 0.0, 'kcat': 20.44836515151515}, '6666666.58896.peg.1024': {'protein': '6666666.58896.peg.1024', 'mw

In [38]:
with open(f"../data/ec_data/{model_id}.json", "w") as f:
        json.dump(ec_data,f)

In [39]:
with open(f"../data/ec_data/{model_id}.json") as json_file:
    ec_data = json.load(json_file)

In [40]:
print("Genes with missing data:")
for k,v in ec_data.items():
    if v['mw']==0:
        print(k)

Genes with missing data:
226186.12.1885.peg
226186.12.660.peg
226186.12.1912.peg
6666666.58896.peg.1024
226186.12.3387.peg
226186.12.2861.peg
226186.12.907.peg
6666666.58896.peg.2568
6666666.58896.peg.34
6666666.58896.peg.3955
6666666.58896.peg.2694
226186.12.peg.1963
6666666.58896.peg.1304
6666666.58896.peg.2315
6666666.58896.peg.2973
6666666.58896.peg.3789
6666666.58896.peg.720
6666666.58896.peg.1542
6666666.58896.peg.2049
6666666.58896.peg.2050
6666666.58896.peg.458
6666666.58896.peg.1426
6666666.58896.peg.3084
6666666.58896.peg.417
6666666.58896.peg.62
6666666.58896.peg.3964
226186.12.3820.peg
226186.12.3821.peg
6666666.58896.peg.1445
6666666.58896.peg.327
226186.12.531_21.peg
226186.12.784_22.peg
6666666.58896.peg.4124
226186.12.1893.peg
6666666.58896.peg.1830
6666666.58896.peg.921
6666666.58896.peg.2988
226186.12.3876_12.peg
226186.12.518_12.peg
6666666.58896.peg.3960
6666666.58896.peg.1722
6666666.58896.peg.2894
6666666.58896.peg.4314
6666666.58896.peg.672
226186.12.2135.peg
226

In [41]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tmcom.util import add_enzyme_constraints

ec_sim = add_enzyme_constraints(sim, ec_data)

Read LP format model from file /tmp/tmpds509v42.lp
Reading time = 0.01 seconds
: 2041 rows, 4876 columns, 15788 nonzeros


Converting to irreversible: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2438/2438 [00:08<00:00, 287.15it/s]
Adding gene species: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 823/823 [00:06<00:00, 125.57it/s]
  warn("need to pass in a list")
Adding proteins usage to reactions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

### Protein pool usage

In [42]:
res = ec_sim.simulate()
print(res)

objective: 73.25259606170324
Status: OPTIMAL
Method:SimulationMethod.FBA


In [43]:
res.find('pool')

Unnamed: 0_level_0,Flux rate
Reaction ID,Unnamed: 1_level_1
prot_pool_exchange,764054.67335


### Save the model

In [44]:
write_sbml_model(ec_sim.model,f"../models/ec/ec_{model_id}.xml")

In [5]:
ec_model = read_sbml_model(f"../models/ec/ec_{model_id}.xml")
ec_sim = get_simulator(ec_model)

In [7]:
env = Environment.complete(ec_sim, max_uptake=1000.0, inplace=False)

env['prot_pool_exchange']=(0,0.8)

ec_sim.simulate(constraints=env)

objective: 0.00029578387788254117
Status: OPTIMAL
Method:SimulationMethod.FBA