# Enzymatic Constraints Enhancement of AGORA models

### Context



### Goals:

- 

In [1]:
from tqdm.auto import tqdm
from reframed.io.sbml import load_cbmodel,save_cbmodel
from cobra.io import read_sbml_model, write_sbml_model
from mewpy.simulation import get_simulator
from mewpy.simulation.environment import Environment as Environment
from mewpy.util.request import retreive_gene,retreive_protein,get_smiles,brenda_query
from mewpy.cobra.util import convert_gpr_to_dnf
import pandas as pd
import numpy as np
from urllib.request import urlopen
from functools import reduce
import json
from ast import literal_eval
from copy import deepcopy

In [2]:
filepath = "../models/non-ec/agora/Bacteroides_thetaiotaomicron_VPI_5482.xml"
model = read_sbml_model(filepath)
model2 = load_cbmodel(filepath)
organism = 'Bacteroides sp'

Set parameter Username
Academic license - for non-commercial use only - expires 2024-11-11


In [3]:
model.id

'M_Bacteroides_thetaiotaomicron_VPI_5482'

In [4]:
model_id = str(model.id.strip('M_'))
print(model_id)

Bacteroides_thetaiotaomicron_VPI_5482


In [5]:
sim = get_simulator(model)
sim.set_objective("biomass")

sim2 = get_simulator(model2)
sim2.set_objective("R_biomass")

In [6]:
sim.simulate()

objective: 73.25259606170322
Status: OPTIMAL
Method:SimulationMethod.FBA

In [7]:
sim2.simulate()

objective: 322.9452607782981
Status: OPTIMAL
Method:SimulationMethod.FBA

## Annotation scraping

In [8]:
ls_ge = []

for ge in sim.genes:
    i = sim.genes.index(ge)
    gene = sim.genes[i]
    rxns = sim.get_gene(ge).reactions
    for rx in rxns:
        anno = sim.get_reaction(rx)['annotations']
        seed_id = anno.get('seed.reactions')
        rxn_name = sim.get_reaction(rx).name
        ecnumber = anno.get('ec-code')
        metanetx = anno.get('metanetx.reaction')
        kegg = anno.get('kegg.reaction')
        res = [gene,rx,rxn_name,seed_id,metanetx,kegg,ecnumber]
        ls_ge.append(res)

df_ge = pd.DataFrame(ls_ge,columns=[['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','ecNumber']])

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,6666666.58896.peg.2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
1,6666666.58896.peg.2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,,"1.1.1.86, 5.4.99.3"
2,6666666.58896.peg.1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,3.1.4.16
3,6666666.58896.peg.1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,3.1.4.16
4,6666666.58896.peg.1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,3.1.4.16
...,...,...,...,...,...,...,...
1686,6666666.58896.peg.2922,rtranscription,RNA transcription c0,rxn13784,,,
1687,6666666.58896.peg.3450,rtranscription,RNA transcription c0,rxn13784,,,
1688,6666666.58896.peg.3588,rtranscription,RNA transcription c0,rxn13784,,,
1689,6666666.58896.peg.3637,rtranscription,RNA transcription c0,rxn13784,,,


In [9]:
mask = df_ge['ecNumber'].values!=None
ec_nona = df_ge[mask]
ec_nona

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
1,HMPREF0833_RS09600,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
2,HMPREF0833_RS09600,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,,"1.1.1.86, 5.4.99.3"
3,HMPREF0833_RS09600,DPRr,2 dehydropantoate 2 reductase,rxn01790,MNXR1788,,1.1.1.169
4,WP_003010246.1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
5,WP_003010246.1,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,,"1.1.1.86, 5.4.99.3"
...,...,...,...,...,...,...,...
2190,HMPREF0833_RS08355,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,3.5.4.16
2191,HMPREF0833_RS08355,r0777,"GTP 7, 8-8, 9-Dihydrolase",rxn03421,MNXR74388,,3.5.4.16
2192,760570.3.202_15.peg,r0127,L-Asparagine Amidohydrolase / Cyanoamino Acid ...,rxn00342,MNXR81754,,3.5.1.38
2193,WP_013903434.1,r0345,ATP:AMP Phosphotransferase Ec:2.7.4.11,rxn01127,MNXR68634,,2.7.4.11


In [10]:
seed_id = df_ge['ModelSEED_id'].values.tolist()

seed_id = [reduce(lambda x: x, inner_list) for inner_list in seed_id]

metanetx_id = df_ge['MetaNetX'].values.tolist()

metanetx_id = [reduce(lambda x: x, inner_list) for inner_list in metanetx_id]

kegg_id = df_ge['KEGG_id'].values.tolist()

kegg_id = [reduce(lambda x: x, inner_list) for inner_list in kegg_id]

 ## ModelSEED query

In [11]:
SOLR_URL='https://modelseed.org'
ls_name = []
ls_kegg = []
ls_bigg = []

for mseed_id in tqdm(seed_id):
    i = seed_id.index(mseed_id)
    if mseed_id == None:
        ls_name.append('None')
        ls_kegg.append('None')
        ls_bigg.append('None')
    else:
        try:
            connection = urlopen(SOLR_URL+f'/solr/reactions/select?wt=json&q=id:{mseed_id}&fl=name,id,formula,charge,aliases')
            response = json.load(connection)
            for document in response['response']['docs']:  
                ms_name = document.get('name')
                ls_alias = document.get('aliases')
                ms_bigg = list(filter(lambda a: 'BiGG:' in a, document.get('aliases')))
                ms_kegg = list(filter(lambda a: 'KEGG:' in a, document.get('aliases')))
                if len(ms_bigg)== 0 and len(ms_kegg)== 0:
                    ms_bigg = 'None'
                    ms_kegg = 'None'
                elif len(ms_bigg)== 0 and len(ms_kegg)!= 0:
                    ms_bigg = 'None'
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                elif len(ms_bigg)!= 0 and len(ms_kegg)== 0:
                    ms_kegg = 'None'
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')
                else:
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')    
                ls_name.append(ms_name)
                ls_bigg.append(ms_bigg)
                ls_kegg.append(ms_kegg)           
        except:
            ls_name.append('None')
            ls_kegg.append('None')
            ls_bigg.append('None')

  0%|          | 0/2213 [00:00<?, ?it/s]

In [12]:
new_kegg = [next(filter(None, i)) for i in zip(ls_kegg, kegg_id)]

In [13]:
df_ge['BIGG_id'] = ls_bigg
df_ge['KEGG_id'] = new_kegg

## BiGG query

In [14]:
import requests

ls_bigg = df_ge['BIGG_id'].values.tolist()
bigg_ls = []

for bigg in tqdm(ls_bigg):
    for bigg_n in bigg:
        bigg_n = str(bigg_n)
        bigg_n = bigg_n.split(';')
        bigg_n = [x.strip(' ') for x in bigg_n]
        sub_bigg_ls = []
        for bi in bigg_n:
            if bi == 'None':
                pass
            else:
                url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bi}'
                with requests.request("GET", url) as resp:
                    try:
                        resp.raise_for_status()  # raises exception when not a 2xx response
                        if resp.status_code != 204:
                            data = dict(resp.json())
                            ec_l = data['database_links']
                            if ec_l == None:
                                sub_bigg_ls.append(None)
                            else:
                                ec = [i['id'] for i in ec_l['EC Number']]
                                if ec == None:
                                    sub_bigg_ls.append(None)
                                sub_bigg_ls.append(ec)
                        else: 
                            sub_bigg_ls.append(None)
                    except:
                        sub_bigg_ls.append(None)
    bigg_ls.append(sub_bigg_ls)

  0%|          | 0/2213 [00:00<?, ?it/s]

In [15]:
ec_l = df_ge['ecNumber'].values.tolist()
new_l = [next(filter(None, i)) for i in zip(bigg_ls, ec_l)]
df_ge['ecNumber'] = new_l

In [16]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id
0,WP_013904118.1,22IDPOR,"2,2-iminodipropanoate:NAD+ oxidoreductase (L-a...",rxn00280,,R00398,[None],
1,HMPREF0833_RS09600,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,[1.1.1.86],
2,HMPREF0833_RS09600,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,"[1.1.1.86, 5.4.99.3]",
3,HMPREF0833_RS09600,DPRr,2 dehydropantoate 2 reductase,rxn01790,MNXR1788,R02472,"[[1.1.1.169], [1.1.1.169], None, None]",DPR; DPRm; ILV5_3; PAN5
4,WP_003010246.1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,[1.1.1.86],
...,...,...,...,...,...,...,...,...
2208,HMPREF0833_RS08265,rtranscription,RNA transcription c0,rxn13784,,,[None],
2209,HMPREF0833_RS09900,rtranscription,RNA transcription c0,rxn13784,,,[None],
2210,WP_003002960.1,rtranscription,RNA transcription c0,rxn13784,,,[None],
2211,WP_013903347.1,rtranscription,RNA transcription c0,rxn13784,,,[None],


## Substrates

In [17]:
rx_l = df_ge['Reaction'].values.tolist()
ls_sub = []

for rxn in rx_l:
    for rx in rxn:
        sub = list(sim.get_substrates(rx).keys())
        ls_sub.append(sub)
    
df_ge["Substrates"] = ls_sub   

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id,Substrates
0,WP_013904118.1,22IDPOR,"2,2-iminodipropanoate:NAD+ oxidoreductase (L-a...",rxn00280,,R00398,[None],,"[alnpn[c], h2o[c], nad[c]]"
1,HMPREF0833_RS09600,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,[1.1.1.86],,"[23dhmp[c], nadp[c]]"
2,HMPREF0833_RS09600,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,"[1.1.1.86, 5.4.99.3]",,[2ahbut[c]]
3,HMPREF0833_RS09600,DPRr,2 dehydropantoate 2 reductase,rxn01790,MNXR1788,R02472,"[[1.1.1.169], [1.1.1.169], None, None]",DPR; DPRm; ILV5_3; PAN5,"[2dhp[c], h[c], nadph[c]]"
4,WP_003010246.1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,[1.1.1.86],,"[23dhmp[c], nadp[c]]"
...,...,...,...,...,...,...,...,...,...
2208,HMPREF0833_RS08265,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
2209,HMPREF0833_RS09900,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
2210,WP_003002960.1,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
2211,WP_013903347.1,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]


In [None]:
sub_na = df_ge['Substrates'].values.tolist()

ls_sub = []
ls_smile =[]



for sub_l in tqdm(sub_na):
    sub_ls_sub = []
    sub_ls_smile = []
    for sub_s in sub_l:
        for sub in sub_s:
            sub_name = sim.get_metabolite(sub).get('name')
            if "2,3-C" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "2,3-c" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "3-triphosphate" in sub_name:
                sub_name = sub_name.replace("3","3'")
            else:
                pass
            smile = get_smiles(sub_name)
            sub_ls_smile.append(smile)
            sub_ls_sub.append(sub_name)
        ls_sub.append(sub_ls_sub)
        ls_smile.append(sub_ls_smile)

df_ge['Substrate Name'] = ls_sub
df_ge['Substrate SMILES'] = ls_smile
df_ge.to_csv(f'../data/ec_data/ge_data_{model_id}.csv',na_rep='None')

  0%|          | 0/2213 [00:00<?, ?it/s]

In [10]:
df_ge = pd.read_csv(f'../data/ec_data/ge_data_{model_id}.csv')
df_ge = df_ge.loc[:,['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','Substrates','Substrate Name','Substrate SMILES','ecNumber']]
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber
0,G_6666666__46__58896__46__peg__46__2230,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86']
1,G_6666666__46__58896__46__peg__46__2230,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,R05069,,['2ahbut[c]'],['(S)-2-Aceto-2-hydroxybutanoate'],['CCC(C(=O)C)(C(=O)O)O'],"['1.1.1.86, 5.4.99.3']"
2,G_6666666__46__58896__46__peg__46__1313,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","[""2',3'-cyclic UMP(1-)"", 'Water']",['C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,"[['3.1.4.16'], ['3.1.4.16']]"
3,G_6666666__46__58896__46__peg__46__1313,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","[""2',3'-Cyclic CMP"", 'Water']",['C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-...,"[['3.1.4.16'], ['3.1.4.16']]"
4,G_6666666__46__58896__46__peg__46__1313,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","[""2',3'-Cyclic AMP"", 'Water']",['C1=NC(=C2C(=N1)N(C=N2)C3C4C(C(O3)CO)OP(=O)(O...,"[['3.1.4.16'], ['3.1.4.16']]"
...,...,...,...,...,...,...,...,...,...,...,...
1686,G_6666666__46__58896__46__peg__46__2922,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1687,G_6666666__46__58896__46__peg__46__3450,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1688,G_6666666__46__58896__46__peg__46__3588,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1689,G_6666666__46__58896__46__peg__46__3637,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]


In [11]:
ec_l = df_ge['ecNumber'].values.tolist()
ec_nl = []
ei = []

for es in ec_l:
    es = str(es)
    es = es.split(',')
    ei = []
    for sublist in es:
        sublist = str(sublist)
        sublist = sublist.strip("[[")
        sublist = sublist.strip(" ")
        sublist = sublist.strip("'")
        sublist = sublist.strip("[")
        sublist = sublist.strip("[")
        sublist = sublist.strip("'")
        sublist = sublist.strip("'")
        sublist = sublist.strip("]")
        sublist = sublist.strip("]]")  
        sublist = sublist.strip("'")
        if sublist not in ei:
            if sublist == 'None' and len(ei) > 0:
                pass
            elif '-' in sublist:
                pass
            else:
                ei.append(sublist)
    ec_nl.append(ei)

df_ge['ecNumber'] = ec_nl

## BRENDA query

### Kcat extraction

In [12]:
from brendapyrser import BRENDA
from brendapyrser import EnzymePropertyDict


dataFile = '../../brenda_2023_1.txt'

In [13]:
brenda = BRENDA(dataFile)
brenda

0,1
Number of Enzymes,7832
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


In [None]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()

for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip(' ')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_kcat_ls.append(None)
            continue
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec)'] = kcat_ls

  0%|          | 0/1691 [00:00<?, ?it/s]

In [None]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.filter_by_organism(organism).get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec and species)'] = kcat_ls

### Sequence extraction

In [None]:
seq_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_seq_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
            continue
        else:
            try:
                from zeep import Client
                import hashlib

                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","sequence*", "noOfAminoAcids*", "firstAccessionCode*", "source*", "id*", f"organism*{organism}")
                resultString = client.service.getSequence(*parameters) 
                sub_seq_ls.append(resultString[0]['sequence'])
            except:
                sub_seq_ls.append(None)                  
    seq_ls.append(sub_seq_ls)
    
df_ge['Protein Sequence'] = seq_ls

### Molecular Weight extraction

In [None]:
mw_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_mw_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
            continue
        else:
            try:
                from zeep import Client
                import hashlib
                
                res_mw = 0
                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ("alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","molecularWeight*","molecularWeightMaximum*","commentary*",f"organism*","literature*" )
                resultString = client.service.getMolecularWeight(*parameters)
                for i in range(len(resultString)):
                    res_mw = res_mw + int(resultString[i]['molecularWeight'])
                res_mw = res_mw/len(resultString)
                sub_mw_ls.append(res_mw)
            except:
                sub_mw_ls.append(None)                  
    mw_ls.append(sub_mw_ls)

df_ge['Molecular Weight'] = mw_ls

In [None]:
df_ge

## DLKcat - Kcat prediction

In [None]:
dk_prep = df_ge.drop(columns=['Substrates','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','ecNumber'])
dk_prep

In [None]:
dk_prep['Substrate Name'] = dk_prep['Substrate Name'].apply(literal_eval) #convert to list type
dk_prep['Substrate SMILES'] = dk_prep['Substrate SMILES'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Substrate Name','Substrate SMILES']).reset_index(drop=True)

In [None]:
#dk_prep['Protein Sequence'] = dk_prep['Protein Sequence'].apply(literal_eval) #convert to list type
#dk_prep['Molecular Weight'] = dk_prep['Molecular Weight'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Protein Sequence','Molecular Weight','Avg Kcat (by ec)','Avg Kcat (by ec and species)']).reset_index(drop=True)

In [None]:
dk_inp = dk_prep.drop(columns=['Gene','Reaction', 'Name', 'Avg Kcat (by ec)', 'Avg Kcat (by ec and species)','Molecular Weight'])
dk_inp.to_csv(f'../../DLKcat/DeeplearningApproach/Code/example/dk_input_{model_id}.tsv',sep="\t",na_rep='None',index= False)

In [None]:
dk_inp

Run DLKcat

In [22]:
dk_out = pd.read_csv(f'../../DLKcat/DeeplearningApproach/Code/example/output.tsv', sep="\t")
dk_out['Gene'] = dk_prep['Gene']
dk_out['Reaction'] = dk_prep['Reaction']
dk_out['Molecular Weight'] = dk_prep['Molecular Weight']
dk_out['Avg Kcat (by ec)'] = dk_prep['Avg Kcat (by ec)']
dk_out = dk_out.loc[:,['Gene','Reaction','Substrate Name','Substrate SMILES','Protein Sequence','Molecular Weight','Kcat value (1/s)','Avg Kcat (by ec)']]  
dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec)
0,WP_013904118.1,22IDPOR,Alanopine,CC(C(=O)O)NC(C)C(=O)O,,,6.5328,
1,WP_013904118.1,22IDPOR,Water,O,,,0.8537,
2,WP_013904118.1,22IDPOR,Nicotinamide adenine dinucleotide,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,,,6.6892,
3,HMPREF0833_RS09600,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAVQMEYKKDVKVPALDGKKIAVIGYGSQGHAHAQNLRDSGHDVII...,138656.357143,5.7286,2.009126
4,HMPREF0833_RS09600,23DHMPO,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAVQMEYKKDVKVPALDGKKIAVIGYGSQGHAHAQNLRDSGHDVII...,138656.357143,9.3451,2.009126
...,...,...,...,...,...,...,...,...
6141,HMPREF0833_RS08265,rtranscription,,,,,,
6142,HMPREF0833_RS09900,rtranscription,,,,,,
6143,WP_003002960.1,rtranscription,,,,,,
6144,WP_013903347.1,rtranscription,,,,,,


### Joining Kcat values

In [23]:
kcat_brenda = dk_out['Avg Kcat (by ec)'].values.tolist()
kcat_dl = dk_out['Kcat value (1/s)'].values.tolist()
new_kcat = []

for i in range(len(kcat_brenda)):
    if kcat_brenda[i] != None and kcat_dl[i] != 'None':
        kcat = (float(kcat_brenda[i]) + float(kcat_dl[i]))/2
    elif kcat_brenda[i] == None and kcat_dl[i] != 'None':
        kcat = kcat_dl[i]
    elif kcat_brenda[i] != None and kcat_dl[i] == 'None':
        kcat = kcat_brenda[i]
    elif kcat_brenda[i] == None and kcat_dl[i] == 'None':
        kcat = 0
    new_kcat.append(kcat)
    
dk_out['New Kcat'] = new_kcat
dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec),New Kcat
0,WP_013904118.1,22IDPOR,Alanopine,CC(C(=O)O)NC(C)C(=O)O,,,6.5328,,6.5328
1,WP_013904118.1,22IDPOR,Water,O,,,0.8537,,0.8537
2,WP_013904118.1,22IDPOR,Nicotinamide adenine dinucleotide,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,,,6.6892,,6.6892
3,HMPREF0833_RS09600,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAVQMEYKKDVKVPALDGKKIAVIGYGSQGHAHAQNLRDSGHDVII...,138656.357143,5.7286,2.009126,3.868863
4,HMPREF0833_RS09600,23DHMPO,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAVQMEYKKDVKVPALDGKKIAVIGYGSQGHAHAQNLRDSGHDVII...,138656.357143,9.3451,2.009126,5.677113
...,...,...,...,...,...,...,...,...,...
6141,HMPREF0833_RS08265,rtranscription,,,,,,,0
6142,HMPREF0833_RS09900,rtranscription,,,,,,,0
6143,WP_003002960.1,rtranscription,,,,,,,0
6144,WP_013903347.1,rtranscription,,,,,,,0


In [24]:
dk_out['Molecular Weight'] = dk_out['Molecular Weight'].fillna(0)

dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec),New Kcat
0,WP_013904118.1,22IDPOR,Alanopine,CC(C(=O)O)NC(C)C(=O)O,,0.000000,6.5328,,6.5328
1,WP_013904118.1,22IDPOR,Water,O,,0.000000,0.8537,,0.8537
2,WP_013904118.1,22IDPOR,Nicotinamide adenine dinucleotide,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,,0.000000,6.6892,,6.6892
3,HMPREF0833_RS09600,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MAVQMEYKKDVKVPALDGKKIAVIGYGSQGHAHAQNLRDSGHDVII...,138656.357143,5.7286,2.009126,3.868863
4,HMPREF0833_RS09600,23DHMPO,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MAVQMEYKKDVKVPALDGKKIAVIGYGSQGHAHAQNLRDSGHDVII...,138656.357143,9.3451,2.009126,5.677113
...,...,...,...,...,...,...,...,...,...
6141,HMPREF0833_RS08265,rtranscription,,,,0.000000,,,0
6142,HMPREF0833_RS09900,rtranscription,,,,0.000000,,,0
6143,WP_003002960.1,rtranscription,,,,0.000000,,,0
6144,WP_013903347.1,rtranscription,,,,0.000000,,,0


## Adding ezymatic constraints

In [25]:
genes = dk_out['Gene'].values.tolist()
mweights = dk_out['Molecular Weight'].values.tolist()
kcats = dk_out['New Kcat'].values.tolist()

ec_data = dict()
for gene in genes:
    ge = sim.get_gene(gene).id
    i = genes.index(gene)
    mw = mweights[i]
    kcat = kcats[i]
    if kcat == 0:
        kcat = 1
    ec_data[ge]={'protein':ge[len(sim._g_prefix):],'mw':mw,'kcat':kcat}
    
print(ec_data)

{'WP_013904118.1': {'protein': 'WP_013904118.1', 'mw': 0.0, 'kcat': '6.5328'}, 'HMPREF0833_RS09600': {'protein': 'HMPREF0833_RS09600', 'mw': 138656.35714285713, 'kcat': 3.868863120068027}, 'WP_003010246.1': {'protein': 'WP_003010246.1', 'mw': 138656.35714285713, 'kcat': 3.868863120068027}, '760570.3.1875.peg': {'protein': '760570.3.1875.peg', 'mw': 0.0, 'kcat': '0.3540'}, 'HMPREF0833_RS07505': {'protein': 'HMPREF0833_RS07505', 'mw': 0.0, 'kcat': '27.2297'}, 'HMPREF0833_RS00645': {'protein': 'HMPREF0833_RS00645', 'mw': 107345.45454545454, 'kcat': 247.0160364408603}, 'HMPREF0833_RS02245': {'protein': 'HMPREF0833_RS02245', 'mw': 107345.45454545454, 'kcat': 247.0160364408603}, 'WP_003014794.1': {'protein': 'WP_003014794.1', 'mw': 107345.45454545454, 'kcat': 247.0160364408603}, 'WP_013903259.1': {'protein': 'WP_013903259.1', 'mw': 107345.45454545454, 'kcat': 247.0160364408603}, 'HMPREF0833_RS02475': {'protein': 'HMPREF0833_RS02475', 'mw': 9558.2, 'kcat': 5140.637736363636}, 'HMPREF0833_RS08

In [26]:
with open(f"../data/ec_data/{model_id}.json", "w") as f:
        json.dump(ec_data,f)

In [27]:
with open(f"../data/ec_data/{model_id}.json") as json_file:
    ec_data = json.load(json_file)

In [28]:
print("Genes with missing data:")
for k,v in ec_data.items():
    if v['mw']==0:
        print(k)

Genes with missing data:
WP_013904118.1
760570.3.1875.peg
HMPREF0833_RS07505
HMPREF0833_RS08345
WP_013904493.1
HMPREF0833_RS05860
WP_013904151.1
WP_041818454.1
760570.3.915.peg
760570.3.761.peg
760570.3.816.peg
HMPREF0833_RS07800
WP_013904439.1
HMPREF0833_RS07520
HMPREF0833_RS07525
HMPREF0833_RS02145
WP_013903504.1
HMPREF0833_RS05690
WP_003015562.1
HMPREF0833_RS10095
WP_003018099.1
WP_013904154.1
HMPREF0833_RS07875
HMPREF0833_RS04135
WP_003018264.1
760570.3.1876.peg
HMPREF0833_RS09645
HMPREF0833_RS01055
Unknown
HMPREF0833_RS03965
WP_013903816.1
HMPREF0833_RS01935
HMPREF0833_RS06240
WP_013904211.1
HMPREF0833_RS03910
WP_013903805.1
760570.3.1105.peg
HMPREF0833_RS05255
HMPREF0833_RS07540
WP_041818395.1
760570.3.1865.peg
HMPREF0833_RS08680
HMPREF0833_RS09990
WP_013904745.1
HMPREF0833_RS02770
WP_013903608.1
HMPREF0833_RS00190
WP_013903188.1
WP_013904072.1
HMPREF0833_RS02890
WP_003016299.1
760570.3.553_12.peg
HMPREF0833_RS01610
HMPREF0833_RS05530
WP_013903404.1
HMPREF0833_RS00700
HMPREF0833_

In [29]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tmcom.util import add_enzyme_constraints

ec_sim = add_enzyme_constraints(sim, ec_data)

Read LP format model from file /tmp/tmp50jdphlu.lp
Reading time = 0.01 seconds
: 1231 rows, 2516 columns, 10874 nonzeros


Converting to irreversible: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1258/1258 [00:02<00:00, 586.54it/s]
Adding gene species: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 899/899 [00:04<00:00, 201.76it/s]
  warn("need to pass in a list")
Adding proteins usage to reactions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

### Protein pool usage

In [30]:
res = ec_sim.simulate()
print(res)

objective: 39.14478450300015
Status: OPTIMAL
Method:SimulationMethod.FBA


In [31]:
res.find('pool')

Unnamed: 0_level_0,Flux rate
Reaction ID,Unnamed: 1_level_1
prot_pool_exchange,647294.963356


### Save the model

In [32]:
write_sbml_model(ec_sim.model,f"../models/ec/ec_{model_id}.xml")

In [5]:
ec_model = read_sbml_model(f"../models/ec/ec_{model_id}.xml")
ec_sim = get_simulator(ec_model)

In [7]:
env = Environment.complete(ec_sim, max_uptake=1000.0, inplace=False)

env['prot_pool_exchange']=(0,1.0)

ec_sim.simulate(constraints=env)

objective: 0.0014375934065246336
Status: OPTIMAL
Method:SimulationMethod.FBA