# Enzymatic Constraints Enhancement of AGORA models

### Context



### Goals:

- 

In [36]:
from tqdm.auto import tqdm
from reframed.io.sbml import load_cbmodel,save_cbmodel
from cobra.io import read_sbml_model, write_sbml_model
from mewpy.cobra.util import add_enzyme_constraints,convert_gpr_to_dnf,split_isozymes
from mewpy.simulation import get_simulator
from mewpy.simulation.environment import Environment as Environment
from mewpy.util.request import retreive_gene,retreive_protein,get_smiles,brenda_query
import pandas as pd
import numpy as np
from urllib.request import urlopen
from functools import reduce
import json
from ast import literal_eval

In [37]:
filepath = "../models/non-ec/agora/Fusobacterium_nucleatum_subsp_nucleatum_ATCC_25586.xml"
model = read_sbml_model(filepath)
model2 = load_cbmodel(filepath)
organism = 'Fusobacterium nucleatum'

In [38]:
model.id

'M_Fusobacterium_nucleatum_subsp_nucleatum_ATCC_25586'

In [39]:
model_id = str(model.id.strip('M_'))
print(model_id)

Fusobacterium_nucleatum_subsp_nucleatum_ATCC_25586


In [40]:
sim = get_simulator(model)
sim.set_objective("biomass")

sim2 = get_simulator(model2)
sim2.set_objective("R_biomass")

In [6]:
sim.simulate()

objective: 186.53129283071732
Status: OPTIMAL
Method:SimulationMethod.FBA

In [7]:
sim2.simulate()

objective: None
Status: INF_OR_UNB
Method:SimulationMethod.FBA

## Annotation scraping

In [12]:
ls_ge = []

for ge in sim.genes:
    i = sim.genes.index(ge)
    gene = sim.genes[i]
    rxns = sim.get_gene(ge).reactions
    for rx in rxns:
        anno = sim.get_reaction(rx)['annotations']
        seed_id = anno.get('seed.reactions')
        rxn_name = sim.get_reaction(rx).name
        ecnumber = anno.get('ec-code')
        metanetx = anno.get('metanetx.reaction')
        kegg = anno.get('kegg.reaction')
        res = [gene,rx,rxn_name,seed_id,metanetx,kegg,ecnumber]
        ls_ge.append(res)

df_ge = pd.DataFrame(ls_ge,columns=[['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','ecNumber']])

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,
1,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,
2,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,
3,585397.9.1830_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,
4,585397.9.1830_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,
...,...,...,...,...,...,...,...
3645,ECED1_RS23680,rtranscription,RNA transcription c0,rxn13784,,,
3646,ECED1_RS23685,rtranscription,RNA transcription c0,rxn13784,,,
3647,WP_000239730.1,rtranscription,RNA transcription c0,rxn13784,,,
3648,WP_000437380.1,rtranscription,RNA transcription c0,rxn13784,,,


In [13]:
mask = df_ge['ecNumber'].values!=None
ec_nona = df_ge[mask]
ec_nona

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
170,ECED1_RS02175,1P4H2CBXLAH,1-Pyrroline-4-hydroxy-2-carboxylate aminohydro...,rxn01635,MNXR93752,,3.5.4.22
171,ECED1_RS02175,DHPPDA,diaminohydroxyphosphoribosylaminopyrimidine de...,rxn02475,MNXR82287,,3.5.4.26
172,ECED1_RS22475,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
173,ECED1_RS22475,2AHBUTI,(S)-2-Aceto-2-hydroxybutanoate isomerase,rxn03436,MNXR76597,,"1.1.1.86, 5.4.99.3"
174,WP_000024951.1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,1.1.1.86
...,...,...,...,...,...,...,...
3632,WP_000139091.1,r0480,S-Adenosyl-L-Methionine:Ethanolamine-Phosphate...,rxn01478,MNXR105345,R02037,2.1.1.103
3633,WP_000139091.1,r0788,S-Adenosyl-L-Methionine:Methylethanolamine Pho...,rxn04681,MNXR105385,R06868,2.1.1.103
3634,WP_000139091.1,r0789,S-Adenosyl-L-Methionine:Phosphodimethylethanol...,rxn04682,MNXR105386,R06869,2.1.1.103
3635,WP_000513796.1,r0641,(5-Glutamyl)-Peptide:Amino-Acid 5-Glutamyltran...,rxn02727,MNXR100134,,3.4.19.14


In [14]:
seed_id = df_ge['ModelSEED_id'].values.tolist()

seed_id = [reduce(lambda x: x, inner_list) for inner_list in seed_id]

metanetx_id = df_ge['MetaNetX'].values.tolist()

metanetx_id = [reduce(lambda x: x, inner_list) for inner_list in metanetx_id]

kegg_id = df_ge['KEGG_id'].values.tolist()

kegg_id = [reduce(lambda x: x, inner_list) for inner_list in kegg_id]

 ## ModelSEED query

In [15]:
SOLR_URL='https://modelseed.org'
ls_name = []
ls_kegg = []
ls_bigg = []

for mseed_id in tqdm(seed_id):
    i = seed_id.index(mseed_id)
    if mseed_id == None:
        ls_name.append('None')
        ls_kegg.append('None')
        ls_bigg.append('None')
    else:
        try:
            connection = urlopen(SOLR_URL+f'/solr/reactions/select?wt=json&q=id:{mseed_id}&fl=name,id,formula,charge,aliases')
            response = json.load(connection)
            for document in response['response']['docs']:  
                ms_name = document.get('name')
                ls_alias = document.get('aliases')
                ms_bigg = list(filter(lambda a: 'BiGG:' in a, document.get('aliases')))
                ms_kegg = list(filter(lambda a: 'KEGG:' in a, document.get('aliases')))
                if len(ms_bigg)== 0 and len(ms_kegg)== 0:
                    ms_bigg = 'None'
                    ms_kegg = 'None'
                elif len(ms_bigg)== 0 and len(ms_kegg)!= 0:
                    ms_bigg = 'None'
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                elif len(ms_bigg)!= 0 and len(ms_kegg)== 0:
                    ms_kegg = 'None'
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')
                else:
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')    
                ls_name.append(ms_name)
                ls_bigg.append(ms_bigg)
                ls_kegg.append(ms_kegg)           
        except:
            ls_name.append('None')
            ls_kegg.append('None')
            ls_bigg.append('None')

  0%|          | 0/3650 [00:00<?, ?it/s]

In [16]:
new_kegg = [next(filter(None, i)) for i in zip(ls_kegg, kegg_id)]

In [17]:
df_ge['BIGG_id'] = ls_bigg
df_ge['KEGG_id'] = new_kegg

## BiGG query

In [18]:
import requests

ls_bigg = df_ge['BIGG_id'].values.tolist()
bigg_ls = []

for bigg in tqdm(ls_bigg):
    for bigg_n in bigg:
        bigg_n = str(bigg_n)
        bigg_n = bigg_n.split(';')
        bigg_n = [x.strip(' ') for x in bigg_n]
        sub_bigg_ls = []
        for bi in bigg_n:
            if bi == 'None':
                pass
            else:
                url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bi}'
                with requests.request("GET", url) as resp:
                    try:
                        resp.raise_for_status()  # raises exception when not a 2xx response
                        if resp.status_code != 204:
                            data = dict(resp.json())
                            ec_l = data['database_links']
                            if ec_l == None:
                                sub_bigg_ls.append(None)
                            else:
                                ec = [i['id'] for i in ec_l['EC Number']]
                                if ec == None:
                                    sub_bigg_ls.append(None)
                                sub_bigg_ls.append(ec)
                        else: 
                            sub_bigg_ls.append(None)
                    except:
                        sub_bigg_ls.append(None)
    bigg_ls.append(sub_bigg_ls)

  0%|          | 0/3650 [00:00<?, ?it/s]

In [19]:
ec_l = df_ge['ecNumber'].values.tolist()
new_l = [next(filter(None, i)) for i in zip(bigg_ls, ec_l)]
df_ge['ecNumber'] = new_l

In [20]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,[None],
1,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,[None],
2,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,[None],
3,585397.9.1830_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,[None],
4,585397.9.1830_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,[None],
...,...,...,...,...,...,...,...,...
3645,ECED1_RS23680,rtranscription,RNA transcription c0,rxn13784,,,[None],
3646,ECED1_RS23685,rtranscription,RNA transcription c0,rxn13784,,,[None],
3647,WP_000239730.1,rtranscription,RNA transcription c0,rxn13784,,,[None],
3648,WP_000437380.1,rtranscription,RNA transcription c0,rxn13784,,,[None],


## Substrates

In [21]:
rx_l = df_ge['Reaction'].values.tolist()
ls_sub = []

for rxn in rx_l:
    for rx in rxn:
        sub = list(sim.get_substrates(rx).keys())
        ls_sub.append(sub)
    
df_ge["Substrates"] = ls_sub   

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id,Substrates
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,[None],,"[1hibupglu_S[c], h2o[c]]"
1,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,[None],,"[1hmdgluc[c], h2o[c]]"
2,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,[None],,"[2hatvacidgluc[c], h2o[c]]"
3,585397.9.1830_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,[None],,"[2hatvlacgluc[c], h2o[c]]"
4,585397.9.1830_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,[None],,"[2hibupglu_S[c], h2o[c]]"
...,...,...,...,...,...,...,...,...,...
3645,ECED1_RS23680,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
3646,ECED1_RS23685,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
3647,WP_000239730.1,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
3648,WP_000437380.1,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]


In [22]:
sub_na = df_ge['Substrates'].values.tolist()

ls_sub = []
ls_smile =[]



for sub_l in tqdm(sub_na):
    sub_ls_sub = []
    sub_ls_smile = []
    for sub_s in sub_l:
        for sub in sub_s:
            sub_name = sim.get_metabolite(sub).get('name')
            if "2,3-C" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "2,3-c" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "3-triphosphate" in sub_name:
                sub_name = sub_name.replace("3","3'")
            else:
                pass
            smile = get_smiles(sub_name)
            sub_ls_smile.append(smile)
            sub_ls_sub.append(sub_name)
        ls_sub.append(sub_ls_sub)
        ls_smile.append(sub_ls_smile)

df_ge['Substrate Name'] = ls_sub
df_ge['Substrate SMILES'] = ls_smile
df_ge.to_csv(f'../data/ec_data/ge_data_{model_id}.csv',na_rep='None')

  0%|          | 0/3650 [00:00<?, ?it/s]

In [8]:
df_ge = pd.read_csv(f'../data/ec_data/ge_data_{model_id}.csv')
df_ge = df_ge.loc[:,['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','Substrates','Substrate Name','Substrate SMILES','ecNumber']]
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,,"['1hibupglu_S[c]', 'h2o[c]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None]
1,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,,"['1hmdgluc[c]', 'h2o[c]']","['1-OH-midazolam-glucuronide', 'Water']","[None, 'O']",[None]
2,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,,"['2hatvacidgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-acyl-glucuronide', 'W...","[None, 'O']",[None]
3,585397.9.1830_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,,"['2hatvlacgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-lactone-glucuronide',...","[None, 'O']",[None]
4,585397.9.1830_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,,"['2hibupglu_S[c]', 'h2o[c]']","['2-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None]
...,...,...,...,...,...,...,...,...,...,...,...
3645,ECED1_RS23680,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
3646,ECED1_RS23685,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
3647,WP_000239730.1,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
3648,WP_000437380.1,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]


In [9]:
ec_l = df_ge['ecNumber'].values.tolist()
ec_nl = []
ei = []

for es in ec_l:
    es = str(es)
    es = es.split(',')
    ei = []
    for sublist in es:
        sublist = str(sublist)
        sublist = sublist.strip("[[")
        sublist = sublist.strip(" ")
        sublist = sublist.strip("'")
        sublist = sublist.strip("[")
        sublist = sublist.strip("[")
        sublist = sublist.strip("'")
        sublist = sublist.strip("'")
        sublist = sublist.strip("]")
        sublist = sublist.strip("]]")  
        sublist = sublist.strip("'")
        if sublist not in ei:
            if sublist == 'None' and len(ei) > 0:
                pass
            elif '-' in sublist:
                pass
            else:
                ei.append(sublist)
    ec_nl.append(ei)

df_ge['ecNumber'] = ec_nl

## BRENDA query

### Kcat extraction

In [10]:
from brendapyrser import BRENDA
from brendapyrser import EnzymePropertyDict


dataFile = '../../brenda_2023_1.txt'

In [11]:
brenda = BRENDA(dataFile)
brenda

0,1
Number of Enzymes,7832
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


In [12]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()

for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip(' ')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_kcat_ls.append(None)
            continue
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec)'] = kcat_ls

  0%|          | 0/3650 [00:00<?, ?it/s]

In [13]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.filter_by_organism(organism).get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec and species)'] = kcat_ls

  0%|          | 0/3650 [00:00<?, ?it/s]

### Sequence extraction

In [14]:
seq_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_seq_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
            continue
        else:
            try:
                from zeep import Client
                import hashlib

                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","sequence*", "noOfAminoAcids*", "firstAccessionCode*", "source*", "id*", f"organism*{organism}")
                resultString = client.service.getSequence(*parameters) 
                sub_seq_ls.append(resultString[0]['sequence'])
            except:
                sub_seq_ls.append(None)                  
    seq_ls.append(sub_seq_ls)
    
df_ge['Protein Sequence'] = seq_ls

  0%|          | 0/3650 [00:00<?, ?it/s]

### Molecular Weight extraction

In [15]:
mw_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_mw_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
            continue
        else:
            try:
                from zeep import Client
                import hashlib
                
                res_mw = 0
                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ("alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","molecularWeight*","molecularWeightMaximum*","commentary*",f"organism*{organism}","literature*" )
                resultString = client.service.getMolecularWeight(*parameters)
                for i in range(len(resultString)):
                    res_mw = res_mw + int(resultString[i]['molecularWeight'])
                res_mw = res_mw/len(resultString)
                sub_mw_ls.append(res_mw)
            except:
                sub_mw_ls.append(None)                  
    mw_ls.append(sub_mw_ls)

df_ge['Molecular Weight'] = mw_ls

  0%|          | 0/3650 [00:00<?, ?it/s]

In [16]:
from zeep import Client
import hashlib

res_mw = 0

wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
client = Client(wsdl)
parameters = ("alexandreareias1718@gmail.com",password,"ecNumber*1.1.1.86", "molecularWeight*", "molecularWeightMaximum*", "commentary*", f"organism*{organism}", "literature*")
resultString = client.service.getMolecularWeight(*parameters) 
for i in range(len(resultString)):
        res_mw = res_mw + int(resultString[i]['molecularWeight'])
res_mw = res_mw/len(resultString)
print(res_mw)

141357.0


In [17]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),Protein Sequence,Molecular Weight
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,,"['1hibupglu_S[c]', 'h2o[c]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None],[None]
1,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,,"['1hmdgluc[c]', 'h2o[c]']","['1-OH-midazolam-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None],[None]
2,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,,"['2hatvacidgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-acyl-glucuronide', 'W...","[None, 'O']",[None],[None],[None],[None],[None]
3,585397.9.1830_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,,"['2hatvlacgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-lactone-glucuronide',...","[None, 'O']",[None],[None],[None],[None],[None]
4,585397.9.1830_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,,"['2hibupglu_S[c]', 'h2o[c]']","['2-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None],[None]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3645,ECED1_RS23680,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
3646,ECED1_RS23685,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
3647,WP_000239730.1,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
3648,WP_000437380.1,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]


## DLKcat - Kcat prediction

In [18]:
dk_prep = df_ge.drop(columns=['Substrates','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','ecNumber'])
dk_prep

Unnamed: 0,Gene,Reaction,Name,Substrate Name,Substrate SMILES,Avg Kcat (by ec),Avg Kcat (by ec and species),Protein Sequence,Molecular Weight
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,"['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None]
1,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,"['1-OH-midazolam-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None]
2,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,"['2-hydroxy-atorvastatin-acyl-glucuronide', 'W...","[None, 'O']",[None],[None],[None],[None]
3,585397.9.1830_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,"['2-hydroxy-atorvastatin-lactone-glucuronide',...","[None, 'O']",[None],[None],[None],[None]
4,585397.9.1830_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,"['2-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None]
...,...,...,...,...,...,...,...,...,...
3645,ECED1_RS23680,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
3646,ECED1_RS23685,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
3647,WP_000239730.1,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
3648,WP_000437380.1,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]


In [19]:
dk_prep['Substrate Name'] = dk_prep['Substrate Name'].apply(literal_eval) #convert to list type
dk_prep['Substrate SMILES'] = dk_prep['Substrate SMILES'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Substrate Name','Substrate SMILES']).reset_index(drop=True)

In [20]:
#dk_prep['Protein Sequence'] = dk_prep['Protein Sequence'].apply(literal_eval) #convert to list type
#dk_prep['Molecular Weight'] = dk_prep['Molecular Weight'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Protein Sequence','Molecular Weight','Avg Kcat (by ec)','Avg Kcat (by ec and species)']).reset_index(drop=True)

In [21]:
dk_inp = dk_prep.drop(columns=['Gene','Reaction', 'Name', 'Avg Kcat (by ec)', 'Avg Kcat (by ec and species)','Molecular Weight'])
dk_inp.to_csv(f'../../DLKcat/DeeplearningApproach/Code/example/dk_input_{model_id}.tsv',sep="\t",na_rep='None',index= False)

In [22]:
dk_inp

Unnamed: 0,Substrate Name,Substrate SMILES,Protein Sequence
0,1-hydroxy S-ibuprofen-glucuronide,,
1,Water,O,
2,1-OH-midazolam-glucuronide,,
3,Water,O,
4,2-hydroxy-atorvastatin-acyl-glucuronide,,
...,...,...,...
9833,,,
9834,,,
9835,,,
9836,,,


Run DLKcat

In [23]:
dk_out = pd.read_csv(f'../../DLKcat/DeeplearningApproach/Code/example/output.tsv', sep="\t")
dk_out['Gene'] = dk_prep['Gene']
dk_out['Reaction'] = dk_prep['Reaction']
dk_out['Molecular Weight'] = dk_prep['Molecular Weight']
dk_out['Avg Kcat (by ec)'] = dk_prep['Avg Kcat (by ec)']
dk_out = dk_out.loc[:,['Gene','Reaction','Substrate Name','Substrate SMILES','Protein Sequence','Molecular Weight','Kcat value (1/s)','Avg Kcat (by ec)']]  
dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec)
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen-glucuronide,,,,,
1,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,Water,O,,,0.8537,
2,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide,,,,,
3,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,Water,O,,,0.8537,
4,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide,,,,,
...,...,...,...,...,...,...,...,...
9833,ECED1_RS23680,rtranscription,,,,,,
9834,ECED1_RS23685,rtranscription,,,,,,
9835,WP_000239730.1,rtranscription,,,,,,
9836,WP_000437380.1,rtranscription,,,,,,


### Joining Kcat values

In [24]:
kcat_brenda = dk_out['Avg Kcat (by ec)'].values.tolist()
kcat_dl = dk_out['Kcat value (1/s)'].values.tolist()
new_kcat = []

for i in range(len(kcat_brenda)):
    if kcat_brenda[i] != None and kcat_dl[i] != 'None':
        kcat = (float(kcat_brenda[i]) + float(kcat_dl[i]))/2
    elif kcat_brenda[i] == None and kcat_dl[i] != 'None':
        kcat = kcat_dl[i]
    elif kcat_brenda[i] != None and kcat_dl[i] == 'None':
        kcat = kcat_brenda[i]
    elif kcat_brenda[i] == None and kcat_dl[i] == 'None':
        kcat = 0
    new_kcat.append(kcat)
    
dk_out['New Kcat'] = new_kcat
dk_out

Unnamed: 0,Gene,Reaction,Substrate Name,Substrate SMILES,Protein Sequence,Molecular Weight,Kcat value (1/s),Avg Kcat (by ec),New Kcat
0,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen-glucuronide,,,,,,0
1,585397.9.1830_56.peg,1HIBUP_S_GLCAASE,Water,O,,,0.8537,,0.8537
2,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide,,,,,,0
3,585397.9.1830_56.peg,1HMDGLUC_GLCAASE,Water,O,,,0.8537,,0.8537
4,585397.9.1830_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide,,,,,,0
...,...,...,...,...,...,...,...,...,...
9833,ECED1_RS23680,rtranscription,,,,,,,0
9834,ECED1_RS23685,rtranscription,,,,,,,0
9835,WP_000239730.1,rtranscription,,,,,,,0
9836,WP_000437380.1,rtranscription,,,,,,,0


## Adding ezymatic constraints

In [25]:
genes = dk_out['Gene'].values.tolist()
mweights = dk_out['Molecular Weight'].values.tolist()
kcats = dk_out['New Kcat'].values.tolist()

ec_data = dict()
for gene in genes:
    ge = sim.get_gene(gene).id
    i = genes.index(gene)
    mw = mweights[i]
    if mw == None:
        mw = 0
    kcat = kcats[i]
    if kcat == None:
        kcat = 1
    ec_data[ge]={'protein':ge[len(sim._g_prefix):],'mw':mw,'kcat':kcat}
    
print(ec_data)
with open(f"../data/ec_data/{model_id}.json", "w") as f:
    json.dump(ec_data,f)

{'585397.9.1830_56.peg': {'protein': '585397.9.1830_56.peg', 'mw': 0, 'kcat': 0}, 'ECED1_RS02175': {'protein': 'ECED1_RS02175', 'mw': 0, 'kcat': 32.4219}, 'ECED1_RS22475': {'protein': 'ECED1_RS22475', 'mw': 141357.0, 'kcat': 3.222513120068027}, 'WP_000024951.1': {'protein': 'WP_000024951.1', 'mw': 141357.0, 'kcat': 3.222513120068027}, 'ECED1_RS25540': {'protein': 'ECED1_RS25540', 'mw': 0, 'kcat': 5.361464705882353}, 'WP_001116472.1': {'protein': 'WP_001116472.1', 'mw': 0, 'kcat': '5.4670'}, 'WP_001198717.1': {'protein': 'WP_001198717.1', 'mw': 0, 'kcat': '5.4670'}, 'ECED1_RS12160': {'protein': 'ECED1_RS12160', 'mw': 0, 'kcat': '5.4670'}, 'WP_000048190.1': {'protein': 'WP_000048190.1', 'mw': 0, 'kcat': '5.4670'}, 'ECED1_RS19110': {'protein': 'ECED1_RS19110', 'mw': 0, 'kcat': 60.425050000000006}, 'ECED1_RS23775': {'protein': 'ECED1_RS23775', 'mw': 0, 'kcat': '27.2297'}, 'WP_000963849.1': {'protein': 'WP_000963849.1', 'mw': 0, 'kcat': 0}, 'ECED1_RS13850': {'protein': 'ECED1_RS13850', 'mw'

In [41]:
with open(f"../data/ec_data/{model_id}.json") as json_file:
    ec_data = json.load(json_file)

In [42]:
print("Genes with missing data:")
for k,v in ec_data.items():
    if v['mw']==0:
        print(k)

Genes with missing data:
g.191.peg.13
190304.8.242.peg
g.191.peg.1069
g.191.peg.177
g.191.peg.1752
g.191.peg.561
g.191.peg.1073
g.191.peg.1079
g.191.peg.688
g.191.peg.846
g.191.peg.70
g.191.peg.1104
g.191.peg.1715
g.191.peg.704
g.191.peg.1644
g.191.peg.1851
g.191.peg.418
g.191.peg.1730
g.191.peg.546
190304.8.218.peg
190304.8.219_22.peg
190304.8.1540.peg
190304.8.220_21.peg
190304.8.1542.peg
190304.8.532.peg
190304.8.1863_17.peg
g.191.peg.186
g.191.peg.1889
g.191.peg.552
190304.8.1475_37.peg
g.191.peg.1396
190304.8.97.peg
g.191.peg.1899
190304.8.1547_12.peg
g.191.peg.1003
g.191.peg.1224
g.191.peg.1335
g.191.peg.342
g.191.peg.390
g.191.peg.1297
g.191.peg.397
g.191.peg.1445
g.191.peg.1168
190304.8.1567.peg
190304.8.46.peg
190304.8.247_16.peg
190304.8.799_2.peg
g.191.peg.1424
190304.8.11_2_5.peg
g.191.peg.112
g.191.peg.728
190304.8.1339.peg
190304.8.996_5.peg
190304.8.1979.peg
190304.8.912.peg
190304.8.1786.peg
190304.8.356.peg
g.191.peg.1792
g.191.peg.1004
g.191.peg.247
190304.8.637_10.pe

In [43]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tmcom.util import add_enzyme_constraints

ec_sim = add_enzyme_constraints(sim, ec_data)

Read LP format model from file /tmp/tmpxpcdyf40.lp
Reading time = 0.01 seconds
: 1135 rows, 2406 columns, 9502 nonzeros


Converting to irreversible: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1203/1203 [00:06<00:00, 198.33it/s]
Adding gene species: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 503/503 [00:02<00:00, 223.52it/s]
  warn("need to pass in a list")
Adding proteins usage to reactions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

### Protein pool usage

In [44]:
res = ec_sim.simulate()
print(res)

objective: 60.78973722691819
Status: OPTIMAL
Method:SimulationMethod.FBA


In [45]:
res.find('pool')

Unnamed: 0_level_0,Flux rate
Reaction ID,Unnamed: 1_level_1
prot_pool_exchange,134016.314386


### Save the model

In [46]:
write_sbml_model(ec_sim.model,f"../models/ec/ec_{model_id}.xml")

In [47]:
env = Environment.complete(ec_sim, max_uptake=1000.0, inplace=False)

env['prot_pool_exchange']=(0,0.8)

ec_sim.simulate(constraints=env)

objective: 0.001558568530208206
Status: OPTIMAL
Method:SimulationMethod.FBA