# Enzymatic Constraints Enhancement of AGORA models

### Context



### Goals:

- 

In [1]:
from tqdm.auto import tqdm
from reframed.io.sbml import load_cbmodel,save_cbmodel
from cobra.io import read_sbml_model, write_sbml_model
from mewpy.cobra.util import add_enzyme_constraints,convert_gpr_to_dnf,split_isozymes
from mewpy.simulation import get_simulator
from mewpy.simulation.environment import Environment as Environment
from mewpy.util.request import retreive_gene,retreive_protein,get_smiles,brenda_query
import pandas as pd
import numpy as np
from urllib.request import urlopen
from functools import reduce
import json
from ast import literal_eval

In [2]:
filepath = "../models/non-ec/agora/Bacteroides_uniformis_ATCC_8492.xml"
model = read_sbml_model(filepath)
model2 = load_cbmodel(filepath)
organism = 'Bacteroides thetaiotaomicron'

Set parameter Username


In [3]:
model.id

'M_Bacteroides_uniformis_ATCC_8492'

In [4]:
sim = get_simulator(model)
sim.set_objective("biomass")

sim2 = get_simulator(model2)
sim2.set_objective("R_biomass")

In [5]:
sim.simulate()

objective: 71.40464067328286
Status: OPTIMAL
Method:SimulationMethod.FBA

In [6]:
sim2.simulate()

objective: None
Status: INF_OR_UNB
Method:SimulationMethod.FBA

## Annotation scraping

In [7]:
ls_ge = []

for ge in sim.genes:
    i = sim.genes.index(ge)
    gene = sim.genes[i]
    rxns = sim.get_gene(ge).reactions
    for rx in rxns:
        anno = sim.get_reaction(rx)['annotations']
        seed_id = anno.get('seed.reactions')
        rxn_name = sim.get_reaction(rx).name
        ecnumber = anno.get('ec-code')
        metanetx = anno.get('metanetx.reaction')
        kegg = anno.get('kegg.reaction')
        res = [gene,rx,rxn_name,seed_id,metanetx,kegg,ecnumber]
        ls_ge.append(res)

df_ge = pd.DataFrame(ls_ge,columns=[['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','ecNumber']])

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,411479.1.3062_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,
1,411479.1.3062_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,
2,411479.1.3062_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,
3,411479.1.3062_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,
4,411479.1.3062_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,
...,...,...,...,...,...,...,...
1717,411479.1.peg.2102,rtranscription,RNA transcription c0,rxn13784,,,
1718,411479.1.peg.3030,rtranscription,RNA transcription c0,rxn13784,,,
1719,411479.1.peg.3645,rtranscription,RNA transcription c0,rxn13784,,,
1720,411479.1.peg.3749,rtranscription,RNA transcription c0,rxn13784,,,


In [8]:
seed_id = df_ge['ModelSEED_id'].values.tolist()

seed_id = [reduce(lambda x: x, inner_list) for inner_list in seed_id]

metanetx_id = df_ge['MetaNetX'].values.tolist()

metanetx_id = [reduce(lambda x: x, inner_list) for inner_list in metanetx_id]

kegg_id = df_ge['KEGG_id'].values.tolist()

kegg_id = [reduce(lambda x: x, inner_list) for inner_list in kegg_id]

 ## ModelSEED query

In [9]:
SOLR_URL='https://modelseed.org'
ls_name = []
ls_kegg = []
ls_bigg = []

for mseed_id in tqdm(seed_id):
    i = seed_id.index(mseed_id)
    if mseed_id == None:
        ls_name.append('None')
        ls_kegg.append('None')
        ls_bigg.append('None')
    else:
        try:
            connection = urlopen(SOLR_URL+f'/solr/reactions/select?wt=json&q=id:{mseed_id}&fl=name,id,formula,charge,aliases')
            response = json.load(connection)
            for document in response['response']['docs']:  
                ms_name = document.get('name')
                ls_alias = document.get('aliases')
                ms_bigg = list(filter(lambda a: 'BiGG:' in a, document.get('aliases')))
                ms_kegg = list(filter(lambda a: 'KEGG:' in a, document.get('aliases')))
                if len(ms_bigg)== 0 and len(ms_kegg)== 0:
                    ms_bigg = 'None'
                    ms_kegg = 'None'
                elif len(ms_bigg)== 0 and len(ms_kegg)!= 0:
                    ms_bigg = 'None'
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                elif len(ms_bigg)!= 0 and len(ms_kegg)== 0:
                    ms_kegg = 'None'
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')
                else:
                    ms_kegg = list(ms_kegg)[0]
                    ms_kegg = ms_kegg.replace('KEGG: ','')
                    ms_bigg = list(ms_bigg)[0]
                    ms_bigg = ms_bigg.replace('BiGG: ','')    
                ls_name.append(ms_name)
                ls_bigg.append(ms_bigg)
                ls_kegg.append(ms_kegg)           
        except:
            ls_name.append('None')
            ls_kegg.append('None')
            ls_bigg.append('None')

  0%|          | 0/1722 [00:00<?, ?it/s]

In [10]:
new_kegg = [next(filter(None, i)) for i in zip(ls_kegg, kegg_id)]

In [11]:
df_ge['BIGG_id'] = ls_bigg
df_ge['KEGG_id'] = new_kegg

## BiGG query

In [12]:
import requests

ls_bigg = df_ge['BIGG_id'].values.tolist()
bigg_ls = []

for bigg in tqdm(ls_bigg):
    for bigg_n in bigg:
        bigg_n = str(bigg_n)
        bigg_n = bigg_n.split(';')
        bigg_n = [x.strip(' ') for x in bigg_n]
        sub_bigg_ls = []
        for bi in bigg_n:
            if bi == 'None':
                pass
            else:
                url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bi}'
                with requests.request("GET", url) as resp:
                    try:
                        resp.raise_for_status()  # raises exception when not a 2xx response
                        if resp.status_code != 204:
                            data = dict(resp.json())
                            ec_l = data['database_links']
                            if ec_l == None:
                                sub_bigg_ls.append(None)
                            else:
                                ec = [i['id'] for i in ec_l['EC Number']]
                                if ec == None:
                                    sub_bigg_ls.append(None)
                                sub_bigg_ls.append(ec)
                        else: 
                            sub_bigg_ls.append(None)
                    except:
                        sub_bigg_ls.append(None)
    bigg_ls.append(sub_bigg_ls)

  0%|          | 0/1722 [00:00<?, ?it/s]

In [13]:
ec_l = df_ge['ecNumber'].values.tolist()
new_l = [next(filter(None, i)) for i in zip(bigg_ls, ec_l)]
df_ge['ecNumber'] = new_l

In [14]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id
0,411479.1.3062_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,[None],
1,411479.1.3062_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,[None],
2,411479.1.3062_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,[None],
3,411479.1.3062_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,[None],
4,411479.1.3062_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,[None],
...,...,...,...,...,...,...,...,...
1717,411479.1.peg.2102,rtranscription,RNA transcription c0,rxn13784,,,[None],
1718,411479.1.peg.3030,rtranscription,RNA transcription c0,rxn13784,,,[None],
1719,411479.1.peg.3645,rtranscription,RNA transcription c0,rxn13784,,,[None],
1720,411479.1.peg.3749,rtranscription,RNA transcription c0,rxn13784,,,[None],


## Substrates

In [15]:
rx_l = df_ge['Reaction'].values.tolist()
ls_sub = []

for rxn in rx_l:
    for rx in rxn:
        sub = list(sim.get_substrates(rx).keys())
        ls_sub.append(sub)
    
df_ge["Substrates"] = ls_sub   

df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id,Substrates
0,411479.1.3062_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,[None],,"[1hibupglu_S[c], h2o[c]]"
1,411479.1.3062_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,[None],,"[1hmdgluc[c], h2o[c]]"
2,411479.1.3062_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,[None],,"[2hatvacidgluc[c], h2o[c]]"
3,411479.1.3062_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,[None],,"[2hatvlacgluc[c], h2o[c]]"
4,411479.1.3062_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,[None],,"[2hibupglu_S[c], h2o[c]]"
...,...,...,...,...,...,...,...,...,...
1717,411479.1.peg.2102,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
1718,411479.1.peg.3030,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
1719,411479.1.peg.3645,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]
1720,411479.1.peg.3749,rtranscription,RNA transcription c0,rxn13784,,,[None],,[]


In [16]:
sub_na = df_ge['Substrates'].values.tolist()

ls_sub = []
ls_smile =[]



for sub_l in tqdm(sub_na):
    sub_ls_sub = []
    sub_ls_smile = []
    for sub_s in sub_l:
        for sub in sub_s:
            sub_name = sim.get_metabolite(sub).get('name')
            if "2,3-C" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "2,3-c" in sub_name:
                sub_name = sub_name.replace("2,3","2',3'")
            elif "3-triphosphate" in sub_name:
                sub_name = sub_name.replace("3","3'")
            else:
                pass
            smile = get_smiles(sub_name)
            sub_ls_smile.append(smile)
            sub_ls_sub.append(sub_name)
        ls_sub.append(sub_ls_sub)
        ls_smile.append(sub_ls_smile)

df_ge['Substrate Name'] = ls_sub
df_ge['Substrate SMILES'] = ls_smile
df_ge.to_csv(f'../data/ec_data/ge_data_{model.id}.csv',na_rep='None')

  0%|          | 0/1722 [00:00<?, ?it/s]

In [17]:
df_ge = pd.read_csv(f'../data/ec_data/ge_data_{model.id}.csv')
df_ge = df_ge.loc[:,['Gene','Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','Substrates','Substrate Name','Substrate SMILES','ecNumber']]
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber
0,411479.1.3062_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,,"['1hibupglu_S[c]', 'h2o[c]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None]
1,411479.1.3062_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,,"['1hmdgluc[c]', 'h2o[c]']","['1-OH-midazolam-glucuronide', 'Water']","[None, 'O']",[None]
2,411479.1.3062_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,,"['2hatvacidgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-acyl-glucuronide', 'W...","[None, 'O']",[None]
3,411479.1.3062_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,,"['2hatvlacgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-lactone-glucuronide',...","[None, 'O']",[None]
4,411479.1.3062_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,,"['2hibupglu_S[c]', 'h2o[c]']","['2-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None]
...,...,...,...,...,...,...,...,...,...,...,...
1717,411479.1.peg.2102,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1718,411479.1.peg.3030,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1719,411479.1.peg.3645,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]
1720,411479.1.peg.3749,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None]


In [18]:
ec_l = df_ge['ecNumber'].values.tolist()
ec_nl = []
ei = []

for es in ec_l:
    es = str(es)
    es = es.split(',')
    ei = []
    for sublist in es:
        sublist = str(sublist)
        sublist = sublist.strip("[[")
        sublist = sublist.strip(" ")
        sublist = sublist.strip("'")
        sublist = sublist.strip("[")
        sublist = sublist.strip("[")
        sublist = sublist.strip("'")
        sublist = sublist.strip("'")
        sublist = sublist.strip("]")
        sublist = sublist.strip("]]")  
        sublist = sublist.strip("'")
        if sublist not in ei:
            if sublist == 'None' and len(ei) > 0:
                pass
            elif '-' in sublist:
                pass
            else:
                ei.append(sublist)
    ec_nl.append(ei)

df_ge['ecNumber'] = ec_nl

## BRENDA query

### Kcat extraction

In [19]:
from brendapyrser import BRENDA
from brendapyrser import EnzymePropertyDict


dataFile = '../../brenda_2023_1.txt'

In [20]:
brenda = BRENDA(dataFile)
brenda

0,1
Number of Enzymes,7832
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


In [21]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()

for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip(' ')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_kcat_ls.append(None)
            continue
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec)'] = kcat_ls

  0%|          | 0/1722 [00:00<?, ?it/s]

In [22]:
kcat_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_kcat_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.filter_by_organism(organism).get_values()
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
            except:
                sub_kcat_ls.append(None)
    kcat_ls.append(sub_kcat_ls)

df_ge['Avg Kcat (by ec and species)'] = kcat_ls

  0%|          | 0/1722 [00:00<?, ?it/s]

### Sequence extraction

In [23]:
seq_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_seq_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
        else:
            try:
                from zeep import Client
                import hashlib

                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","sequence*", "noOfAminoAcids*", "firstAccessionCode*", "source*", "id*", "organism*Bacteroides sp")
                resultString = client.service.getSequence(*parameters) 
                sub_seq_ls.append(resultString[0]['sequence'])
            except:
                sub_seq_ls.append(None)                  
    seq_ls.append(sub_seq_ls)
    
df_ge['Protein Sequence'] = seq_ls

  0%|          | 0/1722 [00:00<?, ?it/s]

### Molecular Weight extraction

In [24]:
mw_ls = []
ec_ls = df_ge['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec=str(ec)
    ec = ec.split(',')
    sub_mw_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        elif ec_n == None:
            sub_mw_ls.append(None)
        else:
            try:
                from zeep import Client
                import hashlib
                
                res_mw = 0
                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ("alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","molecularWeight*","molecularWeightMaximum*","commentary*","organism*","literature*" )
                resultString = client.service.getMolecularWeight(*parameters)
                for i in range(len(resultString)):
                    res_mw = res_mw + int(resultString[i]['molecularWeight'])
                res_mw = res_mw/len(resultString)
                sub_mw_ls.append(res_mw)
            except:
                sub_mw_ls.append(None)                  
    mw_ls.append(sub_mw_ls)

df_ge['Molecular Weight'] = mw_ls

  0%|          | 0/1722 [00:00<?, ?it/s]

In [25]:
from zeep import Client
import hashlib

res_mw = 0

wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
client = Client(wsdl)
parameters = ("alexandreareias1718@gmail.com",password,"ecNumber*1.1.1.86", "molecularWeight*", "molecularWeightMaximum*", "commentary*", "organism*", "literature*")
resultString = client.service.getMolecularWeight(*parameters) 
for i in range(len(resultString)):
        res_mw = res_mw + int(resultString[i]['molecularWeight'])
res_mw = res_mw/len(resultString)
print(res_mw)

138656.35714285713


In [26]:
df_ge

Unnamed: 0,Gene,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,BIGG_id,Substrates,Substrate Name,Substrate SMILES,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),Protein Sequence,Molecular Weight
0,411479.1.3062_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,,,,,"['1hibupglu_S[c]', 'h2o[c]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None],[None]
1,411479.1.3062_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,,,,,"['1hmdgluc[c]', 'h2o[c]']","['1-OH-midazolam-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None],[None]
2,411479.1.3062_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,,,,,"['2hatvacidgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-acyl-glucuronide', 'W...","[None, 'O']",[None],[None],[None],[None],[None]
3,411479.1.3062_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,,,,,"['2hatvlacgluc[c]', 'h2o[c]']","['2-hydroxy-atorvastatin-lactone-glucuronide',...","[None, 'O']",[None],[None],[None],[None],[None]
4,411479.1.3062_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,,,,,"['2hibupglu_S[c]', 'h2o[c]']","['2-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None],[None]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717,411479.1.peg.2102,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
1718,411479.1.peg.3030,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
1719,411479.1.peg.3645,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]
1720,411479.1.peg.3749,rtranscription,RNA transcription c0,rxn13784,,,,[],[],[],[None],[None],[None],[None],[None]


## DLKcat - Kcat prediction

In [27]:
dk_prep = df_ge.drop(columns=['Substrates','ModelSEED_id','MetaNetX','KEGG_id','BIGG_id','ecNumber'])
dk_prep

Unnamed: 0,Gene,Reaction,Name,Substrate Name,Substrate SMILES,Avg Kcat (by ec),Avg Kcat (by ec and species),Protein Sequence,Molecular Weight
0,411479.1.3062_56.peg,1HIBUP_S_GLCAASE,1-hydroxy S-ibuprofen beta-glucuronidase,"['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None]
1,411479.1.3062_56.peg,1HMDGLUC_GLCAASE,1-OH-midazolam-glucuronide beta-glucuronidase,"['1-OH-midazolam-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None]
2,411479.1.3062_56.peg,2HATVACIDGLUC_GLCAASE,2-hydroxy-atorvastatin-acyl-glucuronide beta-g...,"['2-hydroxy-atorvastatin-acyl-glucuronide', 'W...","[None, 'O']",[None],[None],[None],[None]
3,411479.1.3062_56.peg,2HATVLACGLUC_GLCAASE,2-hydroxy-atorvastatin-lactone-glucuronide bet...,"['2-hydroxy-atorvastatin-lactone-glucuronide',...","[None, 'O']",[None],[None],[None],[None]
4,411479.1.3062_56.peg,2HIBUP_S_GLCAASE,2-hydroxy-S-ibuprofen beta-glucuronidase,"['2-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None],[None],[None],[None]
...,...,...,...,...,...,...,...,...,...
1717,411479.1.peg.2102,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
1718,411479.1.peg.3030,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
1719,411479.1.peg.3645,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]
1720,411479.1.peg.3749,rtranscription,RNA transcription c0,[],[],[None],[None],[None],[None]


In [28]:
dk_prep['Substrate Name'] = dk_prep['Substrate Name'].apply(literal_eval) #convert to list type
dk_prep['Substrate SMILES'] = dk_prep['Substrate SMILES'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Substrate Name','Substrate SMILES']).reset_index(drop=True)

In [29]:
#dk_prep['Protein Sequence'] = dk_prep['Protein Sequence'].apply(literal_eval) #convert to list type
#dk_prep['Molecular Weight'] = dk_prep['Molecular Weight'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['Protein Sequence','Molecular Weight','Avg Kcat (by ec)','Avg Kcat (by ec and species)']).reset_index(drop=True)

In [30]:
dk_inp = dk_prep.drop(columns=['Gene','Reaction', 'Name', 'Avg Kcat (by ec)', 'Avg Kcat (by ec and species)','Molecular Weight'])
dk_inp.to_csv(f'../../DLKcat/DeeplearningApproach/Code/example/dk_input_{model.id}.tsv',sep="\t",na_rep='None',index= False)

In [31]:
dk_inp

Unnamed: 0,Substrate Name,Substrate SMILES,Protein Sequence
0,1-hydroxy S-ibuprofen-glucuronide,,
1,Water,O,
2,1-OH-midazolam-glucuronide,,
3,Water,O,
4,2-hydroxy-atorvastatin-acyl-glucuronide,,
...,...,...,...
4727,,,
4728,,,
4729,,,
4730,,,


Run DLKcat

In [10]:
dk_out = pd.read_csv(f'../../DLKcat/DeeplearningApproach/Code/example/output.tsv', sep="\t")
dk_out['Gene'] = dk_prep['Gene']
dk_out['Reaction'] = dk_prep['Reaction']
dk_out['Molecular Weight'] = dk_prep['Molecular Weight']
dk_out['Avg Kcat (by ec)'] = dk_prep['Avg Kcat (by ec)']
dk_out = dk_out.loc[:,['Gene','Reaction','Substrate Name','Substrate SMILES','Protein Sequence','Molecular Weight','Kcat value (1/s)','Avg Kcat (by ec)']]  
dk_out

NameError: name 'dk_prep' is not defined

### Joining Kcat values

In [8]:
kcat_brenda = dk_out['Avg Kcat (by ec)'].values.tolist()
kcat_dl = dk_out['Kcat value (1/s)'].values.tolist()
new_kcat = []

for i in range(len(kcat_brenda)):
    if kcat_brenda[i] != None and kcat_dl[i] != 'None':
        kcat = (float(kcat_brenda[i]) + float(kcat_dl[i]))/2
    elif kcat_brenda[i] == None and kcat_dl[i] != 'None':
        kcat = kcat_dl[i]
    elif kcat_brenda[i] != None and kcat_dl[i] == 'None':
        kcat = kcat_brenda[i]
    elif kcat_brenda[i] == None and kcat_dl[i] == 'None':
        kcat = 0
    new_kcat.append(kcat)
    
dk_out['New Kcat'] = new_kcat
dk_out

KeyError: 'Avg Kcat (by ec)'

## Adding ezymatic constraints

In [9]:
genes = dk_out['Gene'].values.tolist()
mweights = dk_out['Molecular Weight'].values.tolist()
kcats = dk_out['New Kcat'].values.tolist()

ec_data = dict()
for gene in genes:
    ge = sim.get_gene(gene).id
    i = genes.index(gene)
    mw = mweights[i]
    if mw == None:
        mw = 0
    kcat = kcats[i]
    if kcat == None:
        kcat = 1
    ec_data[ge]={'protein':ge[len(sim._g_prefix):],'mw':mw,'kcat':kcat}
    
print(ec_data)

KeyError: 'Gene'

In [39]:
print("Genes with missing data:")
for k,v in ec_data.items():
    if v['mw']==0:
        print(k)

Genes with missing data:
411479.1.3062_56.peg
411479.1.821.peg
411479.1.958.peg
411479.1.1231.peg
411479.1.2100.peg
411479.1.peg.2438
411479.1.peg.3415
411479.1.peg.1956
411479.1.peg.2370
411479.1.peg.2854
411479.1.peg.2347
411479.1.peg.2583
411479.1.peg.2744
411479.1.peg.2891
411479.10.peg.739
411479.1.2801_33.peg
411479.1.307_22.peg
411479.1.peg.2795
411479.10.peg.2963
411479.1.1321.peg
411479.1.peg.4050
411479.1.peg.2123
411479.1.peg.957
411479.1.2466_12.peg
411479.1.248_12.peg
411479.1.peg.1029
411479.1.peg.1435
411479.1.peg.2385
411479.1.peg.242
411479.1.1814.peg
411479.1.peg.3485
411479.1.peg.1610
411479.1.74.peg
411479.10.peg.957
411479.1.957_16.peg
411479.1.1938_11.peg
411479.1.2016_11.peg
411479.1.3734_11.peg
411479.1.3735_11.peg
411479.1.1617.peg
411479.1.2440.peg
411479.1.3812.peg
411479.1.526.peg
411479.1.3155.peg
411479.1.505.peg
411479.1.506.peg
411479.1.541.peg
411479.1.peg.1126
411479.1.peg.1131
411479.1.peg.1240
411479.1.peg.135
411479.1.peg.1645
411479.1.peg.2997
4114

In [None]:
ec_sim = add_enzyme_constraints(sim2, ec_data)

### Protein pool usage

### Save the model