# Enzymatic Constraints Enhancement of AGORA models

### Context



### Goals:

- 

In [1]:
from tqdm.auto import tqdm
from reframed.io.sbml import load_cbmodel
from cobra.io import read_sbml_model, write_sbml_model
from mewpy.cobra.util import add_enzyme_constraints,convert_gpr_to_dnf,split_isozymes
from mewpy.simulation import get_simulator
from mewpy.simulation.environment import Environment as Environment
from mewpy.util.request import retreive_gene,retreive_protein,get_smiles,brenda_query
import pandas as pd
import numpy as np
from urllib.request import urlopen
from functools import reduce
import json
from ast import literal_eval

In [28]:
model = read_sbml_model("../models/non-ec/Bacteroides_thetaiotaomicron_VPI_5482.xml")

In [29]:
model.id

'M_Bacteroides_thetaiotaomicron_VPI_5482'

In [31]:
sim = get_simulator(model)
sim.set_objective("biomass")

In [32]:
sim.simulate()

objective: 73.25259606170327
Status: OPTIMAL
Method:SimulationMethod.FBA

In [33]:
print(len(sim.reactions))
print(len(sim.genes))

2438
823


## Annotation scraping

In [36]:
ls_rxn = []

for rxn in sim.reactions:
    anno = sim.get_reaction(rxn).annotations
    seed_id = anno.get('seed.reactions')
    rxn_name = sim.get_reaction(rxn).name
    ecnumber = anno.get('ec-code')
    metanetx = anno.get('metanetx.reaction')
    kegg = anno.get('kegg.reaction')
    res = [rxn,rxn_name,seed_id,metanetx,kegg,ecnumber]
    ls_rxn.append(res)
        
df_rxn = pd.DataFrame(ls_rxn,columns=[['Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','ecNumber']])

df_rxn

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,
...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,


In [37]:
seed_id = df_rxn['ModelSEED_id'].values.tolist()

seed_id = [reduce(lambda x: x, inner_list) for inner_list in seed_id]

metanetx_id = df_rxn['MetaNetX'].values.tolist()

metanetx_id = [reduce(lambda x: x, inner_list) for inner_list in metanetx_id]

kegg_id = df_rxn['KEGG_id'].values.tolist()

kegg_id = [reduce(lambda x: x, inner_list) for inner_list in kegg_id]

In [39]:
print(seed_id)

[None, None, None, None, None, None, None, None, 'rxn03435', 'rxn02522', 'rxn02762', 'rxn02521', 'rxn03483', 'rxn07441', 'rxn03436', None, None, None, None, None, None, None, None, None, 'rxn42709', 'rxn05358', None, None, None, None, None, None, None, None, None, None, None, 'rxn05333', 'rxn05423', 'rxn05373', 'rxn05398', 'rxn05331', 'rxn05427', 'rxn05377', 'rxn05402', 'rxn05335', 'rxn05431', 'rxn05381', 'rxn05406', 'rxn05332', 'rxn05462', 'rxn05329', 'rxn05411', 'rxn05361', 'rxn05386', 'rxn05330', 'rxn05415', 'rxn05365', 'rxn05390', 'rxn05334', 'rxn05369', 'rxn05419', 'rxn05394', None, None, None, None, 'rxn00902', None, None, 'rxn05338', 'rxn05422', 'rxn05372', 'rxn05397', 'rxn05340', 'rxn05426', 'rxn05376', 'rxn05401', 'rxn05342', 'rxn05430', 'rxn05380', 'rxn05405', 'rxn05336', 'rxn05461', 'rxn05339', 'rxn05410', 'rxn05360', 'rxn05385', 'rxn05337', 'rxn05414', 'rxn05364', 'rxn05389', 'rxn05341', 'rxn05368', 'rxn05418', 'rxn05393', 'rxn05343', 'rxn05425', 'rxn05375', 'rxn05400', 'rx

 ## ModelSEED query

In [17]:
SOLR_URL='https://modelseed.org'
ls_name = []
ls_kegg = []
ls_bigg = []

for mseed_id in tqdm(seed_id):
    try:
        connection = urlopen(SOLR_URL+f'/solr/reactions/select?wt=json&q=id:{mseed_id}&fl=name,id,formula,charge,aliases')
        response = json.load(connection)
        for document in response['response']['docs']:
            #print(document.get('name'),document.get('id'),document.get('formula'),document.get('charge'),document.get('aliases'))  
            ms_name = document.get('name')
            ls_alias = document.get('aliases')
            ms_bigg = list(filter(lambda a: 'BiGG:' in a, document.get('aliases')))
            ms_kegg = list(filter(lambda a: 'KEGG:' in a, document.get('aliases')))
            if len(ms_bigg)== 0 and len(ms_kegg)== 0:
                ms_bigg = None
                ms_kegg = None
            elif len(ms_bigg)== 0 and len(ms_kegg)!= 0:
                ms_bigg = None
                ms_kegg = list(ms_kegg)[0]
                ms_kegg = ms_kegg.replace('KEGG: ','')
            elif len(ms_bigg)!= 0 and len(ms_kegg)== 0:
                ms_kegg = None
                ms_bigg = list(ms_bigg)[0]
                ms_bigg = ms_bigg.replace('BiGG: ','')
            else:
                ms_kegg = list(ms_kegg)[0]
                ms_kegg = ms_kegg.replace('KEGG: ','')
                ms_bigg = list(ms_bigg)[0]
                ms_bigg = ms_bigg.replace('BiGG: ','')    
            ls_name.append(ms_name)
            ls_bigg.append(ms_bigg)
            ls_kegg.append(ms_kegg)           
    except:
        ls_name.append(None)
        ls_kegg.append(None)
        ls_bigg.append(None)

  0%|          | 0/2438 [00:00<?, ?it/s]

In [18]:
['None' if v is None else v for v in ls_bigg]
len(ls_bigg)

0

In [19]:
new_kegg = [next(filter(None, i)) for i in zip(ls_kegg, kegg_id)]


#new_kegg = [next(filter(lambda x: x is not None, i)) for i in zip(ls_kegg, kegg_id)]

In [20]:
df_rxn['BIGG_id'] = ls_bigg
df_rxn['KEGG_id2'] = ls_kegg

ValueError: Length of values (0) does not match length of index (2438)

## MetaNetX query

## BiGG query

In [11]:
#ls_bigg = df_rxn['BIGG_id'].values.tolist()
print(ls_bigg)

NameError: name 'ls_bigg' is not defined

In [13]:
import requests

ls_bigg = df_rxn['BIGG_id'].values.tolist()
bigg_ls = []

for bigg in tqdm(ls_bigg):
    for bigg_n in bigg:
        bigg_n = str(bigg_n)
        bigg_n = bigg_n.split(';')
        bigg_n = [x.strip(' ') for x in bigg_n]
        sub_bigg_ls = []
        for bi in bigg_n:
            if bi == 'None':
                pass
            else:
                url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bi}'
                #print(url)
                with requests.request("GET", url) as resp:
                    try:
                        resp.raise_for_status()  # raises exception when not a 2xx response
                        if resp.status_code != 204:
                            data = dict(resp.json())
                            ec_l = data['database_links']
                            if ec_l == None:
                                sub_bigg_ls.append(None)
                            else:
                                ec = [i['id'] for i in ec_l['EC Number']]
                                if ec == None:
                                    sub_bigg_ls.append(None)
                                print(ec)
                                sub_bigg_ls.append(ec)
                        else: 
                            sub_bigg_ls.append(None)
                            #print(sub_bigg_ls)
                    except:
                        sub_bigg_ls.append(None)
                        #print(sub_bigg_ls)
    bigg_ls.append(sub_bigg_ls)

  0%|          | 0/2438 [00:00<?, ?it/s]

['3.1.4.16']
['3.1.4.16']
['3.1.4.16']
['3.1.4.16']
['3.1.4.16']
['3.1.4.16']
['3.1.4.16']
['3.1.4.16']
['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61']
['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61']
['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']
['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']
['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59']
['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.61']
['2.3.3.13']
['2.3.3.13']
['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86']
['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86']
['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86']
['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86']
['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86']
['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86']
['2.3.1.86']
['2.3.1.86']
['2.3.1.86']
['2.3.1.86']
['2.3.1.86']
['4.2.3', '4.2.3.1']
['4.1.1.21']
['5.3.1.13']
['2.7.7.62']
['6.4.1.2']
['6.4.1.2']
['6.4.1

['4.2.1.1']
['4.2.1.1']
['2.1.1', '2.1.1.10']
['2.4']
['2']
['2.4.1.56']
['2.7.1.1', '2.7.1.2']
['2.7.1.1', '2.7.1.7']
['2.7.1.1', '2.7.1.4']
['2.5.1.-']
['4.3.1.3']
['4.3.1.3']
['4.3.1.3']
['1.1.1.23']
['3.1.3.15']
['2.5.1.61', '4.3.1.8']
['2.7.1.49']
['1.1.1.3']
['1.1.1.3']
['1.1.1.3']
['1.1.1.3']
['2.3.1.31']
['2.3.1.31']
['2.7.1.39']
['2.6.1.9']
['2.6.1.9']
['2.4.2.8']
['2.4.2.8']
['5.4.4.2']
['5.4.4.2']
['2.4.2.-', '4.1.3.-', '4.3.2.M2']
['4.2.1.19']
['2.6.1.42']
['2.6.1.42']
['2.1.2.3', '3.5.4.10']
['2.1.2.3', '3.5.4.10']
['1.1.1.205']
['3.5.1.4']
['1.1.1.18']
['3.2.2.1', '3.2.2.2', '3.2.2.8']
['3.2.2.1', '3.2.2.2', '3.2.2.8']
['5.3.3.2']
['5.3.3.2']
['1.17.1.2']
['2.5.1', '2.5.1.31', '2.5.1.M1']
['1.1.1.85']
['1.1.1.85']
['4.2.1.33']
['4.2.1.33']
['3.5.2', '3.5.2.7']
['1.1.1.86']
['1.1.1.86']
['1.1.1.86']
['2.3.1.-', '2.3.1.179', '2.3.1.180', '2.3.1.41', '2.3.1.85', '2.3.1.86']
['2.7.7.38']
['2.7.7.38']
['3.1.3.45']
['2.5.1.55']
['3.2.1.108', '3.2.1.23']
['3.2.1.108', '3.2.1.23'

In [15]:
ec_l = df_rxn['ecNumber'].values.tolist()
new_l = [next(filter(None, i)) for i in zip(bigg_ls, ec_l)]
df_rxn['ecNumber'] = new_l

## KEGG query

## Substrates

In [16]:
ls_sub = []

for rxn in sim.reactions:
    sub = list(sim.get_substrates(rxn).keys())
    ls_sub.append(sub)
    
df_rxn["Substrates"] = ls_sub   

df_rxn

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id,KEGG_id2,Substrates
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,[None],,,[12ppd_S[e]]
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,[None],,,[12ppd_S[p]]
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,[None],,,[1hibupglu_S[e]]
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,[None],,,"[1hibupglu_S[p], h2o[p]]"
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,[None],,,[1hibup_S[e]]
...,...,...,...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,[None],,,[chlphncl[p]]
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,[None],,,[czp[p]]
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,[None],,,[nchlphncl[p]]
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,[None],,,[nzp[p]]


In [17]:
sub_na = df_rxn['Substrates'].values.tolist()

ls_sub = []
ls_smile =[]



for sub_l in tqdm(sub_na):
    sub_ls_sub = []
    sub_ls_smile = []
    for sub_s in sub_l:
        for sub in sub_s:
            sub_name = sim.get_metabolite(sub).get('name')
            smile = get_smiles(sub_name)
            sub_ls_smile.append(smile)
            sub_ls_sub.append(sub_name)
        ls_sub.append(sub_ls_sub)
        ls_smile.append(sub_ls_smile)

df_rxn['Substrate Name'] = ls_sub
df_rxn['Substrate SMILE'] = ls_smile
df_rxn.to_csv(f'../data/rxn_data_{model.id}.csv',na_rep='None')

  0%|          | 0/2438 [00:00<?, ?it/s]

In [11]:
df_rxn = pd.read_csv(f'../data/rxn_data_{model.id}.csv')
df_rxn = df_rxn.iloc[:,1:]
df_rxn

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,,,['12ppd_S[e]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],[None]
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,,,['12ppd_S[p]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],[None]
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,,,['1hibupglu_S[e]'],['1-hydroxy S-ibuprofen-glucuronide'],[None],[None]
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,,,"['1hibupglu_S[p]', 'h2o[p]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None]
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,,,['1hibup_S[e]'],['1-hydroxy S-ibuprofen'],[None],[None]
...,...,...,...,...,...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,,,['chlphncl[p]'],['Chloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[...,[None]
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,,,['czp[p]'],['Clonazepam'],['C1C(=O)NC2=C(C=C(C=C2)[N+](=O)[O-])C(=N1)C3=...,[None]
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,,,['nchlphncl[p]'],['Nitrosochloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)N=O'],[None]
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,,,['nzp[p]'],"['Nitrazepam, Benzalin, Neozepam']",[None],[None]


In [12]:
df_rxn = df_rxn.loc[:,['Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','KEGG_id2','BIGG_id','Substrates','Substrate Name','Substrate SMILE','ecNumber']]

### Reactions with ecNumber

In [13]:
mask = df_rxn['ecNumber'].values!='[None]'
ec_nona = df_rxn[mask]
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86']
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]"
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]"
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",['3.5.4.16']
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,['3.5.4.16']


In [14]:
ec_nona = ec_nona.copy()

### Reactions without ecNumber

In [15]:
mask2 = df_rxn['ecNumber'].values=='[None]'
ec_na = df_rxn[mask2]
ec_na

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,,,['12ppd_S[e]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],[None]
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,,,['12ppd_S[p]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],[None]
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,,,['1hibupglu_S[e]'],['1-hydroxy S-ibuprofen-glucuronide'],[None],[None]
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,,,"['1hibupglu_S[p]', 'h2o[p]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",[None]
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,,,['1hibup_S[e]'],['1-hydroxy S-ibuprofen'],[None],[None]
...,...,...,...,...,...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,,,['chlphncl[p]'],['Chloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[...,[None]
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,,,['czp[p]'],['Clonazepam'],['C1C(=O)NC2=C(C=C(C=C2)[N+](=O)[O-])C(=N1)C3=...,[None]
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,,,['nchlphncl[p]'],['Nitrosochloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)N=O'],[None]
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,,,['nzp[p]'],"['Nitrazepam, Benzalin, Neozepam']",[None],[None]


## BRENDA query

### Kcat extraction

In [16]:
from brendapyrser import BRENDA


dataFile = '../../brenda_2023_1.txt'

In [17]:
brenda = BRENDA(dataFile)
brenda

0,1
Number of Enzymes,7832
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


In [18]:
r = brenda.reactions.get_by_id('1.1.1.86')

r.proteins

{'1': {'name': 'Salmonella enterica subsp. enterica serovar Typhimurium',
  'proteinID': '',
  'refs': ['2', '5', '6']},
 '2': {'name': 'Vigna radiata var. radiata', 'proteinID': '', 'refs': ['3']},
 '3': {'name': 'Escherichia coli',
  'proteinID': '',
  'refs': ['7', '9', '10', '15', '19', '21', '22', '30', '35', '45', '47']},
 '4': {'name': 'Saccharomyces cerevisiae',
  'proteinID': '',
  'refs': ['1', '32']},
 '5': {'name': 'Triticum aestivum', 'proteinID': '', 'refs': ['19']},
 '6': {'name': 'Neurospora crassa', 'proteinID': '', 'refs': ['4']},
 '7': {'name': 'Hordeum vulgare', 'proteinID': '', 'refs': ['12']},
 '8': {'name': 'Spinacia oleracea',
  'proteinID': '',
  'refs': ['8', '11', '13', '14', '16', '17', '18', '33', '35', '36']},
 '9': {'name': 'Pisum sativum', 'proteinID': '', 'refs': ['27']},
 '10': {'name': 'Pseudomonas aeruginosa',
  'proteinID': '',
  'refs': ['20', '24']},
 '11': {'name': 'Corynebacterium glutamicum',
  'proteinID': '',
  'refs': ['23', '25', '47']},
 '

In [19]:
kcat_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()

for ec in tqdm(ec_ls):
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip(' ')
        #print(ec_n)
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.get_values()
            except ValueError:
                sub_kcat_ls.append(None)
            if len(kcat_va) == 0:
                sub_kcat_ls.append(None)
            else:
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
    kcat_ls.append(sub_kcat_ls)

ec_nona['Avg Kcat (by ec)'] = kcat_ls

  0%|          | 0/838 [00:00<?, ?it/s]

In [20]:
kcat_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        #print(ec_n)
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.filter_by_organism('Bacteroides thetaiotaomicron').get_values()
            except ValueError:
                sub_kcat_ls.append(None)
            if len(kcat_va) == 0:
                sub_kcat_ls.append(None)
            else:
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
    kcat_ls.append(sub_kcat_ls)

ec_nona['Avg Kcat (by ec and species)'] = kcat_ls

  0%|          | 0/838 [00:00<?, ?it/s]

In [21]:
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species)
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86'],[2.0091262401360535],[None]
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]"
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]"
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",['3.5.4.16'],[0.009113],[None]
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,['3.5.4.16'],[0.009113],[None]


### Sequence extraction

In [22]:
seq_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()


for ec in tqdm(ec_ls):
    ec = ec.split(',')
    sub_seq_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        else:
            try:
                from zeep import Client
                import hashlib

                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","organism*Escherichia coli","sequence*",
                              "noOfAminoAcids*","firstAccessionCode*","source*",'id*' )
                resultString = client.service.getSequence(*parameters) 
                sub_seq_ls.append(resultString[0]['sequence'])
            except:
                sub_seq_ls.append(None)                  
    seq_ls.append(sub_seq_ls)

  0%|          | 0/838 [00:00<?, ?it/s]

In [23]:
print(seq_ls)

[['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGCGAQGLNQGLNMRDSGLDISYALRKEAIAEKRASWRKATENGFKVGTYEELIPQADLVVNLTPDKQHSDVVRTVQPLMKDGAALGYSHGFNIVEVGEQIRKDITVVMVAPKCPGTEVREEYKRGFGVPTLIAVHPENDPKGEGMAIAKAWAAATGGHRAGVLESSFVAEVKSDLMGEQTILCGMLQAGSLLCFDKLVEEGTDPAYAEKLIQFGWETITEALKQGGITLMMDRLSNPAKLRAYALSEQLKEIMAPLFQKHMDDIISGEFSSGMMADWANDDKKLLTWREETGKTAFETAPQYEGKIGEQEYFDKGVLMIAMVKAGVELAFETMVDSGIIEESAYYESLHELPLIANTIARKRLYEMNVVISDTAEYGNYLFSYACVPLLKPFMAELQPGDLGKAIPEGAVDNAQLRDVNEAIRSHAIEQVGKKLRGYMTDMKRIAVAG'], ['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTATEKFGLVRTASLINDARNEVKNSVLVDNGDLIQGSPLADYISAKGLKAGDVHPVYKALNTLDYTVGTLGNHEFNYGLDYLKNALAGAKFPYVNANVIDTRTKQPMFTPYLIKDTEVVDKDGKKQTLKIGYIGVVPPQIMGWDKANLSGKVTVNDITETVRKYVPEMREKGADVVVVLAHSGLSADPYKVMAENSVYYLSEIPGVNAIMFGHAHAVFPGKDFADIEGADIAKGTLNGVPAVMPGMWGDHLGVVDLQLSNNSGKWQVTQAKAEARPIYDIANKKSLAAEDSKLVETLKADHDATRQFVSKPIGKSADNMYSYLALVQDDPTVQVVNNAQKAYVEHYIQGDPDLAKLPVLSAAAPFKVGGRKNDPASYVEVEKGQLTFRNAADLYLYPNTLIVVKASGKEVKEWLECSAGQFNQIDPDNTKPQSLINWDGFRTYNFDVIDGVNYQID

In [24]:
ec_nona['AA Sequence'] = seq_ls
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86'],[2.0091262401360535],[None],[MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGC...
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]",[MKIELTVNGLNVQAQYHDEEIERVHKPLLRMLAALQTVNPQRRTV...
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]",[MSLAGKKIVLGVSGGIAAYKTPELVRRLRDRGADVRVAMTEAAKA...
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",['3.5.4.16'],[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,['3.5.4.16'],[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...


### Molecular Weight extraction

In [25]:
mw_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()
ec_ls = [x.strip(' ') for x in ec_ls]


for ec in tqdm(ec_ls):
    ec = ec.split(',')
    sub_mw_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        else:
            try:
                from zeep import Client
                import hashlib
                
                res_mw = 0
                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","organism*Escherichia coli","molecularWeight*",
              "molecularWeightMaximum*","commentary*","literature*" )
                resultString = client.service.getMolecularWeight(*parameters)
                for i in range(len(resultString)):
                    res_mw = res_mw + int(resultString[i]['molecularWeight'])
                res_mw = res_mw/len(resultString)
                sub_mw_ls.append(res_mw)
            except:
                sub_mw_ls.append(None)                  
    mw_ls.append(sub_mw_ls)

  0%|          | 0/838 [00:00<?, ?it/s]

In [26]:
ec_nona['Molecular Weight'] = mw_ls
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence,Molecular Weight
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86'],[2.0091262401360535],[None],[MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGC...,[141357.0]
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]",[MKIELTVNGLNVQAQYHDEEIERVHKPLLRMLAALQTVNPQRRTV...,"[36100.0, None]"
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]",[MSLAGKKIVLGVSGGIAAYKTPELVRRLRDRGADVRVAMTEAAKA...,"[37700.0, None]"
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",['3.5.4.16'],[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...,[125625.125]
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,['3.5.4.16'],[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...,[125625.125]


In [27]:
ec_nona.to_csv(f'../data/ec_nona_{model.id}.csv',na_rep='None')

In [10]:
ec_nona = pd.read_csv(f'../data/ec_nona_{model.id}.csv')
ec_nona = ec_nona.iloc[:,1:]
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence,Molecular Weight
0,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",['1.1.1.86'],[2.0091262401360535],[None],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
1,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
2,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
3,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
4,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]",['MKIELTVNGLNVQAQYHDEEIERVHKPLLRMLAALQTVNPQRRT...,"[36100.0, None]"
834,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]",['MSLAGKKIVLGVSGGIAAYKTPELVRRLRDRGADVRVAMTEAAK...,"[37700.0, None]"
835,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",['3.5.4.16'],[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
836,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,['3.5.4.16'],[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]


In [11]:
#ec_nona['Substrates'] = literal_eval(ec_nona['Substrates']) #convert to list type
ec_nona['Substrate Name'] = ec_nona['Substrate Name'].apply(literal_eval) #convert to list type
ec_nona['Substrate SMILE'] = ec_nona['Substrate SMILE'].apply(literal_eval) #convert to list type
ec_nona = ec_nona.explode(['Substrate Name','Substrate SMILE']).reset_index(drop=True)

In [12]:
ec_nona = ec_nona.drop(columns=['Substrates'])
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence,Molecular Weight
0,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,['1.1.1.86'],[2.0091262401360535],[None],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,['1.1.1.86'],[2.0091262401360535],[None],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
2,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"2,3-cyclic UMP(1-)",,"[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
3,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,Water,O,"[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
4,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"2,3-Cyclic CMP",,"[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1786,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"7,8-dihydroneopterin 3-triphosphate(4-)",,['3.5.4.16'],[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1787,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,Water,O,['3.5.4.16'],[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1788,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,Formamidopyrimidine nucleoside triphosphate,C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP(=...,['3.5.4.16'],[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1789,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,Water,O,['3.5.4.16'],[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]


In [13]:
print(get_smiles("7,8-dihydroneopterin 3'-triphosphate(4-)"))

C1C(=NC2=C(N1)N=C(NC2=O)N)C(C(COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])O)O


In [14]:
subs_name = ec_nona["Substrate Name"].values.tolist()
subs_smile = ec_nona["Substrate SMILE"].values.tolist()
subs_name2 = subs_name.copy()
subs_smile2 = subs_smile.copy()
subs_name3 = []

for i in range(len(subs_name2)):
    sn = subs_name2[i]
    sn = str(sn)
    if "2,3-C" in sn:
        #print(sn)
        sn = sn.replace("2,3","2',3'")
        #print(sn)
        smile = get_smiles(sn)
        subs_smile2[i] = smile
        #print(smile)
    elif "2,3-c" in sn:
        #print(sn)
        sn = sn.replace("2,3","2',3'")
        #print(sn)
        smile = get_smiles(sn)
        subs_smile2[i] = smile
        #print(smile)
    elif "3-triphosphate" in sn:
        #print(sn)
        sn = sn.replace("3","3'")
        #print(sn)
        smile = get_smiles(sn)
        subs_smile2[i] = smile
        #print(smile)
    else:
        pass
    subs_name3.append(sn)
    
ec_nona["Substrate Name"] = subs_name3
ec_nona["Substrate SMILE"] = subs_smile2

In [15]:
print(f'Before review: {int(len(subs_smile)-len([x for x in subs_smile if x is not None]))} is None out of {len(subs_smile)}')
print(f'After review: {int(len(subs_smile)-len([x for x in subs_smile2 if x is not None]))} is None out of {len(subs_smile)}')

Before review: 352 is None out of 1791
After review: 345 is None out of 1791


In [16]:
for i in range(len(subs_name2)):
    if subs_smile2[i] == None:
        print(subs_name2[i])

acyl carrier protein
(R)-3-Hydroxydecanoyl-[acyl-carrier protein]
10-methyl-3-hydroxy-undecanoyl-ACP
10-methyl-3-hydroxy-dodecanoyl-ACP
11-methyl-3-hydroxy-dodecanoyl-ACP
(R)-3-Hydroxydodecanoyl-[acyl-carrier protein]
12-methyl-3-hydroxy-tridecanoyl-ACP
12-methyl-3-hydroxy-tetra-decanoyl-ACP
13-methyl-3-hydroxy-tetra-decanoyl-ACP
(R)-3-Hydroxytetradecanoyl-[acyl-carrier protein]
14-methyl-3-hydroxy-pentadecanoyl-ACP
14-methyl-3-hydroxy-hexa-decanoyl-ACP
15-methyl-3-hydroxy-hexa-decanoyl-ACP
R-3-hydroxypalmitoyl-[acyl-carrier protein]
(R)-3-Hydroxyoctadecanoyl-[acyl-carrier protein]
4-methyl-3-hydroxy-pentanoyl-ACP
4-methyl-3-hydroxy-hexanoyl-ACP
5-methyl-3-hydroxy-hexanoyl-ACP
(R)-3-Hydroxyhexanoyl-[acyl-carrier protein]
6-methyl-3-hydroxy-heptanoyl-ACP
6-methyl-3-hydroxy-octanoyl-ACP
7-methyl-3-hydroxy-octanoyl-ACP
(R)-3-Hydroxyoctanoyl-[acyl-carrier protein]
8-methyl-3-hydroxy-decanoyl-ACP
8-methyl-3-hydroxy-nonanoyl-ACP
9-methyl-3-hydroxy-decanoyl-ACP
3-Oxodecanoyl-[acyl-carrier pro

## DLKcat - Kcat prediction

In [17]:
dk_prep = ec_nona.drop(columns=['ModelSEED_id','MetaNetX','KEGG_id','KEGG_id2','BIGG_id','Avg Kcat (by ec)','Avg Kcat (by ec and species)'])
dk_prep

Unnamed: 0,Reaction,Name,Substrate Name,Substrate SMILE,ecNumber,AA Sequence,Molecular Weight
0,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...","(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,['1.1.1.86'],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,['1.1.1.86'],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
2,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase","2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],"[['3.1.4.16'], ['3.1.4.16']]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
3,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",Water,O,"[['3.1.4.16'], ['3.1.4.16']]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
4,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase","2',3'-Cyclic CMP",C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-],"[['3.1.4.16'], ['3.1.4.16']]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
...,...,...,...,...,...,...,...
1786,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...","7,8-dihydroneopterin 3'-triphosphate(4-)",C1C(=NC2=C(N1)N=C(NC2=O)N)C(C(COP(=O)([O-])OP(...,['3.5.4.16'],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1787,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",Water,O,['3.5.4.16'],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1788,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",Formamidopyrimidine nucleoside triphosphate,C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP(=...,['3.5.4.16'],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1789,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",Water,O,['3.5.4.16'],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]


In [18]:
ec_l = dk_prep['ecNumber'].values.tolist()
ec_nl = []
mw_nl = []
seq_l = dk_prep['AA Sequence'].values.tolist()
mw_l = dk_prep['Molecular Weight'].values.tolist()
ei = []
mi = []

for es in ec_l:
    es = str(es)
    es = es.split(',')
    ei = []
    for sublist in es:
        sublist = str(sublist)
        sublist = sublist.strip("[[")
        sublist = sublist.strip(" ")
        sublist = sublist.strip("'")
        sublist = sublist.strip("[")
        sublist = sublist.strip("[")
        sublist = sublist.strip("'")
        sublist = sublist.strip("'")
        sublist = sublist.strip("]")
        sublist = sublist.strip("]]")  
        sublist = sublist.strip("'")
        if sublist not in ei:
            if sublist == 'None' and len(ei) > 0:
                pass
            elif '-' in sublist:
                pass
            else:
                ei.append(sublist)
    ec_nl.append(ei)

print(ec_nl)

[['1.1.1.86'], ['1.1.1.86'], ['3.1.4.16'], ['3.1.4.16'], ['3.1.4.16'], ['3.1.4.16'], ['3.1.4.16'], ['3.1.4.16'], ['3.1.4.16'], ['3.1.4.16'], ['2.6.1.83'], ['2.6.1.83'], ['1.1.1.86', '5.4.99.3'], ['2.3.1.180'], ['2.3.1.180'], ['2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.61'], ['4.2.1.59'], ['4.2.1.59'], ['4.2.1.59'], ['2.3.3.13'], ['2.3.3.13'], ['2.3.3.13'], ['1.1.1.100', '2.3.1.85', '2.3.1.86'], ['1.1.1.100', '2.3.1.85', '2.3.1.86'], ['1.1.1.100', '2.3.1.85', '2.

In [19]:
mw_l = dk_prep['Molecular Weight'].values.tolist()
mw_nl = []

for mw in mw_l:
    mw = str(mw)
    mw = mw.split(',')
    mi = []
    #print(mw)
    for subw in mw:
        subw = str(subw)
        subw = subw.strip("[[")
        subw = subw.strip(" ")
        subw = subw.strip("'")
        subw = subw.strip("[")
        subw = subw.strip("[")
        subw = subw.strip("'")
        subw = subw.strip("'")
        subw = subw.strip("]")
        subw = subw.strip("]]")  
        subw = subw.strip("'")
        #print(subw)
        if subw not in mi:
            if subw == 'None' and len(mi) > 0:
                pass
            else:
                mi.append(subw)
    mw_nl.append(mi)

print(mw_nl)

[['141357.0'], ['141357.0'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['141357.0'], ['41851.25'], ['41851.25'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'], ['None'],

In [20]:
print(len(ec_nl))
print(len(mw_nl))

1791
1791


In [21]:
dk_prep['ecNumber'] = ec_nl
dk_prep['Molecular Weight'] = mw_nl
dk_prep

Unnamed: 0,Reaction,Name,Substrate Name,Substrate SMILE,ecNumber,AA Sequence,Molecular Weight
0,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...","(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,[1.1.1.86],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,[1.1.1.86],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
2,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase","2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],[3.1.4.16],['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,[None]
3,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",Water,O,[3.1.4.16],['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,[None]
4,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase","2',3'-Cyclic CMP",C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-],[3.1.4.16],['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,[None]
...,...,...,...,...,...,...,...
1786,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...","7,8-dihydroneopterin 3'-triphosphate(4-)",C1C(=NC2=C(N1)N=C(NC2=O)N)C(C(COP(=O)([O-])OP(...,[3.5.4.16],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1787,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",Water,O,[3.5.4.16],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1788,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",Formamidopyrimidine nucleoside triphosphate,C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP(=...,[3.5.4.16],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1789,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",Water,O,[3.5.4.16],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]


In [22]:
dk_prep['AA Sequence'] = dk_prep['AA Sequence'].apply(literal_eval) #convert to list type
dk_prep = dk_prep.explode(['AA Sequence']).reset_index(drop=True)


In [26]:
dk_inp = dk_prep.drop(columns=['Reaction', 'Name', 'ecNumber', 'Molecular Weight'])
dk_inp = dk_inp.rename(columns = {'AA Sequence':'Protein Sequence','Substrate SMILE':'Substrate SMILES'})
dk_inp.to_csv(f'../../DLKcat/DeeplearningApproach/Code/example/dk_input_{model.id}.tsv',sep="\t",na_rep='None',index= False)

In [27]:
dk_inp

Unnamed: 0,Substrate Name,Substrate SMILES,Protein Sequence
0,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGCG...
1,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGCG...
2,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTA...
3,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTA...
4,Water,O,MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTA...
...,...,...,...
4341,"7,8-dihydroneopterin 3'-triphosphate(4-)",C1C(=NC2=C(N1)N=C(NC2=O)N)C(C(COP(=O)([O-])OP(...,VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMRG...
4342,Water,O,VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMRG...
4343,Formamidopyrimidine nucleoside triphosphate,C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP(=...,VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMRG...
4344,Water,O,VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMRG...


In [None]:
#!python ../../DLKcat/DeeplearningApproach/Code/example/prediction_for_input.py ../../DLKcat/DeeplearningApproach/Code/example/dk_input_{model.id}.tsv

In [28]:
dk_out = pd.read_csv(f'../../DLKcat/DeeplearningApproach/Code/example/output.tsv', sep="\t")
dk_out['Molecular Weight'] = dk_prep['Molecular Weight']
dk_out

Unnamed: 0,Substrate Name,Substrate SMILES,Protein Sequence,Kcat value (1/s),Molecular Weight
0,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGCG...,1.3982,[141357.0]
1,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGCG...,1.4882,[141357.0]
2,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTA...,2.8881,[None]
3,"2',3'-cyclic UMP(1-)",C1=CN(C(=O)NC1=O)C2C3C(C(O2)CO)OP(=O)(O3)[O-],MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTA...,2.8881,[None]
4,Water,O,MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTA...,2.3430,[None]
...,...,...,...,...,...
308,proton,[H+],,,[None]
309,Malonyl-[acyl-carrier protein],,,,[None]
310,Malonyl-[acyl-carrier protein],,,,[None]
311,Malonyl-[acyl-carrier protein],,,,[None]


In [29]:
ec_sim = add_enzyme_constraints(sim, dk_out)

Converting to irreversible: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2438/2438 [00:00<00:00, 5703.47it/s]
Splitting isozymes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3916/3916 [00:01<00:00, 2424.56it/s]
Adding gene species:   0%|                                                                                                                                                          | 0/823 [00:00<?, ?it/s]


KeyError: 'G_6666666__46__58896__46__peg__46__2230'