# Enzymatic Constraints Enhancement of AGORA models

### Context



### Goals:

- 

In [1]:
from tqdm.auto import tqdm
from reframed.io.sbml import load_cbmodel
from cobra.io import read_sbml_model, write_sbml_model
from mewpy.cobra.util import add_enzyme_constraints
from mewpy.simulation import get_simulator
from mewpy.simulation.environment import Environment as Environment
from mewpy.util.request import retreive_gene,retreive_protein,get_smiles,brenda_query
import pandas as pd
import numpy as np
from urllib.request import urlopen
from functools import reduce
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = read_sbml_model("../models/non-ec/Bacteroides_thetaiotaomicron_VPI_5482.xml")

In [3]:
model.name

'Bacteroides thetaiotaomicron VPI 5482'

In [4]:
sim = get_simulator(model)
sim.set_objective("biomass")

In [5]:
sim.simulate()

objective: 73.25259606170353
Status: OPTIMAL
Method:SimulationMethod.FBA

## Annotation scraping

In [39]:
ls_rxn = []

for rxn in sim.reactions:
    anno = sim.get_reaction(rxn).annotations
    seed_id = anno.get('seed.reactions')
    rxn_name = sim.get_reaction(rxn).name
    ecnumber = anno.get('ec-code')
    metanetx = anno.get('metanetx.reaction')
    kegg = anno.get('kegg.reaction')
    res = [rxn,rxn_name,seed_id,metanetx,kegg,ecnumber]
    ls_rxn.append(res)
        
df_rxn = pd.DataFrame(ls_rxn,columns=[['Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','ecNumber']])

df_rxn

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,
...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,


In [None]:
mask = df_rxn['ecNumber'].values!=None
ec_nona = df_rxn[mask]
ec_nona

In [40]:
seed_id = df_rxn['ModelSEED_id'].values.tolist()

seed_id = [reduce(lambda x: x, inner_list) for inner_list in seed_id]

metanetx_id = df_rxn['MetaNetX'].values.tolist()

metanetx_id = [reduce(lambda x: x, inner_list) for inner_list in metanetx_id]

kegg_id = df_rxn['KEGG_id'].values.tolist()

kegg_id = [reduce(lambda x: x, inner_list) for inner_list in kegg_id]

In [45]:
print(metanetx_id)

[None, None, None, None, None, None, None, None, 'MNXR83171', 'MNXR34', 'MNXR35', 'MNXR36', 'MNXR37', 'MNXR33992', 'MNXR76597', None, None, None, None, None, None, None, None, None, None, 'MNXR53357', None, None, None, None, None, None, None, None, None, None, None, 'MNXR68167', 'MNXR27697', 'MNXR27647', 'MNXR27672', 'MNXR68168', 'MNXR27701', 'MNXR27651', 'MNXR27676', 'MNXR68170', 'MNXR27705', 'MNXR27655', 'MNXR27680', 'MNXR68172', 'MNXR27736', 'MNXR71008', 'MNXR27685', 'MNXR27635', 'MNXR27660', 'MNXR68177', 'MNXR27689', 'MNXR27639', 'MNXR27664', 'MNXR68178', 'MNXR27643', 'MNXR27693', 'MNXR27668', None, None, None, None, 'MNXR3135', None, None, 'MNXR68183', 'MNXR27696', 'MNXR27646', 'MNXR27671', 'MNXR68184', 'MNXR27700', 'MNXR27650', 'MNXR27675', 'MNXR68185', 'MNXR27704', 'MNXR27654', 'MNXR27679', 'MNXR68186', 'MNXR27735', 'MNXR74285', 'MNXR27684', 'MNXR27634', 'MNXR27659', 'MNXR68190', 'MNXR27688', 'MNXR27638', 'MNXR27663', 'MNXR68191', 'MNXR27642', 'MNXR27692', 'MNXR27667', 'MNXR8138

 ## ModelSEED query

In [None]:
SOLR_URL='https://modelseed.org'
ls_name = []
ls_kegg = []
ls_bigg = []

for mseed_id in tqdm(seed_id):
    if mseed_id=="rxn10067" or mseed_id=="rxn13783" or mseed_id=="rxn13782" or mseed_id=="rxn13784" or mseed_id==None:
        ls_name.append(None)
        ls_kegg.append(None)
        ls_bigg.append(None)
    else:
        connection = urlopen(SOLR_URL+f'/solr/reactions/select?wt=json&q=id:{mseed_id}&fl=name,id,formula,charge,aliases')
        response = json.load(connection)
        for document in response['response']['docs']:
            #print(document.get('name'),document.get('id'),document.get('formula'),document.get('charge'),document.get('aliases'))  
            ms_name = document.get('name')
            ls_alias = document.get('aliases')
            ms_bigg = list(filter(lambda a: 'BiGG:' in a, document.get('aliases')))
            ms_kegg = list(filter(lambda a: 'KEGG:' in a, document.get('aliases')))
            if len(ms_bigg)== 0 and len(ms_kegg)== 0:
                ms_bigg = None
                ms_kegg = None
            elif len(ms_bigg)== 0 and len(ms_kegg)!= 0:
                ms_bigg = None
                ms_kegg = list(ms_kegg)[0]
                ms_kegg = ms_kegg.replace('KEGG: ','')
            elif len(ms_bigg)!= 0 and len(ms_kegg)== 0:
                ms_kegg = None
                ms_bigg = list(ms_bigg)[0]
                ms_bigg = ms_bigg.replace('BiGG: ','')
            else:
                ms_kegg = list(ms_kegg)[0]
                ms_kegg = ms_kegg.replace('KEGG: ','')
                ms_bigg = list(ms_bigg)[0]
                ms_bigg = ms_bigg.replace('BiGG: ','')    
            ls_name.append(ms_name)
            ls_bigg.append(ms_bigg)
            ls_kegg.append(ms_kegg)

In [44]:
print(ls_name)

NameError: name 'ls_name' is not defined

In [None]:
print(ls_bigg)

In [None]:
print(ls_kegg)

In [None]:
['None' if v is None else v for v in ls_bigg]
len(ls_bigg)

In [None]:
df_rxn['BIGG_id'] = ls_bigg
df_rxn['KEGG_id2'] = ls_kegg

## MetaNetX query

## BiGG query

In [None]:
import requests

for bigg in df_rxn['BIGG_id'].values.to_list():
    if bigg == None:
        pass
    else:
        url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bigg}'
        resp = requests.get(url)
        print(resp.content)

## KEGG query

## Substrates

In [None]:
ls_sub = []

for rxn in sim.reactions:
    sub = list(sim.get_substrates(rxn).keys())
    ls_sub.append(sub)
    
df_rxn["Substrates"] = ls_sub   

df_rxn

In [None]:
sub_na = df_rxn['Substrates'].values.tolist()

ls_sub = []
ls_smile =[]



for sub_l in tqdm(sub_na):
    sub_ls_sub = []
    sub_ls_smile = []
    for sub_s in sub_l:
        for sub in sub_s:
            sub_name = sim.get_metabolite(sub).get('name')
            smile = get_smiles(sub_name)
            sub_ls_smile.append(smile)
            sub_ls_sub.append(sub_name)
        ls_sub.append(sub_ls_sub)
        ls_smile.append(sub_ls_smile)

df_rxn['Substrate Name'] = ls_sub
df_rxn['Substrate SMILE'] = ls_smile
df_rxn.to_csv(f'../data/rxn_data_{model.id}.csv',na_rep='None')

In [52]:
df_rxn = pd.read_csv(f'../data/rxn_data_{model.id}.csv')
df_rxn = df_rxn.iloc[:,1:]
df_rxn

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,ecNumber,BIGG_id,KEGG_id2,Substrates,Substrate Name,Substrate SMILE
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,,,,['12ppd_S[e]'],"['(S)-propane-1,2-diol']",['CC(CO)O']
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,,,,['12ppd_S[p]'],"['(S)-propane-1,2-diol']",['CC(CO)O']
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,,,,['1hibupglu_S[e]'],['1-hydroxy S-ibuprofen-glucuronide'],[None]
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,,,,"['1hibupglu_S[p]', 'h2o[p]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']"
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,,,,['1hibup_S[e]'],['1-hydroxy S-ibuprofen'],[None]
...,...,...,...,...,...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,,,,['chlphncl[p]'],['Chloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[...
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,,,,['czp[p]'],['Clonazepam'],['C1C(=O)NC2=C(C=C(C=C2)[N+](=O)[O-])C(=N1)C3=...
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,,,,['nchlphncl[p]'],['Nitrosochloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)N=O']
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,,,,['nzp[p]'],"['Nitrazepam, Benzalin, Neozepam']",[None]


In [53]:
df_rxn = df_rxn.loc[:,['Reaction','Name','ModelSEED_id','MetaNetX','KEGG_id','KEGG_id2','BIGG_id','Substrates','Substrate Name','Substrate SMILE','ecNumber']]

In [64]:
import requests

ls_bigg = df_rxn['BIGG_id'].values.tolist()

ls_bigg = [x.strip(' ') for x in ls_bigg]

bigg_ls = []

for bigg in ls_bigg:
    bigg = bigg.split(';')
    bigg = [x.strip(' ') for x in bigg]
    sub_bigg_ls = []
    for bigg_n in bigg:
        #print(bigg_n)
        if bigg_n == 'None':
            pass
        else:
            url =f'http://bigg.ucsd.edu/api/v2/universal/reactions/{bigg_n}'
            with requests.request("GET", url) as resp:
                try:
                    resp.raise_for_status()  # raises exception when not a 2xx response
                    if resp.status_code != 204:
                        data = dict(resp.json())
                        ec_l = data['database_links']
                        if ec_l == None:
                            sub_bigg_ls.append(None)
                        else:
                            ec = [i['id'] for i in ec_l['EC Number']]
                            if ec == None:
                                sub_bigg_ls.append(None)
                            #print(ec)
                            sub_bigg_ls.append(ec)
                    else: 
                        sub_bigg_ls.append(None)
                except:
                    sub_bigg_ls.append(None)
    bigg_ls.append(sub_bigg_ls)

In [65]:
print(bigg_ls)

[[], [], [], [], [], [], [], [], [], [['3.1.4.16'], ['3.1.4.16']], [['3.1.4.16'], ['3.1.4.16']], [['3.1.4.16'], ['3.1.4.16']], [['3.1.4.16'], ['3.1.4.16']], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], None], [], [], [], [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], None], [], [], [], [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']], [], [], [], [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']], [], [None], [], [], [], [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59'], None], [], [], [], [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.61'], None], [], [], [], [], [], [], [], [['2.3.3.13'], ['2.3.3.13'], None], [], [], [['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86'], None, None], [], [], [], [['1.1.1.100', '2.3.1.-', '2.3.1.85', '2.3.1.86'], None], [], [],

In [66]:
ec_l = df_rxn['ecNumber'].values.tolist()
print(ec_l)

['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1.1.1.86', "[['3.1.4.16'], ['3.1.4.16']]", "[['3.1.4.16'], ['3.1.4.16']]", "[['3.1.4.16'], ['3.1.4.16']]", "[['3.1.4.16'], ['3.1.4.16']]", '2.6.1.83', '1.1.1.86, 5.4.99.3', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '2.3.1.180', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', "[['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], None]", '4.2.1.59', '4.2.1.59', '4.2.1.59', "[['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], None]", '4.2.1.59', '4.2.1.59', '4.2.1.59', "[['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']]", '4.2.1.59', '4.2.1.59', '4.2.1.59', "[['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']]", '4.2.1.59', '[None]', '4.2.1.59', '4.2.1.59', '4.2.1.59', "[['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59'], None]", '4.2.1.59', '4.2.1.59',

In [67]:
new_l = [next(filter(None, i)) for i in zip(bigg_ls, ec_l)]
print(new_l)

['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '1.1.1.86', [['3.1.4.16'], ['3.1.4.16']], [['3.1.4.16'], ['3.1.4.16']], [['3.1.4.16'], ['3.1.4.16']], [['3.1.4.16'], ['3.1.4.16']], '2.6.1.83', '1.1.1.86, 5.4.99.3', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', '2.3.1.180', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], None], '4.2.1.59', '4.2.1.59', '4.2.1.59', [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59', '4.2.1.60', '4.2.1.61'], None], '4.2.1.59', '4.2.1.59', '4.2.1.59', [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']], '4.2.1.59', '4.2.1.59', '4.2.1.59', [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.59', '4.2.1.61']], '4.2.1.59', [None], '4.2.1.59', '4.2.1.59', '4.2.1.59', [['2.3.1.-', '2.3.1.85', '2.3.1.86', '4.2.1.58', '4.2.1.59'], None], '4.2.1.59', '4.2.1.59', '4.2.1.59', [['2.3.

In [69]:
df_rxn['ecNumber'] = new_l

In [70]:
df_rxn.to_csv(f'../data/rxn_data_{model.id}.csv',na_rep='None')

In [6]:
df_rxn = pd.read_csv(f'../data/rxn_data_{model.id}.csv')
df_rxn = df_rxn.iloc[:,1:]
df_rxn

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,,,['12ppd_S[e]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,,,['12ppd_S[p]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,,,['1hibupglu_S[e]'],['1-hydroxy S-ibuprofen-glucuronide'],[None],
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,,,"['1hibupglu_S[p]', 'h2o[p]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,,,['1hibup_S[e]'],['1-hydroxy S-ibuprofen'],[None],
...,...,...,...,...,...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,,,['chlphncl[p]'],['Chloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[...,
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,,,['czp[p]'],['Clonazepam'],['C1C(=O)NC2=C(C=C(C=C2)[N+](=O)[O-])C(=N1)C3=...,
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,,,['nchlphncl[p]'],['Nitrosochloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)N=O'],
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,,,['nzp[p]'],"['Nitrazepam, Benzalin, Neozepam']",[None],


### Reactions with ecNumber

In [7]:
mask = df_rxn['ecNumber'].values!='None'
ec_nona = df_rxn[mask]
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",1.1.1.86
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]"
...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]"
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]"
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",3.5.4.16
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,3.5.4.16


In [8]:
ec_nona = ec_nona.copy()

### Reactions without ecNumber

In [9]:
mask2 = df_rxn['ecNumber'].values=='None'
ec_na = df_rxn[mask2]
ec_na

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber
0,12PPD_Stex,"(S)-propane-1,2-diol diffusion extracellular t...",,,,,,['12ppd_S[e]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],
1,12PPDtpp,"S-Propane-1,2-diol facilitated transport, peri...",,,,,,['12ppd_S[p]'],"['(S)-propane-1,2-diol']",['CC(CO)O'],
2,1HIBUPGLU_Stex,1-hydroxy S-ibuprofen-glucuronide diffusion ex...,,,,,,['1hibupglu_S[e]'],['1-hydroxy S-ibuprofen-glucuronide'],[None],
3,1HIBUP_S_GLCAASEepp,1-hydroxy S-ibuprofen beta-glucuronidase extr...,,,,,,"['1hibupglu_S[p]', 'h2o[p]']","['1-hydroxy S-ibuprofen-glucuronide', 'Water']","[None, 'O']",
4,1HIBUP_Stex,1-hydroxy S-ibuprofen diffusion extracellular ...,,,,,,['1hibup_S[e]'],['1-hydroxy S-ibuprofen'],[None],
...,...,...,...,...,...,...,...,...,...,...,...
2433,tCAPpp,"Chloramphenicol transport, reversible, hypothe...",,,,,,['chlphncl[p]'],['Chloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)[N+](=O)[...,
2434,tCZPpp,"Clonazepam transport, reversible, hypothetical...",,,,,,['czp[p]'],['Clonazepam'],['C1C(=O)NC2=C(C=C(C=C2)[N+](=O)[O-])C(=N1)C3=...,
2435,tNCAPpp,"Nitrosochloramphenicol transport, reversible, ...",,,,,,['nchlphncl[p]'],['Nitrosochloramphenicol'],['C1=CC(=CC=C1C(C(CO)NC(=O)C(Cl)Cl)O)N=O'],
2436,tNZPpp,"Nitrazepam transport, reversible, hypothetical...",,,,,,['nzp[p]'],"['Nitrazepam, Benzalin, Neozepam']",[None],


## BRENDA query

### Kcat extraction

In [10]:
from brendapyrser import BRENDA


dataFile = '../../brenda_2023_1.txt'

In [11]:
brenda = BRENDA(dataFile)
brenda

0,1
Number of Enzymes,7832
BRENDA copyright,"Copyrighted by Dietmar Schomburg, Techn. University  Braunschweig, GERMANY. Distributed under the License as stated  at http:/www.brenda-enzymes.org"
Parser version,0.0.1
Author,"Semidán Robaina Estévez, 2020"


In [12]:
r = brenda.reactions.get_by_id('1.1.1.86')

r.proteins

{'1': {'name': 'Salmonella enterica subsp. enterica serovar Typhimurium',
  'proteinID': '',
  'refs': ['2', '5', '6']},
 '2': {'name': 'Vigna radiata var. radiata', 'proteinID': '', 'refs': ['3']},
 '3': {'name': 'Escherichia coli',
  'proteinID': '',
  'refs': ['7', '9', '10', '15', '19', '21', '22', '30', '35', '45', '47']},
 '4': {'name': 'Saccharomyces cerevisiae',
  'proteinID': '',
  'refs': ['1', '32']},
 '5': {'name': 'Triticum aestivum', 'proteinID': '', 'refs': ['19']},
 '6': {'name': 'Neurospora crassa', 'proteinID': '', 'refs': ['4']},
 '7': {'name': 'Hordeum vulgare', 'proteinID': '', 'refs': ['12']},
 '8': {'name': 'Spinacia oleracea',
  'proteinID': '',
  'refs': ['8', '11', '13', '14', '16', '17', '18', '33', '35', '36']},
 '9': {'name': 'Pisum sativum', 'proteinID': '', 'refs': ['27']},
 '10': {'name': 'Pseudomonas aeruginosa',
  'proteinID': '',
  'refs': ['20', '24']},
 '11': {'name': 'Corynebacterium glutamicum',
  'proteinID': '',
  'refs': ['23', '25', '47']},
 '

In [13]:
kcat_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()

for ec in ec_ls:
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip(' ')
        print(ec_n)
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.get_values()
            except ValueError:
                sub_kcat_ls.append(None)
            if len(kcat_va) == 0:
                sub_kcat_ls.append(None)
            else:
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
    kcat_ls.append(sub_kcat_ls)

ec_nona['Avg Kcat (by ec)'] = kcat_ls

1.1.1.86
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
2.6.1.83
1.1.1.86
5.4.99.3
2.3.1.180
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
4.2.1.60
4.2.1.61
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
4.2.1.60
4.2.1.61
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.59
4.2.1.61
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.59
4.2.1.61
4.2.1.59
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
4.2.1.61
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.3.13
2.3.3.13
None
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100

4.1.1.47
4.1.1.47
None
None
None
2.3.1.29
2.3.1.37
None
2.3.1.29
2.3.1.37
None
1.1.1.26
None
None
None
3.4.11.2
None
3.4.11.2
None
3.4.11.23
2.7.7.70
None
3.1.3.82 3.1.3.83
6.3.5.2
None
1.1.1.351
1.1.1.44
None
None
1.1.1.351
1.1.1.44
1.1.1.351
1.1.1.44
3.2.2.1
3.2.2.8
None
3.1.4.2
3.1.4.46
3.1.4.2
3.1.4.46
3.1.4.2
3.1.4.46
3.1.4.2
3.1.4.46
3.1.4.46
3.1.4.46
2.5.1.10
3.5.4.16
3.5.4.25
None
2.7.6.5
2.4.2.22
2.4.2.7
2.4.2.8
2.4.2.22
2.4.2.7
2.4.2.8
None
5.3.1.12
5.3.1.12
2.1.2.5
4.3.1.4
None
4.2.1.1
4.2.1.1
4.2.1.1
2.1.1
2.1.1.10
None
None
None
None
2.4
2
None
2.4.1.56
None
None
2.7.1.1
2.7.1.2
None
None
2.7.1.1
2.7.1.8
None
2.7.1.1
2.7.1.7
None
None
2.7.1.1
2.7.1.4
None
None
2.5.1.-
None
None
None
4.3.1.3
None
None
4.3.1.3
4.3.1.3
None
1.1.1.23
None
3.1.3.15
None
2.5.1.61
4.3.1.8
2.7.1.49
None
None
None
None
None
1.1.1.3
1.1.1.3
None
1.1.1.3
1.1.1.3
2.3.1.31
2.3.1.31
None
2.7.1.39
None
None
2.6.1.9
2.6.1.9
None
2.4.2.8
2.4.2.8
1.12.7.2
None
2.3.1.180
1.1.1.42
None
5.4.4.2
5.4.4.2
2.4.2.-

In [14]:
kcat_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()


for ec in ec_ls:
    ec = ec.split(',')
    sub_kcat_ls = []
    #print(ec)
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        print(ec_n)
        if "-" in ec_n:
            pass
        else:
            try:
                r = brenda.reactions.get_by_id(ec_n)
                kcat_va = r.Kcatvalues.filter_by_organism('Bacteroides thetaiotaomicron').get_values()
            except ValueError:
                sub_kcat_ls.append(None)
            if len(kcat_va) == 0:
                sub_kcat_ls.append(None)
            else:
                avg_kcat = sum(kcat_va)/len(kcat_va)
                sub_kcat_ls.append(avg_kcat)
    kcat_ls.append(sub_kcat_ls)

ec_nona['Avg Kcat (by ec and species)'] = kcat_ls

1.1.1.86
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
3.1.4.16
2.6.1.83
1.1.1.86
5.4.99.3
2.3.1.180
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
4.2.1.60
4.2.1.61
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
4.2.1.60
4.2.1.61
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.59
4.2.1.61
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.59
4.2.1.61
4.2.1.59
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.1.-
2.3.1.85
2.3.1.86
4.2.1.58
4.2.1.59
4.2.1.61
None
4.2.1.59
4.2.1.59
4.2.1.59
2.3.3.13
2.3.3.13
None
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100
None
1.1.1.100
1.1.1.100
1.1.1.100
1.1.1.100
2.3.1.-
2.3.1.85
2.3.1.86
None
1.1.1.100

1.1.1.351
1.1.1.44
None
None
1.1.1.351
1.1.1.44
1.1.1.351
1.1.1.44
3.2.2.1
3.2.2.8
None
3.1.4.2
3.1.4.46
3.1.4.2
3.1.4.46
3.1.4.2
3.1.4.46
3.1.4.2
3.1.4.46
3.1.4.46
3.1.4.46
2.5.1.10
3.5.4.16
3.5.4.25
None
2.7.6.5
2.4.2.22
2.4.2.7
2.4.2.8
2.4.2.22
2.4.2.7
2.4.2.8
None
5.3.1.12
5.3.1.12
2.1.2.5
4.3.1.4
None
4.2.1.1
4.2.1.1
4.2.1.1
2.1.1
2.1.1.10
None
None
None
None
2.4
2
None
2.4.1.56
None
None
2.7.1.1
2.7.1.2
None
None
2.7.1.1
2.7.1.8
None
2.7.1.1
2.7.1.7
None
None
2.7.1.1
2.7.1.4
None
None
2.5.1.-
None
None
None
4.3.1.3
None
None
4.3.1.3
4.3.1.3
None
1.1.1.23
None
3.1.3.15
None
2.5.1.61
4.3.1.8
2.7.1.49
None
None
None
None
None
1.1.1.3
1.1.1.3
None
1.1.1.3
1.1.1.3
2.3.1.31
2.3.1.31
None
2.7.1.39
None
None
2.6.1.9
2.6.1.9
None
2.4.2.8
2.4.2.8
1.12.7.2
None
2.3.1.180
1.1.1.42
None
5.4.4.2
5.4.4.2
2.4.2.-
4.1.3.-
4.3.2.M2
4.2.1.19
4.1.1.48
2.3.1.180
2.3.1.180
None
None
2.6.1.42
2.6.1.42
None
None
2.1.2.3
3.5.4.10
2.1.2.3
3.5.4.10
None
None
None
1.1.1.205
None
None
None
3.5.1.4
1.1.1.18
3

In [15]:
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species)
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",1.1.1.86,[2.0091262401360535],[None]
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]"
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]"
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",3.5.4.16,[0.009113],[None]
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,3.5.4.16,[0.009113],[None]


### Sequence extraction

In [16]:
seq_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()
#ec_ls = [x.strip(' ') for x in ec_ls]


for ec in tqdm(ec_ls):
    ec = ec.split(',')
    sub_seq_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        else:
            try:
                from zeep import Client
                import hashlib

                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","organism*Escherichia coli","sequence*",
                              "noOfAminoAcids*","firstAccessionCode*","source*",'id*' )
                resultString = client.service.getSequence(*parameters) 
                sub_seq_ls.append(resultString[0]['sequence'])
            except:
                sub_seq_ls.append(None)                  
    seq_ls.append(sub_seq_ls)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 923/923 [35:41<00:00,  2.32s/it]


In [17]:
print(seq_ls)

[['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGCGAQGLNQGLNMRDSGLDISYALRKEAIAEKRASWRKATENGFKVGTYEELIPQADLVVNLTPDKQHSDVVRTVQPLMKDGAALGYSHGFNIVEVGEQIRKDITVVMVAPKCPGTEVREEYKRGFGVPTLIAVHPENDPKGEGMAIAKAWAAATGGHRAGVLESSFVAEVKSDLMGEQTILCGMLQAGSLLCFDKLVEEGTDPAYAEKLIQFGWETITEALKQGGITLMMDRLSNPAKLRAYALSEQLKEIMAPLFQKHMDDIISGEFSSGMMADWANDDKKLLTWREETGKTAFETAPQYEGKIGEQEYFDKGVLMIAMVKAGVELAFETMVDSGIIEESAYYESLHELPLIANTIARKRLYEMNVVISDTAEYGNYLFSYACVPLLKPFMAELQPGDLGKAIPEGAVDNAQLRDVNEAIRSHAIEQVGKKLRGYMTDMKRIAVAG'], ['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDTATEKFGLVRTASLINDARNEVKNSVLVDNGDLIQGSPLADYISAKGLKAGDVHPVYKALNTLDYTVGTLGNHEFNYGLDYLKNALAGAKFPYVNANVIDTRTKQPMFTPYLIKDTEVVDKDGKKQTLKIGYIGVVPPQIMGWDKANLSGKVTVNDITETVRKYVPEMREKGADVVVVLAHSGLSADPYKVMAENSVYYLSEIPGVNAIMFGHAHAVFPGKDFADIEGADIAKGTLNGVPAVMPGMWGDHLGVVDLQLSNNSGKWQVTQAKAEARPIYDIANKKSLAAEDSKLVETLKADHDATRQFVSKPIGKSADNMYSYLALVQDDPTVQVVNNAQKAYVEHYIQGDPDLAKLPVLSAAAPFKVGGRKNDPASYVEVEKGQLTFRNAADLYLYPNTLIVVKASGKEVKEWLECSAGQFNQIDPDNTKPQSLINWDGFRTYNFDVIDGVNYQID

In [18]:
ec_nona['AA Sequence'] = seq_ls
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",1.1.1.86,[2.0091262401360535],[None],[MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGC...
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]",[MKIELTVNGLNVQAQYHDEEIERVHKPLLRMLAALQTVNPQRRTV...
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]",[MSLAGKKIVLGVSGGIAAYKTPELVRRLRDRGADVRVAMTEAAKA...
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",3.5.4.16,[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,3.5.4.16,[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...


### Molecular Weight extraction

In [None]:
mw_ls = []
ec_ls = ec_nona['ecNumber'].values.tolist()
ec_ls = [x.strip(' ') for x in ec_ls]


for ec in tqdm(ec_ls):
    ec = ec.split(',')
    sub_mw_ls = []
    for ec_n in ec:
        ec_n = ec_n.strip(',')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip('[[')
        ec_n = ec_n.strip(']')
        ec_n = ec_n.strip(']]')
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        ec_n = ec_n.strip("'")
        ec_n = ec_n.strip(' ')
        ec_n = ec_n.strip('[')
        if "-" in ec_n:
            pass
        else:
            try:
                from zeep import Client
                import hashlib
                
                res_mw = 0
                wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
                password = hashlib.sha256("07042000Alex!".encode("utf-8")).hexdigest()
                client = Client(wsdl)
                parameters = ( "alexandreareias1718@gmail.com",password,f"ecNumber*{ec_n}","organism*Escherichia coli","molecularWeight*",
              "molecularWeightMaximum*","commentary*","literature*" )
                resultString = client.service.getMolecularWeight(*parameters)
                for i in range(len(resultString)):
                    res_mw = res_mw + int(resultString[i]['molecularWeight'])
                res_mw = res_mw/len(resultString)
                sub_mw_ls.append(res_mw)
            except:
                sub_mw_ls.append(None)                  
    mw_ls.append(sub_mw_ls)

  0%|█▎                                                                                                                                                                                                                                                                                                                                    | 8/1951 [00:12<56:36,  1.75s/it]

In [20]:
print(mw_ls)

[[141357.0], [None, None], [None, None], [None, None], [None, None], [None], [141357.0, None], [41851.25], [None, None, None, None, None, None, None], [None], [None], [None], [None, None, None, None, None, None, None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None], [None], [None, None, None, None, None], [None], [None], [None], [None, None, None, None, None, None], [None], [None], [None], [None, None, None], [None, None, None, None, None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None, None, None, None], [None], [None], [None], [None, None, None, None], [41738.769230769234], [41738.769230769234], [41738.769230769234], [None, None, None, None], [41738.769230769234], [41738.769230769234], [41738.769230769234

In [34]:
ec_nona['Molecular Weight'] = mw_ls
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence,Molecular Weight
8,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"[23dhmp[c], nadp[c]]","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",1.1.1.86,[2.0091262401360535],[None],[MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVGC...,[141357.0]
9,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"[23cump[c], h2o[c]]","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
10,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"[23ccmp[c], h2o[c]]","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
11,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"[23camp[c], h2o[c]]","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
12,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"[23cgmp[c], h2o[c]]","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",[MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKDT...,"[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2421,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"[atp[c], ptth[c]]","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]",[MKIELTVNGLNVQAQYHDEEIERVHKPLLRMLAALQTVNPQRRTV...,"[36100.0, None]"
2422,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"[4ppan[c], ctp[c], cys_L[c]]","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]",[MSLAGKKIVLGVSGGIAAYKTPELVRRLRDRGADVRVAMTEAAKA...,"[37700.0, None]"
2423,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"[ahdt[c], h2o[c]]","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",3.5.4.16,[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...,[125625.125]
2424,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"[HC01651[c], h2o[c]]",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,3.5.4.16,[0.009113],[None],[VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSMR...,[125625.125]


In [50]:
ec_nona.to_csv(f'../data/ec_nona_{model.id}.csv',na_rep='None')

In [6]:
ec_nona = pd.read_csv(f'../data/ec_nona_{model.id}.csv')
ec_nona = ec_nona.iloc[:,1:]
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrates,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence,Molecular Weight
0,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"['23dhmp[c]', 'nadp[c]']","['(R)-2,3-Dihydroxy-3-methylpentanoate', 'Nico...","['CCC(C)(C(C(=O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(...",1.1.1.86,[2.0091262401360535],[None],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
1,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"['23cump[c]', 'h2o[c]']","['2,3-cyclic UMP(1-)', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
2,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"['23ccmp[c]', 'h2o[c]']","['2,3-Cyclic CMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
3,23PDE7,"2,3-Cyclic AMP 3-nucleotidohydrolase",rxn02521,MNXR36,,R03537,23CN2P1; 23PDE7pp,"['23camp[c]', 'h2o[c]']","['2,3-Cyclic AMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
4,23PDE9,"2,3-Cyclic GMP 3-nucleotidohydrolase",rxn03483,MNXR37,,R05135,23CN2P4; 23PDE9pp,"['23cgmp[c]', 'h2o[c]']","['2,3-Cyclic GMP', 'Water']","[None, 'O']","[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,r0578,ATP:Pantothenate 4-Phosphotransferase,rxn02128,MNXR5843,,R02971,PTHKr,"['atp[c]', 'ptth[c]']","['ATP', 'pantetheine']",['C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O...,"[['2.7.1.33', '2.7.1.34']]","[0.8912105263157896, None]","[None, None]",['MKIELTVNGLNVQAQYHDEEIERVHKPLLRMLAALQTVNPQRRT...,"[36100.0, None]"
919,r0671,(R)-4-Phosphopantothenate:L-Cysteine Ligase Ec...,rxn09177,MNXR4400,,R04231,PPNCL2; U92,"['4ppan[c]', 'ctp[c]', 'cys_L[c]']","['D-4-Phosphopantothenate', 'CTP', 'L-cysteine']","[None, 'C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)...","[['6.3.2.5'], None]","[1.1400000000000001, None, 1.1400000000000001]","[None, None, None]",['MSLAGKKIVLGVSGGIAAYKTPELVRRLRDRGADVRVAMTEAAK...,"[37700.0, None]"
920,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"['ahdt[c]', 'h2o[c]']","['7,8-dihydroneopterin 3-triphosphate(4-)', 'W...","[None, 'O']",3.5.4.16,[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
921,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,"['HC01651[c]', 'h2o[c]']",['Formamidopyrimidine nucleoside triphosphate'...,['C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP...,3.5.4.16,[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]


In [7]:
from ast import literal_eval

#ec_nona['Substrates'] = literal_eval(ec_nona['Substrates']) #convert to list type
ec_nona['Substrate Name'] = ec_nona['Substrate Name'].apply(literal_eval) #convert to list type
ec_nona['Substrate SMILE'] = ec_nona['Substrate SMILE'].apply(literal_eval) #convert to list type
ec_nona = ec_nona.explode(['Substrate Name','Substrate SMILE']).reset_index(drop=True)

In [8]:
ec_nona = ec_nona.drop(columns=['Substrates'])
ec_nona

Unnamed: 0,Reaction,Name,ModelSEED_id,MetaNetX,KEGG_id,KEGG_id2,BIGG_id,Substrate Name,Substrate SMILE,ecNumber,Avg Kcat (by ec),Avg Kcat (by ec and species),AA Sequence,Molecular Weight
0,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,"(R)-2,3-Dihydroxy-3-methylpentanoate",CCC(C)(C(C(=O)O)O)O,1.1.1.86,[2.0091262401360535],[None],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
1,23DHMPO,"(R)-2,3-Dihydroxy-3-methylpentanoate:NADP+ oxi...",rxn03435,MNXR83171,,R05068,,Nicotinamide adenine dinucleotide phosphate,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O...,1.1.1.86,[2.0091262401360535],[None],['MANYFNTLNLRQQLAQLGKCRFMGRDEFADGASYLQGKKVVIVG...,[141357.0]
2,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,"2,3-cyclic UMP(1-)",,"[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
3,23PDE2,"2,3-Cyclic UMP 3-nucleotidohydrolase",rxn02522,MNXR34,,R03538,23CN2P2; 23PDE2pp,Water,O,"[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
4,23PDE4,"2,3-Cyclic CMP 3-nucleotidohydrolase",rxn02762,MNXR35,,R03929,23CN2P3; 23PDE4pp,"2,3-Cyclic CMP",,"[['3.1.4.16'], ['3.1.4.16']]","[8.665329411764708, 8.665329411764708]","[None, None]",['MIKFSATLLATLIAASVNAATVDLRIMETTDLHSNMMDFDYYKD...,"[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,"7,8-dihydroneopterin 3-triphosphate(4-)",,3.5.4.16,[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1947,r0708,"2-Amino-4-Hydroxy-6- (Erythro-1, 2, 3-Trihydro...",rxn03174,MNXR74304,,R04639,,Water,O,3.5.4.16,[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1948,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,Formamidopyrimidine nucleoside triphosphate,C(C1C(C(C(O1)NC2=C(C(=O)NC(=N2)N)NC=O)O)O)OP(=...,3.5.4.16,[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]
1949,r0775,"Formamidopyrimidine Nucleoside Triphosphate 7,...",rxn03419,MNXR85297,,R05046,,Water,O,3.5.4.16,[0.009113],[None],['VEVYARRPQVQERLTQQIADALVEYAGARGVIVVTECGHLCMSM...,[125625.125]


In [47]:
print(get_smiles("2',3'-Cyclic CMP"))

C1=CN(C(=O)N=C1N)C2C3C(C(O2)CO)OP(=O)(O3)[O-]


## DLKcat - Kcat prediction