# Imports: 

In [1]:
import pandas as pd
import rdflib
import os

In [2]:
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF, XSD

# Functions: 

In [3]:
def get_chebi_side_rxn(rxn_id_side): 
    
    list_part_comps = []
    for obj in g.objects(rxn_id_side, contains_pred):
        list_part_comps.append(obj)

    list_comps = []
    for part_comp in list_part_comps:
        for obj in g.objects(part_comp, compound_pred):
            list_comps.append(obj)

#     list_chebi = []
#     for comp in list_comps:
#         for obj in g.objects(comp, accession_pred):
#             list_chebi.append(obj)

    list_chebi = []
    for comp in list_comps:
        for obj in g.objects(comp, accession_pred):

            if (str(obj).split(':')[0] == 'GENERIC'):
                print('WARNING: generic compound')

                list_reactive = []
                for obj in g.objects(comp, reactive_part_pred):
                    list_reactive.append(obj)

                for reactive in list_reactive:
                    for obj in g.objects(reactive, chebi_pred):
                          list_chebi.append(str(obj).split('/')[-1])

            else:
                list_chebi.append( '_'.join(str(obj).split(':')) )
    
    return list_chebi

In [4]:
def get_chebi_from_rhea_ID(RHEA_id):

    rxn_id = URIRef(rh+RHEA_id)
    rxn_id_L = URIRef(rh+RHEA_id+'_L')
    rxn_id_R = URIRef(rh+RHEA_id+'_R')

    list_chebi_L = get_chebi_side_rxn(rxn_id_L)
    list_chebi_R = get_chebi_side_rxn(rxn_id_R)
            
    return list_chebi_L, list_chebi_R

### Unit test: 

Testing out polymer issue: 

In [None]:
RHEA_id = str(21528)
get_chebi_from_rhea_ID(RHEA_id)

In [None]:
rxn_id_L = URIRef(rh+RHEA_id+'_L')
rxn_id_R = URIRef(rh+RHEA_id+'_R')
list_chebi_L = get_chebi_side_rxn(rxn_id_L)
list_chebi_L

In [74]:
list_part_comps = []
for obj in g.objects(rxn_id_L, contains_pred):
    list_part_comps.append(obj)
list_part_comps

[rdflib.term.URIRef('http://rdf.rhea-db.org/Participant_21528_compound_9859'),
 rdflib.term.URIRef('http://rdf.rhea-db.org/Participant_21528_compound_1283')]

In [78]:
list_comps = []
for part_comp in list_part_comps:
    for obj in g.objects(part_comp, compound_pred):
        list_comps.append(obj)
list_comps

[rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_9859'),
 rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_1283')]

In [80]:
comp = list_comps[0]
list_test = []
for obj in g.objects(comp, accession_pred):
    list_test.append(obj)

In [81]:
list_test

[rdflib.term.Literal('POLYMER:9859')]

# Parameters: 

In [23]:
rh = 'http://rdf.rhea-db.org/'
# predicates: 
substrates_pred = URIRef(rh+'substrates')
contains_pred = URIRef(rh+'contains')
contains1_pred = URIRef(rh+'contains1')
accession_pred = URIRef(rh+'accession')
compound_pred = URIRef(rh+'compound')
side_pred = URIRef(rh+'side')
directional_pred = URIRef(rh+'directionalReaction')

reactive_part_pred = URIRef(rh+'reactivePart')
chebi_pred = URIRef(rh+'chebi')

In [24]:
# CHEBI:16526 = CO2
# 'CHEBI_57945' = , 'CHEBI_15378' = , 'CHEBI_57540' = , 'CHEBI_58349' = , 'CHEBI_57783 = '
cofactor_IDS = ['CHEBI_57945', 'CHEBI_15378', 'CHEBI_57540', 'CHEBI_58349', 'CHEBI_57783', 'CHEBI_16526']

# Read in files: 

#### Rhea database: 

In [9]:
path = '/home/ajinich/Dropbox/KyuRhee/unknown_function/unknown_redox/data/mohammed/rhea_test/'
file_in = os.path.join(path, 'rhea.rdf') 

# create a Graph
g = rdflib.Graph()
result = g.parse(file_in)

print(len(g))

1327346


In [11]:
print(len(g))

1327346


#### ChEBI-to-SMILES database: 

In [12]:
file_chebi = os.path.join(path, 'chebi.owl')
# create a Graph
g_chebi = rdflib.Graph()
result_chebi = g_chebi.parse(file_chebi)

len(g_chebi)

2340334

In [12]:
len(g_chebi)

2340334

#### UniProt-to-Rhea dataset:
This was generated in the notebook "EDA_rhea.ipynb"

This dataframe has the UniProt ID's and corresponding Rhea-ID's for Mtb proteins

In [16]:
file_in = '../data/up_to_rhea.csv'
df_UP_rhea = pd.read_csv(file_in)
df_UP_rhea.head(2)

Unnamed: 0,Entry,RheaID
0,P9WJM5,14217
1,P9WJM5,19993


# Part I: Mapping Rhea-to-ChEBI: 

##### Get unique Rhea reaction IDs:

In [14]:
list_rhea_IDs = df_UP_rhea.RheaID.unique().tolist()
len(list_rhea_IDs)

1217

##### Map Rhea reaction ID's to ChEBI substrate/product ID's: 

[QUESTION]: What does the "generic compound" WARNING mean? 

In [17]:
rh

'http://rdf.rhea-db.org/'

In [None]:
list_chebi_LR_ALL = []
list_rhea_IDs_ALL = []
list_type_ALL = []

for RHEA_id in list_rhea_IDs: # for each Rhea reaction ID: 
    list_chebi_L, list_chebi_R = get_chebi_from_rhea_ID(str(RHEA_id)) # Get left and right compounds
#     list_chebi_L = ['_'.join(str(chebi).split(':')) for chebi in list_chebi_L if str(chebi) not in cofactor_IDS]
#     list_chebi_R = ['_'.join(str(chebi).split(':')) for chebi in list_chebi_R if str(chebi) not in cofactor_IDS]
    # In this version, we don't discard the cofactors --> this might be important information to "learn" from the protein sequences. 
    list_chebi_L = ['_'.join(str(chebi).split(':')) for chebi in list_chebi_L ]
    list_chebi_R = ['_'.join(str(chebi).split(':')) for chebi in list_chebi_R ]
    
    list_chebi_LR_ALL += list_chebi_L
    list_chebi_LR_ALL += list_chebi_R
    
    list_rhea_IDs_ALL += [RHEA_id]*len(list_chebi_L)
    list_rhea_IDs_ALL += [RHEA_id]*len(list_chebi_R)
    
    list_type_ALL += ['LEFT']*len(list_chebi_L)
    list_type_ALL += ['RIGHT']*len(list_chebi_R)


Store in dataframe: 

In [19]:
df_Rhea_ChEBI = pd.DataFrame()
df_Rhea_ChEBI['RheaID'] = list_rhea_IDs_ALL
df_Rhea_ChEBI['type'] = list_type_ALL
df_Rhea_ChEBI['ChEBI'] = list_chebi_LR_ALL

In [41]:
df_Rhea_ChEBI.shape

(4837, 3)

In [37]:
df_Rhea_ChEBI.head()

Unnamed: 0,RheaID,type,ChEBI
0,14217,LEFT,CHEBI_50058
1,14217,LEFT,CHEBI_15377
2,14217,LEFT,CHEBI_16044
3,14217,RIGHT,CHEBI_29950
4,14217,RIGHT,CHEBI_29950


In [91]:
df_Rhea_ChEBI[df_Rhea_ChEBI.ChEBI.str.contains('POLYMER')].ChEBI.unique()

array(['POLYMER_9859', 'POLYMER_14279', 'POLYMER_11133', 'POLYMER_11132',
       'POLYMER_11129', 'POLYMER_11128', 'POLYMER_14280', 'POLYMER_9527',
       'POLYMER_9517', 'POLYMER_9537', 'POLYMER_9539', 'POLYMER_9565',
       'POLYMER_9566', 'POLYMER_9602', 'POLYMER_9603', 'POLYMER_9584',
       'POLYMER_9586', 'POLYMER_11130', 'POLYMER_11131', 'POLYMER_12939',
       'POLYMER_14378', 'POLYMER_12940', 'POLYMER_13431', 'POLYMER_9587',
       'POLYMER_14738', 'POLYMER_14740', 'POLYMER_9564', 'POLYMER_9563',
       'POLYMER_10183', 'POLYMER_14708', 'POLYMER_14709'], dtype=object)

In [38]:
df_UP_rhea.head()

Unnamed: 0,Entry,RheaID
0,P9WJM5,14217
1,P9WJM5,19993
2,P9WHM9,17453
3,P9WGE9,20696
4,P9WG79,24840


#### Here we load the manually edited dataframe (which didn't really fix anything)

In [15]:
fn = '../data/tests/df_Rhea_CheBI_ManuallyFIxed.xlsx'
df_Rhea_ChEBI = pd.read_excel(fn)
df_Rhea_ChEBI.head()

Unnamed: 0,RheaID,type,ChEBI
0,14217,LEFT,CHEBI_50058
1,14217,LEFT,CHEBI_15377
2,14217,LEFT,CHEBI_16044
3,14217,RIGHT,CHEBI_29950
4,14217,RIGHT,CHEBI_29950


#### Merging the two dataframes? 
1. The UniProt to RheaID mappings ("df_UP_rhea"), with
2. The RheaID to ChEBI compound ID mappings: 

In [17]:
df_UP_rhea_chebi = df_UP_rhea.merge(df_Rhea_ChEBI, on='RheaID', how = 'inner')
df_UP_rhea_chebi.shape

(7365, 4)

## Get smiles string for every ChEBI-ID

#### Quick test for a single ChEBI-ID

In [18]:
df_UP_rhea_chebi.RheaID.value_counts().head(5)

13065    90
57888    90
47348    85
10868    76
47356    70
Name: RheaID, dtype: int64

### ChEBI compound ID's to SMILES: 

#### Now do it for all your ChEBI ID's: 

In [25]:
uri = 'http://purl.obolibrary.org/obo/'

# rdf relevant predicates: 
smiles_pred = URIRef(uri+'chebi/smiles')

In [26]:
list_chebi = df_UP_rhea_chebi.ChEBI.unique().tolist()
len(list_chebi)

1163

In [27]:
list_smiles_ALL = []
for chebi_id in list_chebi:
    
    chebi_id_uri = URIRef(uri+chebi_id)

    list_smiles = []
    for obj in g_chebi.objects(chebi_id_uri, smiles_pred):
        list_smiles.append(str(obj)) 

    if len(list_smiles)==0:
        print(chebi_id, 'zero smiles')
        list_smiles.append('NaN')
    if len(list_smiles)>1:
        print(chebi_id, 'more than 1 smiles')
        
    list_smiles_ALL+=list_smiles
        

CHEBI_16838 zero smiles
CHEBI_83401 zero smiles
CHEBI_83400 zero smiles
CHEBI_58211 zero smiles
CHEBI_57683 zero smiles
CHEBI_84139 zero smiles
CHEBI_16374 zero smiles
CHEBI_18151 zero smiles
CHEBI_16389 zero smiles
CHEBI_17976 zero smiles
CHEBI_78435 zero smiles
CHEBI_15444 zero smiles
CHEBI_83828 zero smiles
CHEBI_133980 zero smiles
CHEBI_139511 zero smiles
CHEBI_136960 zero smiles
CHEBI_141005 zero smiles
CHEBI_58914 zero smiles
CHEBI_55437 zero smiles
CHEBI_140774 zero smiles
CHEBI_140775 zero smiles


In [28]:
df_chebi_smiles = pd.DataFrame()
df_chebi_smiles['ChEBI'] = list_chebi
df_chebi_smiles['smiles'] = list_smiles_ALL
df_chebi_smiles.head(2)

Unnamed: 0,ChEBI,smiles
0,CHEBI_50058,C([C@@H](N*)CSSC[C@@H](C(=O)*)N*)(=O)*
1,CHEBI_15377,[H]O[H]


In [30]:
df_chebi_smiles[df_chebi_smiles.ChEBI.str.contains('POLYMER')].shape

(0, 2)

#### One final merge: 

In [34]:
df_UP_rhea_chebi_smiles = df_UP_rhea_chebi.merge(df_chebi_smiles, on = 'ChEBI', how = 'inner')

In [35]:
df_UP_rhea_chebi_smiles.sort_values(by = ['Entry', 'RheaID', 'type'], inplace=True)
df_UP_rhea_chebi_smiles.reset_index(drop=True, inplace=True)

In [36]:
df_UP_rhea_chebi_smiles.shape

(7365, 5)

In [37]:
df_UP_rhea_chebi_smiles.head(3)

Unnamed: 0,Entry,RheaID,type,ChEBI,smiles
0,A0A089QRB9,50344,LEFT,CHEBI_15378,[H+]
1,A0A089QRB9,50344,LEFT,CHEBI_57327,C[C@@H](C([O-])=O)C(=O)SCCNC(=O)CCNC(=O)[C@H](...
2,A0A089QRB9,50344,LEFT,CHEBI_57783,NC(=O)C1=CN(C=CC1)[C@@H]1O[C@H](COP([O-])(=O)O...


In [38]:
fn = '../data/tests/df_UP_rhea_chebi_smiles.csv'
df_UP_rhea_chebi_smiles.to_csv(fn, index = False)

__________________
______________
______________
# OTHER

### Same database above, but discarding NAD(H), NADP(H), CO2, H+, (others?)

In [None]:
cofactor_IDS = ['CHEBI_57945', 'CHEBI_15378', 'CHEBI_57540', 'CHEBI_58349', 'CHEBI_57783', 'CHEBI_16526']

In [473]:
df_45_l_rhea_chebi_smiles_noCF = df_45_l_rhea_chebi_smiles[~df_45_l_rhea_chebi_smiles.ChEBI.isin(cofactor_IDS)].copy()
df_45_l_rhea_chebi_smiles_noCF.reset_index(inplace=True, drop=True)

In [482]:
len(df_45_l_rhea_chebi_smiles_noCF['ChEBI'].unique().tolist())

425

In [480]:
len(df_45_l_rhea_chebi_smiles_noCF['smiles'].unique().tolist())

421

In [475]:
df_45_l_rhea_chebi_smiles_noCF.head(10)

Unnamed: 0,Entry,RheaID,type,ChEBI,smiles
0,A0A024R7X6,25033,LEFT,CHEBI_17336,CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/CO
1,A0A024R7X6,25033,RIGHT,CHEBI_17898,[H]C(=O)\C=C(C)\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C
2,A0A075TRB3,62216,LEFT,CHEBI_145109,C1(C(=C[C@@H]([C@@H]2[C@H]1O2)O)CO)=O
3,A0A075TRB3,62216,RIGHT,CHEBI_145110,C1(C(=CC([C@@H]2[C@H]1O2)=O)CO)=O
4,A0A096NEU8,25033,LEFT,CHEBI_17336,CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/CO
5,A0A096NEU8,25033,RIGHT,CHEBI_17898,[H]C(=O)\C=C(C)\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C
6,A0A0D9RMB0,25033,LEFT,CHEBI_17336,CC(\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C)=C/CO
7,A0A0D9RMB0,25033,RIGHT,CHEBI_17898,[H]C(=O)\C=C(C)\C=C\C=C(C)\C=C\C1=C(C)CCCC1(C)C
8,A0A0L8VV04,48680,LEFT,CHEBI_85440,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...
9,A0A0L8VV04,48680,RIGHT,CHEBI_90725,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...


## Exercise 1: NAD vs. NADP:

## Exercise 2: Reactions involving CO2

# Dig a bit into this: 

You might need to discard some of these (decide on a case-by-case basis):

In [310]:
df_Rhea_ChEBI.RheaID.value_counts().head(7)

19397    6
15805    5
38575    4
12044    4
12456    4
18833    3
14981    2
Name: RheaID, dtype: int64

In [311]:
df_45_l_rhea[df_45_l_rhea.RheaID.values == '12044']

Unnamed: 0,Entry,RheaID
241,Q09851,12044
1098,P40471,12044


And what exactly is the meaning and what are the consequences of this? 

In [315]:
df_Rhea_ChEBI[df_Rhea_ChEBI.ChEBI.str.contains('GENERIC')].head(5)

Unnamed: 0,RheaID,type,ChEBI
10,48488,LEFT,GENERIC_14428
11,48488,RIGHT,GENERIC_14430
187,17397,LEFT,GENERIC_9945
188,17397,RIGHT,GENERIC_9916
316,22564,LEFT,GENERIC_9926


In [325]:
RHEA_id = '48488'

In [326]:
df_Rhea_ChEBI[df_Rhea_ChEBI.RheaID.values == RHEA_id]

Unnamed: 0,RheaID,type,ChEBI
10,48488,LEFT,GENERIC_14428
11,48488,RIGHT,GENERIC_14430


In [320]:
df_45_l_rhea[df_45_l_rhea.RheaID.values == rid].head(2)

Unnamed: 0,Entry,RheaID
7,O54753,48488
37,O54909,48488


In [321]:
rxn_id = URIRef(rh+RHEA_id)
rxn_id_L = URIRef(rh+RHEA_id+'_L')
rxn_id_R = URIRef(rh+RHEA_id+'_R')

In [368]:
reactive_part_pred = URIRef(rh+'reactivePart')
chebi_pred = URIRef(rh+'chebi')

In [385]:
rxn_id_side = rxn_id_R

In [386]:
list_part_comps = []
for obj in g.objects(rxn_id_side, contains_pred):
    list_part_comps.append(obj)

list_comps = []
for part_comp in list_part_comps:
    for obj in g.objects(part_comp, compound_pred):
        list_comps.append(obj)
        
list_chebi = []
for comp in list_comps:
    for obj in g.objects(comp, accession_pred):
        
        if (str(obj).split(':')[0] == 'GENERIC'):
            print('WARNING: generic compound')
            
            list_reactive = []
            for obj in g.objects(comp, reactive_part_pred):
                list_reactive.append(obj)
            
            for reactive in list_reactive:
                for obj in g.objects(reactive, chebi_pred):
                      list_chebi.append(str(obj).split('/')[-1])
                 
        else:
            list_chebi.append( '_'.join(str(obj).split(':')) )



In [387]:
list_part_comps

[rdflib.term.URIRef('http://rdf.rhea-db.org/Participant_48488_compound_3249'),
 rdflib.term.URIRef('http://rdf.rhea-db.org/Participant_48488_compound_1650'),
 rdflib.term.URIRef('http://rdf.rhea-db.org/Participant_48488_compound_14430')]

In [388]:
list_comps

[rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_3249'),
 rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_1650'),
 rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_14430')]

In [389]:
list_chebi

['CHEBI_15378', 'CHEBI_57945', 'CHEBI_83228', 'CHEBI_17898']

In [357]:
for pred_obj in g.predicate_objects(list_comps[1]):
    print(pred_obj)

(rdflib.term.URIRef('http://rdf.rhea-db.org/id'), rdflib.term.Literal('14428', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#long')))
(rdflib.term.URIRef('http://rdf.rhea-db.org/reactivePart'), rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_14428_rp2'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/htmlName'), rdflib.term.Literal('<i>all</i>-<i>trans</i>-retinol--[retinol-binding protein]'))
(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://rdf.rhea-db.org/GenericPolypeptide'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/accession'), rdflib.term.Literal('GENERIC:14428'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/reactivePart'), rdflib.term.URIRef('http://rdf.rhea-db.org/Compound_14428_rp1'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/name'), rdflib.term.Literal('all-trans-retinol--[retinol-binding protein]'))


This is what you eventually need to fetch (if you run into a compound that's only returning "generic")

In [349]:
for obj in g.objects(comp, reactive_part_pred):
    print(obj)

http://rdf.rhea-db.org/Compound_14428_rp2
http://rdf.rhea-db.org/Compound_14428_rp1


In [363]:
cid = URIRef('http://rdf.rhea-db.org/Compound_14428_rp1')
for pred_obj in g.predicate_objects(cid):
    print(pred_obj)

(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://rdf.rhea-db.org/ReactivePart'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/name'), rdflib.term.Literal('all-trans-retinol'))
(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_17336'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/formula'), rdflib.term.Literal('C20H30O'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/htmlName'), rdflib.term.Literal('<i>all</i>-<i>trans</i>-retinol'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/charge'), rdflib.term.Literal('0'))
(rdflib.term.URIRef('http://rdf.rhea-db.org/chebi'), rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_17336'))


# Misc: 

In [None]:
for pred_obj in g_chebi.predicate_objects(chebi_id_uri):
    print(pred_obj)