In [1]:
from bio2bel_sider.parser import get_indications_df, get_drug_names_df
from bio2bel_sider.utils import (
    convert_flat_stitch_id_to_pubchem_cid, 
    enrich_pubchem_synonyms, 
    get_chembl,
)
import pandas as pd
import os
from tqdm import tqdm_notebook as tqdm

In [2]:
indications_df = get_indications_df()
drug_names_df = get_drug_names_df()

df = pd.merge(indications_df, drug_names_df, on='STITCH_FLAT_ID')
df.head()

Unnamed: 0,STITCH_FLAT_ID,UMLS CUI from Label,Method of Detection,Concept Name,MedDRA Concept Type,UMLS CUI from MedDRA,MedDRA Concept name,Drug Name
0,CID100000085,C0015544,text_mention,Failure to Thrive,LLT,C0015544,Failure to thrive,carnitine
1,CID100000085,C0015544,text_mention,Failure to Thrive,PT,C0015544,Failure to thrive,carnitine
2,CID100000085,C0020615,text_mention,Hypoglycemia,LLT,C0020615,Hypoglycaemia,carnitine
3,CID100000085,C0020615,text_mention,Hypoglycemia,PT,C0020615,Hypoglycaemia,carnitine
4,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",LLT,C0022661,Renal failure chronic,carnitine


In [3]:
df['pubchem_cid'] = df['STITCH_FLAT_ID'].map(convert_flat_stitch_id_to_pubchem_cid)
df.head()

Unnamed: 0,STITCH_FLAT_ID,UMLS CUI from Label,Method of Detection,Concept Name,MedDRA Concept Type,UMLS CUI from MedDRA,MedDRA Concept name,Drug Name,pubchem_cid
0,CID100000085,C0015544,text_mention,Failure to Thrive,LLT,C0015544,Failure to thrive,carnitine,85
1,CID100000085,C0015544,text_mention,Failure to Thrive,PT,C0015544,Failure to thrive,carnitine,85
2,CID100000085,C0020615,text_mention,Hypoglycemia,LLT,C0020615,Hypoglycaemia,carnitine,85
3,CID100000085,C0020615,text_mention,Hypoglycemia,PT,C0020615,Hypoglycaemia,carnitine,85
4,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",LLT,C0022661,Renal failure chronic,carnitine,85


In [5]:
path = 'pubchem_cid_to_chembl.tsv'
checked = {}
if not os.path.exists(path):
    checked = {}
else:
    with open(path) as file:
        checked = dict(
            line.split('\t')
            for line in file
        )

missing_pubchem_cids = set()
with open(path, 'w+') as file:
    for pubchem_cid, chembl_id in checked.items():
        print(pubchem_cid, chembl_id, sep='\t', file=file)
    
    it = tqdm(df.pubchem_cid.unique())
    for pubchem_cid in it:
        try:
            chembl_id = get_chembl(pubchem_cid)
        except Exception:
            it.write(f"exception for {pubchem_cid}")
            continue

        if chembl_id is None:
            it.write(f"can't find {pubchem_cid}")
            missing_pubchem_cids.add(pubchem_cid)
            continue

        print(pubchem_cid, chembl_id, sep='\t', file=file)

HBox(children=(IntProgress(value=0, max=1360), HTML(value='')))

can't find 143
can't find 158
can't find 159
can't find 206
can't find 214
can't find 232
can't find 271
can't find 699
can't find 772
can't find 813
can't find 888
can't find 923
can't find 937
can't find 1003
can't find 1071
can't find 1125
can't find 1206
can't find 1690
can't find 1798
can't find 1875
can't find 1971
can't find 2022
can't find 2085
can't find 2094
can't find 2133
can't find 2142
can't find 2156
can't find 2163
can't find 2168
can't find 2171
can't find 2173
can't find 2182
can't find 2232
can't find 2250
can't find 2269
can't find 2274
can't find 2308
can't find 2350
can't find 2443
can't find 2487
can't find 2522
can't find 2524
can't find 2559
can't find 2617
can't find 2622
can't find 2631
can't find 2637
can't find 2650
can't find 2654
can't find 2655
can't find 2656
can't find 2658
can't find 2675
can't find 2676
can't find 2713
can't find 2751
can't find 2767
can't find 2791
can't find 2792
can't find 2818
can't find 2881
can't find 2891
can't find 2909
can't

In [9]:
print(f'could not find {len(missing_pubchem_cids)}/{len(df.pubchem_cid.unique())} ChEMBL mappings')

could not find 430/1360 ChEMBL mappings
