In [1]:
import pandas as pd
import requests 
from tqdm import tqdm
import os 
import torch 
import numpy as np 

from tkgdti.data.utils import get_smiles_inchikey

# set seed 
torch.manual_seed(0)
np.random.seed(0)

# CTD ; Drug associates disease

NOTE: 

```
# Use is subject to the terms set forth at http://ctdbase.org/about/legal.jsp
# These terms include:
#
#   1. All forms of publication (e.g., web sites, research papers, databases,
#      software applications, etc.) that use or rely on CTD data must cite CTD.
#      Citation guidelines: http://ctdbase.org/about/publications/#citing
#
#   2. All electronic or online applications must include hyperlinks from
#      contexts that use CTD data to the applicable CTD data pages.
#      Linking instructions: http://ctdbase.org/help/linking.jsp
#
#   3. You must notify CTD, and describe your use of our data:
#      http://ctdbase.org/help/contact.go
#
#   4. For quality control purposes, you must provide CTD with periodic
#      access to your publication of our data.
#
# More information: http://ctdbase.org/downloads/
```

In [2]:
chemdrug = pd.read_csv('../../data/tkg_raw/CTD_chemicals_diseases.csv', sep=',', comment='#', header=None)
chemdrug.columns = 'ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs'.split(',')
chemdrug.head()

  chemdrug = pd.read_csv('../../data/tkg_raw/CTD_chemicals_diseases.csv', sep=',', comment='#', header=None)


Unnamed: 0,ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
0,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,therapeutic,,,,4519131
1,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.07,,26432044
2,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.3,,26656844|27602772
3,10074-G5,C534883,,Alopecia,MESH:D000505,,AR,4.5,,15902657
4,10074-G5,C534883,,Androgen-Insensitivity Syndrome,MESH:D013734,,AR,6.87,300068|312300,1303262|8281139


In [3]:

if os.path.exists('../../extdata/meta/ctd_targetome_drug_overlap.csv'): 
    ctd_overlap = pd.read_csv('../../extdata/meta/ctd_targetome_drug_overlap.csv')
else:
    # takes ~2 hours 
    drug_names = chemdrug.ChemicalName.unique()
    results = {'drug': [], 'can_smiles': [], 'inchikey': []}
    for drug in tqdm(drug_names):
        can_smiles, iso_smiles, inchikey = get_smiles_inchikey(drug)
        results['drug'].append(drug)
        results['can_smiles'].append(can_smiles)
        results['inchikey'].append(inchikey)
    results = pd.DataFrame(results)
    druginfo = pd.read_csv('../../extdata/meta/targetome__drug_targets_gene.csv')
    ctd_overlap = druginfo[['inhibitor', 'inchikey']].drop_duplicates().merge(results, left_on='inchikey', right_on='inchikey', how='inner')
    ctd_overlap = ctd_overlap.rename({'drug': 'CTD_ChemicalName', 'inhibitor': 'Targetome_inhibitor'}, axis=1).drop('can_smiles', axis=1)
    ctd_overlap.to_csv('../../extdata/meta/ctd_targetome_drug_overlap.csv', index=False)

print('num matching drugs (targetome, ctd):', ctd_overlap.shape[0])

100%|██████████| 17564/17564 [2:02:02<00:00,  2.40it/s]  


num matching drugs (targetome, ctd): 65


In [4]:
chemdrug = chemdrug.merge(ctd_overlap, left_on='ChemicalName', right_on='CTD_ChemicalName', how='inner')
print('# of unique drugs:', chemdrug.ChemicalName.nunique())
print('# of unique diseases:', chemdrug.DiseaseName.nunique())

# of unique drugs: 65
# of unique diseases: 4095


In [5]:
chemdis_meta = chemdrug[['DiseaseName', 'DiseaseID', 'ChemicalName', 'inchikey']]
chemdis_meta.drop_duplicates().to_csv('../../extdata/meta/CTD___drug_associates_disease.csv', index=False)

In [6]:
chemdis_rel_fwd = chemdis_meta[['DiseaseID', 'inchikey']].rename({'DiseaseID': 'dst', 'inchikey': 'src'}, axis=1)
chemdis_rel_fwd = chemdis_rel_fwd.assign(src_type='drug', dst_type='disease', relation='associates_fwd')
chemdis_rel_fwd = chemdis_rel_fwd.drop_duplicates()[['src', 'dst', 'src_type', 'dst_type', 'relation']]
chemdis_rel_fwd.to_csv('../../extdata/relations/ctd__drug_disease_association_fwd.csv', index=False)
chemdis_rel_fwd.head()

Unnamed: 0,src,dst,src_type,dst_type,relation
0,GFMMXOIFOQCCGU-UHFFFAOYSA-N,MESH:D058739,drug,disease,associates_fwd
1,GFMMXOIFOQCCGU-UHFFFAOYSA-N,MESH:D000015,drug,disease,associates_fwd
2,GFMMXOIFOQCCGU-UHFFFAOYSA-N,MESH:D000022,drug,disease,associates_fwd
4,GFMMXOIFOQCCGU-UHFFFAOYSA-N,MESH:D000052,drug,disease,associates_fwd
5,GFMMXOIFOQCCGU-UHFFFAOYSA-N,MESH:D000138,drug,disease,associates_fwd


In [7]:
chemdis_rel_rev = chemdis_meta[['DiseaseID', 'inchikey']].rename({'DiseaseID': 'src', 'inchikey': 'dst'}, axis=1)
chemdis_rel_rev = chemdis_rel_rev.assign(dst_type='drug', src_type='disease', relation='associates_rev')
chemdis_rel_rev = chemdis_rel_rev.drop_duplicates()[['src', 'dst', 'src_type', 'dst_type', 'relation']]
chemdis_rel_rev.to_csv('../../extdata/relations/ctd__drug_disease_association_rev.csv', index=False)
chemdis_rel_rev.head()

Unnamed: 0,src,dst,src_type,dst_type,relation
0,MESH:D058739,GFMMXOIFOQCCGU-UHFFFAOYSA-N,disease,drug,associates_rev
1,MESH:D000015,GFMMXOIFOQCCGU-UHFFFAOYSA-N,disease,drug,associates_rev
2,MESH:D000022,GFMMXOIFOQCCGU-UHFFFAOYSA-N,disease,drug,associates_rev
4,MESH:D000052,GFMMXOIFOQCCGU-UHFFFAOYSA-N,disease,drug,associates_rev
5,MESH:D000138,GFMMXOIFOQCCGU-UHFFFAOYSA-N,disease,drug,associates_rev


In [9]:
diseasespace = chemdis_meta.DiseaseID.unique() 
np.savetxt('../../extdata/meta/disease_space.txt', diseasespace, fmt='%s')