In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import torch 
import numpy as np 
from Bio import Entrez
from io import StringIO
from Bio import SeqIO

from tkgdti.data.utils import get_smiles_inchikey, get_protein_sequence_uniprot

# set seed 
torch.manual_seed(0)
np.random.seed(0)

# Targetome Data; Drug -> Interacts -> Protein 


In [2]:
INCLUDE_TARGETOME_EXPANDED = True
KD_THESHOLD = 1e3 # nM

In [3]:
drug_info = pd.read_excel('../../data/tkg_raw/beataml_drug_families.xlsx', sheet_name='drug_gene')
drug_info = drug_info[['inhibitor', 'Symbol', 'GeneID', 'targetome_adj_tier']]
drug_info.targetome_adj_tier = drug_info.targetome_adj_tier.fillna('TIER_5*')
drug_info.head() 

Unnamed: 0,inhibitor,Symbol,GeneID,targetome_adj_tier
0,ABT-737,BAD,572,TIER_1
1,Palbociclib,CCND1,595,TIER_1
2,Flavopiridol,CCNT1,904,TIER_1
3,NF-kB Activation Inhibitor,NFKB1,4790,TIER_5*
4,Roscovitine (CYC-202),NFKB1,4790,TIER_5*


In [4]:
tge_meta = pd.read_csv('/home/teddy/local/data/targetome_extended_drugs-01-23-25.csv', low_memory=False)
tge_meta = tge_meta[['inchi_key', 'drug_name']].rename({'drug_name':'inhibitor'}, axis=1).drop_duplicates()

tge = pd.read_csv('/home/teddy/local/data/targetome_extended-01-23-25.csv')
tge = tge.assign(targetome_adj_tier='TIER_1')

# filter to assay relation 
tge = tge[lambda x: x.assay_relation == '=']

# filter to binding affinity less than 10uM 
tge = tge[lambda x: x.assay_value < KD_THESHOLD]

# add drug name 
tge = tge.merge(tge_meta, on='inchi_key', how='inner')

# add gene symbol
uni2symb = pd.read_csv('/home/teddy/local/TKG-DTI/extdata/meta/omnipath_uniprot2symbol.csv')
uni2symb = uni2symb.set_index('From')[['FirstGene']].to_dict()['FirstGene']

tge = tge.assign(Symbol=tge.uniprot_id.map(uni2symb))

tge = tge.rename({'inchi_key': 'inchikey'}, axis=1) 

tge.head() 

Unnamed: 0,pubchem_cid,inchikey,uniprot_id,pubmed_id,database,assay_type,assay_relation,assay_value,targetome_adj_tier,inhibitor,Symbol
0,51,KPGXRSRHYNQIFN-UHFFFAOYSA-N,Q9GZT9,23234607.0,pubchem_bioassay,Kd,=,900.0,TIER_1,Alpha-ketoglutaric Acid,EGLN1
1,72,YQUVCSBJEUQKSH-UHFFFAOYSA-N,P00918,21282059.0,pubchem_bioassay,Ki,=,470.0,TIER_1,Protocatechuic Acid,CA2
2,72,YQUVCSBJEUQKSH-UHFFFAOYSA-N,Q9ULX7,22668600.0,pubchem_bioassay,Ki,=,690.0,TIER_1,Protocatechuic Acid,
3,119,BTCSSZJGUNDROE-UHFFFAOYSA-N,A8MPY1,11093776.0,bindingdb,Ki,=,580.0,TIER_1,Gamma-aminobutyric Acid,
4,119,BTCSSZJGUNDROE-UHFFFAOYSA-N,O00591,1331456.0,pubchem_bioassay,IC50,=,737.5,TIER_1,Gamma-aminobutyric Acid,


In [5]:
drug_info.groupby('targetome_adj_tier').count()

Unnamed: 0_level_0,inhibitor,Symbol,GeneID
targetome_adj_tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TIER_1,476,476,476
TIER_2,37,37,37
TIER_3,8,8,8
TIER_4,4,4,4
TIER_5*,126,126,126


In [6]:
drug_info = drug_info[lambda x: x.targetome_adj_tier == 'TIER_1']
drug_info.shape

(476, 4)

In [7]:
print(f'number of unique drugs: {drug_info.inhibitor.nunique()}')
print(f'number of unique genes: {drug_info.Symbol.nunique()}')

number of unique drugs: 97
number of unique genes: 222


In [8]:
drug_names = drug_info.inhibitor.unique()
results = {'drug': [], 'can_smiles': [], 'inchikey': [], 'iso_smiles': []}
for drug in tqdm(drug_names):
    can_smiles, iso_smiles, inchikey = get_smiles_inchikey(drug)
    results['drug'].append(drug)
    results['can_smiles'].append(can_smiles)
    results['iso_smiles'].append(iso_smiles)
    results['inchikey'].append(inchikey)

results = pd.DataFrame(results)
results.head() 

100%|██████████| 97/97 [00:37<00:00,  2.56it/s]


Unnamed: 0,drug,can_smiles,inchikey,iso_smiles
0,ABT-737,CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O...,HPLNQCPCUACXLM-PGUFJCEWSA-N,CN(C)CC[C@H](CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O...
1,Palbociclib,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...,AHJRHEGDXFFMBM-UHFFFAOYSA-N,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...
2,Flavopiridol,CN1CCC(C(C1)O)C2=C(C=C(C3=C2OC(=CC3=O)C4=CC=CC...,BIIVYFLTOXDAOV-YVEFUNNKSA-N,CN1CC[C@@H]([C@@H](C1)O)C2=C(C=C(C3=C2OC(=CC3=...
3,Bortezomib (Velcade),B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,GXJABQQUPOEUTA-RDJZCZTQSA-N,B([C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)...
4,AST-487,CCN1CCN(CC1)CC2=C(C=C(C=C2)NC(=O)NC3=CC=C(C=C3...,ODPGGGTTYSGTGO-UHFFFAOYSA-N,CCN1CCN(CC1)CC2=C(C=C(C=C2)NC(=O)NC3=CC=C(C=C3...


In [9]:
print('# of unknown inchikeys:', results.inchikey.isnull().sum()) 
print('# of unknown smiles:', results.can_smiles.isnull().sum())

# of unknown inchikeys: 8
# of unknown smiles: 8


In [10]:
drug_info = drug_info.merge(results, left_on='inhibitor', right_on='drug', how='left')
drug_info = drug_info.drop(columns='drug')
drug_info = drug_info.dropna(subset=['can_smiles', 'inchikey'])
drug_info.shape

(422, 7)

In [11]:

tge = tge.merge(results, on='inchikey', how='inner')
tge = tge[['inhibitor', 'Symbol', 'can_smiles', 'inchikey']]
tge = tge.drop_duplicates()
tge.head()

Unnamed: 0,inhibitor,Symbol,can_smiles,inchikey
0,Go-6976,PLK4,CN1C2=CC=CC=C2C3=C4C(=C5C6=CC=CC=C6N(C5=C31)CC...,VWVYILCFSYNJHF-UHFFFAOYSA-N
1,Go-6976,PRKCG,CN1C2=CC=CC=C2C3=C4C(=C5C6=CC=CC=C6N(C5=C31)CC...,VWVYILCFSYNJHF-UHFFFAOYSA-N
2,Go-6976,PRKCB,CN1C2=CC=CC=C2C3=C4C(=C5C6=CC=CC=C6N(C5=C31)CC...,VWVYILCFSYNJHF-UHFFFAOYSA-N
4,Go-6976,PRKCA,CN1C2=CC=CC=C2C3=C4C(=C5C6=CC=CC=C6N(C5=C31)CC...,VWVYILCFSYNJHF-UHFFFAOYSA-N
8,Go-6976,FLT3,CN1C2=CC=CC=C2C3=C4C(=C5C6=CC=CC=C6N(C5=C31)CC...,VWVYILCFSYNJHF-UHFFFAOYSA-N


In [12]:
drug_info = drug_info[['inhibitor', 'Symbol', 'can_smiles', 'inchikey']]
drug_info.head() 

Unnamed: 0,inhibitor,Symbol,can_smiles,inchikey
0,ABT-737,BAD,CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O...,HPLNQCPCUACXLM-PGUFJCEWSA-N
1,Palbociclib,CCND1,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...,AHJRHEGDXFFMBM-UHFFFAOYSA-N
2,Flavopiridol,CCNT1,CN1CCC(C(C1)O)C2=C(C=C(C3=C2OC(=CC3=O)C4=CC=CC...,BIIVYFLTOXDAOV-YVEFUNNKSA-N
3,Bortezomib (Velcade),PSMA1,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,GXJABQQUPOEUTA-RDJZCZTQSA-N
4,Bortezomib (Velcade),PSMA2,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,GXJABQQUPOEUTA-RDJZCZTQSA-N


In [13]:
if INCLUDE_TARGETOME_EXPANDED:
    drug_info = pd.concat([drug_info, tge], axis=0)
    drug_info = drug_info.drop_duplicates()
    drug_info = drug_info.dropna()     

In [14]:
# ensure all inchikeys have the same name 
# BUG fix: previously, the drug name -> inchikey mapping was not unique, so we were ending up with redundant dtis
drug_info = drug_info.copy()
drug_info.drop(columns=['inhibitor'], inplace=True)
drug_info = drug_info.drop_duplicates()
drug_info = drug_info.merge(results[['inchikey', 'drug']], on='inchikey', how='inner')
drug_info = drug_info.rename({'drug': 'inhibitor'}, axis=1)
drug_info = drug_info.drop_duplicates()

In [15]:
print(f'number of unique drugs: {drug_info.inhibitor.nunique()}') # multiple names map to same smiles 
print(f'number of unique genes: {drug_info.Symbol.nunique()}')
print(f'number of unique inchikeys: {drug_info.inchikey.nunique()}')
print(f'number of unique smiles: {drug_info.can_smiles.nunique()}')
print(f'number of DTIs: {drug_info.shape[0]}')

number of unique drugs: 89
number of unique genes: 529
number of unique inchikeys: 89
number of unique smiles: 89
number of DTIs: 3232


In [16]:
drug_info.to_csv('../../extdata/meta/targetome__drug_targets_gene.csv', index=False)

In [17]:
rel_fwd = drug_info[['inchikey', 'Symbol']].rename({'inchikey': 'src', 'Symbol': 'dst'}, axis=1).assign(src_type='drug', dst_type='gene', relation='targets')

rel_fwd.to_csv('../../extdata/relations/targetome_drug_targets_gene.csv', index=False)

In [18]:
# save drugspace for convenience 
drugspace = drug_info.inchikey.unique()
print(f'number of unique drugs: {len(drugspace)}')

np.savetxt('../../extdata/meta/drugspace.txt', drugspace, fmt='%s')

number of unique drugs: 89
