In [26]:
import pandas as pd
import requests
from tqdm import tqdm
import torch 
import numpy as np 
from Bio import Entrez
from io import StringIO
from Bio import SeqIO

from tkgdti.data.utils import get_smiles_inchikey, get_protein_sequence_uniprot

# set seed 
torch.manual_seed(0)
np.random.seed(0)

# Targetome Data; Drug -> Interacts -> Protein 


In [27]:
drug_info = pd.read_excel('../../data/tkg_raw/beataml_drug_families.xlsx', sheet_name='drug_gene')
drug_info = drug_info[['inhibitor', 'Symbol', 'GeneID', 'targetome_adj_tier']]
drug_info.targetome_adj_tier = drug_info.targetome_adj_tier.fillna('TIER_5*')
drug_info

Unnamed: 0,inhibitor,Symbol,GeneID,targetome_adj_tier
0,ABT-737,BAD,572,TIER_1
1,Palbociclib,CCND1,595,TIER_1
2,Flavopiridol,CCNT1,904,TIER_1
3,NF-kB Activation Inhibitor,NFKB1,4790,TIER_5*
4,Roscovitine (CYC-202),NFKB1,4790,TIER_5*
...,...,...,...,...
646,Selumetinib (AZD6244),MAP2K2,5605,TIER_1
647,AZD1152-HQPA (AZD2811),MAP2K5,5607,TIER_5*
648,PLX-4720,MAP2K5,5607,TIER_1
649,PP242,MAP2K5,5607,TIER_5*


In [28]:
drug_info.groupby('targetome_adj_tier').count()

Unnamed: 0_level_0,inhibitor,Symbol,GeneID
targetome_adj_tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TIER_1,476,476,476
TIER_2,37,37,37
TIER_3,8,8,8
TIER_4,4,4,4
TIER_5*,126,126,126


In [29]:
drug_info = drug_info[lambda x: x.targetome_adj_tier == 'TIER_1']
drug_info.shape

(476, 4)

In [30]:
print(f'number of unique drugs: {drug_info.inhibitor.nunique()}')
print(f'number of unique genes: {drug_info.Symbol.nunique()}')

number of unique drugs: 97
number of unique genes: 222


In [31]:
drug_names = drug_info.inhibitor.unique()
results = {'drug': [], 'can_smiles': [], 'inchikey': [], 'iso_smiles': []}
for drug in tqdm(drug_names):
    can_smiles, iso_smiles, inchikey = get_smiles_inchikey(drug)
    results['drug'].append(drug)
    results['can_smiles'].append(can_smiles)
    results['iso_smiles'].append(iso_smiles)
    results['inchikey'].append(inchikey)

results = pd.DataFrame(results)
results.head() 

100%|██████████| 97/97 [00:42<00:00,  2.31it/s]


Unnamed: 0,drug,can_smiles,inchikey,iso_smiles
0,ABT-737,CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O...,HPLNQCPCUACXLM-PGUFJCEWSA-N,CN(C)CC[C@H](CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O...
1,Palbociclib,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...,AHJRHEGDXFFMBM-UHFFFAOYSA-N,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...
2,Flavopiridol,CN1CCC(C(C1)O)C2=C(C=C(C3=C2OC(=CC3=O)C4=CC=CC...,BIIVYFLTOXDAOV-YVEFUNNKSA-N,CN1CC[C@@H]([C@@H](C1)O)C2=C(C=C(C3=C2OC(=CC3=...
3,Bortezomib (Velcade),B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,GXJABQQUPOEUTA-RDJZCZTQSA-N,B([C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)...
4,AST-487,CCN1CCN(CC1)CC2=C(C=C(C=C2)NC(=O)NC3=CC=C(C=C3...,ODPGGGTTYSGTGO-UHFFFAOYSA-N,CCN1CCN(CC1)CC2=C(C=C(C=C2)NC(=O)NC3=CC=C(C=C3...


In [32]:
print('# of unknown inchikeys:', results.inchikey.isnull().sum()) 
print('# of unknown smiles:', results.can_smiles.isnull().sum())

# of unknown inchikeys: 8
# of unknown smiles: 8


In [33]:
drug_info = drug_info.merge(results, left_on='inhibitor', right_on='drug', how='left')
drug_info = drug_info.drop(columns='drug')
drug_info = drug_info.dropna(subset=['can_smiles', 'inchikey'])
drug_info.shape

(422, 7)

In [34]:
drug_info.head()

Unnamed: 0,inhibitor,Symbol,GeneID,targetome_adj_tier,can_smiles,inchikey,iso_smiles
0,ABT-737,BAD,572,TIER_1,CN(C)CCC(CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O)(=O...,HPLNQCPCUACXLM-PGUFJCEWSA-N,CN(C)CC[C@H](CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O...
1,Palbociclib,CCND1,595,TIER_1,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...,AHJRHEGDXFFMBM-UHFFFAOYSA-N,CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCN...
2,Flavopiridol,CCNT1,904,TIER_1,CN1CCC(C(C1)O)C2=C(C=C(C3=C2OC(=CC3=O)C4=CC=CC...,BIIVYFLTOXDAOV-YVEFUNNKSA-N,CN1CC[C@@H]([C@@H](C1)O)C2=C(C=C(C3=C2OC(=CC3=...
3,Bortezomib (Velcade),PSMA1,5682,TIER_1,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,GXJABQQUPOEUTA-RDJZCZTQSA-N,B([C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)...
4,Bortezomib (Velcade),PSMA2,5683,TIER_1,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,GXJABQQUPOEUTA-RDJZCZTQSA-N,B([C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)...


In [35]:
drug_info.to_csv('../../extdata/meta/targetome__drug_targets_gene.csv', index=False)

In [36]:
rel_fwd = drug_info[['inchikey', 'Symbol']].rename({'inchikey': 'src', 'Symbol': 'dst'}, axis=1).assign(src_type='drug', dst_type='gene', relation='targets')

rel_fwd.to_csv('../../extdata/relations/targetome_drug_targets_gene.csv', index=False)

In [37]:
# save drugspace for convenience 
drugspace = drug_info.inchikey.unique()
print(f'number of unique drugs: {len(drugspace)}')

np.savetxt('../../extdata/meta/drugspace.txt', drugspace, fmt='%s')

number of unique drugs: 89
