In [None]:
import warnings
warnings.simplefilter(action='ignore')

In [None]:
import os
import json
import umap
from collections import OrderedDict
import numpy as np
import pandas as pd
from tqdm import tnrange, tqdm_notebook
import deepchem as dc
from scipy.spatial import distance
from sklearn.metrics import roc_auc_score, average_precision_score

In [None]:
# change working directory
os.chdir('/home/yuke/PythonProject/DrugEmbedding/')

In [None]:
from hvae import *
from evae import *
from drugdata import *
from metrics import *
from lorentz import *

In [None]:
from decode import *
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import rdBase
rdBase.DisableLog('rdApp.error') #disable RDKit warning messages
from rdkit.Chem.Draw import IPythonConsole #Needed to show molecules
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols

In [None]:
def euc_dist(z1, z2):
    return np.linalg.norm(z1-z2)

In [None]:
def angular_dist(z1, z2):
    cos_sim = np.dot(z1,z2)/(np.linalg.norm(z1) * np.linalg.norm(z2))
    return np.arccos(cos_sim)/np.pi

In [None]:
def lor_dist(z1, z2):
    m = z1*z2
    lor_prod = m[1:].sum() - m[0]
    x = - lor_prod
    x = np.where(x<1.0, 1.0+1e-6, x)
    return np.log(x + np.sqrt(x**2 - 1))

In [None]:
def lor_pairwise_dist(x):
    x0 = x[:,0].reshape(-1,1)
    x1 = x[:,1:]
    m = np.matmul(x1, x1.transpose()) - np.matmul(x0, x0.transpose())
    np.fill_diagonal(m, -1-1e-12)
    m = -m
    m = np.where(m<1.0, 1.0 + 1e-6, m)
    dm = np.log(m + np.sqrt(m**2 - 1))
    # prevent Inf. distance in dm
    dm = np.nan_to_num(dm)
    return dm

In [None]:
def get_durg_atc(ATC_LVL):
    drug_atc_dict = {}
    for i in range(len(drug_lst)):
        d = drug_lst[i]
        drug_atc_dict[i] = {d:df_atc[df_atc['ATC_LVL5']==d][ATC_LVL].unique()}
    return drug_atc_dict

In [None]:
def query_drug_repurpose(drug_x, ATC_LVL, dm, drug_lst):
    # step 1: create drug atc dictionary
    drug_atc_dict = get_durg_atc(ATC_LVL)
    # step 2: get drug2drug distance array
    drug_drug_dist = dm[drug_lst.index(drug_x),:]
    # create atc level drug2drug distance dataframe
    atc_drug_dist = []
    for i in range(len(drug_lst)):
        d = drug_lst[i]
        if d != drug_x:
            atc_array = drug_atc_dict[i][d]
            if len(atc_array) > 0:
                for atc in atc_array:
                    atc_drug_dist.append((atc, drug_drug_dist[i]))
    labels = [ATC_LVL, 'Drug2Drug_Dist']
    df = pd.DataFrame.from_records(atc_drug_dist, columns=labels)
    df_agg = df.groupby([ATC_LVL]).agg(['mean', 'std', 'count'])
    df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
    return df_agg.reset_index().sort_values('Drug2Drug_Dist_mean')

In [None]:
def np_lorentz_product(z1, z2):
    m = z1 * z2
    return m[1:].sum() - m[0]

In [None]:
def np_arccoh(x):
    x = np.where(x<1.0, 1.0 + 1e-6, x)
    return np.log(x + np.sqrt(x**2 - 1))

In [None]:
def np_inv_exp_map(z, mu):
    alpha = - np_lorentz_product(mu, z)
    return (np_arccoh(alpha)/np.sqrt(alpha**2 - 1)) * (z - alpha*mu)

# Load Model

In [None]:
exp_dir = './experiments/KDD/kdd_010'
checkpoint = 'checkpoint_epoch110.model'
config_path = os.path.join(exp_dir, 'configs.json')
checkpoint_path = os.path.join(exp_dir, checkpoint)

In [None]:
with open(config_path, 'r') as fp:
    configs = json.load(fp)
fp.close()

In [None]:
configs['checkpoint'] = checkpoint
configs

In [None]:
model = load_model(configs)
print(model)

# Load SMILES Data

In [625]:
datasets = OrderedDict()
splits = ['train', 'valid', 'test']
for split in splits:
    datasets[split] = drugdata(task = configs['task'],
                               fda_drugs_dir=configs['data_dir'],
                               fda_smiles_file=configs['fda_file'],
                               fda_vocab_file=configs['vocab_file'],
                               fda_drugs_sp_file=configs['atc_sim_file'],
                               experiment_dir=os.path.join(configs['checkpoint_dir'], configs['experiment_name']),
                               smi_file='smiles_' + split + '.smi',
                               max_sequence_length=configs['max_sequence_length'],
                               nneg=configs['nneg'])

In [626]:
fda_drug_lst = []
fda_smiles_lst = []

# retrieve SMILES from dataloader
for split in ['train', 'valid', 'test']:
    dd = datasets[split]
    for k,v in dd.smiles.items():
        if k[:4] != 'zinc':
            smi = ''
            token_lst = v['words'][1:] # skip <sos>
            for t in token_lst:
                if t != '<eos>':
                    smi = smi + t 
                else:
                    break
            fda_drug_lst.append(k)
            fda_smiles_lst.append(smi)

In [627]:
drug_lst_train, mean_lst_train, logv_lst_train = fda_drug_rep(configs, datasets['train'], model, all_drugs=False)

100%|██████████| 10/10 [03:31<00:00, 21.18s/it]


In [628]:
drug_lst_valid, mean_lst_valid, logv_lst_valid = fda_drug_rep(configs, datasets['valid'], model, all_drugs=False)

100%|██████████| 1/1 [00:11<00:00, 11.24s/it]


In [629]:
drug_lst_test, mean_lst_test, logv_lst_test = fda_drug_rep(configs, datasets['test'], model, all_drugs=False)

100%|██████████| 1/1 [00:14<00:00, 14.74s/it]


In [630]:
drug_lst = drug_lst_train + drug_lst_valid + drug_lst_test
assert drug_lst == fda_drug_lst, 'FDA drug names do not match!'
mean_lst = mean_lst_train + mean_lst_valid + mean_lst_test
logv_lst = logv_lst_train + logv_lst_valid + logv_lst_test

In [631]:
# convert list to numpy array
for i in range(len(drug_lst)):
    mean_lst[i] = np.array(mean_lst[i])
    logv_lst[i] = np.array(logv_lst[i])

In [632]:
# create SMILES dataframe
df_mdl = pd.DataFrame({'ATC_LVL5': drug_lst, 'smiles': fda_smiles_lst, 'mu': mean_lst, 'logv': logv_lst})

In [633]:
## remove drugs with duplicated entries
#idx = df_mdl.groupby('ATC_LVL5').size() > 1
#drug_lst_clean = list(df_mdl.groupby('ATC_LVL5').size()[-idx].reset_index()['ATC_LVL5'])
#df_smiles = df_mdl[df_mdl['ATC_LVL5'].isin(drug_lst_clean)]

In [634]:
# load df_fda_year
df_fda_year = pd.read_csv('./experiments/EXP_TASK/exp_task_011/all_drugs_approval_year.csv')

In [635]:
df_smiles = df_mdl.merge(df_fda_year, how='left', left_on='ATC_LVL5', right_on='drug_name')

In [636]:
df_smiles.head()

Unnamed: 0,ATC_LVL5,smiles,mu,logv,drug_name,Approval
0,novobiocin,CO[C@@H]1[C@@H](OC(N)=O)[C@@H](O)[C@H](Oc2ccc3...,"[13.137248039245605, 1.3716269731521606, 0.025...","[-6.649742126464844, -5.984184741973877, -9.82...",novobiocin,1964
1,vildagliptin,N#C[C@@H]1CCCN1C(=O)CNC12CC3CC(CC(O)(C3)C1)C2,"[102.30171203613281, 2.743731737136841, -1.416...","[-4.914798259735107, -4.87358283996582, -7.162...",vildagliptin,2007
2,benazepril,CCOC(=O)[C@H](CCc1ccccc1)N[C@H]1CCc2ccccc2N(CC...,"[146.45880126953125, 1.0793601274490356, 2.399...","[-4.702251434326172, -4.654637813568115, -6.57...",benazepril,1991
3,testosterone,CCC(=O)O[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]...,"[121.25065612792969, 1.7730571031570435, 0.622...","[-4.358664035797119, -4.222125053405762, -6.42...",testosterone,1953
4,physostigmine,CNC(=O)Oc1ccc2c(c1)[C@]1(C)CCN(C)[C@@H]1N2C,"[35.032623291015625, 0.7125436663627625, -1.19...","[-5.117040634155273, -4.733522415161133, -8.03...",physostigmine,1875


In [637]:
df_smiles.shape

(1368, 6)

# Morgan Fingerprints (Count)

In [638]:
fps_c = []
for i in tnrange(len(df_smiles)):
    smi = df_smiles['smiles'].iloc[i]
    mol = Chem.MolFromSmiles(smi)
    #fp = dc.feat.rdkit_grid_featurizer.compute_ecfp_features(mol)
    fp = AllChem.GetMorganFingerprint(mol,2)
    fps_c.append(fp)

HBox(children=(IntProgress(value=0, max=1368), HTML(value='')))




In [639]:
# append fingerprints
df_smiles['fp_c'] = fps_c

# Morgan Fingerprints (Bit)

In [640]:
fps_b = []
for i in tnrange(len(df_smiles)):
    smi = df_smiles['smiles'].iloc[i]
    mol = Chem.MolFromSmiles(smi)
    #fp = dc.feat.rdkit_grid_featurizer.compute_ecfp_features(mol)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
    fps_b.append(fp)

HBox(children=(IntProgress(value=0, max=1368), HTML(value='')))




In [641]:
# append fingerprints
df_smiles['fp_b'] = fps_b

# Euclidean Drug Embeddings

In [642]:
exp_dir = './experiments/KDD/kdd_009'
checkpoint = 'checkpoint_epoch110.model'
config_path = os.path.join(exp_dir, 'configs.json')
checkpoint_path = os.path.join(exp_dir, checkpoint)

In [643]:
with open(config_path, 'r') as fp:
    configs = json.load(fp)
fp.close()

In [644]:
configs['checkpoint'] = checkpoint
configs

{'data_dir': './data/fda_drugs',
 'data_file': 'smiles_set_clean.smi',
 'fda_file': 'all_drugs.smi',
 'vocab_file': 'char_set_clean.pkl',
 'atc_sim_file': 'drugs_sp_all.csv',
 'checkpoint_dir': './experiments/KDD',
 'experiment_name': 'kdd_009',
 'task': 'vae + atc',
 'limit': 0,
 'batch_size': 128,
 'epochs': 100,
 'max_sequence_length': 120,
 'learning_rate': 0.0003,
 'max_norm': 1000000000000.0,
 'wd': 0.0,
 'manifold_type': 'Euclidean',
 'prior_type': 'Standard',
 'num_centroids': 0,
 'bidirectional': False,
 'num_layers': 1,
 'hidden_size': 512,
 'latent_size': 64,
 'word_dropout_rate': 0.2,
 'anneal_function': 'logistic',
 'k': 0.51,
 'x0': 29.0,
 'C': 1.0,
 'num_workers': 4,
 'logging_steps': 1,
 'save_per_epochs': 10,
 'new_training': False,
 'new_annealing': False,
 'checkpoint': 'checkpoint_epoch110.model',
 'trained_epochs': 110,
 'alpha': 0.0,
 'beta': 0.015625,
 'gamma': 0.0,
 'delta': 11.0,
 'nneg': 11,
 'fda_prop': 0.2}

In [645]:
model = load_model(configs)
print(model)

EVAE(
  (encoder_rnn): GRU(49, 512, batch_first=True)
  (decoder_rnn): GRU(49, 512, batch_first=True)
  (hidden2mean): Linear(in_features=512, out_features=64, bias=True)
  (hidden2logv): Linear(in_features=512, out_features=64, bias=True)
  (latent2hidden): Linear(in_features=64, out_features=512, bias=True)
  (outputs2vocab): Linear(in_features=512, out_features=49, bias=True)
  (RECON): NLLLoss()
)


In [646]:
mu_e_lst = []
for i in tnrange(len(df_smiles)):
    smi = df_smiles['smiles'].iloc[i]
    mu_e, _= smiles2mean(configs, smi, model)
    mu_e_lst.append(mu_e.cpu().detach().numpy().squeeze())

HBox(children=(IntProgress(value=0, max=1368), HTML(value='')))




In [647]:
df_smiles['mu_e'] = mu_e_lst

# Lorentz Drug Embeddings (VAE only)

In [648]:
exp_dir = './experiments/EXP_TASK/exp_task_010'
checkpoint = 'checkpoint_epoch110.model'
config_path = os.path.join(exp_dir, 'configs.json')
checkpoint_path = os.path.join(exp_dir, checkpoint)

In [649]:
with open(config_path, 'r') as fp:
    configs = json.load(fp)
fp.close()

In [650]:
configs['checkpoint'] = checkpoint
configs

{'data_dir': './data/fda_drugs',
 'data_file': 'smiles_set_clean.smi',
 'fda_file': 'all_drugs.smi',
 'vocab_file': 'char_set_clean.pkl',
 'atc_sim_file': 'drugs_sp_all.csv',
 'checkpoint_dir': './experiments/EXP_TASK',
 'experiment_name': 'exp_task_010',
 'task': 'vae',
 'limit': 0,
 'batch_size': 128,
 'epochs': 200,
 'max_sequence_length': 120,
 'learning_rate': 0.0003,
 'max_norm': 1000000000000.0,
 'wd': 0.0,
 'manifold_type': 'Lorentz',
 'prior_type': 'Standard',
 'num_centroids': 0,
 'bidirectional': False,
 'num_layers': 1,
 'hidden_size': 512,
 'latent_size': 64,
 'word_dropout_rate': 0.2,
 'anneal_function': 'logistic',
 'k': 0.51,
 'x0': 29.0,
 'C': 1.0,
 'num_workers': 4,
 'logging_steps': 1,
 'save_per_epochs': 5,
 'new_training': True,
 'new_annealing': True,
 'checkpoint': 'checkpoint_epoch110.model',
 'trained_epochs': 0,
 'alpha': 0.0,
 'beta': 0.015625,
 'gamma': 0.0,
 'delta': 11.0,
 'nneg': 11,
 'fda_prop': 0.2}

In [651]:
model = load_model(configs)
print(model)

HVAE(
  (encoder_rnn): GRU(49, 512, batch_first=True)
  (decoder_rnn): GRU(49, 512, batch_first=True)
  (hidden2mean): Linear(in_features=512, out_features=64, bias=True)
  (hidden2logv): Linear(in_features=512, out_features=64, bias=True)
  (latent2hidden): Linear(in_features=65, out_features=512, bias=True)
  (outputs2vocab): Linear(in_features=512, out_features=49, bias=True)
  (RECON): NLLLoss()
)


In [652]:
mu_vae_lst = []
for i in tnrange(len(df_smiles)):
    smi = df_smiles['smiles'].iloc[i]
    mu_vae, _= smiles2mean(configs, smi, model)
    mu_vae_lst.append(mu_vae.cpu().detach().numpy().squeeze())

HBox(children=(IntProgress(value=0, max=1368), HTML(value='')))




In [653]:
df_smiles['mu_vae'] = mu_vae_lst

# RDKit Descriptors

In [654]:
rdkit_desc = dc.feat.RDKitDescriptors()

In [655]:
rdkit_desc_lst = []
for i in tnrange(len(df_smiles)):
    smi = df_smiles['smiles'].iloc[i]
    mol = Chem.MolFromSmiles(smi)
    features = rdkit_desc._featurize(mol)
    rdkit_desc_lst.append(np.array(features))

HBox(children=(IntProgress(value=0, max=1368), HTML(value='')))




In [656]:
df_smiles['rdkit_desc'] = rdkit_desc_lst

# Evaluation on RepoDB

## Load RepoDB

In [657]:
repoDB = pd.read_csv('./data/drug_repositioning/repoDB.csv')

In [658]:
def str_lower(s):
    return s.lower()

In [659]:
repoDB['ATC_LVL5'] = repoDB['drug_name'].map(str_lower)

In [660]:
repoDB.head()

Unnamed: 0,drug_name,drug_id,ind_name,ind_id,NCT,status,phase,DetailedStatus,ATC_LVL5
0,Lepirudin,DB00001,Heparin-induced thrombocytopenia with thrombosis,C0272275,,Approved,,,lepirudin
1,Cetuximab,DB00002,Squamous cell carcinoma of mouth,C0585362,,Approved,,,cetuximab
2,Cetuximab,DB00002,Squamous cell carcinoma of nose,C3163899,,Approved,,,cetuximab
3,Cetuximab,DB00002,Squamous cell carcinoma of pharynx,C1319317,,Approved,,,cetuximab
4,Cetuximab,DB00002,Laryngeal Squamous Cell Carcinoma,C0280324,,Approved,,,cetuximab


In [661]:
repoDB_merge = repoDB.merge(df_smiles, how='inner', on='ATC_LVL5')

In [662]:
len(repoDB_merge['ATC_LVL5'].unique())

985

In [663]:
repoDB_merge.groupby('status').count()

Unnamed: 0_level_0,drug_name_x,drug_id,ind_name,ind_id,NCT,phase,DetailedStatus,ATC_LVL5,smiles,mu,logv,drug_name_y,Approval,fp_c,fp_b,mu_e,mu_vae,rdkit_desc
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Approved,4738,4738,4738,4738,0,0,0,4738,4738,4738,4738,4738,4738,4738,4738,4738,4738,4738
Suspended,268,268,268,268,268,268,257,268,268,268,268,268,268,268,268,268,268,268
Terminated,1859,1859,1859,1859,1859,1859,1387,1859,1859,1859,1859,1859,1859,1859,1859,1859,1859,1859
Withdrawn,449,449,449,449,449,449,351,449,449,449,449,449,449,449,449,449,449,449


## Split RepoDB dataset

In [664]:
# use 2000 as the cutoff year
cutoff_ind = repoDB_merge['Approval'] < 2000
repoDB_merge_train = repoDB_merge[cutoff_ind]
repoDB_merge_test = repoDB_merge[~cutoff_ind]

In [665]:
repoDB_merge_train.shape

(6201, 19)

In [666]:
len(repoDB_merge_train['ATC_LVL5'].unique())

787

In [667]:
repoDB_merge_train.groupby('status').count()

Unnamed: 0_level_0,drug_name_x,drug_id,ind_name,ind_id,NCT,phase,DetailedStatus,ATC_LVL5,smiles,mu,logv,drug_name_y,Approval,fp_c,fp_b,mu_e,mu_vae,rdkit_desc
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Approved,4320,4320,4320,4320,0,0,0,4320,4320,4320,4320,4320,4320,4320,4320,4320,4320,4320
Suspended,220,220,220,220,220,220,210,220,220,220,220,220,220,220,220,220,220,220
Terminated,1287,1287,1287,1287,1287,1287,1036,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287
Withdrawn,374,374,374,374,374,374,288,374,374,374,374,374,374,374,374,374,374,374


In [668]:
repoDB_merge_test.shape

(1113, 19)

In [669]:
len(repoDB_merge_test['ATC_LVL5'].unique())

198

In [670]:
repoDB_merge_test.groupby('status').count()

Unnamed: 0_level_0,drug_name_x,drug_id,ind_name,ind_id,NCT,phase,DetailedStatus,ATC_LVL5,smiles,mu,logv,drug_name_y,Approval,fp_c,fp_b,mu_e,mu_vae,rdkit_desc
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Approved,418,418,418,418,0,0,0,418,418,418,418,418,418,418,418,418,418,418
Suspended,48,48,48,48,48,48,47,48,48,48,48,48,48,48,48,48,48,48
Terminated,572,572,572,572,572,572,351,572,572,572,572,572,572,572,572,572,572,572
Withdrawn,75,75,75,75,75,75,63,75,75,75,75,75,75,75,75,75,75,75


# KNN

## training set

In [671]:
# drug name
drug_name_train = list(repoDB_merge_train['ATC_LVL5'].unique())

# Lorentz drug embeddings
drug_mu_train = []
for d in drug_name_train:
    df = repoDB_merge_train[repoDB_merge_train['ATC_LVL5'] == d]
    drug_mu_train.append(list(df['mu'])[0])

# Morgan FP (count) 
drug_fp_c_train = []
for d in drug_name_train:
    df = repoDB_merge_train[repoDB_merge_train['ATC_LVL5'] == d]
    drug_fp_c_train.append(list(df['fp_c'])[0])

# Morgan FP (bit) 
drug_fp_b_train = []
for d in drug_name_train:
    df = repoDB_merge_train[repoDB_merge_train['ATC_LVL5'] == d]
    drug_fp_b_train.append(list(df['fp_b'])[0])
    
# Euclidean drug embeddings
drug_mu_e_train = []
for d in drug_name_train:
    df = repoDB_merge_train[repoDB_merge_train['ATC_LVL5'] == d]
    drug_mu_e_train.append(list(df['mu_e'])[0])

# Lorentz drug embeddings (VAE)
drug_mu_vae_train = []
for d in drug_name_train:
    df = repoDB_merge_train[repoDB_merge_train['ATC_LVL5'] == d]
    drug_mu_vae_train.append(list(df['mu_vae'])[0])
    
# RDKit descriptors
drug_rdkit_desc_train = []
for d in drug_name_train:
    df = repoDB_merge_train[repoDB_merge_train['ATC_LVL5'] == d]
    drug_rdkit_desc_train.append(list(df['rdkit_desc'])[0])

## testing set

In [672]:
# drug name
drug_name_test = list(repoDB_merge_test['ATC_LVL5'].unique())

# Lorentz drug embeddings
drug_mu_test = []
for d in drug_name_test:
    df = repoDB_merge_test[repoDB_merge_test['ATC_LVL5'] == d]
    drug_mu_test.append(list(df['mu'])[0])

# Morgan FP (count) 
drug_fp_c_test = []
for d in drug_name_test:
    df = repoDB_merge_test[repoDB_merge_test['ATC_LVL5'] == d]
    drug_fp_c_test.append(list(df['fp_c'])[0])

# Morgan FP (bit) 
drug_fp_b_test = []
for d in drug_name_test:
    df = repoDB_merge_test[repoDB_merge_test['ATC_LVL5'] == d]
    drug_fp_b_test.append(list(df['fp_b'])[0])
    
# Euclidean drug embeddings
drug_mu_e_test = []
for d in drug_name_test:
    df = repoDB_merge_test[repoDB_merge_test['ATC_LVL5'] == d]
    drug_mu_e_test.append(list(df['mu_e'])[0])

# Lorentz drug embeddings (VAE)
drug_mu_vae_test = []
for d in drug_name_test:
    df = repoDB_merge_test[repoDB_merge_test['ATC_LVL5'] == d]
    drug_mu_vae_test.append(list(df['mu_vae'])[0])
    
# RDKit descriptors
drug_rdkit_desc_test = []
for d in drug_name_test:
    df = repoDB_merge_test[repoDB_merge_test['ATC_LVL5'] == d]
    drug_rdkit_desc_test.append(list(df['rdkit_desc'])[0])

## Pairwise distances between training and testing

In [None]:
drug_pair_dist = [] # Lorentz distance (Chemical + )

# Lorentz embedding distance
for i in tnrange(len(drug_name_train)):
    d_i = drug_name_train[i]
    mu_i = drug_mu_train[i]
    fp_c_i = drug_fp_c_train[i]
    fp_b_i = drug_fp_b_train[i]
    mu_e_i = drug_mu_e_train[i]
    mu_vae_i = drug_mu_vae_train[i]
    rdkit_desc_i = drug_rdkit_desc_train[i]
    for j in range(len(drug_name_test)):
        d_j = drug_name_test[j]
        mu_j = drug_mu_test[j]
        fp_c_j = drug_fp_c_test[j]
        fp_b_j = drug_fp_b_test[j]
        mu_e_j = drug_mu_e_test[j]
        mu_vae_j = drug_mu_vae_test[j]
        rdkit_desc_j = drug_rdkit_desc_test[j]
        
        # compute different distances
        mu_dist = lor_dist(mu_i, mu_j)
        #fp_dist = distance.rogerstanimoto(fp_i,fp_j)
        fp_c_dist = 1.0 - DataStructs.TanimotoSimilarity(fp_c_i, fp_c_j)
        fp_b_dist = 1.0 - DataStructs.TanimotoSimilarity(fp_b_i, fp_b_j)
        mu_e_dist = euc_dist(mu_e_i, mu_e_j)
        mu_vae_dist = lor_dist(mu_vae_i, mu_vae_j)
        rdkit_desc_dist = angular_dist(rdkit_desc_i, rdkit_desc_j)
        drug_pair_dist.append((d_i, d_j, mu_dist, fp_c_dist, fp_b_dist, mu_e_dist, mu_vae_dist, rdkit_desc_dist))

HBox(children=(IntProgress(value=0, max=787), HTML(value='')))

In [None]:
df_drug_dist = pd.DataFrame(drug_pair_dist, columns=['drug_train', 'drug_test', 'mu_dist', 'fp_c_dist', \
                                                     'fp_b_dist', 'mu_e_dist', 'mu_vae_dist', 'rdkit_desc_dist'])

## KNN evaluation

In [None]:
def knn_drug_train(df_drug_dist, drug_test, dist_type, k):
    df_sub = df_drug_dist[df_drug_dist['drug_test']==drug_test].sort_values(dist_type)
    df_knn = df_sub.head(k)
    return list(df_knn['drug_train'])

In [None]:
# generate the kNN outputs dataframe
def knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test, dist_type, k):
    # loop each drug in test dataset
    knn_outputs_all = []
    for i in tnrange(len(drug_name_test)):
        drug_x = drug_name_test[i]
        # select the target drug
        df_drug_test = repoDB_merge_test[repoDB_merge_test['ATC_LVL5']==drug_x]
        # find KNN in training drug dataset
        ind = repoDB_merge_train['ATC_LVL5'].isin(knn_drug_train(df_drug_dist, drug_x, dist_type, k))
        df_drug_train = repoDB_merge_train[ind]
        # loop each indication of target drug
        for index, row in df_drug_test.iterrows():
            drug_test = row['ATC_LVL5']
            indication_name = row['ind_name']
            indication_id = row['ind_id']
            status = row['status']
            ind_approved = (df_drug_train['ind_id'] == indication_id) & (df_drug_train['status']=='Approved')
            ind_terminated = (df_drug_train['ind_id'] == indication_id) & (df_drug_train['status']!='Approved')
            cnt_approved = len(df_drug_train[ind_approved]['ATC_LVL5'].unique())
            cnt_terminated = len(df_drug_train[ind_terminated]['ATC_LVL5'].unique())
            cnt_unknown = k - (cnt_approved + cnt_terminated)
            knn_outputs_all.append((drug_test, indication_name, status, cnt_approved, cnt_terminated, cnt_unknown))
    df_knn_outputs = pd.DataFrame(knn_outputs_all, columns=['drug_name', 'ind_name', 'status','knn_approved', 'knn_terminated', 'knn_unknown'])
    return df_knn_outputs

In [None]:
# evaluation kNN
def knn_durg_evaluation(df_knn_outputs, k):
    df_DR = df_knn_outputs
    # soft votes
    label_lst = []
    prob_lst = []
    for index, row in df_DR.iterrows():
        knn_approved = row['knn_approved']
        knn_terminated = row['knn_terminated']
        knn_unknown = row['knn_unknown']
        label = 1.0 if row['status'] == 'Approved' else 0.0
        prob = (knn_approved + (0.5 * knn_unknown)) / k
        prob_lst.append(prob)
        label_lst.append(label)

    df_DR['prob'] = prob_lst
    df_DR['label'] = label_lst
    
    # AUROC score
    roc = roc_auc_score(np.array(df_DR['label']), np.array(df_DR['prob']))
    # AUPRC score
    prc = average_precision_score(np.array(df_DR['label']), np.array(df_DR['prob']))
    return roc, prc

### Lorentz Drug Embedding (ATC + ZINC)

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_dist', 3)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_dist', 5)
roc, prc = knn_durg_evaluation(df_knn_outputs, 5)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_dist', 7)
roc, prc = knn_durg_evaluation(df_knn_outputs, 7)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_dist', 9)
roc, prc = knn_durg_evaluation(df_knn_outputs, 9)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_dist', 11)
roc, prc = knn_durg_evaluation(df_knn_outputs, 11)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

## Morgan FP (count)

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_c_dist', 3)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_c_dist', 5)
roc, prc = knn_durg_evaluation(df_knn_outputs, 5)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_c_dist', 7)
roc, prc = knn_durg_evaluation(df_knn_outputs, 7)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_c_dist', 9)
roc, prc = knn_durg_evaluation(df_knn_outputs, 9)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_c_dist', 11)
roc, prc = knn_durg_evaluation(df_knn_outputs, 11)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

## Morgan FP (Bit)

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_b_dist', 3)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_b_dist', 5)
roc, prc = knn_durg_evaluation(df_knn_outputs, 5)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_b_dist', 7)
roc, prc = knn_durg_evaluation(df_knn_outputs, 7)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_b_dist', 9)
roc, prc = knn_durg_evaluation(df_knn_outputs, 9)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'fp_b_dist', 11)
roc, prc = knn_durg_evaluation(df_knn_outputs, 11)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

## Euclidean Drug Embeddings

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_e_dist', 3)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_e_dist', 5)
roc, prc = knn_durg_evaluation(df_knn_outputs, 5)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_e_dist', 7)
roc, prc = knn_durg_evaluation(df_knn_outputs, 7)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_e_dist', 9)
roc, prc = knn_durg_evaluation(df_knn_outputs, 9)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_e_dist', 11)
roc, prc = knn_durg_evaluation(df_knn_outputs, 11)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

## Lorentz VAE Only

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_vae_dist', 3)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_vae_dist', 5)
roc, prc = knn_durg_evaluation(df_knn_outputs, 5)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_vae_dist', 7)
roc, prc = knn_durg_evaluation(df_knn_outputs, 7)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_vae_dist', 9)
roc, prc = knn_durg_evaluation(df_knn_outputs, 9)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'mu_vae_dist', 11)
roc, prc = knn_durg_evaluation(df_knn_outputs, 11)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

## RDKit Descriptor

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'rdkit_desc_dist', 3)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'rdkit_desc_dist', 5)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'rdkit_desc_dist', 7)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'rdkit_desc_dist', 9)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))

In [None]:
df_knn_outputs = knn_drug_outputs(drug_name_test, repoDB_merge_train, repoDB_merge_test,'rdkit_desc_dist', 11)
roc, prc = knn_durg_evaluation(df_knn_outputs, 3)
print ('ROC = %0.3f, PRC = %0.3f' %(roc, prc))