## Imports

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import os
from tqdm import tqdm
import re
import numpy as np

# Map 

In [None]:
# Function that returns value if key in map, otherwise, returns a default value
def default(map, key, default):
  if key in map:
    return map[key]
  return default

def for_all(list, f):
  for e in list:
    if not f(e):
      return False
  return True

In [None]:
# Directory to the project folder
deep_learning_dir = '/content/gdrive/My Drive/BMI 707 Project' 

In [None]:
# Reading formatted train, test, val data
train = pd.read_pickle(deep_learning_dir + '/data_formatting/training_data.pickle')
test = pd.read_pickle(deep_learning_dir + '/data_formatting/testing_data.pickle')
val = pd.read_pickle(deep_learning_dir + '/data_formatting/validation_data.pickle')

In [None]:
# Read mapping file that contains all drugs in drugbank, with  
# Columns: smiles, drugbank_id
mapping = pd.read_csv(deep_learning_dir + '/data/drugbank/drugbank_mappings.csv')
mapping['drug_name'] = mapping['drug_name'].map(lambda name : name.lower())

In [None]:
# Standardize drug names to facilitate matching

# Remove salt from drugs that are in salt form (e.g., metformin hydrochloride)
words_to_remove = ["HYDROCHLORIDE","BISULFATE","ALUMINUM","ARGININE","BENZATHINE","CALCIUM","CHLOROPROCAINE","CHOLINE","DIETHANOLAMINE","ETHANOLAMINE","ETHYLENEDIAMINE","LYSINE","MAGNESIUM","HISTIDINE","LITHIUM","MEGLUMINE","POTASSIUM","PROCAINE","SODIUM","TRIETHYLAMINE","ZINC","ACETATE","ASPARTATE","BENZENESULFONATE","BENZOATE","BESYLATE","BICARBONATE","BITARTRATE","BROMIDE","CAMSYLATE","CARBONATE","CHLORIDE","CITRATE","DECANOATE","EDETATE","ESYLATE","FUMARATE","GLUCEPTATE","GLUCONATE","GLUTAMATE","GLYCOLATE","HEXANOATE","HYDROXYNAPHTHOATE","IODIDE","ISETHIONATE","LACTATE","LACTOBIONATE","MALATE","MALEATE","MANDELATE","MESYLATE","METHYLSULFATE","MUCATE","NAPSYLATE","NITRATE","OCTANOATE","OLEATE","PAMOATE","PANTOTHENATE","PHOSPHATE","POLYGALACTURONATE","PROPIONATE","SALICYLATE","STEARATE","ACETATE","SUCCINATE","SULFATE","TARTRATE","TEOCLATE","TOSYLATE"]
words_to_remove = set(map(lambda w: w.lower(), words_to_remove))

non_alphanum_patt = re.compile(r"[^A-Za-z0-9]")
multi_space_patt = re.compile(r" +")

def clean_name(drug):
  drug = drug.lower()
  # Replace non-alphanumeric characters with spaces
  drug = non_alphanum_patt.sub(' ', drug)
  # Remove words we do not want
  splits = drug.split()
  if splits:
    drug = ' '.join([splits[0], ' '.join(filter(lambda w: w not in words_to_remove, splits[1:]))])
  # Replace multi-spaces into a single space
  drug = multi_space_patt.sub(' ', drug)
  # Trim invisible characters (spaces, tabs, new lines)
  drug = drug.strip()
  return drug

In [None]:
unique_drug = set()
pd.concat([train, test, val])['drugs'].map(lambda drugs : list(map(lambda d : unique_drug.add(d), drugs)))
len(unique_drug)

4833

In [None]:
# Add drugbank data to the training, testing and validation sets

# Dictionary mapping drugbank name to drugbank data
drugbank_dict = {}
for m in mapping.iterrows():
  drugbank_dict[clean_name(m[1]['drug_name'].lower())] = (m[1]['id'], m[1]['smiles'], m[1]['chembl'], m[1]['binding_db'], clean_name(m[1]['drug_name'].lower()))

# This is a debug variable for counting matches
unmatched = {}

drugbank_dict["DEBUG"] = 0
drugbank_dict["DEBUG2"] = 0

# Drug name that need to be changed to map with drugbank
replacements = {
  "bi 10773": "empagliflozin",
  "rbv": "ribavirin",
  "bay59 7939": "rivaroxaban",
  "sof vel": "sofosbuvir",
  "insulin degludec insulin aspart": "insulin degludec",
  "cp 690 550": "tofacitinib",
  "ftc tdf": "tenofovir",
  "risedronate": "risedronic acid",
  "nktr 118": "naloxegol",
}

# Returns a list of tuple with drugbank information
def map_drugbank_data(drugs):
  matched = False
  data = []
  for drug in drugs:
    drug = clean_name(drug)

    if drug in replacements:
      drug = replacements[drug]
    
    # Placebos are a special case
    if 'placebo' in drug:
      drugbank_dict["DEBUG"] += 1
      matched = True
      data.append(('placebo', 'placebo', 'placebo', 'placebo', 'placebo')) # all columns will contain the value placebo
      continue
    
    data.append(('none', 'none', 'none', 'none', 'none'))

    if drug in drugbank_dict:
      drugbank_dict["DEBUG"] += 1
      matched = True
      data[len(data)-1] = drugbank_dict[drug]
      continue

    splits = drug.split(' ')
    for split in splits:
      if split in drugbank_dict:
        drugbank_dict["DEBUG"] += 1
        matched = True
        data[len(data)-1] = drugbank_dict[split]
        break
    
    if data[len(data)-1][0] == "none":
      if not drug in unmatched:
        unmatched[drug] = 0
      unmatched[drug] += 1
  if matched:
    drugbank_dict["DEBUG2"] += 1
  return data

def add_drugbank_data(df, name_of_df):
  drugbank_dict["DEBUG"] = 0
  # Add drugbank data
  df['drugbank_data'] = df['drugs'].map(lambda drugs : map_drugbank_data(drugs))
  # Splitting the tuples into different columns for readability
  df['drugbank_id'] = df['drugbank_data'].map(lambda drugs : list(map(lambda d : d[0], drugs)))
  df['smiles'] = df['drugbank_data'].map(lambda drugs : list(map(lambda d : d[1], drugs)))
  df['chembl'] = df['drugbank_data'].map(lambda drugs : list(map(lambda d : d[2], drugs)))
  df['binding_db'] = df['drugbank_data'].map(lambda drugs : list(map(lambda d : d[3], drugs)))
  df['clean_name'] = df['drugbank_data'].map(lambda drugs : list(map(lambda d : d[4], drugs)))

  # Remove drugbank data now that we have split it
  del df['drugbank_data']
  print(name_of_df + " % of drugs mapped: " + str(drugbank_dict["DEBUG"]/df['drugs'].map(len).sum()*100))

add_drugbank_data(train, "training")
add_drugbank_data(test, "testing")
add_drugbank_data(val, "validation")

training % of drugs mapped: 83.38505301267132
testing % of drugs mapped: 78.1437125748503
validation % of drugs mapped: 86.17021276595744


In [None]:
# Count the number of unique smiles we have matched
smiles_set = set()

def add_to_smiles_set(row):
  for i in range(len(row.drugs)):
    smiles_set.add(row.smiles[i])

train.apply(add_to_smiles_set, axis=1)
test.apply(add_to_smiles_set, axis=1)
val.apply(add_to_smiles_set, axis=1)

len(smiles_set)

881

In [None]:
# Debugging code to help find out the drug names that are the most unmatched
unmatched_list = list(zip(unmatched.keys(), unmatched.values()))
unmatched_list.sort(key=lambda x : int(x[1]), reverse=True)
unmatched_list[0:10]

[('epoetin alfa', 15),
 ('chemotherapy', 12),
 ('corticosteroids', 9),
 ('qva149', 9),
 ('aspirin', 9),
 ('ly2189265', 9),
 ('tak 438', 9),
 ('bay43 9006', 8),
 ('vi 0521', 8),
 ('nva237', 8)]

In [None]:
train.to_pickle(deep_learning_dir + '/data_formatting/drug_drugbank/training_data_drugbank.pickle')
test.to_pickle(deep_learning_dir + '/data_formatting/drug_drugbank/testing_data_drugbank.pickle')
val.to_pickle(deep_learning_dir + '/data_formatting/drug_drugbank/validation_data_drugbank.pickle')

In [None]:
train = pd.read_pickle(deep_learning_dir + '/data_formatting/drug_drugbank/training_data_drugbank.pickle')
test = pd.read_pickle(deep_learning_dir + '/data_formatting/drug_drugbank/testing_data_drugbank.pickle')
val = pd.read_pickle(deep_learning_dir + '/data_formatting/drug_drugbank/validation_data_drugbank.pickle')
dictionary = pd.read_csv(deep_learning_dir + '/data/mapping/dictionary.csv', sep='|')
cui2vec = pd.read_csv(deep_learning_dir + '/data/mapping/cui2vec_pretrained.csv')

In [None]:
# Create a map from clean name to CUI
keys = list(dictionary['STR'].map(lambda s : clean_name(str(s))))
cui_map = dict(zip(keys, dictionary['CUI']))

In [None]:
# Add CUIs to the datasets

mapped_cuis = set()
unmapped_cuis = set()
counter = [0]

def get_cuis(drug_names):
  cuis = []
  for drug in drug_names:
    drug = clean_name(drug)

    if drug in replacements:
      drug = replacements[drug]
    
    # Placebos are a special case
    if 'placebo' in drug:
      cuis.append(cui_map['placebo'])
      mapped_cuis.add(cui_map['placebo'])
      continue
    
    cuis.append('none')

    if drug in cui_map:
      cuis[len(cuis)-1] = cui_map[drug]
      mapped_cuis.add(cui_map[drug])
      continue

    splits = drug.split(' ')
    for split in splits:
      if split in cui_map:
        cuis[len(cuis)-1] = cui_map[split]
        mapped_cuis.add(cui_map[split])
        break
  if for_all(cuis, lambda cui : cui == 'none'):
    counter[0] += 1
  return cuis

train['cuis'] = train['clean_name'].map(get_cuis)
test['cuis'] = test['clean_name'].map(get_cuis)
val['cuis'] = val['clean_name'].map(get_cuis)

In [None]:
counter[0]

333

In [None]:
len(mapped_cuis)

1043

In [None]:
pd.DataFrame(mapped_cuis).to_csv(deep_learning_dir + '/data_formatting/drug_cui2vec_data/mapped_cuis.csv')

In [None]:
cui2vec_map = {}
for row in cui2vec.iterrows():
  cui2vec_map[row[1][0]] = row[1][1:]

In [None]:
train['cui_vectors'] = train['cuis'].map(lambda cuis : [default(cui2vec_map, cui, []) for cui in cuis])
test['cui_vectors'] = test['cuis'].map(lambda cuis : [default(cui2vec_map, cui, []) for cui in cuis])
val['cui_vectors'] = val['cuis'].map(lambda cuis : [default(cui2vec_map, cui, []) for cui in cuis])

In [None]:
counter[0] = 0
def pick_one_cui(cui_vectors):
  for vector in cui_vectors:
    if len(vector) != 0:
      return vector
  return []

train['primary_cui_vector'] = train['cui_vectors'].map(pick_one_cui)
test['primary_cui_vector'] = test['cui_vectors'].map(pick_one_cui)
val['primary_cui_vector'] = val['cui_vectors'].map(pick_one_cui)

In [None]:
train.to_pickle(deep_learning_dir + '/data_formatting/drug_cui2vec_data/training_data_cui.pickle')
test.to_pickle(deep_learning_dir + '/data_formatting/drug_cui2vec_data/testing_data_cui.pickle')
val.to_pickle(deep_learning_dir + '/data_formatting/drug_cui2vec_data/validation_data_cui.pickle')

In [None]:
print(len(train[train['primary_cui_vector'].str.len() == 0])/len(train)*100)
print(len(test[test['primary_cui_vector'].str.len() == 0])/len(test)*100)
print(len(val[val['primary_cui_vector'].str.len() == 0])/len(val)*100)

42.82482223658694
58.90052356020943
43.895348837209305


In [None]:
train['smiles'].map(lambda smiles : for_all(smiles, lambda smile : smile == "none")).sum()/len(train)

0.11667744020685197

In [None]:
smiles_set = set()

def add_to_smiles_set(row):
  for i in range(len(row.drugs)):
    smiles_set.add(row.smiles[i])

train.apply(add_to_smiles_set, axis=1)
test.apply(add_to_smiles_set, axis=1)
val.apply(add_to_smiles_set, axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
339    None
340    None
341    None
342    None
343    None
Length: 344, dtype: object

In [None]:
unique_smiles = pd.DataFrame(smiles_set, columns=["smiles"])

In [None]:
unique_smiles

Unnamed: 0,smiles
0,CCCOC(C(=O)OC1CCN(C)CC1)(C1=CC=CC=C1)C1=CC=CC=C1
1,NCCNC1=CC=C(NCCN)C2=C1C(=O)C1=C(C=NC=C1)C2=O
2,CC1=C(OCC(F)(F)F)C=CN=C1CS(=O)C1=NC2=CC=CC=C2N1
3,CCCCC1=NC2(CCCC2)C(=O)N1CC1=CC=C(C=C1)C1=CC=CC...
4,[H][C@]12[C@@H](C)C(S[C@]3([H])CN[C@H](CNS(N)(...
...,...
876,COC1=C(C=C(Cl)C=C1)C(=O)NCCC1=CC=C(C=C1)S(=O)(...
877,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=...
878,[H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(C)[C@]([H...
879,CC1=CC2=C(C=C1C(=C)C1=CC=C(C=C1)C(O)=O)C(C)(C)...


In [None]:
train[train['smiles'].map(lambda smiles : for_all(smiles, lambda smile : smile == "none"))].head(50)

Unnamed: 0,nctid,n_participants,drugs,diseases,icdcodes,criteria,label,drugbank_id,smiles,chembl,binding_db,clean_name,cuis,cui_vectors,primary_cui_vector
1,NCT01626859,152.0,"[mp-214 low dose, mp-214 middle dose, mp-214 h...",[schizophrenia],"[F20.0, F20.1, F20.2, F20.3, F20.5, F20.89, F2...",\n Inclusion Criteria:\n\n - ...,1,"[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[[], [], []]",[]
7,NCT00605293,578.0,"[methoxy polyethylene glycol-epoetin beta, epo...",[anemia],"[D53.2, D64.9, D46.4, D53.0, D53.9, D61.3, D61.9]",\n Inclusion Criteria:\n\n - ...,1,"[DB09107, none]","[none, none]","[none, none]","[none, none]","[methoxy polyethylene glycol epoetin beta, none]","[C1328071, none]","[[], []]",[]
8,NCT00331864,4189.0,[ranibizumab],"[age related macular degeneration, choroidal n...","[H35.3130, H35.3230, H35.3110, H35.3120, H35.3...",\n Patients who participated in this st...,1,[DB01270],[none],[CHEMBL1201825],[none],[ranibizumab],[C1566537],"[[-0.0198971997276426, 0.0166968261538703, -8....",V1 -0.019897 V2 0.016697 V3 ...
11,NCT00124982,27188.0,"[abatacept, non-biologic disease modifying ant...",[rheumatoid arthritis],"[M06.9, M05.9, M06.08, M06.00, M06.011, M06.01...",\n Inclusion Criteria:\n\n - ...,1,"[DB01281, none, none]","[none, none, none]","[CHEMBL1201823, none, none]","[none, none, none]","[abatacept, none, none]","[C1619966, none, none]","[[-0.0140159579976783, 0.0032744963090471, -2....",V1 -0.014016 V2 0.003274 V3 ...
20,NCT00437112,,"[human insulin inhalation powder, insulin glar...","[diabetes mellitus, type 2]","[E11.65, E11.9, E11.21, E11.36, E11.41, E11.42...",\n Inclusion Criteria:\n\n - ...,0,"[none, DB00047]","[none, none]","[none, CHEMBL1201497]","[none, none]","[none, insulin glargine]","[none, C0907402]","[[], [-0.0088522591226082, 0.0038537407826502,...",V1 -0.008852 V2 0.003854 V3 ...
51,NCT00316719,928.0,"[lam group, adv group]",[chronic hepatitis b],"[B18.0, B18.1, B18.2, B18.8, B18.9]",\n Inclusion criteria:\n\n - ...,1,"[none, none]","[none, none]","[none, none]","[none, none]","[none, none]","[none, none]","[[], []]",[]
63,NCT00311402,12910.0,[aggrenox capsule],[cerebrovascular accident],"[A52.05, I67.81, I67.89, I67.9, I67.841, I67.8...",\n Inclusion Criteria:\n\n Patie...,0,[none],[none],[none],[none],[none],[none],[[]],[]
73,NCT00221845,,"[ace inhibition, intensified blood pressure co...","[children, chronic renal failure, hypertension...","[Y93.6A, Y92.110, Y92.111, Y92.112, Y92.113, Y...",\n Inclusion Criteria:\n\n - ...,1,"[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[[], [], []]",[]
75,NCT00206089,,[exanta],[thromboembolism],"[O88.22, O88.23, O88.211, O88.212, O88.213, O8...",\n Inclusion Criteria:\n\n - ...,0,[none],[none],[none],[none],[none],[none],[[]],[]
77,NCT00250965,,[intravenous magnesium],"[coronary artery disease, valvular heart disease]","[I25.10, I25.110, I25.119, I25.111, I25.118]",\n Inclusion Criteria:\n\n - ...,0,[none],[none],[none],[none],[none],[none],[[]],[]


In [None]:
# Debug code to figure out which unique tuples were matched
all_data = pd.concat([train, test, val])

def extract_info(r):
  assert len(r.drugs) == len(r.drugbank_id), "%d not equal to %d" % (len(r.drugs), len(r.drugbank_id))
  out = []
  for i in range(len(r.drugs)):
    out.append((r.drugs[i], r.drugbank_id[i], r.smiles[i], r.chembl[i], r.binding_db[i]))
  return out

def flatten(lists):
  out = []
  for l in lists:
    out += l
  return out

pd.DataFrame(set(flatten(all_data.apply(extract_info, axis=1))), columns=["drug", "drugbank_id", "smiles", "chembl", "binding_db"]).to_pickle(deep_learning_dir + '/AChander_Targets/unique_drug_tuple.pickle')

In [None]:
smiles_embedding = pd.read_pickle(deep_learning_dir + '/embeddings/SMILES_embedding.pkl')

In [None]:
train['embeddings'] = train['smiles'].map(lambda smiles : list(map(lambda smile : default(smiles_embedding, smile, []), smiles)))
test['embeddings'] = test['smiles'].map(lambda smiles : list(map(lambda smile : default(smiles_embedding, smile, []), smiles)))
val['embeddings'] = val['smiles'].map(lambda smiles : list(map(lambda smile : default(smiles_embedding, smile, []), smiles)))

In [None]:
def pick_one_embedding(embeddings):
  for embedding in embeddings:
    if len(embedding) != 0:
      return embedding
  return []

train['embedding'] = train['embeddings'].map(pick_one_embedding)
test['embedding'] = test['embeddings'].map(pick_one_embedding)
val['embedding'] = val['embeddings'].map(pick_one_embedding)

In [None]:
def pick_one_embedding(embeddings):
  for i in range(len(embeddings)):
    if len(embeddings[i]) != 0:
      return i
  return -1

train['embedding_id'] = train['embeddings'].map(pick_one_embedding)
test['embedding_id'] = test['embeddings'].map(pick_one_embedding)
val['embedding_id'] = val['embeddings'].map(pick_one_embedding)

In [None]:
train.columns

Index(['nctid', 'n_participants', 'drugs', 'diseases', 'icdcodes', 'criteria',
       'label', 'drugbank_id', 'smiles', 'chembl', 'binding_db', 'clean_name',
       'cuis', 'cui_vectors', 'primary_cui_vector', 'embeddings', 'embedding',
       'embedding_id'],
      dtype='object')

In [None]:
ctid_set = set()
ctid_embeddings = []

def add_to_ctid_embeddings(row):
  if row.nctid in ctid_set:
    return
  if row.embedding_id == -1:
    ctid_embeddings.append((row.nctid, np.zeros(1024), "none"))
  else:
    ctid_embeddings.append((row.nctid, row.embeddings[row.embedding_id], row.drugs[row.embedding_id]))
  ctid_set.add(row.nctid)

train.apply(add_to_ctid_embeddings, axis=1)
test.apply(add_to_ctid_embeddings, axis=1)
val.apply(add_to_ctid_embeddings, axis=1)

ctid_embeddings_df = pd.DataFrame(ctid_embeddings, columns=["nctid", "embedding", "drug"])

In [None]:
train

Unnamed: 0,nctid,n_participants,drugs,diseases,icdcodes,criteria,label,drugbank_id,smiles,chembl,binding_db,clean_name,cuis,cui_vectors,primary_cui_vector,embeddings,embedding,embedding_id
0,NCT00475085,944.0,"[aprepitant, dexamethasone, granisetron hydroc...",[nausea],"[R11.0, R11.11, R11.2]",\n Inclusion criteria:\n\n - ...,1,"[DB00673, DB14649, DB00889, DB00377, DB00433, ...",[C[C@@H](O[C@H]1OCCN(CC2=NNC(=O)N2)[C@H]1C1=CC...,"[CHEMBL1471, CHEMBL1530428, CHEMBL1290003, CHE...","[50220136, 50103620, 50443668, 50417287, 78434...","[aprepitant, dexamethasone, granisetron, palon...","[C1176306, C2930043, C0543476, C1310734, C0770...","[[-0.0133983809219361, 0.0038140331326222, -3....",V1 -0.013398 V2 0.003814 V3 ...,"[[13.561273574829102, -13.577717781066895, 0.9...","[13.561273574829102, -13.577717781066895, 0.95...",0
1,NCT01626859,152.0,"[mp-214 low dose, mp-214 middle dose, mp-214 h...",[schizophrenia],"[F20.0, F20.1, F20.2, F20.3, F20.5, F20.89, F2...",\n Inclusion Criteria:\n\n - ...,1,"[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[none, none, none]","[[], [], []]",[],"[[], [], []]",[],-1
2,NCT00203957,,"[istradefylline, istradefylline]",[parkinsons disease],[G20],\n Inclusion Criteria:\n\n - ...,1,"[DB11757, DB11757]",[[H]\C(=C(\[H])C1=CC(OC)=C(OC)C=C1)C1=NC2=C(N1...,"[CHEMBL431770, CHEMBL431770]","[50176050, 50176050]","[istradefylline, istradefylline]","[C0673470, C0673470]","[[], []]",[],"[[], []]",[],-1
3,NCT00169832,,[rosiglitazone or placebo],"[diabetes, coronary artery bypass grafting]","[E23.2, N25.1, P70.2, O24.92, Z83.3, Z86.32, E...",\n Inclusion Criteria:\n\n AT SC...,0,[placebo],[placebo],[placebo],[placebo],[placebo],[C1706408],[[]],[],[[]],[],-1
4,NCT01249352,,"[nimotuzumab, cisplatin, fluorouracil]","[esophageal cancer, adenocarcinoma]","[K22.2, K22.81, Q39.4, P78.83, I85.00, I85.01,...",\n Inclusion Criteria:\n\n 1. ...,1,"[DB06192, DB00515, DB00544]","[none, [H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H...","[none, CHEMBL2068237, CHEMBL185]","[none, 50028111, 50340677]","[nimotuzumab, cisplatin, fluorouracil]","[C1570308, C0008838, C2711401]","[[], [-0.0160435887106513, 0.0074711445684327,...",V1 -0.016044 V2 0.007471 V3 ...,"[[], [], [4.89539098739624, 3.768472671508789,...","[4.89539098739624, 3.768472671508789, 4.447010...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3089,NCT01015118,12294.0,"[placebo, paclitaxel, bibf 1120, carboplatin, ...","[ovarian neoplasms, peritoneal neoplasms]","[C05.2, C10.0, C16.0, C16.4, C17.0, C17.1, C17...",\n Inclusion criteria:\n\n - ...,1,"[placebo, DB01229, none, DB00958, DB01229, DB0...","[placebo, [H][C@]12[C@H](OC(=O)C3=CC=CC=C3)[C@...","[placebo, CHEMBL428647, none, CHEMBL1351, CHEM...","[placebo, 50001839, none, none, 50001839, none]","[placebo, paclitaxel, none, carboplatin, pacli...","[C1706408, C0144576, none, C0079083, C0144576,...","[[], [-0.0152721016686416, 0.0059331896906342,...",V1 -0.015272 V2 0.005933 V3 ...,"[[], [], [], [], [], []]",[],-1
3090,NCT01127217,,"[amlodipine/losartan, amlodipine]",[hypertension],"[I15.0, I97.3, K76.6, P29.2, G93.2, H40.053, I10]",\n Inclusion Criteria:\n\n - ...,1,"[DB00381, DB00381]",[CCOC(=O)C1=C(COCCN)NC(C)=C(C1C1=CC=CC=C1Cl)C(...,"[CHEMBL1491, CHEMBL1491]","[50088383, 50088383]","[amlodipine, amlodipine]","[C5195719, C5195719]","[[], []]",[],"[[5.37669563293457, -5.854226589202881, -4.580...","[5.37669563293457, -5.854226589202881, -4.5800...",0
3091,NCT01187953,1086.0,"[prograf (tacrolimus), lcp-tacro]",[renal failure],"[P96.0, O03.32, O04.82, O08.4, O03.82, O07.32,...",\n Inclusion Criteria:\n\n 1. ...,1,"[DB00864, none]",[CO[C@@H]1C[C@@H](CC[C@H]1O)\C=C(/C)[C@H]1OC(=...,"[CHEMBL269732, none]","[50030448, none]","[tacrolimus, none]","[C0519826, none]","[[], []]",[],"[[8.613880157470703, -9.339082717895508, 19.17...","[8.613880157470703, -9.339082717895508, 19.174...",0
3092,NCT01364649,1186.0,"[vortioxetine, escitalopram, placebo]",[treatment outcome],"[Z01.12, Z92.89, Z75.2, M27.59, Z53.9, Z91.19,...",\n Inclusion Criteria:\n\n 1. ...,1,"[DB09068, DB01175, placebo]","[CC1=CC=C(SC2=CC=CC=C2N2CCNCC2)C(C)=C1, CN(C)C...","[CHEMBL2104993, CHEMBL1508, placebo]","[50400902, 50302225, placebo]","[vortioxetine, escitalopram, placebo]","[C3661282, C1099456, C1706408]","[[], [-0.0147954572699932, 0.003364188566606, ...",V1 -0.014795 V2 0.003364 V3 ...,"[[-3.836270332336426, -12.949006080627441, 3.9...","[-3.836270332336426, -12.949006080627441, 3.97...",0


In [None]:
ctid_embeddings_df

Unnamed: 0,nctid,embedding,drug
0,NCT00475085,"[13.561273574829102, -13.577717781066895, 0.95...",aprepitant
1,NCT01626859,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
2,NCT00203957,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
3,NCT00169832,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
4,NCT01249352,"[4.89539098739624, 3.768472671508789, 4.447010...",fluorouracil
...,...,...,...
4579,NCT00679484,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
4580,NCT01057407,"[-3.7375659942626953, -3.272908926010132, 4.68...",sevelamer hydrochloride
4581,NCT01126580,"[9.013765335083008, -4.713770866394043, 15.139...",metformin
4582,NCT00423813,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none


In [None]:
ctid_embeddings_df.sample(50)

Unnamed: 0,nctid,embedding,drug
761,NCT00174720,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
3383,NCT02220894,"[14.387839317321777, -10.28243350982666, 1.606...",pemetrexed
1451,NCT00113386,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
2678,NCT00002597,"[10.195808410644531, -9.866142272949219, 0.332...",flutamide
3475,NCT02292446,"[23.38435935974121, -0.7330405712127686, 11.95...",ruxolitinib
2898,NCT01768286,"[3.361865282058716, 0.7817785739898682, 10.766...",rbv
1653,NCT01049334,"[6.104618072509766, -5.057725429534912, -14.15...",flurbiprofen
3307,NCT02149121,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
3569,NCT02388906,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",none
1292,NCT01348243,"[-2.2266159057617188, -5.000643253326416, 5.43...",disodium clodronate 200 mg/4 ml with 1% lidocaine


In [None]:
ctid_embeddings_df.to_pickle(deep_learning_dir + '/embeddings/drug_embeddings.pickle')

In [None]:
ctid_embeddings_dict = dict(zip(ctid_embeddings_df["nctid"], ctid_embeddings_df["embedding"]))

In [None]:
import pickle

In [None]:
with open("nctid2drugs.pkl", 'wb') as handle:
    pickle.dump(ctid_embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mv nctid2drugs.pkl "/content/gdrive/My Drive/BMI 707 Project/embeddings/"