In [None]:
# compare strings
!pip install jellyfish

Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[?25l[K     |██▌                             | 10 kB 24.0 MB/s eta 0:00:01[K     |█████                           | 20 kB 28.3 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 11.7 MB/s eta 0:00:01[K     |█████████▉                      | 40 kB 9.0 MB/s eta 0:00:01[K     |████████████▍                   | 51 kB 6.5 MB/s eta 0:00:01[K     |██████████████▉                 | 61 kB 7.6 MB/s eta 0:00:01[K     |█████████████████▎              | 71 kB 7.8 MB/s eta 0:00:01[K     |███████████████████▊            | 81 kB 6.7 MB/s eta 0:00:01[K     |██████████████████████▎         | 92 kB 7.3 MB/s eta 0:00:01[K     |████████████████████████▊       | 102 kB 7.9 MB/s eta 0:00:01[K     |███████████████████████████▏    | 112 kB 7.9 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 122 kB 7.9 MB/s eta 0:00:01[K     |████████████████████████████████| 132 kB 7.9 MB/s 
[?25hBuilding wheel

In [None]:
import numpy as np
import pandas as pd
import jellyfish
from tqdm import tqdm
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
deep_learning_dir = "/content/gdrive/My Drive/BMI 707 Project"

cui_path = "/data/mapping"

# cui_str is a dataframe with two columns: CUI | concept as string
cui_str = pd.read_csv(deep_learning_dir + cui_path + "/dictionary.csv", sep="|")
str_to_cui = dict(zip(cui_str["STR"], cui_str["CUI"]))

In [None]:
data_path = "/data_formatting"

train = pd.read_csv(deep_learning_dir + data_path + "/training_data.tsv", sep="\t")
val = pd.read_csv(deep_learning_dir + data_path + "/validation_data.tsv", sep="\t")
test = pd.read_csv(deep_learning_dir + data_path + "/testing_data.tsv", sep="\t")

In [None]:
def clean_column(col):
  """
  After reading in the files, the disease cells are strings indead of lists
  """
  col = [e.replace("[", '') for e in col]
  col = [e.replace("]", '') for e in col]
  col = [e.replace("'", '') for e in col]
  col = [e.split(", ") for e in col]

  return col


def flatten_lol(l):
  """
  Flatten a list of lists
  """

  return [item for sublist in l for item in sublist]

In [None]:
diseases_train = clean_column(train["diseases"])
diseases_val = clean_column(val["diseases"])
diseases_test = clean_column(test["diseases"])

In [None]:
# all diseases in the dataset
diseases = flatten_lol(diseases_train) + flatten_lol(diseases_val) + flatten_lol(diseases_test)

# hand-curation
diseases = list(set(diseases) - set(["type 2", "high-risk", "first-line", "oral", "major", "breast", "immediate", "nontuberculous",
                                     "partial", "waldenstroms", "painful", "inborn", "local", "chronic", "specific", "left", "depression",
                                     "high", "human", "sudden", "mixed", "perennial", "postherpetic", "functional", "prevention", "safety"]))

In [None]:
def d_to_cuis(disease):
  """
  Find CUI of a given disease. If no CUI matches, split the disease into words and look for CUI's
  of the words. If there is no match even after splitting the disease string into words, look for the
  closest CUI concept, where distance is defined via the damerau-levelshtein string distance.
  """
  if disease in str_to_cui: 
    
    return [str_to_cui[disease]]

  # when the disease is not found in the str_to_cui dictionary
  words = disease.split(" ")
  words = [w.split("-") for w in words]
  words = flatten_lol(words)

  cuis = []

  for i,w in enumerate(words):

      if w in str_to_cui:

          cuis.append(str_to_cui[w])
  
  if len(cuis) == 0:

    print(f"There was no CUI match for the words in disease {disease}.\n")

    distances = np.zeros(len(str_to_cui), dtype=float)

    # compute string distance to all other CUI concepts
    for i,s in enumerate(str_to_cui.keys()):

      distances[i] = jellyfish.damerau_levenshtein_distance(disease, str(s))

    print(f"The closest string we could find was {list(str_to_cui.keys())[np.argmin(distances)]}. \n")

    cuis = [str_to_cui[list(str_to_cui.keys())[np.argmin(distances)]]]
    
  
  return cuis

In [None]:
# match all diseases to a CUI
diseases_to_cuis = {}

for d in tqdm(diseases):

  diseases_to_cuis[d] = d_to_cuis(d)

  0%|          | 0/1586 [00:00<?, ?it/s]

There was no CUI match for the words in disease chronic obstructive.



  1%|          | 10/1586 [00:24<1:05:09,  2.48s/it]

The closest string we could find was chronic obstruction. 

There was no CUI match for the words in disease rheumatoid.



  1%|          | 11/1586 [00:41<1:50:41,  4.22s/it]

The closest string we could find was rheumatism. 

There was no CUI match for the words in disease combined.



  2%|▏         | 36/1586 [00:54<30:56,  1.20s/it]  

The closest string we could find was combines. 

There was no CUI match for the words in disease psychological.



  5%|▌         | 86/1586 [01:12<15:21,  1.63it/s]

The closest string we could find was psychologic. 

There was no CUI match for the words in disease primary open angle.



 18%|█▊        | 293/1586 [01:37<04:38,  4.64it/s]

The closest string we could find was wide open angle. 

There was no CUI match for the words in disease obstructive.



 19%|█▉        | 308/1586 [01:53<06:01,  3.54it/s]

The closest string we could find was obstruction. 

There was no CUI match for the words in disease neurogenic.



 31%|███       | 489/1586 [02:08<03:05,  5.91it/s]

The closest string we could find was neurogenia. 

There was no CUI match for the words in disease adenosquamous.



 38%|███▊      | 598/1586 [02:26<02:46,  5.92it/s]

The closest string we could find was adenothamnus. 

There was no CUI match for the words in disease protocol specific.



 38%|███▊      | 603/1586 [02:49<04:15,  3.84it/s]

The closest string we could find was protocol specialist. 

There was no CUI match for the words in disease neurocognition.



 42%|████▏     | 660/1586 [03:08<04:19,  3.56it/s]

The closest string we could find was neurocognitive. 

There was no CUI match for the words in disease cystoid.



 48%|████▊     | 762/1586 [03:20<02:57,  4.64it/s]

The closest string we could find was cystoma. 

There was no CUI match for the words in disease hypertension,.



 72%|███████▏  | 1139/1586 [03:38<00:45,  9.74it/s]

The closest string we could find was hypertension. 

There was no CUI match for the words in disease stage ii/iii.



 73%|███████▎  | 1151/1586 [03:56<01:01,  7.02it/s]

The closest string we could find was stage iii. 

There was no CUI match for the words in disease open-angle.



 74%|███████▎  | 1168/1586 [04:11<01:17,  5.42it/s]

The closest string we could find was open angle. 

There was no CUI match for the words in disease psoriatic.



 78%|███████▊  | 1237/1586 [04:25<01:06,  5.28it/s]

The closest string we could find was psoriatec. 

There was no CUI match for the words in disease genetic.



 85%|████████▍ | 1347/1586 [04:37<00:38,  6.25it/s]

The closest string we could find was genetics. 

There was no CUI match for the words in disease sexual dysfunctions.



 89%|████████▉ | 1413/1586 [05:01<00:36,  4.76it/s]

The closest string we could find was sexual dysfunction. 

There was no CUI match for the words in disease gouty.



 92%|█████████▏| 1456/1586 [05:11<00:27,  4.67it/s]

The closest string we could find was gouts. 

There was no CUI match for the words in disease low-grade.



 97%|█████████▋| 1536/1586 [05:25<00:10,  4.96it/s]

The closest string we could find was low grade. 

There was no CUI match for the words in disease healthy.



 97%|█████████▋| 1537/1586 [05:37<00:13,  3.70it/s]

The closest string we could find was health. 

There was no CUI match for the words in disease ankylosing.



 98%|█████████▊| 1562/1586 [05:52<00:07,  3.06it/s]

The closest string we could find was ankylosis. 

There was no CUI match for the words in disease myelocytic.



100%|██████████| 1586/1586 [06:08<00:00,  4.31it/s]

The closest string we could find was myelocytes. 






In [None]:
# cui to embedding
cui_emb = pd.read_csv(deep_learning_dir + cui_path + "/cui2vec_pretrained.csv")
cui_emb.set_index("Unnamed: 0", inplace=True)

embeddings = {}

for index, row in cui_emb.iterrows():
    embeddings[index] = np.array(row)

In [None]:
# compute embedding of every disease with at least one existing corresponding CUI
# Note: a disease can have multiple corresponding CUI's

d_emb = {}

for d, cuis in diseases_to_cuis.items():
  
  embs = [embeddings[cui] for cui in cuis if cui in embeddings]

  if len(embs) != 0:

    d_emb[d] = np.mean(embs, axis=0)

In [None]:
def embedding_row(diseases):
  """
  Embed a list of diseases. Will correspond to the diseases associated to a single clinical trial
  """

  d_embs = [d_emb[d] for d in diseases if d in d_emb]

  if len(d_embs) == 0:
    # 500 for embedding
    return np.zeros(500, dtype=float)

  else:
    # 500 for average disease embedding
    return np.mean(d_embs, axis=0)

In [None]:
# key: nctid of clinical trial, value: disease embedding

train_embeddings = [embedding_row(diseases) for diseases in diseases_train]
train_n_diseases = [len(d) for d in diseases_train]
train_d_embeddings = dict(zip(train["nctid"], train_embeddings))
train_n_diseases = dict(zip(train["nctid"], train_n_diseases))

val_embeddings = [embedding_row(diseases) for diseases in diseases_val]
val_n_diseases = [len(d) for d in diseases_val]
val_d_embeddings = dict(zip(val["nctid"], val_embeddings))
val_n_diseases = dict(zip(val["nctid"], val_n_diseases))


test_embeddings = [embedding_row(diseases) for diseases in diseases_test]
test_n_diseases = [len(d) for d in diseases_test]
test_d_embeddings = dict(zip(test["nctid"], test_embeddings))
test_n_diseases = dict(zip(test["nctid"], test_n_diseases))

# combinded dictionary
final_d_embeddings = {**train_d_embeddings, **val_d_embeddings, **test_d_embeddings}
final_n_diseases = {**train_n_diseases, **val_n_diseases, **test_n_diseases}

In [None]:
with open("nctid2diseases.pkl", 'wb') as handle:
    pickle.dump(final_d_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("nctid2n_diseases.pkl", 'wb') as handle:
    pickle.dump(final_n_diseases, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mv nctid2diseases.pkl "/content/gdrive/My Drive/BMI 707 Project/embeddings/"
!mv nctid2n_diseases.pkl "/content/gdrive/My Drive/BMI 707 Project/embeddings/"