<a href="https://colab.research.google.com/github/amyth18/CS598-Deep-Learning-Final-Project/blob/main/Other_Baseline_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install gensim --upgrade

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [19]:
PROJECT_PATH = "/content/drive/My Drive/DLH Final Project"
DATASET_PATH = f"{PROJECT_PATH}/mimic3/df_dataset_full_text.csv"
DATASET_D2V_PATH = f"{PROJECT_PATH}/mimic3/df_dataset_full_text_d2v.csv"
DOC2VEC_PATH = f"{PROJECT_PATH}/models/doc2vec.model"
W2V_MODEL_PATH = f"{PROJECT_PATH}/models/word2vec.model"

TRAINING_BATCH_SIZE = 400
MAX_WORDS = 1000
W2V_EMB_SIZE = 128

In [14]:
! ls "/content/drive/My Drive/DLH Final Project/models"

doc2vec.model			main-model-27-04-2022-19-11-16
doc2vec.model.syn1neg.npy	tf-idf-27-04-2022-16-29-54
doc2vec.model.wv.vectors.npy	word2vec-27-04-2022-17-47-59
main-model-27-04-2022-15-40-59	word2vec.model


# Data Preprocessing

In [8]:
df_dataset = pd.read_csv(DATASET_PATH, converters={'INPUT_TEXT': eval, 
                                                   'ICD9_CODE': eval})

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

docs = [TaggedDocument(doc, [i])
        for i, doc in enumerate(df_dataset.INPUT_TEXT)]

model = Doc2Vec(vector_size=128, window=2, 
                min_count=1, 
                workers=8, 
                epochs = 40)

model.build_vocab(docs)

model.train(docs, total_examples=model.corpus_count, 
            epochs=model.epochs)

In [13]:
model.save(DOC2VEC_PATH)

In [15]:
X_doc2vec = [model.infer_vector(df_dataset['INPUT_TEXT'][i]) 
              for i in range(0, len(df_dataset['INPUT_TEXT']))]

In [20]:
df_dataset['DOC2VEC'] = np.array(X_doc2vec).tolist()
df_dataset.to_csv(DATASET_D2V_PATH)

In [21]:
from gensim.models import Word2Vec

# load the model
model = Word2Vec.load(W2V_MODEL_PATH)

# now create a vector of word2vec embeddings for each discharge summary
X_word2vec = list()
for idx in range(len(df_dataset)):
  # ignore words in not vocabulary
  text = df_dataset["INPUT_TEXT"][idx]
  word_emb = [model.wv[w] for w in text if w in model.wv]
  X_word2vec.append(word_emb)

In [22]:
# top 50 unique ICD codes.
top_icd_codes = [codes for codes in df_dataset['ICD9_CODE']]
top_icd_codes = np.unique([code for codes in top_icd_codes for code in codes])

sorted_top_icd_codes = sorted(top_icd_codes)
icd_code_to_idx = dict((k, v) for v, k in enumerate(sorted_top_icd_codes))

multi_hot_ecoding_col = list()
for idx in range(len(df_dataset)):
  icd_codes = df_dataset.iloc[idx]['ICD9_CODE']
  encoding = [0] * 50
  for code in icd_codes:
    encoding[icd_code_to_idx[code]] = 1    
  multi_hot_ecoding_col.append(encoding)

# new add a new column with multi-hot encoding.
df_dataset['ICD9_CODE_ENCODED'] = multi_hot_ecoding_col

# multi-hot encoding for ICD codes diagnosed.
y = df_dataset['ICD9_CODE_ENCODED'].to_list()

In [23]:
print(len(X_word2vec))
print(len(X_doc2vec))
print(len(y))

55988
55988
55988


# Datasets and Dataloaders

In [24]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader

In [25]:
def pad_dataset(dataset, vec_size):
  seq_lengths = list()

  for idx in range(len(dataset)):
    seq_lengths.append(len(dataset[idx]))
  max_seq_length = max(seq_lengths)

  padded_dataset = torch.zeros([len(dataset), max_seq_length, vec_size], 
                               dtype=torch.float)
  for i in range(len(dataset)):
    for j in range(len(dataset[i])):
      padded_dataset[i][j] = torch.FloatTensor(dataset[i][j])
  
  return padded_dataset

In [62]:
def collate_fn(data):
  x_w2v, x_d2v, y_batch = zip(*data)
  x_w2v = pad_dataset(x_w2v, W2V_EMB_SIZE)
  x_d2v = torch.FloatTensor(x_d2v)
  y_batch = torch.FloatTensor(y_batch)
  # move to gpus
  x_w2v = x_w2v.cuda() if torch.cuda.is_available() else x_w2v
  x_d2v = x_d2v.cuda() if torch.cuda.is_available() else x_d2v
  y_batch = y_batch.cuda() if torch.cuda.is_available() else y_batch
  return (x_w2v, x_d2v), y_batch

In [65]:
class CustomDataset(Dataset):
  def __init__(self, X_w2v, X_d2v, y):              
    self.X_w2v = X_w2v
    self.X_d2v = X_d2v
    self.y = y
    
  def __len__(self):                
    return len(self.y)
    
  def __getitem__(self, index):
    return self.X_w2v[index], self.X_d2v[index], self.y[index]

dataset = CustomDataset(X_word2vec, X_doc2vec, y)
split = int(len(dataset)*0.8)
lengths = [split, len(dataset) - split]

train_dataset, test_dataset = random_split(dataset, lengths)

train_loader = DataLoader(train_dataset, shuffle=True, 
                          batch_size=TRAINING_BATCH_SIZE, 
                          collate_fn=collate_fn)

test_loader = DataLoader(test_dataset, shuffle=True, 
                         batch_size=TRAINING_BATCH_SIZE, 
                         collate_fn=collate_fn)

# Model Definition

In [70]:
class CNNModel(nn.Module):
  
  def __init__(self):
    super(CNNModel, self).__init__()
    self.conv1 = nn.Conv2d(1, 64, (5, 128), 1)
    self.max_pool = torch.nn.MaxPool2d(4)
    self.dropout = torch.nn.Dropout(0.75)
    self.relu = torch.nn.ReLU()
  
  def forward(self, X):
    out = self.conv1(X)
    # print(out.shape)
    out = self.relu(out)
    x_in = torch.squeeze(out, dim=3)
    # print(x_in.shape)
    out = self.max_pool(x_in)
    out = self.dropout(out)
    # print(out.shape)
    out = torch.flatten(out, 1)
    return out

class DeepLabeler(nn.Module):

  def __init__(self):
    super(DeepLabeler, self).__init__()
    self.cnn = CNNModel()    
    self.fc = nn.Linear(4112, 50)
    self.dropout = nn.Dropout(0.75)
    self.sigmoid = nn.Sigmoid()
  
  def forward(self, X_w2vec, X_d2vec):
    out1 = self.cnn(X_w2vec)
    print(out1.shape)
    print(X_d2vec.shape)
    X_concat = torch.cat((out1, X_d2vec), 1)
    out2 = self.sigmoid(self.fc(X_concat))
    print(out2.shape)
    return out2
  
  def get_name():
    return "deep-labeler"

In [68]:
X_sample = next(iter(train_loader))
X_sample[0][0].shape

torch.Size([400, 1000, 128])

In [67]:
X_sample[0][1].shape

torch.Size([400, 128])

In [72]:
m1 = DeepLabeler()
x1_in = torch.unsqueeze(X_sample[0][0], dim=1)
print(x1_in.shape)
x2_in = X_sample[0][1]
print(x2_in.shape)
out = m1(x1_in, x2_in)
out.shape
print(out)

torch.Size([400, 1, 1000, 128])
torch.Size([400, 128])
torch.Size([400, 3984])
torch.Size([400, 128])
torch.Size([400, 50])
tensor([[0.5598, 0.6411, 0.5223,  ..., 0.5178, 0.4665, 0.5179],
        [0.4518, 0.4751, 0.4671,  ..., 0.5421, 0.4803, 0.5803],
        [0.5321, 0.5355, 0.5016,  ..., 0.4799, 0.4733, 0.3935],
        ...,
        [0.4804, 0.4114, 0.4845,  ..., 0.5279, 0.4329, 0.5696],
        [0.4938, 0.5693, 0.5384,  ..., 0.4730, 0.3922, 0.4981],
        [0.4086, 0.5665, 0.4872,  ..., 0.3397, 0.4256, 0.5571]],
       grad_fn=<SigmoidBackward0>)


# Model Training

In [None]:
from datetime import datetime
import pytz

def get_model_file_name(modelname="model"):
  return "/content/drive/My Drive/DLH Final Project/models/" + modelname + "-" + \
                  datetime.now(pytz.timezone('Asia/Kolkata')).strftime(
                      "%d-%m-%Y-%H-%M-%S")

def get_stats_file_name(modelname="model"):
  return "/content/drive/My Drive/DLH Final Project/stats/" + modelname + "-" + \
                  datetime.now(pytz.timezone('Asia/Kolkata')).strftime(
                      "%d-%m-%Y-%H-%M-%S")

def get_results_file_name(modelname="model"):
  return "/content/drive/My Drive/DLH Final Project/results/" + modelname + \
                  "-" + datetime.now(pytz.timezone('Asia/Kolkata')).strftime(
                      "%d-%m-%Y-%H-%M-%S")

In [None]:
import psutil
import time
import pickle

no_of_epocs = 100

def train_model(model, loss, optimizer, train_loader):

  main_memory_usage = list()
  gpu_memory_usage = list()
  gpu_time = list()
  train_loss = list()

  for e in range(no_of_epocs):
    model.train()
    epoc_train_loss = 0
    main_memory_before = psutil.virtual_memory().used
    gpu_memory_before = torch.cuda.memory_allocated()
    start_time = time.time()

    # iterate over data in mini batches.
    for tup, y_batch in train_loader:    
      model.zero_grad()
      pred = model(tup)
      l = loss(pred, y_batch)
      l.backward()
      optimizer.step()    
      epoc_train_loss += l.item()
      
    # print epoc level training loss.
    print(f"epoc: {e}: Train Loss: {epoc_train_loss/len(train_loader)}")
    
    # collect cpu and memory stats.
    memory_used = psutil.virtual_memory().used
    gpu_memory_used = torch.cuda.memory_allocated()
    run_time = time.time() - start_time
    print(f"time: {run_time} memory_used: {memory_used} gpu_memory_used: {gpu_memory_used}")
    print("\n")

    train_loss.append(epoc_train_loss/len(train_loader))
    main_memory_usage.append(memory_used)
    gpu_memory_usage.append(gpu_memory_used)
    gpu_time.append(run_time)
    # end of one epoc

  # save the model
  torch.save(model.state_dict(), get_model_file_name(model.get_name()))
  # print and collect stats.
  print(psutil.virtual_memory())

  stats = {
      "gpu_mem": gpu_memory_usage,
      "main_mem": main_memory_usage,
      "gpu_time": gpu_time,
      "vmm_info": psutil.virtual_memory()
  }

  with open(get_stats_file_name(model.get_name()), "ab") as sfile:
    pickle.dump(stats, sfile)

In [None]:
model = DeepLabeler()
if torch.cuda.is_available():
  model.cuda()

loss_fn = nn.BCELoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
print(f"No of parameters to train: \
        {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

train_model(model, loss_fn, optim, train_loader)

# Model Evaluation

In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def evaluate_model(model, test_loader):
  model.eval()
  y_pred_all = list()
  y_true_all = list()

  for tup, y_batch in test_loader:
    y_pred = model(tup)
    y_pred = y_pred > 0.20 # TODO: remove hard coding
    y_pred_all.extend(y_pred.detach().to('cpu').numpy())
    y_true_all.extend(y_batch.detach().to('cpu').numpy())

  y_true_all = np.array(y_true_all)
  y_pred_all = np.array(y_pred_all)

  # micro level metrics
  p1, r1, f1, s1 = precision_recall_fscore_support(y_true_all, y_pred_all, 
                                                  average="micro")
  micro_auc = roc_auc_score(y_true_all, y_pred_all, average="micro")
  print(f"Micro Averaging. Precision: {p1}, Recall: {r1}, F1 Score: {f1}, \
          AUC: {micro_auc}")

  # macro level metrics
  p2, r2, f2, s2 = precision_recall_fscore_support(y_true_all, y_pred_all, 
                                                  average="macro")
  macro_auc = roc_auc_score(y_true_all, y_pred_all, average="macro")
  print(f"Macro Averaging. Precision: {p2}, Recall: {r2}, F1 Score: {f2}, \
          AUC: {macro_auc}")

  results = {
      "micro": [p1, r1, f1],
      "macro": [p2, r2, f2]
  }

  with open(get_results_file_name(model.get_name()), "ab") as rfile:
    pickle.dump(results, rfile)
  
  for idx in range(50):
    p, r, f, _12 = precision_recall_fscore_support(y_true_all[:,idx], 
                                                 y_pred_all[:,idx], 
                                                 average='binary')
    print(f"p={p}, r={r}, f={f}")

In [None]:
if model is None:
  print("load from disk")
  model = DeepLabeler()
  if torch.cuda.is_available():
    model.cuda()
    model.load_state_dict(torch.load(f"{PROJECT_PATH}/models/"))
    evaluate_model(model, test_loader)
else:
  print("evaluating in-memory model")
  evaluate_model(model, train_loader)