#Definition of models

In [12]:

# import standard PyTorch modules
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [13]:
class Network(nn.Module):
  def __init__(self, dimension):
    super().__init__()
    self.d = dimension
    # define layers
    self.fc1 = nn.Linear(in_features=dimension,out_features=512)
    self.fc2 = nn.Linear(in_features=512,out_features=376)

  def forward(self, t):
        # fc 1
        # t=t.reshape(-1,28*28)
        t=t.reshape(-1, self.d)
        t=self.fc1(t)
        t=F.relu(t)

        t=self.fc2(t)
        # t=F.relu(t)
        return t

In [14]:
class Lstm(nn.Module):
    def __init__(self, inputSize, batchSize, hiddenSize):
        super().__init__()
        self.i = inputSize
        self.h = hiddenSize
        self.b = batchSize
        # define layers
        self.lstm = nn.LSTM(input_size=self.i, hidden_size=self.h, batch_first=True)
        self.h0 = torch.rand(1,self.b, self.h)/10
        self.c0 = torch.rand(1,self.b, self.h)/10


    def forward(self, t):
        output,(hn,cn) = self.lstm(t,(self.h0,self.c0))
        return output


In [15]:
from torch.utils.data import Dataset
class MyDataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, data, labels, lengths):
        'Initialization'
        self.labels = labels
        self.data = data
        self.lengths = lengths
  def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

  def __getitem__(self, index):
        'Generates one sample of data'

        # Load data and get label
        x = self.data[index]
        y = self.labels[index]
        #adding length to return
        z = self.lengths[index]
        return x, y, z

# Data preperation

In [16]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [17]:

train_df = pd.read_csv("gdrive/My Drive/Colab Notebooks/VirtualPatient/train_ctx_anon.csv", index_col=0)
test_df = pd.read_csv("gdrive/My Drive/Colab Notebooks/VirtualPatient/valid_ctx_anon.csv", index_col=0)

print(train_df.head())
def getGlove():
    glove = {}
    with open("gdrive/My Drive/Colab Notebooks/VirtualPatient/glove.6B.100d.txt","r") as f:
        for wv in  f:
            splits  = wv.split()
            word = splits[0]
            vector = np.asarray(splits[1:],"float32")
            glove[word] = vector
    return glove
glove = getGlove()

   convo_num  ...  label
0          0  ...     31
1          0  ...    270
2          0  ...    256
3          0  ...    177
4          0  ...     84

[5 rows x 4 columns]


In [18]:
# todo revisit data preperation
def prepareData(input_size, batch_size, train, limit=0):
    input = []
    if train:
        if limit == 0:
            data = train_df['query']
            labels = train_df['label']
        else:
            data = train_df['query'][:limit]
            labels = train_df['label'][:limit]
    else:
        data = test_df['query']
        labels = test_df['label']
    maxlength = 0
    for sentence in data:
        sentenceVec = []
        for word in word_tokenize(sentence):
            if word in glove:
                sentenceVec.append(glove[word])
        if maxlength < len(sentence):
            maxlength = len(sentence)
        input.append(np.array(sentenceVec))
    input1 = []
    lengths =[]
    for sentenceVec in input:
        t = torch.tensor(sentenceVec)
        # print(t.shape)
        sentenceLength = len(sentenceVec)
        if(sentenceLength == 0):
          t=torch.tensor(np.random.uniform(size=(1,input_size)))
          # print(t.shape)
          sentenceLength = 1
        lengths.append(sentenceLength)
       
        if batch_size >1 and sentenceLength < maxlength:
            padding=torch.zeros(maxlength-sentenceLength, input_size)
            t = torch.cat((t,padding), dim=0)
        input1.append(t)
    dataset = MyDataset(input1, labels, lengths=lengths)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return loader


In [20]:
batch_size=20
# input size is the word embedding dimension.
inputSize = 100
loader = prepareData(inputSize, batch_size,True)
test_loader = prepareData(inputSize, batch_size, False)

#Training

In [21]:

import tqdm.notebook as tq
def get_accuracy(model1, model2,dataloader):
  count=0
  correct=0

  model1.eval()
  model2.eval()
  with torch.no_grad():
    for batch in tq.tqdm_notebook(dataloader):
      data = batch[0]
      labels = batch[1]
      lengths = batch[2]
      packed = nn.utils.rnn.pack_padded_sequence(data, lengths, batch_first=True, enforce_sorted=False)
      # lstmOut = lstm(data.float())
      lstmOut = model1(packed.float())
      seq, seqLenghts = nn.utils.rnn.pad_packed_sequence(lstmOut, batch_first=True)
      lstmOut = seq[range(len(seq)),seqLenghts-1, :]
      preds = model2(lstmOut)
      batch_correct=preds.argmax(dim=1).eq(labels).sum().item()
      batch_count=len(batch[0])
      count+=batch_count
      correct+=batch_correct
  model1.train()
  model2.train()
  return correct/count

In [23]:
import tqdm.notebook as tq
lr=0.001
shuffle=True
epochs=10
hiddenSize = 500
inputSize = 100
lstm = Lstm(inputSize, batch_size, hiddenSize)
lstmOptimizer = optim.Adam(lstm.parameters(), lr=lr)
network = Network(hiddenSize)
networkOptimizer = optim.Adam(network.parameters(), lr=lr)
# set the network to training mode
lstm.train()
network.train()
for epoch in range(epochs):
  for batch in tq.tqdm_notebook(loader):
    loss = 0
    data = batch[0]
    labels = batch[1]
    lengths = batch[2]
    packed = nn.utils.rnn.pack_padded_sequence(data, lengths, batch_first=True, enforce_sorted=False)
    # lstmOut = lstm(data.float())
    lstmOut = lstm(packed.float())
    seq, seqLenghts = nn.utils.rnn.pad_packed_sequence(lstmOut, batch_first=True)
    lstmOut = seq[range(len(seq)),seqLenghts-1, :]
    preds= network(lstmOut)
    loss = F.cross_entropy(preds, labels)
    lstmOptimizer.zero_grad()
    networkOptimizer.zero_grad()
    loss.backward()
    lstmOptimizer.step()
    networkOptimizer.step()
    
  print('Epoch {0}: train set accuracy {1}'.format(epoch,get_accuracy(lstm,network,loader)))

print('Epoch {0}: test set accuracy {1}'.format(epoch,get_accuracy(lstm, network,test_loader)))


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 0: train set accuracy 0.5515952143569293


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 1: train set accuracy 0.7247424393486208


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 2: train set accuracy 0.8095712861415753


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 3: train set accuracy 0.8543536058491193


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 4: train set accuracy 0.8944001329345297


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 5: train set accuracy 0.9104353605849119


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 6: train set accuracy 0.9416749750747757


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 7: train set accuracy 0.9410933865071452


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 8: train set accuracy 0.9484878697241609


HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=602.0), HTML(value='')))


Epoch 9: train set accuracy 0.9563808574277168


HBox(children=(FloatProgress(value=0.0, max=135.0), HTML(value='')))


Epoch 9: test set accuracy 0.7586334942443371
