#Definition of models

In [None]:

# import standard PyTorch modules
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class Network(nn.Module):
  def __init__(self, dimension):
    super().__init__()
    self.d = dimension
    # define layers
    self.fc1 = nn.Linear(in_features=dimension,out_features=512)
    self.fc2 = nn.Linear(in_features=512,out_features=376)

  def forward(self, t):
        # fc 1
        # t=t.reshape(-1,28*28)
        t=t.reshape(-1, self.d)
        t=self.fc1(t)
        t=F.relu(t)

        t=self.fc2(t)
        # t=F.relu(t)
        return t

In [None]:
class Lstm(nn.Module):
    def __init__(self, inputSize, batchSize, hiddenSize):
        super().__init__()
        self.i = inputSize
        self.h = hiddenSize
        self.b = batchSize
        # define layers
        self.lstm = nn.LSTM(input_size=self.i, hidden_size=self.h, batch_first=True)
        self.h0 = torch.rand(1,self.b, self.h)/10
        self.c0 = torch.rand(1,self.b, self.h)/10


    def forward(self, t):
        output,(hn,cn) = self.lstm(t,(self.h0,self.c0))
        output = output[:,-1,:]
        return output


In [None]:
from torch.utils.data import Dataset
class MyDataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, data, labels, lengths):
        'Initialization'
        self.labels = labels
        self.data = data
        self.lengths = lengths
  def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

  def __getitem__(self, index):
        'Generates one sample of data'

        # Load data and get label
        x = self.data[index]
        y = self.labels[index]
        #adding length to return
        z = self.lengths[index]
        return x, y, z

# Data preperation

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:

train_df = pd.read_csv("gdrive/My Drive/Colab Notebooks/VirtualPatient/train_ctx_anon.csv", index_col=0)
test_df = pd.read_csv("gdrive/My Drive/Colab Notebooks/VirtualPatient/valid_ctx_anon.csv", index_col=0)

print(train_df.head())
def getGlove():
    glove = {}
    with open("gdrive/My Drive/Colab Notebooks/VirtualPatient/glove.6B.100d.txt","r") as f:
        for wv in  f:
            splits  = wv.split()
            word = splits[0]
            vector = np.asarray(splits[1:],"float32")
            glove[word] = vector
    return glove
glove = getGlove()

   convo_num  ...  label
0          0  ...     31
1          0  ...    270
2          0  ...    256
3          0  ...    177
4          0  ...     84

[5 rows x 4 columns]


In [None]:
# todo revisit data preperation
def prepareData(batch_size, train, limit=0):
    input = []
    if train:
        if limit == 0:
            data = train_df['query']
            labels = train_df['label']
        else:
            data = train_df['query'][:limit]
            labels = train_df['label'][:limit]
    else:
        data = test_df['query']
        labels = test_df['label']
    maxlength = 0
    for sentence in data:
        sentenceVec = []
        for word in word_tokenize(sentence):
            if word in glove:
                sentenceVec.append(glove[word])
        if maxlength < len(sentence):
            maxlength = len(sentence)
        input.append(np.array(sentenceVec))
    input1 = []
    lengths =[]
    for sentenceVec in input:
        t = torch.tensor(sentenceVec)
        sentenceLength = len(sentenceVec)
        lengths.append(len(sentenceVec))
        if batch_size >1 and sentenceLength < maxlength:
            padding=torch.zeros(maxlength-sentenceLength, 100)
            t = torch.cat((t,padding), dim=0)
        input1.append(t)
    dataset = MyDataset(input1, labels, lengths=lengths)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return loader


In [None]:
batch_size=204
loader = prepareData(batch_size,True)

#Training

In [None]:
#Todo: is this correct
import tqdm.notebook as tq
def get_accuracy(model1, model2,dataloader):
  count=0
  correct=0

  model1.eval()
  model2.eval()
  with torch.no_grad():
    for batch in tq.tqdm_notebook(dataloader):
      data = batch[0]
      labels = batch[1]
      preds = model1(data.float())
      preds = model2(preds)
      batch_correct=preds.argmax(dim=1).eq(labels).sum().item()
      batch_count=len(batch[0])
      count+=batch_count
      correct+=batch_correct
  model1.train()
  model2.train()
  return correct/count

In [None]:
import tqdm.notebook as tq
lr=0.001
shuffle=True
epochs=10
hiddenSize = 100

lstm = Lstm(100, batch_size, hiddenSize)
lstmOptimizer = optim.Adam(lstm.parameters(), lr=lr)
network = Network(hiddenSize)
networkOptimizer = optim.Adam(network.parameters(), lr=lr)
# set the network to training mode
lstm.train()
network.train()
for epoch in range(epochs):
  for batch in tq.tqdm_notebook(loader):
    loss = 0
    data = batch[0]
    labels = batch[1]
    lstmOut = lstm(data.float())
    preds= network(lstmOut)
    loss = F.cross_entropy(preds, labels)
    lstmOptimizer.zero_grad()
    networkOptimizer.zero_grad()
    loss.backward()
    lstmOptimizer.step()
    networkOptimizer.step()
    
  print('Epoch {0}: train set accuracy {1}'.format(epoch,get_accuracy(lstm,network,loader)))

# test_loader = torch.utils.data.DataLoader(test_set, batch_size = batch_size)
# print('Epoch {0}: test set accuracy {1}'.format(epoch,get_accuracy(network,test_loader)))
