In [1]:

import sys
import os
import io
import time
import string
import numpy as np
import pandas as pd
import pickle
import re
from collections import defaultdict
import random
import random
import csv
import sqlite3 as lite
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler
from collections import defaultdict
from IPython.display import clear_output
from tensorboardX import SummaryWriter
from sklearn.metrics import f1_score


In [2]:
train_on_gpu = torch.cuda.is_available()
#train_on_gpu = False

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')
    
train_set = "../res/train_set.tsv"
test_set = "../res/test_set.tsv"
valid_set = "../res/valid_set.tsv"
glove50_path = "../res/GloveDict50.pkl"

model_path = "../res/mod4.pth"

max_q_len = 10
max_a_len = 100
embedding_len = 50
pad_char = '_'
batch_size=32
header_names = ["question","answer","label"]

e_unknown = 0
e_train = 1
e_valid = 2
e_test = 3


translator = str.maketrans('', '', string.punctuation)

CUDA is available!  Training on GPU ...


In [3]:
def load_glove_vectors(fname):
    pickle_in = open(fname,"rb")
    _dict = pickle.load(pickle_in)
    return _dict

In [4]:
glove50 = load_glove_vectors(glove50_path)
supported_words = list(glove50.keys())
#print(supported_words[:100])

In [5]:
default_val = np.zeros(embedding_len)
glove = defaultdict(lambda: default_val,glove50)

In [6]:
class CustomDatasetFromCSV(Dataset):
    def __init__(self, csv_path, transform=None):
        _dataset = pd.read_csv(csv_path, sep='\t', header=None, names=header_names, encoding='utf8')
        self.labels = _dataset['label']
        self.questions = _dataset['question']
        self.answers = _dataset['answer']

    def __getitem__(self, index):
        q = self.questions.iloc[index]
        a = self.answers.iloc[index]
        q = q.lower()
        a = a.lower()
        q = q.translate(translator)
        a = a.translate(translator)
        
        tokens = set((q+" "+a).split(" "))
        unique_vals = [x for x in tokens if x not in supported_words]      
        
        
        q_list = np.zeros((max_q_len, (embedding_len+1)))
        a_list = np.zeros((max_a_len, (embedding_len+1)))
        
        
        q_words = q.split(" ")        
        if len(q_words) > max_q_len:
            q_words = q_words[:max_q_len]
        else:
            q_words = [pad_char]*(max_q_len - len(q_words) ) + q_words
            
        a_words = a.split(" ")
        if len(a_words) > max_a_len:
            a_words = a_words[:max_a_len]
        else:
            a_words = [pad_char]*(max_a_len - len(a_words) ) + a_words
        
        
        
        for i, word in enumerate(q_words):
            q_val = glove[word]
            if word not in unique_vals:
                q_val = np.append(q_val, 0)
            else:
                q_val = np.append(q_val, unique_vals.index(word))            
            q_list[i] = q_val
        
            
        for i, word in enumerate(a_words):
            a_val = glove[word]
            if word not in unique_vals:
                a_val = np.append(a_val, 0)
            else:
                a_val = np.append(a_val, unique_vals.index(word))            
            a_list[i] = a_val
        
        
        x = [q_list,a_list]
        y = self.labels.iloc[index]
        return x, y
    def __len__(self):
        return len(self.labels)

In [7]:

class W_RNN(nn.Module):

    def __init__(self, n_hidden=128, n_layers=3, drop_prob=0.2, lr=0.001):
        super(W_RNN,self).__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        self.lstm = nn.LSTM((embedding_len+1), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, (embedding_len+1))
        self.sigmoid = nn.Sigmoid()
        
        
    def forward(self, x, hidden):
        ''' Forward pass through the network.
            These inputs are x, and the hidden/cell state `hidden`. '''
        x = x.float()
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
         # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        #out = self.sigmoid(x)
        return out, hidden


    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())

        return hidden



In [8]:
class FC_OP(nn.Module):
    
    def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5):
        ''' Builds a feedforward network with arbitrary hidden layers.
        
            Arguments
            ---------
            input_size: integer, size of the input layer
            output_size: integer, size of the output layer
            hidden_layers: list of integers, the sizes of the hidden layers
        
        '''
        super(FC_OP, self).__init__()
        # Input to a hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])])
        
        # Add a variable number of more hidden layers
        layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        
        self.output = nn.Linear(hidden_layers[-1], output_size)
        
        self.dropout = nn.Dropout(p=drop_p)
        
    def forward(self, x):
        ''' Forward pass through the network, returns the output logits '''
        
        for each in self.hidden_layers:
            x = F.relu(each(x))
            x = self.dropout(x)
        x = self.output(x)
        #print("Before ",x.size())
        x = F.sigmoid(x)
        #print("After ",x.size())
        return x
    

In [9]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.qNet = W_RNN()
        self.aNet = W_RNN()
        self.fcNet = FC_OP(input_size = ((embedding_len+1)*2),
                          output_size = 1,
                          hidden_layers = [128,128], 
                          drop_p = 0.2)

    def forward(self, questions, answers, q_hidden, a_hidden):
        q, q_hidden = self.qNet(questions, q_hidden)
        a, a_hidden = self.aNet(answers, a_hidden)
        #print(q.size())
        #print(a.size())
        ops = torch.cat((q, a), 1)                
        output = self.fcNet(ops)
        return output, q_hidden, a_hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        q_hidden = self.qNet.init_hidden(batch_size)
        a_hidden = self.aNet.init_hidden(batch_size)
        return q_hidden, a_hidden



In [10]:
#model loss and optimizers here
net = Model()
#print(net)

lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

if train_on_gpu:
    net = net.cuda()

In [21]:
a = torch.tensor([[1.],[0.],[0.],[1.],[0.]])
b = torch.tensor([[1.],[0.],[0.],[1.],[0.]])
l = criterion(a,b)
print(l.item())

0.0


In [11]:
custom_data = CustomDatasetFromCSV(test_set)    
test_loader = torch.utils.data.DataLoader(dataset=custom_data, batch_size=32, shuffle=True)

custom_data = CustomDatasetFromCSV(train_set)    
train_loader = torch.utils.data.DataLoader(dataset=custom_data, batch_size=32, shuffle=True)

custom_data = CustomDatasetFromCSV(valid_set)    
valid_loader = torch.utils.data.DataLoader(dataset=custom_data, batch_size=32, shuffle=True)

In [12]:
def get_acc(yhat, y):
    yhat = yhat.flatten()
    y = y.flatten()  
    if yhat.size() != y.size():
        print("dimension mismatch")
        return 0, [], []
    
    interpretation = (yhat>0.5).float()
    equals = interpretation == y.float()
    #print("equals.sum() ", equals.sum().item())
    #print("y.size() ", len(y))
    accuracy = equals.sum().item()/len(y)
    return accuracy, interpretation, y
    

In [13]:
writer = SummaryWriter(comment='A')

tag_train_loss = "train_loss"
tag_train_acc = "train_acc"

tag_valid_loss = "valid_loss"
tag_valid_acc = "valid_acc"

tag_train_f1score = "train_f1score"
tag_valid_f1score = "valid_f1score"


In [14]:
# Use this for both test and validation set
def test(dataloader, iteration):
    dataset_loss = 0
    dataset_acc = 0
    net.eval()
    count = iteration    
    batches_so_far = 0
    elements_seen = 0

    with torch.no_grad():
        for x, y in dataloader:            
            q = x[0]
            a = x[1]
            if train_on_gpu:
                q = q.cuda()
                a = a.cuda()
                y = y.cuda()                
            l = len(y)
            if l != batch_size:
                # last incomplete batch, skip over it.
                continue;
            batches_so_far += 1
            count += 1
            elements_seen += l
            q_hidden, a_hidden = net.init_hidden(batch_size)            
            output, q_hidden, a_hidden = net(questions = q, answers = a, q_hidden = q_hidden, a_hidden = a_hidden)
            output = output.flatten()
            loss = criterion(output, y.float())                 
            
            dataset_acc_val, pred_val ,actual_val = get_acc(output, y)            
            f1score = f1_score(pred_val.cpu().data.numpy(), actual_val.cpu().data.numpy())
            writer.add_scalar(tag_valid_loss, loss.item(), count)
            writer.add_scalar(tag_valid_acc, dataset_acc_val, count)
            writer.add_scalar(tag_valid_f1score, f1score, count)           
            
                        
            dataset_loss += loss.item()
            dataset_acc += dataset_acc_val
            
        dataset_loss = dataset_loss/batches_so_far
        dataset_accuracy = dataset_acc/batches_so_far
        
    return dataset_loss, dataset_accuracy

In [15]:
lowest_loss = np.inf

In [16]:
def train(epoch=1, print_every=1, clip = 5):
    net.train()
    train_loss = 0;
    train_acc = 0;
    valid_loss = 0;
    valid_acc = 0;
    count = 0  
    
    final_train_loss = 0;
    final_train_acc = 0;
    final_valid_loss = 0;
    final_valid_acc = 0;
    
    
    interrupted = False

    lowest_loss = np.inf
    
    for e in range(epoch):
        
        batches_so_far = 0
        elements_seen = 0
        for x, y in train_loader:            
            q = x[0]
            a = x[1]
            if train_on_gpu:
                q = q.cuda()
                a = a.cuda()
                y = y.cuda()                
            l = len(y)      
            if interrupted:
                interrupted = False
                print("Continue as usual again")
            
            if l != batch_size:
                print("Abrupt batch size for training, l = ", l)
                print("epoch ", e)
                print("count ", count)
                print("batches_so_far ", batches_so_far)
                print("elements_seen ", elements_seen)
                interrupted = True                
                # last incomplete batch, skip over it.
                continue;
            
            elements_seen += l # not one, but l
            batches_so_far +=1
            count += 1
            
            net.zero_grad()
            q_hidden, a_hidden = net.init_hidden(batch_size)            
            output, q_hidden, a_hidden = net(questions = q, answers = a, q_hidden = q_hidden, a_hidden = a_hidden)
                        
            #y = y.view(batch_size, -1)
            output = output.flatten()
            
            loss = criterion(output, y.float())
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()                     

            train_acc_val, pred_val ,actual_val = get_acc(output, y)
            f1score = f1_score(pred_val.cpu().data.numpy(), actual_val.cpu().data.numpy())
            writer.add_scalar(tag_train_loss, loss.item(), count)
            writer.add_scalar(tag_train_acc, train_acc_val, count)
            writer.add_scalar(tag_train_f1score, f1score, count)
            
            train_loss += loss.item()
            train_acc += train_acc_val
                
            if count%print_every == 0:
                final_train_loss = train_loss/batches_so_far
                if final_train_loss > 1:
                    print("Loss ERROR: train_loss ", train_loss,"\t - \t" , batches_so_far)                
                
                final_train_acc = train_acc/batches_so_far 
                if final_train_acc > 1:
                    print("train_accuracy ", train_acc,"\t - \t" , elements_seen)                
                
                valid_loss, valid_acc = test(valid_loader, iteration=count)
                
                final_valid_acc = valid_acc
                final_valid_loss = valid_loss                
                                              
                
                clear_output(wait=True)                
                print("Count ", count, "\t Epoch ", e)
                print("Train Loss ", final_train_loss, "\tTrain Acc ", final_train_acc, "\nValid Loss", final_valid_loss, "\tValid Acc", final_valid_acc)
                
                
                if  valid_loss < lowest_loss:
                    lowest_loss = valid_loss
                    torch.save(net.state_dict(),model_path)                
                net.train()                    
        

                batches_so_far = 0
                elements_seen = 0
                train_loss = 0;
                train_acc = 0;
                valid_loss = 0;
                valid_acc = 0;
                
                #if count == 5: break
                
            #break;

In [None]:
# Test loss here

In [None]:
train(epoch = 10, print_every = 100)

In [None]:
# simple load
state_dict_1 = torch.load(model_path)

# simple load into model
net.load_state_dict(state_dict_1)
if train_on_gpu:
    net = net.cuda()

In [None]:
print(list(net.parameters()))

In [None]:
test_loss, test_accuracy = test(test_loader)

#clear_output(wait=True)
print("test_loss ", test_loss, "\ttest_accuracy ", test_accuracy)