<h2 id="tocheading">Table of Contents</h2>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import pickle
import random
import random
import spacy
import csv
import string
import os
import torch
import numpy as np
import pandas as pd
import spacy
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

## Part 1: Data Upload & Preprocessing
The datasets provided are already tokenized. Thus, without running the data through a tokenizer, we use pretrained word embeddings (e.g. fast-Text) to embed the tokens. 

#### Word Vectors

The web page for recommended word vector sets can be found here: https://fasttext.cc/docs/en/english-vectors.html wiki-news-300d-1M.vec from Mikolov et al (2018, Advances in Pre-Training Distributed Word Representations) 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens) is used in this assignment. 

In [4]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', 
                  newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        ## convert all maps to lists
        data[tokens[0]] = [*map(float, tokens[1:])]
    return data

In [5]:
## get the wiki word vectors
fname = "wiki-news-300d-1M.vec"
word_vectors = load_vectors(fname)

In [6]:
all_vocab_tokens = [*word_vectors.keys()]

In [7]:
print ("The number of unique tokens in the wiki news English vectors is " + str(len(all_vocab_tokens) ))

The number of unique tokens in the wiki news English vectors is 999994


#### Construct Table from Vocab Dict

In [8]:
word_vector_df = pd.DataFrame(word_vectors)

In [9]:
word_vector_df = word_vector_df.T

In [10]:
table_lookup = np.array(word_vector_df)

In [11]:
def index_vocab(table_df):
    
    token_array = np.array([*table_df.index])
    num_index_array = np.array([*range(table_df.shape[0])])
    
    token2id = {}
    id2token = {}
    for i in [*range(len(token_array))]:
        token2id[token_array[i]] = num_index_array[i]
        id2token[num_index_array[i]] = token_array[i]

    return token2id, id2token

In [12]:
token2id_wiki, id2token_wiki = index_vocab(word_vector_df)

__Check for table correctness!__

Do token2id and id2token match each other?

In [13]:
token2id_wiki["Alberto"]

93141

In [14]:
id2token_wiki[93141]

'Alberto'

Does the table fit the initial word vector vocab?

In [15]:
all(word_vectors["Alberto"] == table_lookup[93141])==True

True

### Part 1.1: SNLI Dataset

In [16]:
label_dict = {"entailment":0,
             "neutral":1,
             "contradiction":2}

In [17]:
snli_train = pd.read_table("data/snli_train.tsv")
snli_val = pd.read_table("data/snli_val.tsv")

In [18]:
import pandas as pd

## get tokenized training data
snli_train["sentence1"] = snli_train["sentence1"].apply(lambda x: x.split(" "))
snli_train["sentence2"] = snli_train["sentence2"].apply(lambda x: x.split(" "))

## get numeric labels
snli_train["label_num"] = snli_train["label"].apply(lambda x: label_dict[x])
snli_val["label_num"] = snli_val["label"].apply(lambda x: label_dict[x])

## get tokenized validation data
snli_val["sentence1"] = snli_val["sentence1"].apply(lambda x: x.split(" "))
snli_val["sentence2"] = snli_val["sentence2"].apply(lambda x: x.split(" "))

## write original indices
snli_train["original_index"] = snli_train.index.values
snli_val["original_index"] = snli_val.index.values

## label arrays
snli_train_labels = np.array(snli_train["label_num"])
snli_val_labels = np.array(snli_val["label_num"])

### Part 1.2: MultiNLI Dataset

In [19]:
mnli_train = pd.read_table("data/mnli_train.tsv")
mnli_val = pd.read_table("data/mnli_val.tsv")

In [20]:
mnli_train.head(3)

Unnamed: 0,sentence1,sentence2,label,genre
0,and now that was in fifty one that 's forty ye...,It was already a problem forty years ago but n...,neutral,telephone
1,Jon could smell baked bread on the air and his...,Jon smelt food in the air and was hungry .,neutral,fiction
2,it will be like Italian basketball with the uh...,This type of Italian basketball is nothing lik...,contradiction,telephone


In [21]:
## get tokenized training data
mnli_train["sentence1"] = mnli_train["sentence1"].apply(lambda x: x.split(" "))
mnli_train["sentence2"] = mnli_train["sentence2"].apply(lambda x: x.split(" "))

In [22]:
## get tokenized validation data
mnli_val["sentence1"] = mnli_val["sentence1"].apply(lambda x: x.split(" "))
mnli_val["sentence2"] = mnli_val["sentence2"].apply(lambda x: x.split(" "))

In [23]:
## get labels
mnli_train["label_num"] = mnli_train["label"].apply(lambda x: label_dict[x])
mnli_val["label_num"] = mnli_val["label"].apply(lambda x: label_dict[x])

Get train and val datasets for each __MNLI genre__. 

In [24]:
mnli_train["original_index"] = mnli_train.index.values
mnli_val["original_index"] = mnli_val.index.values

In [25]:
## telephone
mnli_train_telephone = mnli_train[mnli_train["genre"]=="telephone"]
mnli_val_telephone = mnli_val[mnli_val["genre"]=="telephone"]
## slate
mnli_train_slate = mnli_train[mnli_train["genre"]=="slate"]
mnli_val_slate = mnli_val[mnli_val["genre"]=="slate"]
## travel
mnli_train_travel = mnli_train[mnli_train["genre"]=="travel"]
mnli_val_travel = mnli_val[mnli_val["genre"]=="travel"]
## government
mnli_train_government = mnli_train[mnli_train["genre"]=="government"]
mnli_val_government = mnli_val[mnli_val["genre"]=="government"]
## fiction
mnli_train_fiction = mnli_train[mnli_train["genre"]=="fiction"]
mnli_val_fiction = mnli_val[mnli_val["genre"]=="fiction"]

#### Data Loaders

In [26]:
## idx = token2id_wiki

def token2index_dataset(tokens_data,
                        idx_dict=None):
    indices_data = []
    for tokens in tokens_data:
        ## get index list for each sentence.
        index_list = [idx_dict[token] if token in \
                      idx_dict else idx_dict["unk"] for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [27]:
def init_embedding_weights(vectors, 
                           token2id, 
                           id2token, 
                           embedding_size):
    
    weights = np.zeros((len(token2id), 
                        embedding_size))

    for idx in range(2, len(id2token)):
        token = id2token[idx]
        weights[idx] = np.array(token2id_wiki[token])
        
    weights[1] = np.random.randn(embedding_size)
    
    return weights

In [28]:
def build_vocabulary(train_data, 
                     vocab_size, 
                     vocab_embeddings):
    all_tokens = []
    for row in (train_data['sentence1'] + train_data['sentence2']).iteritems():
        all_tokens += row[1]
        
    vocabulary, count = zip(*Counter(all_tokens).most_common(vocab_size))
    vectors = wiki_embed_table
    vocabulary = [word for word in vocabulary if word in token2id_wiki.keys()]

    id2token = list(vocabulary)
    token2id = dict(zip(vocabulary, range(2, 2+len(vocabulary))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    
    return token2id, id2token, vectors

In [29]:
def preprocess_data(data, dataset, vocab_size=50000):
#     data = prepare_data(data)
    if dataset == "train":
        token2id, id2token, vectors = build_vocabulary(data, 
                                                       vocab_size,
                                                       wiki_embed_table)
        return data, token2id, id2token, vectors
    return data

In [30]:
from collections import Counter

In [32]:
wiki_embed_table = torch.tensor(table_lookup)

In [33]:
snli_train, token2id, id2token, vectors = preprocess_data(snli_train,
                                                          "train", 
                                                          50000)

NameError: name 'PAD_IDX' is not defined

__Note:__ I am getting the indices for Sentence 1 and Sentence 2 separately (not concatenating them at first from the beginning) since, in hyperparameter search I want to try more than one ways of interacting the hidden representations of the two sentences. 

In [None]:

## SNLI
snli_train_sentence1_indices = token2index_dataset([*snli_train["sentence1"]],idx_dict=token2id_wiki)
snli_train_sentence2_indices = token2index_dataset([*snli_train["sentence2"]],idx_dict=token2id_wiki)
snli_val_sentence1_indices = token2index_dataset([*snli_val["sentence1"]],idx_dict=token2id_wiki)
snli_val_sentence2_indices = token2index_dataset([*snli_val["sentence2"]],idx_dict=token2id_wiki)

## MNLI
mnli_train_sentence1_indices = token2index_dataset([*mnli_train["sentence1"]],idx_dict=token2id_wiki)
mnli_train_sentence2_indices = token2index_dataset([*mnli_train["sentence2"]],idx_dict=token2id_wiki)
mnli_val_sentence1_indices = token2index_dataset([*mnli_val["sentence1"]],idx_dict=token2id_wiki)
mnli_val_sentence2_indices = token2index_dataset([*mnli_val["sentence2"]],idx_dict=token2id_wiki)



In [34]:
## GENRES

## telephone
mnli_train_s1_tel_ix = token2index_dataset([*mnli_train_telephone["sentence1"]],idx_dict=token2id_wiki)
mnli_train_s2_tel_ix = token2index_dataset([*mnli_train_telephone["sentence2"]],idx_dict=token2id_wiki)
mnli_val_s1_tel_ix = token2index_dataset([*mnli_val_telephone["sentence1"]],idx_dict=token2id_wiki)
mnli_val_s2_tel_ix = token2index_dataset([*mnli_val_telephone["sentence2"]],idx_dict=token2id_wiki)
## slate
mnli_train_s1_slate_ix = token2index_dataset([*mnli_train_slate["sentence1"]],idx_dict=token2id_wiki)
mnli_train_s2_slate_ix = token2index_dataset([*mnli_train_slate["sentence2"]],idx_dict=token2id_wiki)
mnli_val_s1_slate_ix = token2index_dataset([*mnli_val_slate["sentence1"]],idx_dict=token2id_wiki)
mnli_val_s2_slate_ix = token2index_dataset([*mnli_val_slate["sentence2"]],idx_dict=token2id_wiki)
## travel
mnli_train_s1_travel_ix = token2index_dataset([*mnli_train_travel["sentence1"]],idx_dict=token2id_wiki)
mnli_train_s2_travel_ix = token2index_dataset([*mnli_train_travel["sentence2"]],idx_dict=token2id_wiki)
mnli_val_s1_travel_ix = token2index_dataset([*mnli_val_travel["sentence1"]],idx_dict=token2id_wiki)
mnli_val_s2_travel_ix = token2index_dataset([*mnli_val_travel["sentence2"]],idx_dict=token2id_wiki)
## gov
mnli_train_s1_gov_ix = token2index_dataset([*mnli_train_government["sentence1"]],idx_dict=token2id_wiki)
mnli_train_s2_gov_ix = token2index_dataset([*mnli_train_government["sentence2"]],idx_dict=token2id_wiki)
mnli_val_s1_gov_ix = token2index_dataset([*mnli_val_government["sentence1"]],idx_dict=token2id_wiki)
mnli_val_s2_gov_ix = token2index_dataset([*mnli_val_government["sentence2"]],idx_dict=token2id_wiki)
## fiction
mnli_train_s1_fiction_ix = token2index_dataset([*mnli_train_fiction["sentence1"]],idx_dict=token2id_wiki)
mnli_train_s2_fiction_ix = token2index_dataset([*mnli_train_fiction["sentence2"]],idx_dict=token2id_wiki)
mnli_val_s1_fiction_ix = token2index_dataset([*mnli_val_fiction["sentence1"]],idx_dict=token2id_wiki)
mnli_val_s2_fiction_ix = token2index_dataset([*mnli_val_fiction["sentence2"]],idx_dict=token2id_wiki)

Getting training and validation set __labels__ (targets) for both datasets. 

In [35]:
## SNLI
snli_train_labels = np.array(snli_train["label_num"])
snli_val_labels = np.array(snli_val["label_num"])

## MNLI
mnli_train_labels = np.array(mnli_train["label_num"])
mnli_val_labels = np.array(mnli_val["label_num"])

## GENRES

## telephone
mnli_train_tel_labels = np.array(mnli_train_telephone["label_num"])
mnli_val_tel_labels = np.array(mnli_val_telephone["label_num"])
## slate
mnli_train_slate_labels = np.array(mnli_train_slate["label_num"])
mnli_val_slate_labels = np.array(mnli_val_slate["label_num"])
## travel
mnli_train_travel_labels = np.array(mnli_train_travel["label_num"])
mnli_val_travel_labels = np.array(mnli_val_travel["label_num"])
## gov
mnli_train_gov_labels = np.array(mnli_train_government["label_num"])
mnli_val_gov_labels = np.array(mnli_val_government["label_num"])
## fiction
mnli_train_fiction_labels = np.array(mnli_train_fiction["label_num"])
mnli_val_fiction_labels = np.array(mnli_val_fiction["label_num"])

Function to get pretrained word embeddings from the table

##### SNLI Data Loader

In [73]:
MAX_SENTENCE_LENGTH = 100
PAD_IDX = 0
UNK_IDX = 1

BATCH_SIZE = 64

class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data, token2id=token2id_wiki, max_sentence_length=MAX_SENTENCE_LENGTH):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.sentence1, self.sentence2, self.labels = data['sentence1'].values, \
                                                      data['sentence2'].values, data['label_num'].values
        self.max_sentence_length = max_sentence_length
        self.token2id = token2id_wiki

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, row):
        label = self.labels[row]
        sentence1_word_idx, sentence2_word_idx = [], []
        sentence1_mask, sentence2_mask = [], []
        
        for word in self.sentence1[row][:MAX_SENTENCE_LENGTH]:
            if word in self.token2id.keys():
                sentence1_word_idx.append(self.token2id[word])
                sentence1_mask.append(0)
            else:
                sentence1_word_idx.append(UNK_IDX)
                sentence1_mask.append(1)
                
        for word in self.sentence2[row][:MAX_SENTENCE_LENGTH]:
            if word in self.token2id.keys():
                sentence2_word_idx.append(self.token2id[word])
                sentence2_mask.append(0)
            else:
                sentence2_word_idx.append(UNK_IDX)
                sentence2_mask.append(1)
        
        sentence1_list = [sentence1_word_idx, sentence1_mask, len(sentence1_word_idx)]
        sentence2_list = [sentence2_word_idx, sentence2_mask, len(sentence2_word_idx)]
        
        return sentence1_list + sentence2_list + [label]


    
def snli_func(batch, max_sent_length):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sentence1_data, sentence2_data = [], []
    sentence1_mask, sentence2_mask = [], []
    s1_lengths, s2_lengths = [], []
    labels = []

    for datum in batch:
        s1_lengths.append(datum[2])
        s2_lengths.append(datum[5])
        labels.append(datum[6])
        
        # PAD
        sentence1_data_padded = np.pad(np.array(datum[0]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        sentence1_data.append(sentence1_data_padded)
        
        sentence1_mask_padded = np.pad(np.array(datum[1]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        sentence1_mask.append(sentence1_mask_padded)
        
        sentence2_data_padded = np.pad(np.array(datum[3]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                                mode="constant", constant_values=0)
        sentence2_data.append(sentence2_data_padded)
        
        sentence2_mask_padded = np.pad(np.array(datum[4]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                               mode="constant", constant_values=0)
        sentence2_mask.append(sentence2_mask_padded)
        
    ind_dec_order = np.argsort(s1_lengths)[::-1]
    sentence1_data = np.array(sentence1_data)[ind_dec_order]
    sentence2_data = np.array(sentence2_data)[ind_dec_order]
    sentence1_mask = np.array(sentence1_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    sentence2_mask = np.array(sentence2_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    s1_lengths = np.array(s1_lengths)[ind_dec_order]
    s2_lengths = np.array(s2_lengths)[ind_dec_order]
    
    labels = np.array(labels)[ind_dec_order]
    
    s1_list = [torch.from_numpy(sentence1_data), 
               torch.from_numpy(sentence1_mask).float(), s1_lengths]
    s2_list = [torch.from_numpy(sentence2_data), 
               torch.from_numpy(sentence2_mask).float(), s2_lengths]
        
    return s1_list + s2_list + [torch.from_numpy(labels)]


snli_train_dataset = SNLIDataset(snli_train, 
                                 max_sentence_length=MAX_SENTENCE_LENGTH)
snli_train_loader = torch.utils.data.DataLoader(dataset=snli_train_dataset,
                               batch_size=BATCH_SIZE,
                               collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: snli_func(x, max_sentence_length),
                               shuffle=False)

snli_val_dataset = SNLIDataset(snli_val, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
snli_val_loader = torch.utils.data.DataLoader(dataset=snli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: snli_func(x, max_sentence_length),
                             shuffle=False)

##### MNLI Data Loader

In [37]:
MAX_SENTENCE_LENGTH = 100
PAD_IDX = 0
UNK_IDX = 1

BATCH_SIZE = 64

class MNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data, token2id=token2id_wiki, max_sentence_length=MAX_SENTENCE_LENGTH):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.sentence1, self.sentence2, self.labels = data['sentence1'].values, \
                                                      data['sentence2'].values, data['label_num'].values
        self.max_sentence_length = max_sentence_length
        self.token2id = token2id_wiki

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, row):
        label = self.labels[row]
        sentence1_word_idx, sentence2_word_idx = [], []
        sentence1_mask, sentence2_mask = [], []
        
        for word in self.sentence1[row][:MAX_SENTENCE_LENGTH]:
            if word in self.token2id.keys():
                sentence1_word_idx.append(self.token2id[word])
                sentence1_mask.append(0)
            else:
                sentence1_word_idx.append(UNK_IDX)
                sentence1_mask.append(1)
                
        for word in self.sentence2[row][:MAX_SENTENCE_LENGTH]:
            if word in self.token2id.keys():
                sentence2_word_idx.append(self.token2id[word])
                sentence2_mask.append(0)
            else:
                sentence2_word_idx.append(UNK_IDX)
                sentence2_mask.append(1)
        
        sentence1_list = [sentence1_word_idx, sentence1_mask, len(sentence1_word_idx)]
        sentence2_list = [sentence2_word_idx, sentence2_mask, len(sentence2_word_idx)]
        
        return sentence1_list + sentence2_list + [label]


    
def mnli_func(batch, max_sent_length):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    sentence1_data, sentence2_data = [], []
    sentence1_mask, sentence2_mask = [], []
    s1_lengths, s2_lengths = [], []
    labels = []

    for datum in batch:
        s1_lengths.append(datum[2])
        s2_lengths.append(datum[5])
        labels.append(datum[6])
        
        # PAD
        sentence1_data_padded = np.pad(np.array(datum[0]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        sentence1_data.append(sentence1_data_padded)
        
        sentence1_mask_padded = np.pad(np.array(datum[1]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        sentence1_mask.append(sentence1_mask_padded)
        
        sentence2_data_padded = np.pad(np.array(datum[3]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                                mode="constant", constant_values=0)
        sentence2_data.append(sentence2_data_padded)
        
        sentence2_mask_padded = np.pad(np.array(datum[4]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                               mode="constant", constant_values=0)
        sentence2_mask.append(sentence2_mask_padded)
        
    ind_dec_order = np.argsort(s1_lengths)[::-1]
    sentence1_data = np.array(sentence1_data)[ind_dec_order]
    sentence2_data = np.array(sentence2_data)[ind_dec_order]
    sentence1_mask = np.array(sentence1_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    sentence2_mask = np.array(sentence2_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    s1_lengths = np.array(s1_lengths)[ind_dec_order]
    s2_lengths = np.array(s2_lengths)[ind_dec_order]
    
    labels = np.array(labels)[ind_dec_order]
    
    s1_list = [torch.from_numpy(sentence1_data), 
               torch.from_numpy(sentence1_mask).float(), s1_lengths]
    s2_list = [torch.from_numpy(sentence2_data), 
               torch.from_numpy(sentence2_mask).float(), s2_lengths]
        
    return s1_list + s2_list + [torch.from_numpy(labels)]


mnli_train_dataset = MNLIDataset(mnli_train, 
                                 max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_train_loader = torch.utils.data.DataLoader(dataset=mnli_train_dataset,
                               batch_size=BATCH_SIZE,
                               collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                               shuffle=False)

mnli_val_dataset = SNLIDataset(snli_val, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                             shuffle=False)

## Part 2: Model

The model is trained on SNLI training set. The best model is chosen using SNLI validation set, then the best model is evaluated on each genre in MultiNLI validation set. 

We will use an encoder (either a CNN or an RNN) to map each string of text (hypothesis and premise) to a fixed-dimension vector representation. 

- We will interact the two hidden representations and output a __3-class softmax__. 

- To keep things simple, we will simply __concatenate__ the two representations, and feed them through a network of __2 fully-connected layers__. 

- For the encoder, we want the following:

### Part 2.1: Bidirectional_GRU

In [39]:
# batch_size = 8
# batch_size = 16
# epochs = 20
no_cuda = False
# log_interval = 1

cuda = not no_cuda and torch.cuda.is_available()
# cuda = False

seed = 1
torch.manual_seed(seed)

# device = torch.device("cuda" if args.cuda else "cpu")
device = torch.device("cuda" if cuda else "cpu")

# kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

In [40]:
wiki_embed_table = torch.tensor(table_lookup)

In [41]:
# max_train_length = max([snli_train_dataset_s1[x][1] for x in range(len(snli_train_dataset_s1))])

class biGRU(nn.Module):
    
    def __init__(self,
                 hidden_size,
                 embedding_weights,
                 percent_dropout,
                 vocab_size=wiki_embed_table.size(0),
                 interaction_type="concat",
                 num_layers=1,
                 input_size=300):

        super(biGRU, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        
        ## use pretrained word embeddings
        wiki_embed_table = torch.FloatTensor(table_lookup)
        embedding = nn.Embedding.from_pretrained(wiki_embed_table)
        self.embedding = embedding
        
        self.interaction = interaction_type
        self.dropout = percent_dropout
        
        self.GRU = nn.GRU(300, hidden_size, num_layers, 
                          batch_first=True, bidirectional=True)
        
        self.drop_out = nn.Dropout(self.dropout)
        
    def init_hidden(self, batch_size):
        hidden = torch.randn(2*self.num_layers, ## 2 for bidirectional
                             batch_size, self.hidden_size).to(device)
        return hidden
    
    def forward(self, sentence, 
                mask, lengths):
        sort_original = sorted(range(len(lengths)), 
                             key=lambda sentence: -lengths[sentence])
        unsort_to_original = sorted(range(len(lengths)), 
                             key=lambda sentence: sort_original[sentence])
        
        sentence = sentence[sort_original]
        _mask = mask[sort_original]
        lengths = lengths[sort_original]
        
        batch_size, seq_len = sentence.size()
        
        # init hidden
        self.hidden = self.init_hidden(batch_size)
        
        embeds = self.embedding(sentence)
        embeds = mask*embeds + (1-_mask)*embeds.clone().detach()
        embeds = torch.nn.utils.rnn.pack_padded_sequence(embeds, lengths, 
                                                         batch_first=True)
        
        gru_out, self.hidden = self.GRU(embeds, self.hidden)
        
        # undo packing
        gru_out, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out, 
                                                            batch_first=True)
        
        gru_out = gru_out.view(batch_size, -1, 2, self.hidden_size)
        gru_out = torch.sum(gru_out, dim=1)
        gru_out = torch.cat([gru_out[:,i,:] for i in range(2)], dim=1)
        gru_out = gru_out[unsort_to_original] ## back to original indices
        
        return gru_out
    
    
class Linear_Layers(nn.Module):
    
    def __init__(self,
                 hidden_size,
                 hidden_size_2,
                 percent_dropout,
                 interaction_type="concat",
                 classes=3,
                 input_size=300):
        
        super(Linear_Layers, self).__init__()

        self.interaction = interaction_type
        self.num_classes = classes
        self.hidden_size = hidden_size
        self.hidden_size_2 = hidden_size_2
        self.percent_dropout = percent_dropout
        self.num_classes = classes

        if self.interaction == "concat":
            ## 2 x 2 for bidirectional and concatenation
            self.fc1 = nn.Linear(4*self.hidden_size, self.hidden_size_2)
        else:
            ## 2 for bidirectional for multiplication and subtraction
            self.fc1 = nn.Linear(2*self.hidden_size, self.hidden_size_2)
        
        self.fc2 = nn.Linear(self.hidden_size_2, self.num_classes)
        self.ReLU = nn.ReLU(inplace=True)
        self.drop_out = nn.Dropout(self.percent_dropout)

        self.init_weights()
        
    def init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_normal_(module.weight)
                nn.init.uniform_(module.bias)

    def forward(self, gru_out_1, gru_out_2):
        
        if self.interaction == "concat":
            hidden = torch.cat([gru_out_1, gru_out_2], dim=1)
        elif self.interaction == "mul":
            ## elem-wise multiplication *
            hidden = gru_out_1*gru_out_2
        elif self.interaction == "subtract":
            ## subtraction
            hidden = gru_out_1-gru_out_2
        
        hidden = hidden.view(hidden.size(0),-1) 
        
        hidden_out = self.fc1(hidden)
        hidden_out = self.drop_out(self.ReLU(hidden_out))
        out = self.fc2(hidden_out)
        
        return out

In [42]:
len(snli_train)/64

1562.5

In [43]:
def train(RNN, 
          Linear_Classifier, 
          DataLoader, 
          criterion, 
          optimizer, 
          epoch):
    
    RNN.train()
    Linear_Classifier.train()
    total_loss = 0
    
    for batch_idx, (sentence1, s1_original, sentence1_lengths, 
                    sentence2, s2_original, sentence2_lengths, labels)\
    in enumerate(DataLoader):
            
        sentence1, s1_original = sentence1.to(device), s1_original.to(device),  
        sentence2, s2_original = sentence2.to(device), s2_original.to(device),
        labels = labels.to(device)
        
        RNN.train()
        Linear_Classifier.train()
        optimizer.zero_grad()
        
        output_s1 = RNN(sentence1, 
                              s1_original, 
                              sentence1_lengths)
        output_s2 = RNN(sentence2, 
                              s2_original, 
                              sentence2_lengths)
        
        out = Linear_Classifier(output_s1, output_s2)
        
        loss = criterion(out, labels)
        loss.cuda().backward()
        optimizer.step()

        total_loss += loss.item() * len(sentence1) / len(DataLoader.dataset)
        
        if (batch_idx+1) % (len(DataLoader.dataset)//(20*labels.shape[0])) == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, (batch_idx+1) * labels.shape[0], len(DataLoader.dataset),
                100. * (batch_idx+1) / len(DataLoader), loss.item()))

    optimizer.zero_grad()
    return total_loss


def test(RNN, 
         Linear_Classifier, 
         DataLoader, 
         criterion):

    RNN.eval()
    Linear_Classifier.eval()
    
    test_loss = 0
    label_list = []
    output_list = []
    
    with torch.no_grad():
        for batch_idx, (sentence1, s1_original, sentence1_lengths, 
                    sentence2, s2_original, sentence2_lengths, labels)\
                    in enumerate(DataLoader):

            sentence1, s1_original = sentence1.to(device), s1_original.to(device),  
            sentence2, s2_original = sentence2.to(device), s2_original.to(device),
            labels = labels.to(device)
            
            # Forward
            output_s1 = RNN(sentence1, 
                                  s1_original, 
                                  sentence1_lengths)
            # Reverse
            output_s2 = RNN(sentence2, 
                                  s2_original, 
                                  sentence2_lengths)
            
            out = Linear_Classifier(output_s1, output_s2)
        
            loss = criterion(out, labels)

            test_loss += loss.item()/len(DataLoader.dataset)

            output_list.append(out)
            label_list.append(labels)
            
            print ("outputs= "+str(torch.cat(output_list, dim=0)))
            print ("labels= "+str(torch.cat(label_list, dim=0)))
            
    return test_loss, torch.cat(output_list, dim=0), torch.cat(label_list, dim=0)

def accuracy(RNN, 
             Linear_Classifier, 
             DataLoader, 
             criterion):
    
    _, predicted, true_labels = test(RNN = RNN,
                              Linear_Classifier = Linear_Classifier,
                              DataLoader = DataLoader,
                              criterion = criterion)

    predicted = predicted.max(1)[1]
    return 100 * predicted.eq(true_labels.data.view_as(predicted)).float().mean().item()

In [44]:
vocab_size = 50000
num_classes = 3
num_layers = 1
bidirectional = True
gru_hidden_size = 256
classifier_hidden_size = 512

BATCH_SIZE = 32
lr = 3e-4
n_epochs = 10

#### Hyperparameter Search

The hyperparameters included in the hyperparameter search space are;

- The size of the hidden dimension of the CNN,
- The kernel size of the CNN,
- Experiment with different ways of interacting the two encoded sentences (concatenation, element-wise multiplication, outer multiplication etc)
- Regularization (e.g. weight decay, dropout).


In [45]:
import itertools

params = [[1e-3, 1e-2],    ## learning rate
          [512], ## hidden_size
          [64, 128],     ## hidden_size_2
          [0.1],         ## dropout
          ["mul","concat","subtract"]]  ## interaction type

params = [*itertools.product(*params)]

In [46]:
params

[(0.001, 512, 64, 0.1, 'mul'),
 (0.001, 512, 64, 0.1, 'concat'),
 (0.001, 512, 64, 0.1, 'subtract'),
 (0.001, 512, 128, 0.1, 'mul'),
 (0.001, 512, 128, 0.1, 'concat'),
 (0.001, 512, 128, 0.1, 'subtract'),
 (0.01, 512, 64, 0.1, 'mul'),
 (0.01, 512, 64, 0.1, 'concat'),
 (0.01, 512, 64, 0.1, 'subtract'),
 (0.01, 512, 128, 0.1, 'mul'),
 (0.01, 512, 128, 0.1, 'concat'),
 (0.01, 512, 128, 0.1, 'subtract')]

In [47]:
table_lookup.shape

(999994, 300)

In [49]:
param_losses = {}

for param_set in params:
    print ("Parameter Set: "+str(param_set))
    ## INITIALIZE VALIDATION ACCURACY LIST
    param_losses[param_set] = []
    vectors = table_lookup
    weights_init = init_embedding_weights(vectors, 
                                     token2id_wiki, 
                                     id2token_wiki,
                                     embedding_size = 300)

    RNN = biGRU(hidden_size=param_set[1],
                num_layers=1,
              percent_dropout = param_set[3],
              embedding_weights = weights_init,
              vocab_size=wiki_embed_table.size(0),
              interaction_type=param_set[4],
              input_size=300).to(device)
    
    linear_model = Linear_Layers(hidden_size = param_set[1],
                 hidden_size_2 = param_set[2],
                 percent_dropout = param_set[3],
                 interaction_type=param_set[4],
                 classes=3,
                 input_size=300).to(device)

    training_accuracy = []
    validation_accuracy = []
    num_epochs = 5

    for epoch in range(num_epochs):
        print ("epoch = "+str(epoch))

        loss_train = train(RNN, 
                           linear_model,
                           DataLoader = snli_train_loader,
                           criterion = nn.CrossEntropyLoss(),
                           optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                       list(linear_model.parameters()), 
                                                       lr=lr), 
                          epoch = epoch)

        loss_val, val_preds, val_true = test(
            RNN, 
            linear_model,
            DataLoader = snli_val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum'))

        train_acc = accuracy(RNN, linear_model, snli_train_loader, 
                              nn.CrossEntropyLoss(reduction='sum'))
        val_acc = accuracy(RNN, linear_model, 
                            snli_val_loader, nn.CrossEntropyLoss(reduction='sum'))

        training_accuracy.append(train_acc)
        validation_accuracy.append(val_acc)

    # saving to pickle
    param_losses[param_set] = validation_accuracy
    pd.DataFrame(param_losses).to_csv("GRU_val_acc_2.csv")

Parameter Set: (0.001, 512, 64, 0.1, 'mul')
epoch = 0
epoch = 1
epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.001, 512, 64, 0.1, 'concat')
epoch = 0
epoch = 1
epoch = 2


epoch = 3
epoch = 4
Parameter Set: (0.001, 512, 64, 0.1, 'subtract')
epoch = 0
epoch = 1
epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.001, 512, 128, 0.1, 'mul')
epoch = 0


epoch = 1
epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.001, 512, 128, 0.1, 'concat')
epoch = 0
epoch = 1
epoch = 2
epoch = 3


epoch = 4
Parameter Set: (0.001, 512, 128, 0.1, 'subtract')
epoch = 0
epoch = 1
epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.01, 512, 64, 0.1, 'mul')
epoch = 0
epoch = 1


epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.01, 512, 64, 0.1, 'concat')
epoch = 0
epoch = 1
epoch = 2
epoch = 3
epoch = 4


Parameter Set: (0.01, 512, 64, 0.1, 'subtract')
epoch = 0
epoch = 1
epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.01, 512, 128, 0.1, 'mul')
epoch = 0
epoch = 1
epoch = 2


epoch = 3
epoch = 4
Parameter Set: (0.01, 512, 128, 0.1, 'concat')
epoch = 0
epoch = 1
epoch = 2
epoch = 3
epoch = 4
Parameter Set: (0.01, 512, 128, 0.1, 'subtract')
epoch = 0


epoch = 1
epoch = 2
epoch = 3
epoch = 4


### Part 3: Test Performance

In [54]:
## get the best performing model from validation
import numpy as np
import pandas as pd 

validation_gru = pd.DataFrame(pd.read_csv("GRU_val_acc_2.csv", header=None)).drop(0,1)
    
validation_gru = validation_gru.T

validation_gru.columns=["lr","hidden","hidden_2",
                                     "dropout","interaction"]+[*range(1,6)]

In [56]:
validation_gru

Unnamed: 0,lr,hidden,hidden_2,dropout,interaction,1,2,3,4,5
1,0.001,512,64,0.1,concat,48.90000224113464,57.40000009536743,58.90000462532044,61.900001764297485,66.40000343322754
2,0.001,512,64,0.1,mul,40.30000269412994,44.20000314712525,51.10000371932984,54.40000295639038,55.80000281333923
3,0.001,512,64,0.1,subtract,44.40000355243683,51.30000114440918,57.50000476837158,59.800004959106445,64.20000195503235
4,0.001,512,128,0.1,concat,49.2000013589859,58.40000510215759,60.10000109672546,62.5,63.60000371932984
5,0.001,512,128,0.1,mul,38.30000162124634,44.10000145435333,52.000004053115845,54.90000247955322,58.60000252723694
6,0.001,512,128,0.1,subtract,46.50000333786011,51.80000066757202,58.79999995231629,61.00000143051148,64.10000324249268
7,0.01,512,64,0.1,concat,50.40000081062317,56.40000104904175,61.10000014305115,60.80000400543213,64.30000066757202
8,0.01,512,64,0.1,mul,38.200002908706665,41.7000025510788,50.49999952316284,50.49999952316284,54.30000424385071
9,0.01,512,64,0.1,subtract,45.6000030040741,52.900004386901855,56.49999976158142,60.00000238418579,64.10000324249268
10,0.01,512,128,0.1,concat,48.50000143051148,56.2000036239624,58.79999995231629,62.10000514984131,64.20000195503235


In [58]:
validation_gru["max_val_acc"] = [max(validation_gru[[*range(1,6)]].\
                                     iloc[i]) for i in range(len(validation_gru))]

In [59]:
np.where(np.array(validation_gru["max_val_acc"])==max(validation_gru["max_val_acc"]))

(array([0]),)

In [60]:
validation_gru.iloc[0]

lr                          0.001
hidden                        512
hidden_2                       64
dropout                       0.1
interaction                concat
1              48.900002241134644
2               57.40000009536743
3              58.900004625320435
4              61.900001764297485
5               66.40000343322754
max_val_acc     66.40000343322754
Name: 1, dtype: object

In [64]:
best_series = validation_gru.iloc[0]

hid = int(best_series["hidden"])
hid2 = int(best_series["hidden_2"])
drop = float(best_series["dropout"])
inter = best_series["interaction"]

In [65]:
vectors = table_lookup
weights_init = init_embedding_weights(vectors, 
                                 token2id_wiki, 
                                 id2token_wiki,
                                 embedding_size = 300)

RNN = biGRU(hidden_size=hid,
            num_layers=1,
          percent_dropout = drop,
          embedding_weights = weights_init,
          vocab_size=wiki_embed_table.size(0),
          interaction_type=inter,
          input_size=300).to(device)

linear_model = Linear_Layers(hidden_size = hid,
             hidden_size_2 = hid2,
             percent_dropout = drop,
             interaction_type=inter,
             classes=3,
             input_size=300).to(device)

#### Government

In [79]:
BATCH_SIZE=32

mnli_val_dataset = MNLIDataset(mnli_val_government, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                             shuffle=False)

loss_train = train(RNN,linear_model,
                   DataLoader = snli_train_loader, ### train on SNLI
                   criterion = nn.CrossEntropyLoss(),
                   optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                       list(linear_model.parameters()), 
                                                       lr=lr), 
                          epoch = epoch)

loss_val, test_preds, test_true = test(
            RNN, 
            linear_model,
            DataLoader = mnli_val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum'))

print ("test_preds = "+str(test_preds))
print ("test_true = "+str(test_true))

test_acc = accuracy(RNN, linear_model, 
                            mnli_val_loader, nn.CrossEntropyLoss(reduction='sum'))

print ("Test accuracy = "+ str(test_acc))



KeyboardInterrupt: 

#### Telephone

In [None]:
BATCH_SIZE=len(mnli_val_telephone)

mnli_val_dataset = MNLIDataset(mnli_val_telephone, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                             shuffle=False)

loss_train = train(RNN,linear_model,
                   DataLoader = snli_train_loader, ### train on SNLI
                   criterion = nn.CrossEntropyLoss(),
                   optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                       list(linear_model.parameters()), 
                                                       lr=lr), 
                          epoch = epoch)

loss_val, test_preds, test_true = test(
            RNN, 
            linear_model,
            DataLoader = mnli_val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum'))

print ("test_preds = "+str(test_preds))
print ("test_true = "+str(test_true))

test_acc = accuracy(RNN, linear_model, 
                            mnli_val_loader, nn.CrossEntropyLoss(reduction='sum'))

print ("Test accuracy = "+ str(test_acc))



#### Slate

In [None]:
BATCH_SIZE=len(mnli_val_slate)

mnli_val_dataset = MNLIDataset(mnli_val_slate, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                             shuffle=False)

loss_train = train(RNN,linear_model,
                   DataLoader = snli_train_loader, ### train on SNLI
                   criterion = nn.CrossEntropyLoss(),
                   optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                       list(linear_model.parameters()), 
                                                       lr=lr), 
                          epoch = epoch)

loss_val, test_preds, test_true = test(
            RNN, 
            linear_model,
            DataLoader = mnli_val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum'))

print ("test_preds = "+str(test_preds))
print ("test_true = "+str(test_true))

test_acc = accuracy(RNN, linear_model, 
                            mnli_val_loader, nn.CrossEntropyLoss(reduction='sum'))

print ("Test accuracy = "+ str(test_acc))

#### Fiction

In [None]:
BATCH_SIZE=len(mnli_val_fiction)

mnli_val_dataset = MNLIDataset(mnli_val_fiction, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                             shuffle=False)

loss_train = train(RNN,linear_model,
                   DataLoader = snli_train_loader, ### train on SNLI
                   criterion = nn.CrossEntropyLoss(),
                   optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                       list(linear_model.parameters()), 
                                                       lr=lr), 
                          epoch = epoch)

loss_val, test_preds, test_true = test(
            RNN, 
            linear_model,
            DataLoader = mnli_val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum'))

print ("test_preds = "+str(test_preds))
print ("test_true = "+str(test_true))

test_acc = accuracy(RNN, linear_model, 
                            mnli_val_loader, nn.CrossEntropyLoss(reduction='sum'))

print ("Test accuracy = "+ str(test_acc))

#### Travel

In [None]:
BATCH_SIZE=len(mnli_val_travel)

mnli_val_dataset = MNLIDataset(mnli_val_travel, 
                               max_sentence_length=MAX_SENTENCE_LENGTH)
mnli_val_loader = torch.utils.data.DataLoader(dataset=mnli_val_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: mnli_func(x, max_sentence_length),
                             shuffle=False)

loss_train = train(RNN,linear_model,
                   DataLoader = snli_train_loader, ### train on SNLI
                   criterion = nn.CrossEntropyLoss(),
                   optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                       list(linear_model.parameters()), 
                                                       lr=lr), 
                          epoch = epoch)

loss_val, test_preds, test_true = test(
            RNN, 
            linear_model,
            DataLoader = mnli_val_loader,
            criterion = nn.CrossEntropyLoss(reduction='sum'))

print ("test_preds = "+str(test_preds))
print ("test_true = "+str(test_true))

test_acc = accuracy(RNN, linear_model, 
                            mnli_val_loader, nn.CrossEntropyLoss(reduction='sum'))

print ("Test accuracy = "+ str(test_acc))