<h1 id="tocheading">MultiNLI Training</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [2]:
import pickle
import random
import spacy
import errno
import glob
import string
import os
import jieba
import nltk
import functools
import numpy as np
import pandas as pd
from collections import Counter
from collections import defaultdict
from argparse import ArgumentParser
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.autograd import Variable

## Imports

Besides the publicly available libraries above, we import our preprocessing functions, models (bidirectional LSTM and linear classifier), and trainer functions. 

Then we define the pre-set variables using some preprocessing functions:

    PAD_IDX: Padding index to use in the vocabulary (here 0)
    UNK_IDX: Unkknown index to use in the vocabulary (here 1)
    multinli_path: the directory where the MultiNLI dataset is located
    align_path: the directory where the aligned vectors are located
    multi_path: the directory where the standard multi-lingual vectors are located

The code that follows these in the second cell below will specify your device type. (whether you are running on a GPU or a CPU).

In [3]:
from preprocess import *
from models import *
from nli_trainer import *

In [4]:
PAD_IDX, UNK_IDX = define_indices()
label_dict = define_label_dict()
snli_path, align_path, multi_path = define_paths()
no_cuda = False
cuda = not no_cuda and torch.cuda.is_available()
seed = 1
device = torch.device("cuda" if cuda else "cpu")

In [5]:
config = SNLIconfig(corpus = "multinli", val_test_lang = "en", max_sent_len = 50, max_vocab_size = 75000,
             epochs = 15, batch_size = 256, embed_dim = 300, hidden_dim = 512, dropout = 0.1, lr = 1e-3)

## Read & Tokenize Datasets

We use the cell below to read train and dev sets from the MultiNLI corpus. Then we write a numeric label that our models can recognize. It corresponds to "entailment", "contradiction" or "neutral". 

The tokenizer function here uses standard nltk tokenizer. 

In [6]:
train, dev, test = read_nli(config.corpus)
train, dev, test = write_numeric_label(train, dev, test, nli_corpus=config.corpus)
train, all_train_tokens = tokenize_xnli(train, lang=config.val_test_lang)
dev, _ = tokenize_xnli(dev, lang=config.val_test_lang)
# test, _ = tokenize_xnli(test, lang=config.val_test_lang) # will test on XNLI later

### Load Pre-trained Word Embeddings

Here we load the pretrained fastText word embeddings using the preprocessing.load_vectors function.

Then we construct id2token list and token2id vocabulary. 

In [7]:
vecs = load_vectors("../data/vecs/cc.en.300.vec")

In [8]:
id2tok = [x+"."+"en" for x in [*vecs.keys()]][:config.max_vocab_size]
id2tok = ["<PAD>", "<UNK>"] + id2tok
tok2id = build_tok2id(id2tok)
vecs = update_single_vocab_keys(vecs)

In [9]:
weights_init = init_embedding_weights(vecs, tok2id, id2tok, 300)

### Specify Data Loaders

We specify training and dev data loaders using the NLIDataset class and nli_collate_func from preprocessing.py. We will later use XNLI English test set as the test data for this model. 

We use these loaders to pass data into training and test functions.

In [10]:
# train
train_dataset = NLIDataset(train, max_sentence_length=config.max_sent_len, token2id=tok2id, id2token=id2tok)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=config.batch_size,
                               collate_fn=lambda x, max_sentence_length=config.max_sent_len: nli_collate_func(x, config.max_sent_len),
                               shuffle=False)

# dev
dev_dataset = NLIDataset(dev, max_sentence_length=config.max_sent_len, token2id=tok2id, id2token=id2tok)
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset, batch_size=config.batch_size,
                               collate_fn=lambda x, max_sentence_length=config.max_sent_len: nli_collate_func(x, config.max_sent_len),
                               shuffle=False)

## Train Models on MultiNLI

Now we specify our models and train them on MultiNLI training data. At the end of each epoch, we check development (dev) set accuracy using the MultiNLI matched dev dataset. Here is a brief definition of the models & functions we use:

    biLSTM: LSTM network, bidirectional by default. Imported from models.py. Takes as input a sentence (premise or hypothesis), encodes it into a fixed-length vector.
    
    Linear_Layers: linear classifier network from models.py. Takes as input the vector representations of both premise and hypothesis and generates log-likelihood scores for each entailment category ("entailment", "contradiction", "neutral")
    
    train_: Trainer function for NLI from nli_trainer.py.
    
    accuracy: Computes accuracy on dev or test set using trained LSTM and linear models. 
    
You can go into each .py file to learn more about the functions. 

In [11]:
LSTM = biLSTM(config.hidden_dim, weights_init, config.dropout, config.max_vocab_size,
              num_layers=1, input_size=300).to(device)

linear_ = Linear_Layers(hidden_size = 1024, hidden_size_2 = 128, percent_dropout = config.dropout,
                        classes=3, input_size=config.embed_dim).to(device)

print ("Encoder:\n", LSTM)
print ("Classifier:\n", linear_)

validation_accuracy = [0]
start_epoch = 0

for epoch in range(start_epoch, start_epoch + config.epochs):
    print ("\nepoch = "+str(epoch))
    loss_train = train_(LSTM, linear_, DataLoader = train_loader,
                       criterion = nn.NLLLoss(),
                       optimizer = torch.optim.Adam(list(LSTM.parameters()) + list(linear_.parameters()), 
                                                   lr=1e-3),
                       epoch = epoch)
    
    val_acc = accuracy(LSTM, linear_, dev_loader, nn.NLLLoss(reduction='sum'))
    if val_acc <= validation_accuracy[-1]:
        break
        
    print ("\n{} Validation Accuracy = {}".format(config.val_test_lang.upper(), val_acc))
    validation_accuracy.append(val_acc)
    torch.save(LSTM.state_dict(), "best_encoder_eng_snli_{}_{}".format(epoch, "EN"))
    torch.save(linear_.state_dict(), "best_linear_eng_snli_{}_{}".format(epoch, "EN"))

Encoder:
 biLSTM(
  (embedding): Embedding(75002, 300)
  (drop_out): Dropout(p=0.1)
  (LSTM): LSTM(300, 512, batch_first=True, bidirectional=True)
)
Classifier:
 Linear_Layers(
  (mlp): Sequential(
    (0): Linear(in_features=4096, out_features=128, bias=True)
    (1): ReLU(inplace)
    (2): Dropout(p=0.1)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): ReLU(inplace)
    (5): Dropout(p=0.1)
    (6): Linear(in_features=128, out_features=3, bias=True)
  )
)

epoch = 0
EN Validation Accuracy = 58.634740114212036

epoch = 1
EN Validation Accuracy = 62.221091985702515

epoch = 2
EN Validation Accuracy = 63.9123797416687

epoch = 3
EN Validation Accuracy = 64.23841118812561

epoch = 4
EN Validation Accuracy = 65.05349278450012

epoch = 5
EN Validation Accuracy = 66.21497869491577

epoch = 6
EN Validation Accuracy = 66.29648804664612

epoch = 7
EN Validation Accuracy = 66.75496697425842

epoch = 8
EN Validation Accuracy = 67.12175011634827

epoch = 9