In [1]:
# train.py imports
import os
import sys
import errno
import glob
import random
import numpy as np
from argparse import ArgumentParser
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset
from corpora import MultiNLI, SciTail, StanfordNLI, AllNLI, BreakingNLI
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import io
import torch
import torch.nn as nn
from embeddings import SentenceEmbedding

import pickle
import random
import random
import spacy
import csv
import string
import re
import functools
import numpy as np
import pandas as pd
from setuptools import setup
from collections import Counter
from collections import defaultdict
import spacy
from torch.utils.data import Dataset

In [2]:
import preprocess_nli

In [3]:
PAD_IDX = 0
UNK_IDX = 1

label_dict = {"entailment":0,
             "neutral":1,
             "contradiction":2}

In [4]:
no_cuda = False
cuda = not no_cuda and torch.cuda.is_available()
seed = 1
device = torch.device("cuda" if cuda else "cpu")

In [6]:
fname = "vector_cache/glove.840B.300d.txt"
word_vectors = preprocess_nli.load_glove(fname)

In [12]:
len(dict(zip([*word_vectors.keys()][:5], range(5))))

5

In [23]:
def build_vocab(all_tokens):
    token2id = dict(zip([*word_vectors.keys()], range(2, 2+len(word_vectors))))
    token2id["<PAD>"] = 0
    token2id["<UNK>"] = 1
    id2token = dict(zip(range(2, 2+len(word_vectors)),[*word_vectors.keys()]))
    id2token[0] = "<PAD>"
    id2token[1] = "<UNK>"
    return token2id, id2token

In [14]:
word_vector_tensor = torch.from_numpy(np.array(pd.DataFrame(word_vectors).T)).float()

In [15]:
def read_enli(nli_corpus = "snli"):
	if nli_corpus == "snli":
		path_ = "data/snli/snli_1.0/snli_1.0"
	elif nli_corpus == "multinli":
		path_ = "data/multinli/multinli_1.0/"

	train = pd.read_json("{}_{}.jsonl".format(path_,"train"), lines=True)
	dev = pd.read_json("{}_{}.jsonl".format(path_,"dev"), lines=True)
	test = pd.read_json("{}_{}.jsonl".format(path_,"test"), lines=True)
    
	train = train[train["gold_label"] != "-"]
	dev = dev[dev["gold_label"] != "-"]
	test = test[test["gold_label"] != "-"]
	return train, dev, test

In [16]:
nli_train, nli_dev, nli_test = read_enli(nli_corpus = "snli")

In [17]:
nli_train["gold_label"].value_counts()

entailment       183416
contradiction    183187
neutral          182764
Name: gold_label, dtype: int64

In [18]:
def write_numeric_label(train, dev, test):
    for dataset in [train, dev, test]:
        dataset["gold_label"] = dataset["gold_label"].apply(lambda x: label_dict[x])
    return train, dev, test

In [19]:
nli_train, nli_dev, nli_test = write_numeric_label(nli_train, nli_dev, nli_test)

In [20]:
reg = re.compile("[%s]" % re.escape(string.punctuation))
def tokenize_enli(dataset, remove_punc=False):
    punc = string.punctuation
    all_s1_tokens = []
    all_s2_tokens = []
    for s in [1,2]:
        if remove_punc:
            dataset["sentence{}_tokenized".format(s)] = dataset["sentence{}".format(s)].\
            apply(lambda x: reg.sub("", x).lower().split(" "))
        else:
            dataset["sentence{}_tokenized".format(s)] = dataset["sentence{}".format(s)].\
            apply(lambda x: (reg.sub("", x) + " .").lower().split(" "))
    print ("Tokenizing data.")
    dataset["sentence1_tokenized"].apply(lambda x: all_s1_tokens.extend(x))
    dataset["sentence2_tokenized"].apply(lambda x: all_s2_tokens.extend(x))
    all_tokens = all_s1_tokens + all_s2_tokens
    return dataset, all_tokens

In [21]:
nli_train, all_train_tokens = tokenize_enli(nli_train, remove_punc=False)
nli_dev, _ = tokenize_enli(nli_dev, remove_punc=False)
nli_test, _ = tokenize_enli(nli_test, remove_punc=False)

Tokenizing data.
Tokenizing data.
Tokenizing data.


In [24]:
token2id, id2token = build_vocab(all_train_tokens)

In [25]:
token2id["the"]

4

In [26]:
id2token[4]

'the'

In [27]:
def make_dirs(name):
    try:
        os.makedirs(name)
    except OSError as ex:
        if ex.errno == errno.EEXIST and os.path.isdir(name):
            # ignore existing directory
            pass
        else:
            # a different error happened
            raise

In [28]:
class NLIDataset(Dataset):
    def __init__(self, tokenized_dataset, max_sentence_length, token2id, id2token):
        self.sentence1, self.sentence2, self.labels = [*tokenized_dataset["sentence1_tokenized"].values], \
                                                      [*tokenized_dataset["sentence2_tokenized"].values], \
                                                      [*tokenized_dataset["gold_label"].values]
        self.max_sentence_length = int(max_sentence_length)
        self.token2id, self.id2token = token2id, id2token
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, row):
        label = self.labels[row]
        sentence1_word_idx, sentence2_word_idx = [], []
        sentence1_mask, sentence2_mask = [], []
        for word in self.sentence1[row][:self.max_sentence_length]:
            if word in self.token2id.keys():
                sentence1_word_idx.append(self.token2id[word])
                sentence1_mask.append(0)
            else:
                sentence1_word_idx.append(UNK_IDX)
                sentence1_mask.append(1)
        for word in self.sentence2[row][:self.max_sentence_length]:
            if word in self.token2id.keys():
                sentence2_word_idx.append(self.token2id[word])
                sentence2_mask.append(0)
            else:
                sentence2_word_idx.append(UNK_IDX)
                sentence2_mask.append(1)
        sentence1_list = [sentence1_word_idx, sentence1_mask, len(sentence1_word_idx)]
        sentence2_list = [sentence2_word_idx, sentence2_mask, len(sentence2_word_idx)]
        
        return sentence1_list + sentence2_list + [label]

def nli_collate_func(batch, max_sent_length):
    sentence1_data, sentence2_data = [], []
    sentence1_mask, sentence2_mask = [], []
    s1_lengths, s2_lengths = [], []
    labels = []

    for datum in batch:
        s1_lengths.append(datum[2])
        s2_lengths.append(datum[5])
        labels.append(datum[6])
        sentence1_data_padded = np.pad(np.array(datum[0]), pad_width=((0, config.max_sentence_length-datum[2])), mode="constant", constant_values=0)
        sentence1_data.append(sentence1_data_padded)
        sentence1_mask_padded = np.pad(np.array(datum[1]), pad_width=((0, config.max_sentence_length-datum[2])), mode="constant", constant_values=0)
        sentence1_mask.append(sentence1_mask_padded)
        sentence2_data_padded = np.pad(np.array(datum[3]), pad_width=((0, config.max_sentence_length-datum[5])), mode="constant", constant_values=0)
        sentence2_data.append(sentence2_data_padded)
        sentence2_mask_padded = np.pad(np.array(datum[4]), pad_width=((0, config.max_sentence_length-datum[5])), mode="constant", constant_values=0)
        sentence2_mask.append(sentence2_mask_padded)
        
    ind_dec_order = np.argsort(s1_lengths)[::-1]
    sentence1_data = np.array(sentence1_data)[ind_dec_order]
    sentence2_data = np.array(sentence2_data)[ind_dec_order]
    sentence1_mask = np.array(sentence1_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    sentence2_mask = np.array(sentence2_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    s1_lengths = np.array(s1_lengths)[ind_dec_order]
    s2_lengths = np.array(s2_lengths)[ind_dec_order]
    labels = np.array(labels)[ind_dec_order]
    
    s1_list = [torch.from_numpy(sentence1_data), torch.from_numpy(sentence1_mask).float(), s1_lengths]
    s2_list = [torch.from_numpy(sentence2_data), torch.from_numpy(sentence2_mask).float(), s2_lengths]
        
    return [torch.from_numpy(sentence1_data), torch.from_numpy(sentence1_mask).float(), s1_lengths,
            torch.from_numpy(sentence2_data), torch.from_numpy(sentence2_mask).float(), s2_lengths,
            labels]

In [29]:
class config_class:
    def __init__(self, max_sentence_length, corpus, epochs, batch_size, encoder_type, 
                 activation, optimizer,
                 embed_dim, fc_dim, hidden_dim, layers, dropout, learning_rate,
                 lr_patience, lr_decay, lr_reduction_factor, weight_decay,
                 preserve_case, word_embedding, resume_snapshot, early_stopping_patience,
                 save_path, seed):
        
        self.max_sentence_length = max_sentence_length
        self.corpus = corpus
        self.epochs = epochs
        self.batch_size = batch_size
        self.encoder_type = encoder_type
        self.activation = activation
        self.optimizer = optimizer
        self.embed_dim = embed_dim
        self.fc_dim = fc_dim
        self.hidden_dim = hidden_dim
        self.layers = layers
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.lr_patience = lr_patience
        self.lr_decay = lr_decay
        self.lr_reduction_factor = lr_reduction_factor
        self.weight_decay = weight_decay
        self.preserve_case = preserve_case
        self.word_embedding = word_embedding
        self.resume_snapshot = resume_snapshot
        self.early_stopping_patience = early_stopping_patience
        self.save_path = save_path
        self.seed = seed
        self.lower = True
        self.vectors = word_vector_tensor
        self.embed_size = self.vectors.size(0)

In [30]:
config = config_class(30, "snli", 20, 16, "HBMP", "relu", "adam", 300, 300, 300, # default 600, 600 fc_dim, hidden_dim
                      1, 0, 5e-4, 1, 0.99, 0.2, 0,"store_false", 'glove.840B.300d',
                      "", 3, "results", 1234)

In [31]:
# train
nli_train_dataset = NLIDataset(nli_train, max_sentence_length=config.max_sentence_length, token2id=token2id, id2token=id2token)
nli_train_loader = torch.utils.data.DataLoader(dataset=nli_train_dataset, batch_size=config.batch_size,
                               collate_fn=lambda x, max_sentence_length=config.max_sentence_length: nli_collate_func(x, config.max_sentence_length),
                               shuffle=False)

# dev
nli_dev_dataset = NLIDataset(nli_dev, max_sentence_length=config.max_sentence_length, token2id=token2id, id2token=id2token)
nli_dev_loader = torch.utils.data.DataLoader(dataset=nli_dev_dataset, batch_size=config.batch_size,
                               collate_fn=lambda x, max_sentence_length=config.max_sentence_length: nli_collate_func(x, config.max_sentence_length),
                               shuffle=False)

# test
nli_test_dataset = NLIDataset(nli_test, max_sentence_length=config.max_sentence_length, token2id=token2id, id2token=id2token)
nli_test_loader = torch.utils.data.DataLoader(dataset=nli_test_dataset, batch_size=config.batch_size,
                               collate_fn=lambda x, max_sentence_length=config.max_sentence_length: nli_collate_func(x, config.max_sentence_length),
                               shuffle=False)

In [47]:
class biLSTM(nn.Module):
    
    def __init__(self,
                 hidden_size,
                 embedding_weights,
                 percent_dropout,
                 vocab_size=word_vector_tensor.size(0),
                 interaction_type="concat",
                 num_layers=1,
                 input_size=300):

        super(biLSTM, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        embed_table = word_vector_tensor
        embedding = nn.Embedding.from_pretrained(embed_table)
        self.embedding = embedding
        self.interaction = interaction_type
        self.dropout = percent_dropout
        self.LSTM = nn.LSTM(300, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        if self.LSTM.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
        self.drop_out = nn.Dropout(self.dropout)
        
        
    def init_hidden(self, batch_size):
        hidden = torch.randn(self.num_directions*self.num_layers, batch_size, self.hidden_size).to(device)
        c_0 = torch.randn(self.num_directions*self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, c_0
    
    def forward(self, sentence, mask, lengths):
        sort_original = sorted(range(len(lengths)), key=lambda sentence: -lengths[sentence])
        unsort_to_original = sorted(range(len(lengths)), key=lambda sentence: sort_original[sentence])
        sentence = sentence[sort_original]
        _mask = mask[sort_original]
        lengths = lengths[sort_original]
        batch_size, seq_len = sentence.size()
        self.hidden, self.c_0 = self.init_hidden(batch_size)
        embeds = self.embedding(sentence)
        embeds = mask*embeds + (1-_mask)*embeds.clone().detach()
        embeds = torch.nn.utils.rnn.pack_padded_sequence(embeds, lengths, batch_first=True)
        lstm_out, (self.hidden, self.c_0) = self.LSTM(embeds, (self.hidden, self.c_0))
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = lstm_out.view(batch_size, -1, 2, self.hidden_size)
        lstm_out = torch.sum(lstm_out, dim=1)
        lstm_out = torch.cat([lstm_out[:,i,:] for i in range(2)], dim=1)
        lstm_out = lstm_out[unsort_to_original]
        return lstm_out
    
    
class Linear_Layers(nn.Module):
    
    def __init__(self, hidden_size, hidden_size_2, percent_dropout,
                 interaction_type="concat", classes=3, input_size=300):
        
        super(Linear_Layers, self).__init__()
        self.interaction = interaction_type
        self.num_classes = classes
        self.hidden_size = hidden_size
        self.hidden_size_2 = hidden_size_2
        self.percent_dropout = percent_dropout
        self.num_classes = classes
        
        if self.interaction == "concat":
            self.mlp = nn.Sequential(
                nn.Dropout(p=self.percent_dropout),
                nn.Linear(4*self.hidden_size, self.hidden_size_2),
                nn.ReLU(inplace=True),
                nn.Dropout(p=self.percent_dropout),
                nn.Linear(self.hidden_size_2, self.num_classes))
        else:
            self.mlp = nn.Sequential(
                nn.Dropout(p=self.percent_dropout),
                nn.Linear(2*self.hidden_size, self.hidden_size_2),
                nn.ReLU(inplace=True),
                nn.Dropout(p=self.percent_dropout),
                nn.Linear(self.hidden_size_2, self.num_classes))

        self.init_weights()
        self.batch_norm = nn.BatchNorm1d(self.hidden_size * 4)
        
    def init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_normal_(module.weight)
                nn.init.uniform_(module.bias)

    def forward(self, lstm_out_1, lstm_out_2):
        if self.interaction == "concat":
            hidden = torch.cat([lstm_out_1, lstm_out_2], dim=1)
        elif self.interaction == "mul":
            hidden = lstm_out_1*lstm_out_2
        elif self.interaction == "subtract":
            hidden = lstm_out_1-lstm_out_2
        hidden = self.batch_norm(hidden)
        hidden = hidden.view(hidden.size(0),-1) 
        out = self.mlp(hidden)
        return out

In [48]:
def train(RNN, Linear_Classifier, DataLoader, criterion, optimizer, epoch):
    
    RNN.train()
    Linear_Classifier.train()
    total_loss = 0
    for batch_idx, (sentence1, s1_original, sentence1_lengths, 
                    sentence2, s2_original, sentence2_lengths, labels)\
    in enumerate(DataLoader):
        sentence1, s1_original = sentence1.to(device), s1_original.to(device),  
        sentence2, s2_original = sentence2.to(device), s2_original.to(device),
        labels = torch.from_numpy(labels).to(device)
        RNN.train()
        Linear_Classifier.train()
        optimizer.zero_grad()
        output_s1 = RNN(sentence1, s1_original, sentence1_lengths)
        output_s2 = RNN(sentence2, s2_original, sentence2_lengths)
        out = Linear_Classifier(output_s1, output_s2)
        loss = criterion(out, labels)
        loss.cuda().backward()
        optimizer.step()
        total_loss += loss.item() * len(sentence1) / len(DataLoader.dataset)
        
        if (batch_idx+1) % (len(DataLoader.dataset)//(20*labels.shape[0])) == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, (batch_idx+1) * labels.shape[0], len(DataLoader.dataset),
                100. * (batch_idx+1) / len(DataLoader), loss.item()), end="\r")

    optimizer.zero_grad()
    return total_loss


def test(RNN, Linear_Classifier, DataLoader, criterion):

    RNN.eval()
    Linear_Classifier.eval()
    test_loss = 0
    label_list = []
    output_list = []
    with torch.no_grad():
        for batch_idx, (sentence1, s1_original, sentence1_lengths, 
                    sentence2, s2_original, sentence2_lengths, labels)\
                    in enumerate(DataLoader):

            sentence1, s1_original = sentence1.to(device), s1_original.to(device),  
            sentence2, s2_original = sentence2.to(device), s2_original.to(device),
            labels = torch.from_numpy(labels).to(device)
            output_s1 = RNN(sentence1, s1_original, sentence1_lengths)
            output_s2 = RNN(sentence2, s2_original, sentence2_lengths)
            out = Linear_Classifier(output_s1, output_s2)
            loss = criterion(out, labels)
            test_loss += loss.item()/len(DataLoader.dataset)
            output_list.append(out)
            label_list.append(labels)
            
    return test_loss, torch.cat(output_list, dim=0), torch.cat(label_list, dim=0)

def accuracy(RNN, Linear_Classifier, DataLoader, criterion):
    
    _, predicted, true_labels = test(RNN = RNN,  Linear_Classifier = Linear_Classifier,
                                     DataLoader = DataLoader, criterion = criterion)

    predicted = predicted.max(1)[1]
    return 100 * predicted.eq(true_labels.data.view_as(predicted)).float().mean().item()

In [49]:
vocab_size = 50000
num_classes = 3
num_layers = 1
bidirectional = True
lstm_hidden_size = 512
classifier_hidden_size = 512

BATCH_SIZE = 16
lr = 5e-4
n_epochs = 20

In [50]:
def init_embedding_weights(vectors, token2id, id2token, embedding_size):
    weights = np.zeros((len(token2id), embedding_size))
    for idx in range(2, len(id2token)):
        token = id2token[idx]
        weights[idx] = np.array(token2id[token])
    weights[1] = np.random.randn(embedding_size)
    return weights

In [None]:
weights_init = init_embedding_weights(word_vector_tensor, token2id, id2token, embedding_size = 300)

RNN = biLSTM(hidden_size=lstm_hidden_size, num_layers=1, percent_dropout = 0.1, 
             embedding_weights = weights_init, vocab_size=word_vector_tensor.size(0),
             interaction_type="concat", input_size=300).to(device)

linear_model = Linear_Layers(hidden_size = classifier_hidden_size, hidden_size_2 = 512,
                             percent_dropout = 0.1, interaction_type="concat", 
                             classes=3, input_size=300).to(device)

print ("RNN:\n", RNN)
print ("linear_model:\n", linear_model)

training_accuracy = []
validation_accuracy = []

for epoch in range(n_epochs):
    print ("epoch = "+str(epoch))

    loss_train = train(RNN, linear_model, DataLoader = nli_train_loader,
                       criterion = nn.CrossEntropyLoss(),
                       optimizer = torch.optim.Adam(list(RNN.parameters()) + \
                                                   list(linear_model.parameters()), 
                                                   lr=5e-4), 
                      epoch = epoch)

    loss_val, val_preds, val_true = test(RNN, linear_model, DataLoader = nli_dev_loader,
                                         criterion = nn.CrossEntropyLoss(reduction='sum'))
#     train_acc = accuracy(RNN, linear_model, nli_train_loader, nn.CrossEntropyLoss(reduction='sum'))
    val_acc = accuracy(RNN, linear_model, nli_dev_loader, nn.CrossEntropyLoss(reduction='sum'))
#     print ("Train Accuracy = {}".format(train_acc))
    print ("\nValidation Accuracy = {}".format(val_acc))

RNN:
 biLSTM(
  (embedding): Embedding(2196016, 300)
  (LSTM): LSTM(300, 512, batch_first=True, bidirectional=True)
  (drop_out): Dropout(p=0.1)
)
linear_model:
 Linear_Layers(
  (mlp): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=2048, out_features=512, bias=True)
    (2): ReLU(inplace)
    (3): Dropout(p=0.1)
    (4): Linear(in_features=512, out_features=3, bias=True)
  )
  (batch_norm): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
epoch = 0
Validation Accuracy = 68.51249933242798
epoch = 1
Validation Accuracy = 71.0221529006958
epoch = 2
Validation Accuracy = 71.65210247039795
epoch = 3
Validation Accuracy = 72.60719537734985
epoch = 4
Validation Accuracy = 72.71896004676819
epoch = 5
Validation Accuracy = 73.42003583908081
epoch = 6
Validation Accuracy = 72.93233275413513
epoch = 7

In [None]:
# TODO adaptive hierarchical bilstm