In [1]:
import csv
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

## Load CNN/RNN Class

In [2]:
class RNN(nn.Module):
    def __init__(self, hidden_size1, hidden_size2, hidden_size3, num_layers, num_classes, emb_size=300):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size1, self.hidden_size2, self.hidden_size3 = \
        num_layers, hidden_size1, hidden_size2, hidden_size3
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft)).float()
        self.rnn1 = nn.GRU(emb_size, hidden_size1, num_layers, batch_first=True, bidirectional=True)
        self.rnn2 = nn.GRU(emb_size, hidden_size2, num_layers, batch_first=True, bidirectional=True)
        self.linear1 = nn.Linear(hidden_size1+hidden_size2, hidden_size3)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size3, num_classes)

    def init_hidden(self, batch_size, hidden_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        hidden = torch.randn(self.num_layers*2, batch_size, hidden_size)
        return hidden

    def forward(self, x1, lengths1, x2, lengths2):
        batch_size1, seq_len1 = x1.size()
        batch_size2, seq_len2 = x2.size()
        
        _, idx_sort1 = torch.sort(lengths1, dim=0, descending=True)
        _, idx_unsort1 = torch.sort(idx_sort1, dim=0)
        _, idx_sort2 = torch.sort(lengths2, dim=0, descending=True)
        _, idx_unsort2 = torch.sort(idx_sort2, dim=0)
        
        # reset hidden state
        self.hidden1 = self.init_hidden(batch_size1, self.hidden_size1)
        self.hidden2 = self.init_hidden(batch_size2, self.hidden_size2)

        # get embedding of characters
        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        # pack padded sequence
        embed1 = embed1.index_select(0, idx_sort1)
        embed1 = torch.nn.utils.rnn.pack_padded_sequence(embed1, lengths1.index_select(0, idx_sort1).numpy(), batch_first=True)
        embed2 = embed2.index_select(0, idx_sort2)
        embed2 = torch.nn.utils.rnn.pack_padded_sequence(embed2, lengths2.index_select(0, idx_sort2).numpy(), batch_first=True)
        # fprop though RNN
        rnn_out1, self.hidden1 = self.rnn1(embed1, self.hidden1)
        self.hidden1 = self.hidden1.index_select(1, idx_unsort1)
        rnn_out2, self.hidden2 = self.rnn2(embed2, self.hidden2)
        self.hidden2 = self.hidden2.index_select(1, idx_unsort2)
    
        combined_vector = torch.cat([self.hidden1, self.hidden2],dim=-1)
        rnn_out = torch.sum(combined_vector, dim=0)

        fc_out = self.linear1(rnn_out)
        fc_out = self.relu(fc_out)
        logits = self.linear2(fc_out)
        
        return logits

class CNN(nn.Module):
    def __init__(self, kernel_pad, hidden_size1, hidden_size2, num_layers, num_classes, emb_size=300):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size1, self.hidden_size2 = \
        num_layers, hidden_size1, hidden_size2
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft)).float()
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size1, kernel_size=kernel_pad[0], padding=kernel_pad[1])
        self.conv2 = nn.Conv1d(hidden_size1, hidden_size1, kernel_size=kernel_pad[0], padding=kernel_pad[1])
        self.conv3 = nn.Conv1d(emb_size, hidden_size1, kernel_size=kernel_pad[0], padding=kernel_pad[1])
        self.conv4 = nn.Conv1d(hidden_size1, hidden_size1, kernel_size=kernel_pad[0], padding=kernel_pad[1])
        
        self.maxpool1 = nn.MaxPool1d(MAX_SENTENCE_LENGTH_X1)
        self.maxpool2 = nn.MaxPool1d(MAX_SENTENCE_LENGTH_X2)

        self.linear1 = nn.Linear(2*hidden_size1, hidden_size2)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size2, num_classes)

    def forward(self, x1, lengths1, x2, lengths2):
        batch_size1, seq_len1 = x1.size()
        batch_size2, seq_len2 = x2.size()

        embed1 = self.embedding(x1)
        embed2 = self.embedding(x2)
        
        hidden1 = self.conv1(embed1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, seq_len1, hidden1.size(-1))
        hidden1 = self.conv2(hidden1.transpose(1,2)).transpose(1,2)
        hidden1 = F.relu(hidden1.contiguous().view(-1, hidden1.size(-1))).view(batch_size1, hidden1.size(-1), seq_len1)
        hidden1 = self.maxpool1(hidden1)
        
        hidden2 = self.conv3(embed2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, seq_len2, hidden2.size(-1))
        hidden2 = self.conv4(hidden2.transpose(1,2)).transpose(1,2)
        hidden2 = F.relu(hidden2.contiguous().view(-1, hidden2.size(-1))).view(batch_size2, hidden2.size(-1), seq_len2)   
        hidden2 = self.maxpool2(hidden2)
        
        combined_vector = torch.cat([hidden1, hidden2],dim=1)
        cnn_out = torch.sum(combined_vector, dim=-1)
        
        fc_out = self.linear1(cnn_out)
        fc_out = self.relu(fc_out)
        logits = self.linear2(fc_out)
        return logits

## Load best trained RNN/CNN models

In [3]:
model_rnn = torch.load('best_snli_model_rnn.pth')
model_cnn = torch.load('best_snli_model_cnn.pth')

## Load pre-tokenized val set by genres

In [4]:
def load_data(filepath):
    x1 = []
    x2 = []
    y = []
    genres = []
    with open(filepath) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
            if row == ['sentence1', 'sentence2', 'label', 'genre']:
                pass
            else:
                x1.append(row[0].split())
                x2.append(row[1].split())
                genres.append(row[3])
                if row[2] == 'contradiction':
                    y.append(0.0)
                elif row[2] == 'entailment':
                    y.append(1.0)
                elif row[2] == 'neutral':
                    y.append(2.0)
    
    return x1,x2,y,genres

In [5]:
val_x1, val_x2, val_y, val_genre = load_data('hw2_data/mnli_val.tsv')

In [6]:
set(val_genre)

{'fiction', 'government', 'slate', 'telephone', 'travel'}

## Evaluation

In [7]:
MAX_SENTENCE_LENGTH_X1 = 34
MAX_SENTENCE_LENGTH_X2 = 19
words_to_load = 60000
PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

In [8]:
with open('wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
    np.random.seed(1)
    loaded_embeddings_ft[UNK_IDX] = np.random.rand(300)
    token2id = {'<pad>':PAD_IDX, '<unk>':UNK_IDX}
    id2token = {PAD_IDX:'<pad>', UNK_IDX:'<unk>'}
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        token2id[s[0]] = i+2
        id2token[i+2] = s[0]

# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [9]:
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list1, data_list2, target_list):
        """
        @param data_list: list of SNLI tokens 
        @param target_list: list of SNLI targets 

        """
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        self.target_list = target_list
        assert (len(self.data_list1) == len(self.target_list))
        assert (len(self.data_list2) == len(self.target_list))

    def __len__(self):
        return len(self.data_list1)
        
    def __getitem__(self, key):
        """
        Triggered when yo-u call dataset[i]
        """
        
        token_idx1 = self.data_list1[key][:MAX_SENTENCE_LENGTH_X1]
        token_idx2 = self.data_list2[key][:MAX_SENTENCE_LENGTH_X2]
        label = self.target_list[key]
        return [token_idx1, len(token_idx1), token_idx2, len(token_idx2), label]

def SNLI_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    label_list = []
    length_list1 = []
    length_list2 = []
    for datum in batch:
        label_list.append(datum[4])
        length_list1.append(datum[1])
        length_list2.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH_X1-datum[1])), 
                                mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[2]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH_X2-datum[3])), 
                                mode="constant", constant_values=0)
        data_list1.append(padded_vec1)
        data_list2.append(padded_vec2)
    
    return [torch.from_numpy(np.array(data_list1)), torch.LongTensor(length_list1), 
            torch.from_numpy(np.array(data_list2)), torch.LongTensor(length_list2),torch.LongTensor(label_list)]

In [10]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data1, lengths1, data2, lengths2, labels in loader:
        data_batch1, lengths_batch1, data_batch2, lengths_batch2, label_batch = \
        data1, lengths1, data2, lengths2, labels
        outputs = F.softmax(model(data_batch1, lengths_batch1, data_batch2, lengths_batch2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

### fiction

In [11]:
val_x1_fict = [val for ind, val in enumerate(val_x1) if val_genre[ind]=='fiction']
val_x2_fict = [val for ind, val in enumerate(val_x2) if val_genre[ind]=='fiction']
val_y_fict = [val for ind, val in enumerate(val_y) if val_genre[ind]=='fiction']
val_x1_fict_indices = token2index_dataset(val_x1_fict)
val_x2_fict_indices = token2index_dataset(val_x2_fict)
val_dataset_fict = SNLIDataset(val_x1_fict_indices, val_x2_fict_indices, val_y_fict)
val_loader_fict = torch.utils.data.DataLoader(dataset=val_dataset_fict, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [12]:
print("Val acc of best RNN on MNLI(fiction):{}".format(test_model(val_loader_fict, model_rnn)))
print("Val acc of best CNN on MNLI(fiction): {}".format(test_model(val_loader_fict, model_cnn)))

Val acc of best RNN on MNLI(fiction):43.015075376884425
Val acc of best CNN on MNLI(fiction): 44.02010050251256


### government

In [13]:
val_x1_gov = [val for ind, val in enumerate(val_x1) if val_genre[ind]=='government']
val_x2_gov = [val for ind, val in enumerate(val_x2) if val_genre[ind]=='government']
val_y_gov = [val for ind, val in enumerate(val_y) if val_genre[ind]=='government']
val_x1_gov_indices = token2index_dataset(val_x1_gov)
val_x2_gov_indices = token2index_dataset(val_x2_gov)
val_dataset_gov = SNLIDataset(val_x1_gov_indices, val_x2_gov_indices, val_y_gov)
val_loader_gov = torch.utils.data.DataLoader(dataset=val_dataset_gov, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [14]:
print("Val acc of best RNN on MNLI(government):{}".format(test_model(val_loader_gov, model_rnn)))
print("Val acc of best CNN on MNLI(government): {}".format(test_model(val_loader_gov, model_cnn)))

Val acc of best RNN on MNLI(government):45.17716535433071
Val acc of best CNN on MNLI(government): 41.338582677165356


### slate

In [15]:
val_x1_sla = [val for ind, val in enumerate(val_x1) if val_genre[ind]=='slate']
val_x2_sla = [val for ind, val in enumerate(val_x2) if val_genre[ind]=='slate']
val_y_sla = [val for ind, val in enumerate(val_y) if val_genre[ind]=='slate']
val_x1_sla_indices = token2index_dataset(val_x1_sla)
val_x2_sla_indices = token2index_dataset(val_x2_sla)
val_dataset_sla = SNLIDataset(val_x1_sla_indices, val_x2_sla_indices, val_y_sla)
val_loader_sla = torch.utils.data.DataLoader(dataset=val_dataset_sla, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [16]:
print("Val acc of best RNN on MNLI(slate):{}".format(test_model(val_loader_sla, model_rnn)))
print("Val acc of best CNN on MNLI(slate): {}".format(test_model(val_loader_sla, model_cnn)))

Val acc of best RNN on MNLI(slate):41.417165668662676
Val acc of best CNN on MNLI(slate): 41.417165668662676


### telephone

In [17]:
val_x1_tel = [val for ind, val in enumerate(val_x1) if val_genre[ind]=='telephone']
val_x2_tel = [val for ind, val in enumerate(val_x2) if val_genre[ind]=='telephone']
val_y_tel = [val for ind, val in enumerate(val_y) if val_genre[ind]=='telephone']
val_x1_tel_indices = token2index_dataset(val_x1_tel)
val_x2_tel_indices = token2index_dataset(val_x2_tel)
val_dataset_tel = SNLIDataset(val_x1_tel_indices, val_x2_tel_indices, val_y_tel)
val_loader_tel = torch.utils.data.DataLoader(dataset=val_dataset_tel, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [18]:
print("Val acc of best RNN on MNLI(telephone):{}".format(test_model(val_loader_tel, model_rnn)))
print("Val acc of best CNN on MNLI(telephone): {}".format(test_model(val_loader_tel, model_cnn)))

Val acc of best RNN on MNLI(telephone):47.36318407960199
Val acc of best CNN on MNLI(telephone): 43.681592039801


### travel

In [19]:
val_x1_tra = [val for ind, val in enumerate(val_x1) if val_genre[ind]=='travel']
val_x2_tra = [val for ind, val in enumerate(val_x2) if val_genre[ind]=='travel']
val_y_tra = [val for ind, val in enumerate(val_y) if val_genre[ind]=='travel']
val_x1_tra_indices = token2index_dataset(val_x1_tra)
val_x2_tra_indices = token2index_dataset(val_x2_tra)
val_dataset_tra = SNLIDataset(val_x1_tra_indices, val_x2_tra_indices, val_y_tra)
val_loader_tra = torch.utils.data.DataLoader(dataset=val_dataset_tra, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [20]:
print("Val acc of best RNN on MNLI(travel):{}".format(test_model(val_loader_tra, model_rnn)))
print("Val acc of best CNN on MNLI(travel): {}".format(test_model(val_loader_tra, model_cnn)))

Val acc of best RNN on MNLI(travel):45.010183299389
Val acc of best CNN on MNLI(travel): 42.36252545824847


## Fine-tuning on MNLI

In [21]:
def fine_tune_model(model, train_loader, val_loader, learning_rate, num_epochs, adj, lr_decay=0.5):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("Number of trainable parameters:{}".format(params))
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    if adj:
        scheduler = StepLR(optimizer, step_size=1, gamma=lr_decay)

    for epoch in range(num_epochs):
        if adj:
            scheduler.step()
        for i, (data1, lengths1, data2, lengths2, labels) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            # Forward pass
            outputs = model(data1, lengths1, data2, lengths2)
            loss = criterion(outputs, labels)

            # Backward and optimize
            loss.backward()
            optimizer.step()
    
    val_acc_last = test_model(val_loader, model)
    print("Val Accuracy:{}".format(val_acc_last))
    
    return

In [22]:
train_x1, train_x2, train_y, train_genre = load_data('hw2_data/mnli_train.tsv')

In [23]:
set(train_genre)

{'fiction', 'government', 'slate', 'telephone', 'travel'}

### fiction

In [24]:
train_x1_fict = [val for ind, val in enumerate(train_x1) if train_genre[ind]=='fiction']
train_x2_fict = [val for ind, val in enumerate(train_x2) if train_genre[ind]=='fiction']
train_y_fict = [val for ind, val in enumerate(train_y) if train_genre[ind]=='fiction']
train_x1_fict_indices = token2index_dataset(train_x1_fict)
train_x2_fict_indices = token2index_dataset(train_x2_fict)
train_dataset_fict = SNLIDataset(train_x1_fict_indices, train_x2_fict_indices, train_y_fict)
train_loader_fict = torch.utils.data.DataLoader(dataset=train_dataset_fict, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [25]:
model_rnn = torch.load('best_snli_model_rnn.pth')
fine_tune_model(model_rnn, train_loader_fict, val_loader_fict, 3e-4, 5, False)

Number of trainable parameters:492603
Val Accuracy:44.321608040201006


### government

In [26]:
train_x1_gov = [val for ind, val in enumerate(train_x1) if train_genre[ind]=='government']
train_x2_gov = [val for ind, val in enumerate(train_x2) if train_genre[ind]=='government']
train_y_gov = [val for ind, val in enumerate(train_y) if train_genre[ind]=='government']
train_x1_gov_indices = token2index_dataset(train_x1_gov)
train_x2_gov_indices = token2index_dataset(train_x2_gov)
train_dataset_gov = SNLIDataset(train_x1_gov_indices, train_x2_gov_indices, train_y_gov)
train_loader_gov = torch.utils.data.DataLoader(dataset=train_dataset_gov, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [27]:
model_rnn = torch.load('best_snli_model_rnn.pth')
fine_tune_model(model_rnn, train_loader_gov, val_loader_gov, 3e-4, 5, False)

Number of trainable parameters:492603
Val Accuracy:48.91732283464567


### slate

In [28]:
train_x1_sla = [val for ind, val in enumerate(train_x1) if train_genre[ind]=='slate']
train_x2_sla = [val for ind, val in enumerate(train_x2) if train_genre[ind]=='slate']
train_y_sla = [val for ind, val in enumerate(train_y) if train_genre[ind]=='slate']
train_x1_sla_indices = token2index_dataset(train_x1_sla)
train_x2_sla_indices = token2index_dataset(train_x2_sla)
train_dataset_sla = SNLIDataset(train_x1_sla_indices, train_x2_sla_indices, train_y_sla)
train_loader_sla = torch.utils.data.DataLoader(dataset=train_dataset_sla, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [29]:
model_rnn = torch.load('best_snli_model_rnn.pth')
fine_tune_model(model_rnn, train_loader_sla, val_loader_sla, 3e-4, 5, False)

Number of trainable parameters:492603
Val Accuracy:43.712574850299404


### telephone

In [30]:
train_x1_tel = [val for ind, val in enumerate(train_x1) if train_genre[ind]=='telephone']
train_x2_tel = [val for ind, val in enumerate(train_x2) if train_genre[ind]=='telephone']
train_y_tel = [val for ind, val in enumerate(train_y) if train_genre[ind]=='telephone']
train_x1_tel_indices = token2index_dataset(train_x1_tel)
train_x2_tel_indices = token2index_dataset(train_x2_tel)
train_dataset_tel = SNLIDataset(train_x1_tel_indices, train_x2_tel_indices, train_y_tel)
train_loader_tel = torch.utils.data.DataLoader(dataset=train_dataset_tel, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [38]:
model_rnn = torch.load('best_snli_model_rnn.pth')
fine_tune_model(model_rnn, train_loader_tel, val_loader_tel, 3e-4, 5, False)

Number of trainable parameters:492603
Val Accuracy:47.66169154228856


### travel

In [32]:
train_x1_tra = [val for ind, val in enumerate(train_x1) if train_genre[ind]=='travel']
train_x2_tra = [val for ind, val in enumerate(train_x2) if train_genre[ind]=='travel']
train_y_tra = [val for ind, val in enumerate(train_y) if train_genre[ind]=='travel']
train_x1_tra_indices = token2index_dataset(train_x1_tra)
train_x2_tra_indices = token2index_dataset(train_x2_tra)
train_dataset_tra = SNLIDataset(train_x1_tra_indices, train_x2_tra_indices, train_y_tra)
train_loader_tra = torch.utils.data.DataLoader(dataset=train_dataset_tra, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=SNLI_collate_func,
                                           shuffle=False)

In [33]:
model_rnn = torch.load('best_snli_model_rnn.pth')
fine_tune_model(model_rnn, train_loader_tra, val_loader_tra, 3e-4, 5, False)

Number of trainable parameters:492603
Val Accuracy:47.04684317718941
