# FastText Word Vectors and Embedding


Load in pre-trained word vectors. Using wiki-news-300d-1M from fasttext.

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline


UNK_IDX = 1
PAD_IDX = 0
BATCH_SIZE = 1000 # change later

# Set device. 
device = None
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
        
print(device)

# Loads pretrained embeddings from files, also creates token2id and id2token           
def create_embedding(fname, size):
    vecs = []
    tokens = []
    embedding = np.array
    with open(fname, 'r') as f:
        f.readline() # Skip first line
        for i in range(size):
            line = f.readline()
            if line: 
                line = line.rstrip().split(' ')
                token = line[0]
                vec = line[1:]
                tokens.append(token)
                vecs.append(list(map(float,vec)))    
            else:
                break # Reached end of file
    
    # Insert tokens
    tokens.insert(0, '<unk>')
    tokens.insert(0, '<pad>')
    
    token2id = {}
    id2token = {}
    
    # Initialize pad weights to all zeroes
    pad_weights = torch.zeros([1, 300], dtype=torch.float32)
    
    unk_weights = torch.zeros([1, 300], dtype=torch.float32)
    
    # Initialize unk weights from standard normal distribution 
    unk_weights.normal_() 
    # Scale weights
    unk_weights = unk_weights / 10
    
    for i in range(len(tokens)):
        token2id[tokens[i]] = i
        id2token[i] = tokens[i]
    
    # Create embedding matrix
    wiki_embed = torch.cat((pad_weights, unk_weights, torch.FloatTensor(vecs)), dim=0)
    return wiki_embed, token2id, id2token
                               
wiki_embed, token2id, id2token = create_embedding('wiki-news-300d-1M.vec', 100000)

cpu


In [8]:
# From lab, check token2id and id2token match
random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 5898 ; token appeals
Token appeals; token id 5898


# Tokenization and Max Sentence Length

Tokenize the data sets and compute MAX_SENTENCE_LENGTH

In [9]:
# From lab
def token2index_dataset(token2id, dataset):
    categories = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
    indices = []
    for sent1, sent2, label in dataset:
        sent1_idx = [token2id[token] if token in token2id else 1 for token in sent1.split()]
        sent2_idx = [token2id[token] if token in token2id else 1 for token in sent2.split()]
        indices.append((sent1_idx, sent2_idx, categories[label]))
    return indices

# Load training set
train_df = pd.read_csv('./hw2_data/snli_train.tsv', header=0, sep='\t')
train_dataset = zip(train_df['sentence1'], train_df['sentence2'], train_df['label'])
train_dataset_idx = token2index_dataset(token2id, train_dataset)

# Load validation set
val_df = pd.read_csv('./hw2_data/snli_val.tsv', header=0, sep='\t')
val_dataset = zip(val_df['sentence1'], val_df['sentence2'], val_df['label'])
val_dataset_idx = token2index_dataset(token2id, val_dataset)

train_sent1, train_sent2, train_labels = zip(*(train_dataset_idx))
val_sent1, val_sent2, val_labels = zip(*(val_dataset_idx))

# Check to make sure number of sentences equals number of labels
assert(len(train_sent1) == len(train_sent2) == len(train_labels))
assert(len(val_sent1) == len(val_sent2) == len(val_labels))

# Get MAX_SENTENCE_LENGTH
def find_max_length(train_df, threshold):
    """
    Helper function that returns the sentence length
    greater than (threshold*100)% of the sentences in the
    dataframe.
    
    @param: train_df - dataframe containing the premise and hypothesis sentences
    @param: threshold - value between 0 and 1 that specifies what percentage of the
    sentences should the returned sentence length be longer than - REWORD
    """
    sent1 = train_df['sentence1'].tolist()
    sent2 = train_df['sentence2'].tolist()
    sents = sent1 + sent2
    sent_lens = list(map(lambda x: len(x.split()), sents))
    
    # Sentence threshold
    num = int(threshold * len(sent_lens))
    
    c = Counter(sent_lens)

    order = c.most_common()
    # Sort, so shortest sentences are first
    order.sort()
    
    # Running count of the number of sentences
    rc = 0
    # Index
    i = 0
    while rc < num:
        rc += order[i][1]
        i += 1
    
    return order[i][0]

MAX_SENTENCE_LENGTH = find_max_length(train_df, 0.995)
print("MAX_SENTENCE_LENGTH", MAX_SENTENCE_LENGTH)


MAX_SENTENCE_LENGTH 35


# Data Loaders

Create the data loaders for the training set and validation set

In [10]:
from torch.utils.data import Dataset

# Create dataset loader. 
# Starter code from lab 4.
class SNLIDataset(Dataset):
    
    def __init__(self, snli_instances):
        """
        @param snli_instances: list of tuples containing (sent1, sent2, label)
        @param token2id: mapping from token to index
        """
        
        self.data_list = [(sent1, sent2) for sent1, sent2, target in snli_instances]
        self.target_list = [target for sent1, sent2, target in snli_instances]
        assert (len(self.data_list) == len(self.target_list))
    
    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        sent1 = self.data_list[key][0][:MAX_SENTENCE_LENGTH]
        sent2 = self.data_list[key][1][:MAX_SENTENCE_LENGTH]
        return [(sent1, sent2) , (len(sent1), len(sent2)), self.target_list[key]]
        
def snli_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so the data all
    have the same length
    """
    sent1_list = []
    sent2_list = []
    label_list = []
    sent1_len_list = []
    sent2_len_list = []
    
    for datum in batch:
        sent_pairs, len_pairs, label = datum
        
        label_list.append(label)
        sent1_len_list.append(len_pairs[0])
        sent2_len_list.append(len_pairs[1])
        
        # Pad first sentence
        sent1_padded = np.pad(np.array(sent_pairs[0]),
                             pad_width=((0,MAX_SENTENCE_LENGTH-len_pairs[0])),
                             mode="constant", constant_values=0)
        sent1_list.append(sent1_padded)
        # Pad second sentence
        sent2_padded = np.pad(np.array(sent_pairs[1]),
                             pad_width=((0,MAX_SENTENCE_LENGTH-len_pairs[1])),
                             mode="constant", constant_values=0)
        sent2_list.append(sent2_padded)
        
    # Returns premise sentences, hypothesis sentences, premise sentence lengths, hypothesis sentence lengths, targets
    return [torch.from_numpy(np.array(sent1_list)), torch.from_numpy(np.array(sent2_list)),
            torch.LongTensor(sent1_len_list), torch.LongTensor(sent2_len_list),
            torch.LongTensor(label_list)]
        
        
# Create training dataset loader
train_dataset = SNLIDataset(train_dataset_idx)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                          batch_size=BATCH_SIZE,
                                          collate_fn=snli_collate_func,
                                          num_workers=8,
                                          shuffle=True)

# Create validation dataset loader
val_dataset = SNLIDataset(val_dataset_idx)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                        batch_size=BATCH_SIZE,
                                        collate_fn=snli_collate_func,
                                        num_workers=8,
                                        shuffle=True)

print(len(train_loader), len(val_loader))

100 1


# RNN Architecture

In [None]:
# Create RNN Model
class RNN(nn.Module):
    def __init__(self, emb_dim, emb_matrix, hidden_size=10, num_layers=1, num_classes=3, scheme='cat', dropout=0):
        """
        @param: emb_dim - embedding dimension size
        @param: emb_matrix - embedding matrix
        @param: hidden_size - size of the hidden state
        @param: num_layers - number of layers in the RNN
        @param: num_classes - number of classes in classification problem
        @param: scheme - method for combining premise and hypothesis sentences
        @param: dropout - float between 0 and 1 specifying probability of dropout
        """
        super(RNN, self).__init__()
        
        self.num_layers, self.hidden_size, self.scheme = num_layers, hidden_size, scheme
        self.emb = nn.Embedding.from_pretrained(emb_matrix, freeze=True)
        self.rnn = nn.GRU(emb_dim, hidden_size, num_layers=self.num_layers, bidirectional=True, batch_first=True)
        
        # Adjust first linear layer based on combination scheme
        if scheme == 'cat':
            self.nn1 = nn.Linear(4*hidden_size, hidden_size)
        else:
            self.nn1 = nn.Linear(hidden_size, hidden_size)
        
        # Dropout layer
        self.dropout_val = dropout
        self.dropout = nn.Dropout(dropout)
        
        # Second linear layer
        self.nn2 = nn.Linear(hidden_size, num_classes)
        
    
    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        # Multiply by 2 since it is a bidirectional GRU and we have twice as many
        # sentences.
        hidden = torch.randn(2*self.num_layers, 2*batch_size, self.hidden_size)
        hidden = hidden.to(device)
        return hidden
    
    # Function used to create mask to only update unknown vector
    def create_mask(self, sent_tensor):
        mask = sent_tensor == 1
        diags = []
        i_diags = []
        for row in mask:
            diag = torch.diag(row)
            i_diag = torch.diag(1 - row)
            diags.append(diag.numpy())
            i_diags.append(i_diag.numpy())
        diags = np.array(diags)
        i_diags = np.array(i_diags)
        m = torch.from_numpy(diags)
        i_m = torch.from_numpy(i_diags)
    
        return m.float(), i_m.float()
    
    # Sort sentences in batch to be in descending order by length
    # for pack_padded_sequence.
    def sort_batch(self, sents, lengths):
        """
        @param: sents - premise and hypothesis sentences concatenated together
        @param: lengths - lengths of each sentence concatenated together
        
        returns the sentences in descending order, the lengths in descending order,
        and the order of the indices corresponding to descending order"""
        lengths = lengths.cpu()
        sents = sents.cpu()
        
        ind_dec_order = np.argsort(lengths.numpy())[::-1]
        lens_desc = lengths.numpy()[ind_dec_order]
        lens_desc = torch.from_numpy(lens_desc)
        sents_desc = sents.numpy()[ind_dec_order]
        sents_desc = torch.from_numpy(sents_desc)
        
        sents_desc = sents_desc.to(device)
        lens_desc = lens_desc.to(device)
        
        return sents_desc, lens_desc, ind_dec_order
        
    # Sort back into original order of the batch
    def unsort_batch(self, outputs, ordering):
        """
        @param: outputs - outputs from RNN encoding
        @param: ordering - ind_dec_order, the ordering of indices corresponding to sorting
        by decreasing length"""
        
        original_order = np.argsort(ordering)
        outputs_original = outputs[original_order]

        # Half by batch size
        size = int(outputs.shape[0] / 2)
        sent1_outputs, sent2_outputs = outputs_original[:size], outputs_original[size:]
        
        sent1_outputs, sent2_outputs = sent1_outputs.to(device), sent2_outputs.to(device)
        return sent1_outputs, sent2_outputs
    
    
    def forward(self, sent1, sent2, sent1_len, sent2_len):
        self.hidden = self.init_hidden(sent1.shape[0])
        # Sort batch sentences in descending order
        sents = torch.cat((sent1, sent2), 0)
        lens = torch.cat((sent1_len, sent2_len), 0)
        
        sents_desc, lens_desc, ind_dec_order = self.sort_batch(sents, lens)
        
        # Create mask to update only <UNK>
        # m, i_m = self.create_mask(sents_desc)
        
        # Pass into embedding
        res = self.emb(sents_desc)
        
        # Use mask
        # res = torch.matmul(m, res) + torch.matmul(i_m, res.clone().detach())
        
        # Pass into RNN encoder
        res = torch.nn.utils.rnn.pack_padded_sequence(res, lens_desc, batch_first=True)

        # Forward prop through RNN
        rnn_out, self.hidden = self.rnn(res, self.hidden)
        
        # Unsort for each direction
        sent1_forward_outputs, sent2_forward_outputs = self.unsort_batch(self.hidden[0], ind_dec_order)
        sent1_backward_outputs, sent2_backward_outputs = self.unsort_batch(self.hidden[1], ind_dec_order)
        
        
        # Combine sent1 outputs and sent2 outputs
        if self.scheme == 'cat':
            sent_outputs = torch.cat((sent1_forward_outputs, sent2_forward_outputs, sent1_backward_outputs, sent2_backward_outputs), dim=1)
        elif self.scheme == 'add':
            sent_outputs = sent1_forward_outputs + sent2_forward_outputs + sent1_backward_outputs + sent2_backward_outputs
        elif self.scheme == 'mult':
            sent_outputs = sent1_forward_outputs * sent2_forward_outputs * sent1_backward_outputs * sent2_backward_outputs
        else:
            # Throw error since scheme does not match existing scheme
            raise ValueError('Scheme ' + self.scheme + ' not valid.')
        
        # Pass into first layer
        out = self.nn1(sent_outputs)
        # Pass into relu
        out = F.relu(out)
        
        # Pass to dropout layer
        out = self.dropout(out)
        
        # Pass into second layer
        logits = self.nn2(out)
        
        return logits
    

# CNN Architecture

In [None]:

# Create CNN Model    
class CNN(nn.Module):
    def __init__(self, emb_dim, emb_matrix, hidden_size=10, num_layers=2, num_classes=3, scheme='cat', dropout=0):
        """
        @param: emb_dim - embedding dimension size
        @param: emb_matrix - embedding matrix
        @param: hidden_size - size of the hidden state
        @param: num_layers - number of layers in the RNN
        @param: num_classes - number of classes in classification problem
        @param: scheme - method for combining premise and hypothesis sentences
        @param: dropout - float between 0 and 1 specifying probability of dropout
        """
        super(CNN, self).__init__()
        self.num_layers, self.hidden_size, self.scheme = num_layers, hidden_size, scheme
        self.emb = nn.Embedding.from_pretrained(emb_matrix, freeze=True)
        
        self.conv1 = nn.Conv1d(emb_dim, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)
        
        # Dropout layer between convolutional layers
        self.dropout1 = nn.Dropout(dropout)
        
        # Dropout layer between linear layers
        self.dropout2 = nn.Dropout(dropout)
        
        # This should change based on how we choose to represent
        # the two sentences
        if self.scheme == 'cat':
            self.nn1 = nn.Linear(2*hidden_size, hidden_size)
        else:
            self.nn1 = nn.Linear(hidden_size, hidden_size)
            
        self.nn2 = nn.Linear(hidden_size, num_classes)
    

    # Function used to create mask to only update unknown vector
    def create_mask(self, sent_tensor):
        mask = sent_tensor == 1
        diags = []
        i_diags = []
        for row in mask:
            diag = torch.diag(row)
            i_diag = torch.diag(1 - row)
            diags.append(diag.numpy())
            i_diags.append(i_diag.numpy())
        diags = np.array(diags)
        i_diags = np.array(i_diags)
        m = torch.from_numpy(diags)
        i_m = torch.from_numpy(i_diags)
    
        return m.float(), i_m.float()
    
    
    def forward(self, sent1, sent2, sent1_len, sent2_len):
        
        batch_size = sent1.shape[0]
        # Sort batch sentences in descending order
        sents = torch.cat((sent1, sent2), 0)
        lens = torch.cat((sent1_len, sent2_len), 0)
 
        # m, i_m = self.create_mask(sents)
        
        # Pass into embedding
        res = self.emb(sents)
        
         
        # Use mask
        # res = torch.matmul(m, res) + torch.matmul(i_m, res.clone().detach())
        


        # Pass into CNN encoder

        # From lab 4
        hidden = self.conv1(res.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden)
        
        # Pass to dropout layer
        hidden = self.dropout1(hidden)
        
        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden)
        
        # Max pool over time
        hidden, _ = torch.max(hidden, dim=1)
        
        # Split sentences
        sent1_output, sent2_output = hidden[:batch_size], hidden[batch_size:]
        
        # Combine premise and hypothesis outputs
        if self.scheme == 'cat':
            sent_outputs = torch.cat((sent1_output, sent2_output), dim=1)
        elif self.scheme == 'add':
            sent_outputs = sent1_output + sent2_output
        elif self.scheme == 'mult':
            sent_outputs = sent1_output * sent2_output
        else:
            # Throw exception
            raise ValueError('Scheme ' + self.scheme + ' not valid.')
        
        # Pass to first linear layer
        out = self.nn1(sent_outputs)
        out = F.relu(out)
        
        # Pass to second dropout layer
        out = self.dropout2(out)
        
        # Pass to second lienar layer
        logits = self.nn2(out)
        
        return logits



# Helper Functions for Hyperparameter Tuning

In [11]:


# Train model
# From lab
def test_model(loader, model, criterion):
    """
    Helper function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    @param: model - the model to test
    @param: criterion - the cost function used to compute loss
    
    returns accuracy, average_loss
    """
    cumulative_loss = 0
    correct = 0
    total = 0
    model = model.to(device)

    model.eval()
    for sent1, sent2, sent1_len, sent2_len, label in loader:
        sent1, sent2, sent1_len, sent2_len, label = sent1.to(device), sent2.to(device), sent1_len.to(device), sent2_len.to(device), label.to(device)
        outputs = model(sent1, sent2, sent1_len, sent2_len)
        loss = criterion(outputs, label)
        cumulative_loss += loss.item()
        
    
        probabilities = F.softmax(outputs, dim=1)
        predicted = probabilities.max(1, keepdim=True)[1]

        total += label.size(0)
        correct += predicted.eq(label.view_as(predicted)).sum().item()
    return (100 * correct / total, cumulative_loss)

def train_model(train_loader, val_loader, model, criterion, optimizer, num_epochs):
    """
    Helper function that trains a model
    @param: train_loader - training dataset loader
    @param: val_loader - validation dataset loader
    @param: model - model to train
    @param: criterion - cost function used to compute loss
    @param: optimizer - optimizer used to update parameters
    @param: epochs - the number of epochs to train 
    """
    x = []
    train_accs = []
    train_losses = []
    val_accs = []
    val_losses = []
    
    model = model.to(device)
    for epoch in range(num_epochs):
        for i, (sent1, sent2, sent1_len, sent2_len, labels) in enumerate(train_loader):
            sent1, sent2, sent1_len, sent2_len, labels = sent1.to(device), sent2.to(device), sent1_len.to(device), sent2_len.to(device), labels.to(device)
#             model = model.to(device)
#             print("IN TRAINING, MODEL", next(model.parameters()).is_cuda)
            model.train()
            optimizer.zero_grad()
            # Forward pass
            outputs = model(sent1, sent2, sent1_len, sent2_len)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            if i > 0:
                train_acc, train_loss = test_model(train_loader, model, criterion)
                val_acc, val_loss = test_model(val_loader, model, criterion)
                
                print('Epoch: [{}/{}], Step: [{}/{}], Train Acc: {}, Val Acc: {}'.format(
                epoch+1, num_epochs, i+1, len(train_loader), train_acc, val_acc))
                
                train_accs.append(train_acc)
                train_losses.append(train_loss)
                val_accs.append(val_acc)
                val_losses.append(val_loss)
                # I think this is right
                x.append(epoch + i / len(train_loader))
    return x, train_accs, train_losses, val_accs, val_losses


# Creates a figure and saves it to path
def create_figure(path, title, x, label_value_pairs, x_label, y_label):
    """
    Function to create a plot
    @param: path - file path to save figure to
    @param: title - plot title
    @param: x - list of x-values
    @param: label_value_pairs - list of y-values and their labels to plot
    @param: x_label - label for x-axis
    @param: y_label - label for y-axis
    """
    fig = plt.figure()
    ax = plt.subplot(111)
    for pair in label_value_pairs:
        ax.plot(x, pair[1], label=pair[0])
        
    # Set title, x_axis, y_axis
    fig.suptitle(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    
    # Add legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels)
    
    # Save figure to file path
    plt.savefig(path)
    
    # Close figure
    plt.close()
    
# Saves the state dictionary of a model to the file specified
# by path
def save_model(model, path):
    """
    @param: model - a PyTorch model whose state dictionary we want to save
    @param: path - the file path to save the dictionary to
    """
    model = model.cpu()
    torch.save(model.state_dict(), path)

# Loads the state dictionary from the file specified by path
# into the model and returns the model
def load_model(model, path):
    """
    @param: model - the PyTorch model we want to load the state dictionary into
    @param: path - the file containing the state dictionary
    """
    torch.manual_seed(10)
    model.load_state_dict(torch.load(path))
    return model

# Returns the best accuracy and hyperparameter
def get_best(results_dict, param, values):
    """
    @param: results_dict - dictionary of results
    @param: param - hyperparemeter changed
    @param: values - list of values considered for the hyperparameter
    """
    pairs = []
    for val in values:
        pairs.append((val, results_dict[param + '_' + str(val) + '_val_acc'][-1]))
    order = sorted(pairs, key = lambda pair: pair[1], reverse=True)
    return order[0]

# Writes the results to a csv file
def write_csv(results_dict, filename):
    """
    @param: results_dict - dictionary of results to write 
    @param: filename - file to save to
    """
    df = pd.DataFrame.from_dict(results_dict)
    df.to_csv(filename, index=False)

# Create figures for training accuracy, training loss, validation accuracy, and validation loss
# from an experiment
def create_figures(results_dict, param, values, path, mode):
    """
    @param: results_dict - dictionary of results to create figures from
    @param: param - hyperparameter changed
    @param: values - list of values considered for the hyperparameter
    @param: path - file to save figures to
    """
    titles = {'_train_acc': 'Training Accuracy', '_train_loss': 'Training Loss', 
             '_val_acc': 'Validation Accuracy', '_val_loss': 'Validation Loss'}
    y_labels = {'_train_acc': 'Accuracy', '_train_loss': 'Loss', '_val_acc': 'Accuracy', '_val_loss': 'Loss'}
    plots = ['_train_acc', '_train_loss', '_val_acc', '_val_loss']
    
    names = [param + '_' + str(val) for val in values]
    for plot in plots:
        label_value_pairs = []
        for name in names:
            label_value_pairs.append((name + plot, results_dict[name + plot]))
        create_figure(path + param + plot + '.png', mode + ' ' + param + ': ' + titles[plot], results_dict['epoch'], label_value_pairs, 'Epoch', y_labels[plot])

# Build, train, and evaluate RNN models with different hyperparameter values
def run_rnn_experiment(param, values, model_params, train_loader, val_loader):
    """
    @param: param - hyperparameter to change in each model
    @param: values - list of values considered for the hyperparameter
    @param: model_params - hyperparameter values for the model
    @param: train_loader - training dataset data loader
    @param: val_loader - validation dataset data loader
    """
    results = {}
    for value in values:
        model_params[param] = value
        print(model_params)
        
        model = RNN(300, wiki_embed, **model_params)
#         model = model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        criterion = torch.nn.CrossEntropyLoss()
        x, train_acc, train_loss, val_acc, val_loss = train_model(train_loader, val_loader, model, criterion, optimizer, NUM_EPOCHS)
        
        # Save model
        save_model(model, RNN_MODELS_DIR+param+'_'+str(value)+'.pt')
        
        # Store results
        results['epoch'] = x
        results[param+'_'+str(value)+'_train_acc'] = train_acc
        results[param+'_'+str(value)+'_train_loss'] = train_loss
        results[param+'_'+str(value)+'_val_acc'] = val_acc
        results[param+'_'+str(value)+'_val_loss'] = val_loss
    
    # Create figures
    create_figures(results, param, values, RNN_FIG_DIR, 'RNN')
    
    # Write results
    write_csv(results, RNN_RESULTS_DIR + param + '_results.csv')
    
    # Return best performing hyperparameter
    return get_best(results, param, values)

# Build, train, and evaluate CNN models with different hyperparameter values
def run_cnn_experiment(param, values, model_params, train_loader, val_loader):
    """
    @param: param - hyperparameter to change in each model
    @param: values - list of values considered for the hyperparameter
    @param: model_params - hyperparameter values for the model
    @param: train_loader - training dataset data loader
    @param: val_loader - validation dataset data loader
    """
    results = {}
    for value in values:
        model_params[param] = value
        print(model_params)
        
        model = CNN(300, wiki_embed, **model_params)
#         model = model.to(device)
#         print("CNN MODEL", next(model.parameters()).is_cuda)
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        criterion = torch.nn.CrossEntropyLoss()
        x, train_acc, train_loss, val_acc, val_loss = train_model(train_loader, val_loader, model, criterion, optimizer, NUM_EPOCHS)
        
        # Save model
        save_model(model, CNN_MODELS_DIR+param+'_'+str(value)+'.pt')
        
        # Store results
        results['epoch'] = x
        results[param+'_'+str(value)+'_train_acc'] = train_acc
        results[param+'_'+str(value)+'_train_loss'] = train_loss
        results[param+'_'+str(value)+'_val_acc'] = val_acc
        results[param+'_'+str(value)+'_val_loss'] = val_loss
    
    # Create figures
    create_figures(results, param, values, CNN_FIG_DIR, 'CNN')
    
    # Write results
    write_csv(results, CNN_RESULTS_DIR + param + '_results.csv')
    
    # Return best performing hyperparameter
    return get_best(results, param, values)



# Hyperparameter Tuning

Run experiments on both the RNN and CNN architectures. Vary the size of the hidden state and the type of interaction between premise and hypothesis sentences for the RNN. Vary the size of the hidden state, type of interaction between premise and hypothesis sentences, and dropout probability for the CNN.

In [None]:
# HYPERPARAMETER TUNING
# RNN FIRST
# THEN CNN

LEARNING_RATE = 0.001
NUM_EPOCHS = 5

# Set seed for reproducibility
torch.manual_seed(10)

RNN_FIG_DIR = 'rnn_figures/'
RNN_RESULTS_DIR = 'rnn_results/'
RNN_MODELS_DIR = 'rnn_models/'

CNN_FIG_DIR = 'cnn_figures/'
CNN_RESULTS_DIR = 'cnn_results/'
CNN_MODELS_DIR = 'cnn_models/'

# CNN_FIG_DIR = None
# CNN_RESULTS_DIR = None
# CNN_MODELS_DIR = None 

rnn_model_params = {'hidden_size': 10, 'num_layers': 1, 'num_classes':3, 'scheme': 'cat', 'dropout':0}

best = run_rnn_experiment('hidden_size', [50, 100, 250, 500, 1000], rnn_model_params, train_loader, val_loader)

print("BEST HIDDEN SIZE", best)
rnn_model_params['hidden_size'] = int(best[0])
print("NEW CONFIGURATION", rnn_model_params)
    
best = run_rnn_experiment('scheme', ['mult', 'add', 'cat'], rnn_model_params, train_loader, val_loader)

print("BEST COMBINATION SCHEME", best)
rnn_model_params['scheme'] = best[0]
print("NEW CONFIGURATION", rnn_model_params)

############# NOT RUN FOR TIME REASONS ###################
# best = run_rnn_experiment('dropout', [0, 0.1, 0.3, 0.5, 0.7, 0.9], rnn_model_params, train_loader, val_loader)

# print("BEST DROPOUT", best)
# rnn_model_params['dropout'] = float(best[0])
# print("NEW CONFIGURATION", rnn_model_params)

cnn_model_params = {'hidden_size': 10, 'num_layers': 2, 'num_classes':3, 'scheme': 'cat', 'dropout':0}


best = run_cnn_experiment('hidden_size', [50, 100, 250, 500, 1000], cnn_model_params, train_loader, val_loader)

print("BEST HIDDEN SIZE", best)
cnn_model_params['hidden_size'] = int(best[0])
print("NEW CONFIGURATION", cnn_model_params)

best = run_cnn_experiment('scheme', ['mult', 'add', 'cat'], cnn_model_params, train_loader, val_loader)

print("BEST COMBINATION SCHEME", best)
cnn_model_params['scheme'] = best[0]
print("NEW CONFIGURATION", cnn_model_params)


best = run_cnn_experiment('dropout', [0, 0.1, 0.3, 0.5, 0.7, 0.9], cnn_model_params, train_loader, val_loader)

print("BEST DROPOUT", best)
cnn_model_params['dropout'] = float(best[0])
print("NEW CONFIGURATION", cnn_model_params)


# MultiNLI Performance

In [62]:
# BEST MODELS PERFORMANCE ON MNLI
# Load in mnli_val dataset
mnli_df = pd.read_csv('./hw2_data/mnli_val.tsv', header=0, sep='\t')

# Load best CNN model
cnn_model = CNN(300, wiki_embed, hidden_size=250, num_layers=2, num_classes=3, scheme='cat', dropout=0.3)
cnn_model = load_model(cnn_model, './cnn_models/cnn_models/dropout_0.3.pt')

# Load best RNN model
rnn_model = RNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='mult', dropout=0)
rnn_model = load_model(rnn_model, './rnn_models/rnn_models/scheme_mult.pt')



def evaluate_mnli(mnli_dataset, model):
    genres = list(set(df['genre'].tolist())) # Get unique genres

    for genre in genres:
        genre_df = df[df['genre'] == genre]
        genre_df = genre_df.drop('genre', axis=1)

        # Get indices for dataset
        genre_dataset = zip(genre_df['sentence1'], genre_df['sentence2'], genre_df['label'])
        genre_indices = token2index_dataset(token2id, genre_dataset)

        # Create loader. # Do I need to change the BATCH_SIZE?
        genre_dataset = SNLIDataset(genre_indices)
        genre_loader = torch.utils.data.DataLoader(dataset=genre_dataset,
                                            batch_size=BATCH_SIZE,
                                            collate_fn=snli_collate_func,
                                            shuffle=True)
        
        criterion = torch.nn.CrossEntropyLoss()
        acc, loss = test_model(genre_loader, model, criterion)
        print("GENRE: ", genre, "VALIDATION ACCURACY: ", acc, "VALIDATION LOSS: ", loss)


print("MNLI PERFORMANCE WITH BEST CNN MODEL:")
evaluate_mnli(mnli_df, cnn_model)

print("\n MNLI PERFORMANCE WITH BEST RNN MODEL:")
evaluate_mnli(mnli_df, rnn_model)

HIDDEN SIZE 1000
SCHEME mult
DROPOUT 0
MNLI PERFORMANCE WITH BEST CNN MODEL:
GENRE:  travel VALIDATION ACCURACY:  44.70468431771894 VALIDATION LOSS:  1.0965607166290283
GENRE:  fiction VALIDATION ACCURACY:  44.72361809045226 VALIDATION LOSS:  1.1202881336212158
GENRE:  slate VALIDATION ACCURACY:  42.21556886227545 VALIDATION LOSS:  1.7067168951034546
GENRE:  telephone VALIDATION ACCURACY:  46.069651741293534 VALIDATION LOSS:  1.7242774963378906
GENRE:  government VALIDATION ACCURACY:  41.53543307086614 VALIDATION LOSS:  2.248517394065857

 MNLI PERFORMANCE WITH BEST RNN MODEL:
GENRE:  travel VALIDATION ACCURACY:  38.4928716904277 VALIDATION LOSS:  1.2185195684432983
GENRE:  fiction VALIDATION ACCURACY:  44.120603015075375 VALIDATION LOSS:  1.1094251871109009
GENRE:  slate VALIDATION ACCURACY:  40.21956087824351 VALIDATION LOSS:  2.015760123729706
GENRE:  telephone VALIDATION ACCURACY:  38.70646766169154 VALIDATION LOSS:  2.2981666326522827
GENRE:  government VALIDATION ACCURACY:  38.77

# Correctly and Incorrectly Classified Examples


Below is code used to find 3 correctly classified examples and 3 incorrectly classified examples

In [63]:
# Helper function to convert a tensor of indices back into a sentence
# Removes padding
def convert_to_words(sent, id2token):
    sentence = []
    list_of_indices = sent.numpy()[0].tolist()
    for idx in list_of_indices:
        # Skip padding
        if idx != 0: 
            sentence.append(id2token[idx])
    return sentence


# MISCLASSIFIED INSTANCES FOR RNN
model = RNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='mult', dropout=0)
rnn_model = load_model(model, './rnn_models/rnn_models/scheme_mult.pt')

# VAL LOADER WITH BATCH SIZE 1
val_loader_1 = torch.utils.data.DataLoader(dataset=val_dataset,
                                        batch_size=1,
                                        collate_fn=snli_collate_func,
                                        shuffle=True)

# Function to get 3 correctly classified instances and 3 incorrectly classified instances
def get_misclassified(model, val_loader, id2token):
    """
    @param: model - model to evaluate
    @param: val_loader - validation loader to get instances from
    @param: id2token - mapping from indices to tokens
    """
    correct_instances = []
    incorrect_instances = []
    
    model.eval()
    for sent1, sent2, sent1_len, sent2_len, label in val_loader:
        sent1, sent2, sent1_len, sent2_len, label = sent1.to(device), sent2.to(device), sent1_len.to(device), sent2_len.to(device), label.to(device)
        outputs = model(sent1, sent2, sent1_len, sent2_len)
        loss = criterion(outputs, label)
#         cumulative_loss += loss.item()
        
        # Convert indices back to tokens
        sentence1 = ' '.join(convert_to_words(sent1, id2token))
        sentence2 = ' '.join(convert_to_words(sent2, id2token))
    
        probabilities = F.softmax(outputs, dim=1)
        predicted = probabilities.max(1, keepdim=True)[1]
        
        correct = predicted.eq(label.view_as(predicted)).sum().item()
        if correct == 0 and len(incorrect_instances) < 3:
            incorrect_instances.append((sentence1, sentence2, label.item(), predicted.item()))
        elif correct == 1 and len(correct_instances) < 3:
            correct_instances.append((sentence1, sentence2, label.item(), predicted.item()))
            
        if len(correct_instances) == 3 and len(incorrect_instances) == 3:
            break
            
    return correct_instances, incorrect_instances

correct_instances, incorrect_instances = get_misclassified(rnn_model, val_loader_1, id2token)

print("CORRECT INSTANCES")
for inst in correct_instances:
    print("PREMISE: {} \n HYPOTHESIS: {} \n LABEL: {} PREDICTED: {} \n".format(inst[0], inst[1], inst[2], inst[3]))

print("\n")
print("INCORRECT INSTANCES")
for inst in incorrect_instances:
    print("PREMISE: {} \n HYPOTHESIS: {} \n LABEL: {}, PREDICTED: {} \n".format(inst[0], inst[1], inst[2], inst[3]))

HIDDEN SIZE 1000
SCHEME mult
DROPOUT 0
CORRECT INSTANCES
PREMISE: Boy jumps in desert while others watch and take photo . 
 HYPOTHESIS: A group of people without cameras are watching a boy . 
 LABEL: 2 PREDICTED: 2 

PREMISE: A child rests on her mother lap exhausted from a day of sun and fun at the beach . 
 HYPOTHESIS: The child played outside . 
 LABEL: 1 PREDICTED: 1 

PREMISE: An African American wearing a red backpack looks the photographer as he walks past a concrete wall covered in graffiti . 
 HYPOTHESIS: a person wears a backpack 
 LABEL: 1 PREDICTED: 1 



INCORRECT INSTANCES
PREMISE: A young woman and some friends at a party . 
 HYPOTHESIS: Friends are enjoying their time together . 
 LABEL: 0, PREDICTED: 1 

PREMISE: Three Oklahoma Sooners playing football against another team , one of the <unk> with the ball in their possession . 
 HYPOTHESIS: A group of bears are playing a football game . 
 LABEL: 2, PREDICTED: 1 

PREMISE: Two grownups are waiting for the bus to arrive 

# Number of Trained Parameters

Below is code used to find the number of trained parameters in each model.

In [64]:
# FIND NUMBER OF PARAMETERS FOR EACH MODEL

# Create RNN models
rnn_50 = RNN(300, wiki_embed, hidden_size=50, num_layers=1, num_classes=3, scheme='cat', dropout=0)
rnn_100 = RNN(300, wiki_embed, hidden_size=100, num_layers=1, num_classes=3, scheme='cat', dropout=0)
rnn_250 = RNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0)
rnn_500 = RNN(300, wiki_embed, hidden_size=500, num_layers=1, num_classes=3, scheme='cat', dropout=0)
rnn_1000 = RNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='cat', dropout=0)

rnn_cat = RNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='cat', dropout=0)
rnn_mult = RNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='mult', dropout=0)
rnn_add = RNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='add', dropout=0)

# Create CNN models
cnn_50 = CNN(300, wiki_embed, hidden_size=50, num_layers=1, num_classes=3, scheme='cat', dropout=0)
cnn_100 = CNN(300, wiki_embed, hidden_size=100, num_layers=1, num_classes=3, scheme='cat', dropout=0)
cnn_250 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0)
cnn_500 = CNN(300, wiki_embed, hidden_size=500, num_layers=1, num_classes=3, scheme='cat', dropout=0)
cnn_1000 = CNN(300, wiki_embed, hidden_size=1000, num_layers=1, num_classes=3, scheme='cat', dropout=0)

cnn_cat = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0)
cnn_mult = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='mult', dropout=0)
cnn_add = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='add', dropout=0)

cnn_dp_0 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0)
cnn_dp_01 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0.1)
cnn_dp_03 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0.3)
cnn_dp_05 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0.5)
cnn_dp_07 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0.7)
cnn_dp_09 = CNN(300, wiki_embed, hidden_size=250, num_layers=1, num_classes=3, scheme='cat', dropout=0.9)

names = ["RNN HIDDEN SIZE: 50", "RNN HIDDEN SIZE: 100", "RNN HIDDEN SIZE: 250", "RNN HIDDEN SIZE: 500", "RNN HIDDEN SIZE: 1000",
        "RNN SCHEME: CAT", "RNN SCHEME: MULT", "RNN SCHEME: ADD", "CNN HIDDEN SIZE: 50", "CNN HIDDEN SIZE: 100",
        "CNN HIDDEN SIZE: 250", "CNN HIDDEN SIZE: 500", "CNN HIDDEN SIZE: 1000", "CNN SCHEME: CAT", "CNN SCHEME: MULT",
        "CNN SCHEME: ADD", "CNN DROPOUT: 0.0", "CNN DROPOUT: 0.1", "CNN DROPOUT: 0.3", "CNN DROPOUT: 0.5",
        "CNN DROPOUT: 0.7", "CNN DROPOUT: 0.9"]

models = [rnn_50, rnn_100, rnn_250, rnn_500, rnn_1000, rnn_cat, rnn_mult, rnn_add, cnn_50, cnn_100, cnn_250, cnn_500,
         cnn_1000, cnn_cat, cnn_mult, cnn_add, cnn_dp_0, cnn_dp_01, cnn_dp_03, cnn_dp_05, cnn_dp_07, cnn_dp_09]

# A function that returns the number of parameters that are
# learned in a model.
def get_number_of_parameters(model):
    total_parameters = 0
    # Iterate over parameters
    for param in model.parameters():
        # Check to see if a learned parameter
        if param.requires_grad:
            dims = len(param.shape)
            num_of_params = 1
            # Multiply dimensions to get total number of parameters
            for i in range(dims):
                num_of_params *= param.shape[i]
            total_parameters += num_of_params
    return total_parameters

for i in range(len(models)):
    num_parameters = get_number_of_parameters(models[i])
    
    print(names[i], "    ", num_parameters)

HIDDEN SIZE 50
SCHEME cat
DROPOUT 0
HIDDEN SIZE 100
SCHEME cat
DROPOUT 0
HIDDEN SIZE 250
SCHEME cat
DROPOUT 0
HIDDEN SIZE 500
SCHEME cat
DROPOUT 0
HIDDEN SIZE 1000
SCHEME cat
DROPOUT 0
HIDDEN SIZE 1000
SCHEME cat
DROPOUT 0
HIDDEN SIZE 1000
SCHEME mult
DROPOUT 0
HIDDEN SIZE 1000
SCHEME add
DROPOUT 0
RNN HIDDEN SIZE: 50      115803
RNN HIDDEN SIZE: 100      281603
RNN HIDDEN SIZE: 250      1079003
RNN HIDDEN SIZE: 500      3408003
RNN HIDDEN SIZE: 1000      11816003
RNN SCHEME: CAT      11816003
RNN SCHEME: MULT      8816003
RNN SCHEME: ADD      8816003
CNN HIDDEN SIZE: 50      57803
CNN HIDDEN SIZE: 100      140603
CNN HIDDEN SIZE: 250      539003
CNN HIDDEN SIZE: 500      1703003
CNN HIDDEN SIZE: 1000      5906003
CNN SCHEME: CAT      539003
CNN SCHEME: MULT      476503
CNN SCHEME: ADD      476503
CNN DROPOUT: 0.0      539003
CNN DROPOUT: 0.1      539003
CNN DROPOUT: 0.3      539003
CNN DROPOUT: 0.5      539003
CNN DROPOUT: 0.7      539003
CNN DROPOUT: 0.9      539003
