In [1]:
import torch
import pickle as pk
import numpy as np

In [25]:
snli_train_id = pk.load(open("./hw2_data/snli_train_id.pk", "rb"))
snli_val_id = pk.load(open("./hw2_data/snli_val_id.pk", "rb"))
loaded_embeddings_ft = pk.load(open("./hw2_data/loaded_embeddings_ft.pk", "rb"))

In [3]:
from SNLI_DataLoader import SNLIDataset, snli_collate_func

train_dataset = SNLIDataset(snli_train_id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=32,
                                           collate_fn=snli_collate_func,
                                           shuffle=True)
val_dataset = SNLIDataset(snli_val_id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=32,
                                           collate_fn=snli_collate_func,
                                           shuffle=True)


In [88]:
import torch
class RNN(torch.nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, pre_trained_emb, device):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(RNN, self).__init__()
        
        self.device = torch.device(device)
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = torch.nn.Embedding.from_pretrained(pre_trained_emb, freeze = False).to(self.device)
        self.bigru = torch.nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True).to(self.device)
        self.linear1 = torch.nn.Linear(4*hidden_size, hidden_size).to(self.device)
        self.relu =  torch.nn.ReLU().to(self.device)
        self.linear2 = torch.nn.Linear(hidden_size, num_classes).to(self.device)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(2*self.num_layers, batch_size, self.hidden_size).to(self.device)

        return hidden

    def forward(self, prem, prem_len, hyp, hyp_len):
        
        
        batch_size, prem_seq_len = prem.size()
        _, hyp_seq_len = hyp.size()
        
        #sort
        prem_len_new, prem_perm_index = prem_len.sort(0, descending = True)
        hyp_len_new, hyp_perm_index = hyp_len.sort(0, descending = True)
        prem = prem[prem_perm_index]; hyp = hyp[hyp_perm_index]
        
        # get embedding of characters
        prem_embed = self.embedding(prem)
        hyp_embed = self.embedding(hyp)
        
        # pack padded sequence (pad shorter sequences, and pytorch want the sequence length in descending order. 
        # First element in batch should be the longest seqeunce.)
        
        packed_prem = torch.nn.utils.rnn.pack_padded_sequence(prem_embed, list(prem_len_new.data), batch_first=True)
        packed_hyp = torch.nn.utils.rnn.pack_padded_sequence(hyp_embed, list(hyp_len_new.data), batch_first=True)
        
        
        #init hidden layers for both premises and hypothese
        self.hidden_prem = self.init_hidden(batch_size)
        self.hidden_hyp = self.init_hidden(batch_size)
        
        #pass to bidirectional gru
        _, self.hidden_prem = self.bigru(packed_prem, self.hidden_prem)
        _, self.hidden_hyp = self.bigru(packed_hyp, self.hidden_hyp)
        
        #unsort
        _, prem_restore_index = prem_perm_index.sort(0)
        _, hyp_restore_index = hyp_perm_index.sort(0)
        
        #last hidden state
        prem_encode = torch.cat((self.hidden_prem[0], self.hidden_prem[1]), dim = 1)[prem_restore_index]
        hyp_encode = torch.cat((self.hidden_hyp[0], self.hidden_hyp[1]), dim = 1)[hyp_restore_index]
        
        #concat hypothesis and premises result
        out = torch.cat((prem_encode, hyp_encode), dim = 1)
        #size batch_size*(4*hidden_size)
        
        #pass to 2 FC layer
        out = self.linear1(out) #output size: batch_size*hidden_size
        out = self.relu(out) #output size: batch_size*hidden_size
        logits = self.linear2(out) #output batch_size * num_class

        return logits

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


In [79]:
emb_size = 300
hidden_size = 100
num_layers = 1
num_classes = 3
pre_trained_emb = loaded_embeddings_ft
device = 'cpu'

In [None]:
model = RNN(emb_size, hidden_size, num_layers, num_classes, pre_trained_emb, device)

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (prem, prem_len, hyp, hyp_len, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(prem, prem_len, hyp, hyp_len)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


In [91]:
model = RNN(emb_size, hidden_size, num_layers, num_classes, pre_trained_emb, device)

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for i, (prem, prem_len, hyp, hyp_len, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(prem, prem_len, hyp, hyp_len)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        print(model.embedding.grad.data)

AttributeError: 'Embedding' object has no attribute 'grad'