# ULMFiT + Siamese Network for Sentence Vectors
## Part Two: Classifying

The first notebook created a new language model from the SNLI dataset.
This notebook will adapt that model to predicting the SNLI category for sentence pairs.


In [1]:
from fastai.text import *
import html

import json
import html
import re
import pickle
from collections import Counter
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict

import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F

import time
import math
import sys
import data

snli_root = './data/SNLI/'

In [2]:
#load the tokens
itos = pickle.load(open(f'{snli_root}itos.pkl', 'rb'))
trn_lm = np.load(f'{snli_root}trn_lm.npy')
val_lm = np.load(f'{snli_root}val_lm.npy')

stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
vocab_size = len(itos)
vocab_size

8842

## Create a new dataloader to create sentence pairs

In [3]:
from enum import Enum

class Entail(Enum):
    entailment = 0
    contradiction = 1
    neutral = 2
       
class SiameseDataset(dataset.Dataset):
    def __init__(self, json_file):
        
        content = None
        with open(json_file) as fp:
            content = json.load(fp)

        self.items = []
        for item in content:
            s0 = item[0]
            s1 = item[1]
            label = Entail[item[2]].value
            self.items.append((s0, s1, label))
            
    def shuffle(self):
        random.shuffle(self.items)
        
    def __getitem__(self, index):
        return self.items[index]
       
    def __len__(self):
        return len(self.items)
    
class SiameseDataLoader():
    def __init__(self, dataset, stoi, pad_val, batch_size=32):
        self.dataset = dataset
        dataset.shuffle()
        self.batch_size = batch_size
        self.stoi = stoi
        self.index = 0
        self.pad_val = pad_val
      
    def __iter__(self):
        return self
    
    def fill_tensor(self, sentences, max_len):
        data = np.zeros((max_len, self.batch_size), dtype=np.long)
        data.fill(self.pad_val)
        
        for i, s in enumerate(sentences): 
            start_idx = max_len - len(s)
            for j, p in enumerate(s):
                data[:,i][start_idx+j] = stoi[p]
            
        return torch.LongTensor([data.tolist()]).cuda()
     
    def batch(self):
        return self.index//self.batch_size
    
    def __len__(self):
        return len(self.dataset)//self.batch_size
    
    def __next__(self):
        #how many examples to ananlyise for this round
        num = min(self.batch_size, len(self.dataset) - self.index)
        
        if num < 1:
            raise StopIteration  # signals "the end"
            
        #collect the sentences
        max_len = 0
        first = []
        second = []
        labels = np.zeros((self.batch_size), dtype=np.long)
        
        for i in range(self.index, self.index+num):
            a, b, l = self.dataset[i]
            
            if len(a) > max_len:
                max_len = len(a)
            
            if len(b) > max_len:
                max_len = len(b)
            
            first.append(a)
            second.append(b)
            labels[i - self.index] = l
            
        self.index += num
             
        return (self.fill_tensor(first, max_len),
                self.fill_tensor(second, max_len),
                torch.LongTensor([labels.tolist()]).cuda()
               )

In [4]:
siamese_dataset_dev = SiameseDataset(f'{snli_root}/snli_dev.json')
siamese_dataset_test = SiameseDataset(f'{snli_root}snli_test.json')

## Siamese network

In [50]:
class SiameseClassifier(nn.Module):
    
    def __init__(self, encoder, classifier):
        super().__init__()
        self.encoder = encoder
        self.classifier = classifier
    
    def pool(self, x, bs, is_max):
        f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
        return f(x.permute(1,2,0), (1,)).view(bs,-1)

    def pool_outputs(self, output):
        sl, bs,_ = output.size()
        avgpool = self.pool(output, bs, False)
        maxpool = self.pool(output, bs, True)
        return torch.cat([output[-1], maxpool, avgpool], 1)
        
    def forward(self, input1, input2):

        raw_outputs1, outputs1 = self.encoder(input1)
        raw_outputs2, outputs2 = self.encoder(input2)
        
        out1 = self.pool_outputs(outputs1[-1])
        out2 = self.pool_outputs(outputs2[-1])
        
        out = torch.cat([out1, out2], 1)
        
        return self.classifier(out)
        
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()
                
    def freeze_encoder(self):
        for param in self.encoder.parameters():
            param.requires_grad = False
            
    def unfreeze_encoder(self):
        for param in self.encoder.parameters():
            param.requires_grad = True
                
class LinearClassifier(nn.Module):
    def __init__(self, input_size, hidden, num_categories, dropout):
        super().__init__()
        
        self.layers = nn.ModuleList([
            LinearBlock(input_size, hidden, dropout),
            LinearBlock(hidden, hidden, dropout),
            LinearBlock(hidden, num_categories, dropout)
        ])

    def forward(self, x):
        for l in self.layers:
            x = F.relu(l(x))
        return x

## Load our pretrained model then build the Siamese network from it

In [51]:
#these are the values used for the original LM
em_sz, nh = 400, 1150

SNLI_encoder = torch.load("SNLI_Encoder.pt")

#2 pooled vectors, of 3 times the embedding size
classifier = LinearClassifier(3*2*em_sz, nh,  3, dropout=0.05)
siamese_model = SiameseClassifier(SNLI_encoder, classifier).cuda()

## Train the new network

In [52]:
log_interval = 50
criterion = nn.CrossEntropyLoss()
def evaluate(model, data_loader):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    num_correct = 0
    total = 0 
    for a, b, l in data_loader:

        a, b, l = Variable(a), Variable(b), Variable(l)
        a.requires_grad = False
        b.requires_grad = False
        l.requires_grad = False
        out = model(a.squeeze(), b.squeeze())
        num_correct += np.sum(l.data.cpu().numpy() == np.argmax(out.data.cpu().numpy(), 1))
        total += out.shape[0]
        loss = criterion(out, l.squeeze())
        total_loss += out.shape[0] * loss.data.cpu()[0]

    return (total_loss / total, num_correct / total)

def train(model, data_loader, optimizer):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    
    num_correct = 0
    total = 0 
        
    for a, b, l in data_loader:
        
        optimizer.zero_grad()
        a, b, l = Variable(a), Variable(b), Variable(l)

        out = model(a.squeeze(), b.squeeze())
        loss = criterion(out, l.squeeze())
        total_loss += out.shape[0] * loss.data.cpu()[0]
        
        loss.backward()
        optimizer.step()
        
        num_correct += np.sum(l.data.cpu().numpy() == np.argmax(out.data.cpu().numpy(), 1))
        total += out.shape[0]

        batch = data_loader.batch()
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / total
            elapsed = time.time() - start_time
            batches = len(data_loader)
            ms = elapsed * 1000 / log_interval
            print(f'| epoch {epoch:3d} | {batch:5d}/{batches:5d} batches', end=" ")
            print(f'| ms/batch {ms:5.2f} | loss {cur_loss:5.4f} acc {num_correct / total}')
            total_loss = 0
            total = 0
            num_correct = 0
            start_time = time.time()

In [54]:
lrs = [0.001, 0.0005, 0.0001, 0.0001, 0.0001, 0.00005, 0.00001]

for epoch, lr in enumerate(lrs):

    print(f'training with lr {lr}')
    optimizer = optim.Adam(siamese_model.parameters(), lr=lr)

    training_data = SiameseDataLoader(siamese_dataset_dev, stoi, stoi["_pad_"], batch_size=32)

    epoch_start_time = time.time()
    train(siamese_model, training_data, optimizer)

    validation_data = SiameseDataLoader(siamese_dataset_test , stoi, stoi["_pad_"], batch_size=32)
    val_loss, accuracy = evaluate(siamese_model, validation_data)

    delta_t = (time.time() - epoch_start_time)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {delta_t:5.2f}s | valid loss {val_loss:5.2f} accuracy {accuracy} learning rate {lr}')
    print('-' * 89)


training with lr 0.001
| epoch   0 |    50/  307 batches | ms/batch 270.44 | loss 1.1969 acc 0.343125
| epoch   0 |   100/  307 batches | ms/batch 257.95 | loss 1.1029 acc 0.35
| epoch   0 |   150/  307 batches | ms/batch 231.96 | loss 1.0986 acc 0.34125
| epoch   0 |   200/  307 batches | ms/batch 251.36 | loss 1.0987 acc 0.34375


KeyboardInterrupt: 

In [None]:
with open(f'{data_root}siamese_model.pt', 'wb') as f:
    torch.save(siamese_model, f)