# ULMFiT + Siamese Network for Sentence Vectors
## Part Three: Classifying

The second notebook created a new language model from the SNLI dataset.
This notebook will adapt that model to predicting the SNLI category for sentence pairs.
The model will be used as a sentence encoder for a Siamese Network that builds sentence vectors that are feed into a classifier network.

In [1]:
from fastai.text import *
import html

import json
import html
import re
import pickle
from collections import Counter
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict
import random

import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F

import time
import math
import sys
import data

snli_root = './data/SNLI/'

In [2]:
#load the tokens
itos = pickle.load(open(f'{snli_root}itos.pkl', 'rb'))

stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
vocab_size = len(itos)
vocab_size

34155

## Create a new dataloader to create sentence pairs

In [3]:
from enum import Enum

class Entail(Enum):
    entailment = 0
    contradiction = 1
    neutral = 2
       
class SiameseDataset(dataset.Dataset):
    def __init__(self, json_file):
        
        content = None
        with open(json_file) as fp:
            content = json.load(fp)

        self.items = []
        for item in content:
            s0 = item[0]
            s1 = item[1]
            average_len = (len(s0)+len(s1))/2
            try:
                label = Entail[item[2]].value
                self.items.append((s0, s1, label, average_len))
            except KeyError:
                pass
            
    def shuffle(self):
        self.items.sort(key=lambda x: x[3]+random.randint(-5, 5))
        
    def __getitem__(self, index):
        return self.items[index]
       
    def __len__(self):
        return len(self.items)
    
class SiameseDataLoader():
    def __init__(self, dataset, stoi, pad_val, batch_size=32):
        self.dataset = dataset
        self.batch_size = batch_size
        self.stoi = stoi
        self.index = 0
        self.pad_val = pad_val
      
    def __iter__(self):
        return self
    
    def fill_tensor(self, sentences, max_len):
        data = np.zeros((max_len, len(sentences)), dtype=np.long)
        data.fill(self.pad_val)
        
        for i, s in enumerate(sentences): 
            start_idx = max_len - len(s)
            for j, p in enumerate(s):
                data[:,i][start_idx+j] = stoi[p]
            
        return torch.LongTensor([data.tolist()]).cuda()
     
    def batch(self):
        return self.index//self.batch_size
    
    def __len__(self):
        return len(self.dataset)//self.batch_size
    
    def __next__(self):
        #how many examples to ananlyise for this round
        num = min(self.batch_size, len(self.dataset) - self.index)
        
        if num < 1:
            raise StopIteration  # signals "the end"
            
        #collect the sentences
        max_len = 0
        first = []
        second = []
        labels = np.zeros((num), dtype=np.long)
        
        for i in range(num):
            a, b, l, _ = self.dataset[self.index + i]
            
            if len(a) > max_len:
                max_len = len(a)
            
            if len(b) > max_len:
                max_len = len(b)
            
            first.append(a)
            second.append(b)
            labels[i] = l
            
        self.index += num
             
        return (self.fill_tensor(first, max_len).cuda(),
                self.fill_tensor(second, max_len).cuda(),
                torch.LongTensor([labels.tolist()]).cuda()
               )

In [4]:
siamese_dataset_train = SiameseDataset(f'{snli_root}/snli_train.json')
siamese_dataset_dev = SiameseDataset(f'{snli_root}snli_dev.json')
siamese_dataset_test = SiameseDataset(f'{snli_root}snli_test.json')

## Siamese network

In [5]:
class SiameseClassifier(nn.Module):
    
    def __init__(self, encoder, classifier):
        super().__init__()
        self.encoder = encoder
        self.classifier = classifier
    
    def pool(self, x, bs, is_max):
        f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
        return f(x.permute(1,2,0), (1,)).view(bs,-1)

    def pool_outputs(self, output):
        sl, bs,_ = output.size()
        avgpool = self.pool(output, bs, False)
        maxpool = self.pool(output, bs, True)
        return torch.cat([output[-1], maxpool, avgpool], 1)
        
    def forward(self, input1, input2):

        raw_outputs1, outputs1 = self.encoder(input1)
        raw_outputs2, outputs2 = self.encoder(input2)
        
        out1 = self.pool_outputs(outputs1[-1])
        out2 = self.pool_outputs(outputs2[-1])
        
        out = torch.cat([out1, out2], 1)
        
        return self.classifier(out)
        
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

class LinearClassifier(nn.Module):
    def __init__(self, layers, dropout):
        super().__init__()
        self.layers = nn.ModuleList([LinearBlock(layers[i], layers[i + 1], dropout) for i in range(len(layers) - 1)])
        
    def forward(self, input):
        x = input
        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
        return l_x

## Load our pretrained model then build the Siamese network from it

In [18]:
#these are the values used for the original LM
em_sz, nh, nl = 400,1150,3
bptt = 70
max_seq = bptt * 20
cats = 3

SNLI_LM = torch.load("SNLI_LM.pt")

dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.1
SNLI_encoder = MultiBatchRNN(bptt, max_seq, vocab_size, em_sz, nh, nl, stoi["_pad_"], dropouti=dps[0], wdrop=dps[2], dropoute=dps[3], dropouth=dps[4])

SNLI_encoder.load_state_dict(SNLI_LM[0].state_dict())

#2 pooled vectors, of 3 times the embedding size
siamese_model = SiameseClassifier(SNLI_encoder, LinearClassifier(layers=[2*em_sz*3, 50, cats], dropout=0.1)).cuda()

## Training loop
This should be converted over to the fast.ai learner but I'm not sure how to do that yet.

In [7]:
log_interval = 100
criterion = nn.CrossEntropyLoss()
def evaluate(model, data_loader):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    model.reset()
    total_loss = 0.
    num_correct = 0
    total = 0 
    for a, b, l in data_loader:

        a, b, l = Variable(a), Variable(b), Variable(l)
        a.requires_grad = False
        b.requires_grad = False
        l.requires_grad = False
        out = model(a.squeeze(), b.squeeze())
        num_correct += np.sum(l.data.cpu().numpy() == np.argmax(out.data.cpu().numpy(), 1))
        total += out.shape[0]
        loss = criterion(out, l.squeeze())
        total_loss += out.shape[0] * loss.data.cpu()[0]

    return (total_loss / total, num_correct / total)

def train(model, data_loader, optimizer):
    # Turn on training mode which enables dropout.
    model.train()
    model.reset()
    total_loss = 0.
    start_time = time.time()
    
    num_correct = 0
    total = 0 
        
    for a, b, l in data_loader:
        
        optimizer.zero_grad()
        a, b, l = Variable(a), Variable(b), Variable(l)

        out = model(a.squeeze(), b.squeeze())
        loss = criterion(out, l.squeeze())
        total_loss += out.shape[0] * loss.data.cpu()[0]
        
        loss.backward()
        optimizer.step()
        
        num_correct += np.sum(l.data.cpu().numpy() == np.argmax(out.data.cpu().numpy(), 1))
        total += out.shape[0]

        batch = data_loader.batch()
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / total
            elapsed = time.time() - start_time
            batches = len(data_loader)
            ms = elapsed * 1000 / log_interval
            print(f'| {batch:5d}/{batches:5d} batches', end=" ")
            print(f'| ms/batch {ms:5.2f} | loss {cur_loss:5.4f} acc {num_correct / total}')
            total_loss = 0
            total = 0
            num_correct = 0
            start_time = time.time()

In [24]:
best_acc = 0
def training_loop(lrs, model):
    global best_acc
    for epoch, lr in enumerate(lrs):

        print(f'Start epoch {epoch:3d} training with lr {lr}')
        optimizer = optim.Adam(model.parameters(), lr=lr)
        siamese_dataset_train.shuffle()
        training_data = SiameseDataLoader(siamese_dataset_train, stoi, stoi["_pad_"], batch_size=32)

        epoch_start_time = time.time()
        train(siamese_model, training_data, optimizer)

        validation_data = SiameseDataLoader(siamese_dataset_test , stoi, stoi["_pad_"], batch_size=32)
        val_loss, accuracy = evaluate(siamese_model, validation_data)

        delta_t = (time.time() - epoch_start_time)
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {delta_t:5.2f}s | valid loss {val_loss:5.2f} accuracy {accuracy} learning rate {lr}')
        print('-' * 89)

        if accuracy > best_acc:
            best_acc = accuracy
            with open(f'./siamese_model.pt', 'wb') as f:
                torch.save(siamese_model, f)

In [20]:
for param in siamese_model.encoder.parameters():
    param.requires_grad = False
    
training_loop([3e-3], siamese_model.classifier)

Start epoch   0 training with lr 0.003
|   100/  307 batches | ms/batch 99.59 | loss 1.0719 acc 0.4121875
|   200/  307 batches | ms/batch 103.49 | loss 1.0639 acc 0.4246875
|   300/  307 batches | ms/batch 100.20 | loss 1.0571 acc 0.4425
-----------------------------------------------------------------------------------------
| end of epoch   0 | time: 59.11s | valid loss  1.05 accuracy 0.44421824104234525 learning rate 0.003
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [21]:
for param in siamese_model.encoder.parameters():
    param.requires_grad = True
    
training_loop([0.0002, 0.0004, 0.0008, 0.001, 0.0005, 0.0003, 0.0001, 0.00008, 0.00004], siamese_model)

Start epoch   0 training with lr 0.0002
|   100/  307 batches | ms/batch 238.94 | loss 1.0337 acc 0.4653125
|   200/  307 batches | ms/batch 248.22 | loss 1.0380 acc 0.4534375
|   300/  307 batches | ms/batch 240.44 | loss 1.0244 acc 0.479375
-----------------------------------------------------------------------------------------
| end of epoch   0 | time: 103.47s | valid loss  1.03 accuracy 0.4732288273615635 learning rate 0.0002
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   1 training with lr 0.0004
|   100/  307 batches | ms/batch 239.21 | loss 1.0094 acc 0.4940625
|   200/  307 batches | ms/batch 248.07 | loss 1.0079 acc 0.49375
|   300/  307 batches | ms/batch 240.97 | loss 0.9967 acc 0.51625
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 103.52s | valid loss  1.01 accuracy 0.49297638436482083 learning rate 0.0004
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   2 training with lr 0.0008
|   100/  307 batches | ms/batch 239.18 | loss 0.9849 acc 0.515
|   200/  307 batches | ms/batch 247.99 | loss 0.9876 acc 0.5028125
|   300/  307 batches | ms/batch 240.45 | loss 0.9702 acc 0.5115625
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 103.49s | valid loss  0.99 accuracy 0.5154723127035831 learning rate 0.0008
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   3 training with lr 0.001
|   100/  307 batches | ms/batch 239.55 | loss 0.9362 acc 0.5584375
|   200/  307 batches | ms/batch 248.44 | loss 0.9364 acc 0.548125
|   300/  307 batches | ms/batch 240.76 | loss 0.9289 acc 0.56
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 103.63s | valid loss  0.99 accuracy 0.5261604234527687 learning rate 0.001
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   4 training with lr 0.0005
|   100/  307 batches | ms/batch 239.76 | loss 0.8949 acc 0.5828125
|   200/  307 batches | ms/batch 248.34 | loss 0.8913 acc 0.5778125
|   300/  307 batches | ms/batch 240.89 | loss 0.8563 acc 0.6109375
-----------------------------------------------------------------------------------------
| end of epoch   4 | time: 103.64s | valid loss  0.98 accuracy 0.5414291530944625 learning rate 0.0005
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   5 training with lr 0.0003
|   100/  307 batches | ms/batch 239.55 | loss 0.8431 acc 0.615
|   200/  307 batches | ms/batch 248.75 | loss 0.8435 acc 0.60625
|   300/  307 batches | ms/batch 240.81 | loss 0.8113 acc 0.6253125
-----------------------------------------------------------------------------------------
| end of epoch   5 | time: 103.65s | valid loss  0.99 accuracy 0.5453990228013029 learning rate 0.0003
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   6 training with lr 0.0001
|   100/  307 batches | ms/batch 239.64 | loss 0.8161 acc 0.6284375
|   200/  307 batches | ms/batch 248.73 | loss 0.8198 acc 0.6228125
|   300/  307 batches | ms/batch 241.20 | loss 0.7870 acc 0.6559375
-----------------------------------------------------------------------------------------
| end of epoch   6 | time: 103.68s | valid loss  0.99 accuracy 0.5498778501628665 learning rate 0.0001
-----------------------------------------------------------------------------------------


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Start epoch   7 training with lr 8e-05
|   100/  307 batches | ms/batch 239.86 | loss 0.8062 acc 0.6378125
|   200/  307 batches | ms/batch 248.51 | loss 0.8072 acc 0.63875
|   300/  307 batches | ms/batch 241.09 | loss 0.7672 acc 0.6578125
-----------------------------------------------------------------------------------------
| end of epoch   7 | time: 103.72s | valid loss  0.99 accuracy 0.5495724755700325 learning rate 8e-05
-----------------------------------------------------------------------------------------
Start epoch   8 training with lr 4e-05
|   100/  307 batches | ms/batch 239.55 | loss 0.7913 acc 0.638125
|   200/  307 batches | ms/batch 248.19 | loss 0.7972 acc 0.6353125
|   300/  307 batches | ms/batch 240.70 | loss 0.7483 acc 0.6628125
-----------------------------------------------------------------------------------------
| end of epoch   8 | time: 103.58s | valid loss  0.98 accuracy 0.5501832247557004 learning rate 4e-05
-------------------------------------------

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [None]:
training_loop([0.0005, 0.0003, 0.0001, 0.00008], siamese_model)

Start epoch   0 training with lr 0.0005
|   100/17167 batches | ms/batch 71.78 | loss 0.9766 acc 0.56625
|   200/17167 batches | ms/batch 78.40 | loss 0.9085 acc 0.59125
|   300/17167 batches | ms/batch 80.94 | loss 0.9006 acc 0.58375
|   400/17167 batches | ms/batch 83.87 | loss 0.8837 acc 0.5928125
|   500/17167 batches | ms/batch 85.30 | loss 0.8903 acc 0.5859375
|   600/17167 batches | ms/batch 87.14 | loss 0.8986 acc 0.5859375
|   700/17167 batches | ms/batch 89.20 | loss 0.9035 acc 0.578125
|   800/17167 batches | ms/batch 90.20 | loss 0.9032 acc 0.57125
|   900/17167 batches | ms/batch 91.62 | loss 0.8920 acc 0.590625
|  1000/17167 batches | ms/batch 92.51 | loss 0.9004 acc 0.57375


In [18]:
with open(f'./siamese_model0.50.pt', 'wb') as f:
    torch.save(siamese_model, f)

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
