# ULMFiT + Siamese Network for Sentence Vectors
## Part Three: Classifying

The second notebook created a new language model from the SNLI dataset.
This notebook will adapt that model to predicting the SNLI category for sentence pairs.
The model will be used as a sentence encoder for a Siamese Network that builds sentence vectors that are feed into a classifier network.

In [1]:
from fastai.text import *
import html

import json
import html
import re
import pickle
from collections import Counter
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict
import random

import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F

import time
import math
import sys
import data

snli_root = './data/snli_1.0/'
token_files = './data/tokens/'

In [2]:
#load the tokens
itos = pickle.load(open(f'{token_files}itos.pkl', 'rb'))

stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
vocab_size = len(itos)
vocab_size

34155

## Create a new dataloader to create sentence pairs

In [19]:
from enum import Enum

class Entail(Enum):
    entailment = 0
    contradiction = 1
    neutral = 2
       
class SiameseDataset(dataset.Dataset):
    def __init__(self, json_file):
        
        content = []
        with open(json_file) as fp:
            while True:
                line = fp.readline()
                if line:
                    content.append(json.loads(line))
                else:
                    break

        self.items = []
        for item in content:
            l = item['gold_label']
            s0 = item['sentence1']
            s1 = item['sentence2']
            
            average_len = (len(s0)+len(s1))/2
            try:
                label = Entail[l].value
                self.items.append((s0, s1, label, average_len))
            except KeyError:
                pass
            
    def shuffle(self):
        self.items.sort(key=lambda x: x[3]+random.randint(-5, 5))
        
    def __getitem__(self, index):
        return self.items[index]
       
    def __len__(self):
        return len(self.items)

    
class SiameseDataLoader():
    def __init__(self, dataset, stoi, pad_val, batch_size=32):
        self.dataset = dataset
        self.batch_size = batch_size
        self.stoi = stoi
        self.index = 0
        self.pad_val = pad_val
      
    def __iter__(self):
        return self
    
    def fill_tensor(self, sentences, max_len):
        data = np.zeros((max_len, len(sentences)), dtype=np.long)
        data.fill(self.pad_val)
        
        for i, s in enumerate(sentences): 
            start_idx = max_len - len(s)
            for j, p in enumerate(s):
                data[:,i][start_idx+j] = stoi[p]
            
        return torch.LongTensor([data.tolist()]).cuda()
     
    def batch(self):
        return self.index//self.batch_size
    
    def __len__(self):
        return len(self.dataset)//self.batch_size
    
    def __next__(self):
        #how many examples to ananlyise for this round
        num = min(self.batch_size, len(self.dataset) - self.index)
        
        if num < 1:
            raise StopIteration  # signals "the end"
            
        #collect the sentences
        max_len = 0
        first = []
        second = []
        labels = torch.LongTensor(num)
        
        for i in range(num):
            a, b, l, _ = self.dataset[self.index + i]
            
            if len(a) > max_len:
                max_len = len(a)
            
            if len(b) > max_len:
                max_len = len(b)
            
            first.append(a)
            second.append(b)
            labels[i] = l
            
        self.index += num
             
        return (self.fill_tensor(first, max_len).cuda(),
                self.fill_tensor(second, max_len).cuda(),
                labels.cuda()
               )

In [20]:
siamese_dataset_train = SiameseDataset(f'{snli_root}/snli_1.0_train.jsonl')
siamese_dataset_dev = SiameseDataset(f'{snli_root}snli_1.0_dev.jsonl')
siamese_dataset_test = SiameseDataset(f'{snli_root}snli_1.0_test.jsonl')

## Siamese network

In [21]:
class SiameseClassifier(nn.Module):
    
    def __init__(self, encoder, linear):
        super().__init__()
        self.encoder = encoder
        self.linear = linear
    
    def pool(self, x, bs, is_max):
        f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
        return f(x.permute(1,2,0), (1,)).view(bs,-1)

    def pool_outputs(self, output):
        sl, bs,_ = output.size()
        avgpool = self.pool(output, bs, False)
        maxpool = self.pool(output, bs, True)
        return torch.cat([output[-1], maxpool, avgpool], 1)
        
    def forward_once(self, input):
        raw_outputs, outputs = self.encoder(input)
        out = self.pool_outputs(outputs[-1])
        return out
    
    def forward(self, in1, in2):
        u = self.forward_once(in1)
        v = self.forward_once(in2)
        features = torch.cat((u, v, torch.abs(u-v), u*v), 1)
        out = self.linear(features)
        return out 
        
    def reset(self):
        for c in self.children():
            if hasattr(c, 'reset'): c.reset()

class LinearClassifier(nn.Module):
    def __init__(self, layers, dropout):
        super().__init__()
        self.layers = nn.ModuleList([LinearBlock(layers[i], layers[i + 1], dropout) for i in range(len(layers) - 1)])
        
    def forward(self, input):
        x = input
        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
        return l_x


In [22]:
#these are the values used for the original LM
em_sz, nh, nl = 400, 1150, 3
bptt = 70
max_seq = bptt * 20
cats = 3

## Load our pretrained model then build the Siamese network from it

In [None]:
SNLI_LM = torch.load("SNLI_LM.pt")

dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.1
SNLI_encoder = MultiBatchRNN(bptt, max_seq, vocab_size, em_sz, nh, nl, stoi["_pad_"], dropouti=dps[0], wdrop=dps[2], dropoute=dps[3], dropouth=dps[4])

SNLI_encoder.load_state_dict(SNLI_LM[0].state_dict())

#2 pooled vectors, of 3 times the embedding size
siamese_model = SiameseClassifier(SNLI_encoder, LinearClassifier(layers=[em_sz*3*4, nh, em_sz], dropout=0.1)).cuda()

## Training loop
This should be converted over to the fast.ai learner but I'm not sure how to do that yet.

In [58]:
log_interval = 1000
criterion = nn.CrossEntropyLoss()
#criterion = nn.CosineEmbeddingLoss()

def evaluate(model, data_loader):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    
    total_loss = 0.
    num_correct = 0
    total = 0 
    
    for a, b, l in data_loader:
        
        model.reset()
        a, b, l = Variable(a), Variable(b), Variable(l)
        out = model(a.squeeze(), b.squeeze())
        loss = criterion(out, l.squeeze())
        total += l.size(0)
        total_loss += l.size(0) * loss.item()
        num_correct += np.sum(l.data.cpu().numpy() == np.argmax(out.data.cpu().numpy(), 1))
        
    return (total_loss / total, num_correct / total)

def train(model, data_loader, optimizer):
    # Turn on training mode which enables dropout.
    start_time = time.time()
    model.train() 
    
    total_loss = 0.
    num_correct = 0
    total = 0 
        
    for a, b, l in data_loader:
        optimizer.zero_grad()
        
        model.reset()
        a, b, l = Variable(a), Variable(b), Variable(l)
        out = model(a.squeeze(), b.squeeze())
        loss = criterion(out, l.squeeze())
        total += l.size(0)
        total_loss += l.size(0) * loss.item()
        num_correct += np.sum(l.data.cpu().numpy() == np.argmax(out.data.cpu().numpy(), 1))
        
        loss.backward()
        optimizer.step()

        batch = data_loader.batch()
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / total
            elapsed = time.time() - start_time
            batches = len(data_loader)
            ms = elapsed * 1000 / log_interval
            print(f'| {batch:5d}/{batches:5d} batches', end=" ")
            print(f'| ms/batch {ms:5.2f} | loss {cur_loss:5.4f} acc {num_correct / total}')
            #print(f'| ms/batch {ms:5.2f} | loss {cur_loss:5.4f}')
            total_loss = 0
            total = 0
            num_correct = 0
            start_time = time.time()

In [68]:
best_accuracy = 0.72
def training_loop(lrs, model):
    global best_accuracy
    for epoch, lr in enumerate(lrs):

        print(f'Start epoch {epoch:3d} training with lr {lr}')
        optimizer = optim.SGD(model.parameters(), lr=lr)
        siamese_dataset_train.shuffle()
        training_data = SiameseDataLoader(siamese_dataset_train, stoi, stoi["_pad_"], batch_size=24)

        epoch_start_time = time.time()
        train(siamese_model, training_data, optimizer)

        validation_data = SiameseDataLoader(siamese_dataset_test , stoi, stoi["_pad_"], batch_size=24)
        val_loss, accuracy = evaluate(siamese_model, validation_data)

        delta_t = (time.time() - epoch_start_time)
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {delta_t:5.2f}s | valid loss {val_loss:5.2f} accuracy {accuracy} learning rate {lr}')
        print('-' * 89)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            with open(f'./siamese_model{best_accuracy:0.2f}.pt', 'wb') as f:
                torch.save(siamese_model, f)

In [None]:
for param in siamese_model.encoder.parameters():
    param.requires_grad = False

training_loop([0.05], siamese_model.linear)

In [70]:
for param in siamese_model.encoder.parameters():
    param.requires_grad = True
    
lrs = [x/200 for x in reversed(range(6, 17))]
training_loop(lrs, siamese_model)

Start epoch   0 training with lr 0.08
|  1000/22890 batches | ms/batch 68.93 | loss 0.5526 acc 0.77625
|  2000/22890 batches | ms/batch 78.50 | loss 0.5852 acc 0.7637083333333333
|  3000/22890 batches | ms/batch 84.69 | loss 0.6163 acc 0.7449583333333333
|  4000/22890 batches | ms/batch 89.33 | loss 0.6140 acc 0.74525
|  5000/22890 batches | ms/batch 93.95 | loss 0.6248 acc 0.7397916666666666
|  6000/22890 batches | ms/batch 97.44 | loss 0.6275 acc 0.7377083333333333
|  7000/22890 batches | ms/batch 104.01 | loss 0.6337 acc 0.7340833333333333
|  8000/22890 batches | ms/batch 109.99 | loss 0.6450 acc 0.7255416666666666
|  9000/22890 batches | ms/batch 115.36 | loss 0.6507 acc 0.726625
| 10000/22890 batches | ms/batch 121.93 | loss 0.6562 acc 0.723
| 11000/22890 batches | ms/batch 128.66 | loss 0.6601 acc 0.7204583333333333
| 12000/22890 batches | ms/batch 133.05 | loss 0.6695 acc 0.7187083333333333
| 13000/22890 batches | ms/batch 139.09 | loss 0.6720 acc 0.7139166666666666
| 14000/2289

In [None]:
torch.save(siamese_model, './siamese_model.pt')