# ULMFiT + Siamese Network for Sentence Vectors
## Part One: Pretraining
This notebook will take a language model from lesson 10 of the Fast ai course on deeplearning and add a siamese network to create sentence vectors. We will be using the SNLI dataset. The first task will be to make a network that predicts entailment. Then we will create sentence vectors and determine suitability for use as a similarity metric.

### You must have the fastai library installed

In [2]:
from fastai.text import *
import html

import json
import html
import re
import pickle
from collections import Counter
import random
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
from sklearn import model_selection
from functools import partial
from collections import Counter, defaultdict

import numpy as np
import torch
import torch.nn as nn
import torch.utils 
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import dataset, dataloader
import torch.optim as optim
import torch.nn.functional as F

import time
import math
import sys
import data

data_root = './data/'

ModuleNotFoundError: No module named 'fastai.text'

## Load the tokens from the SNLI data

In [9]:
tok_trn = np.load(f'{data_root}tok_trn.npy')
tok_val = np.load(f'{data_root}tok_val.npy')
freq = Counter(np.concatenate([tok_trn, tok_val]))

max_vocab = 60000
min_freq = 2
itos = [o for o, c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

trn_lm = np.array([stoi[p] for p in tok_trn])
val_lm = np.array([stoi[p] for p in tok_val])

#save results
pickle.dump(itos, open(f'{data_root}itos.pkl', 'wb'))
np.save(f'{data_root}trn_lm.npy', trn_lm)
np.save(f'{data_root}val_lm.npy', val_lm)

In [10]:
#load the results so we can pick it up from here 
itos = pickle.load(open(f'{data_root}itos.pkl', 'rb'))
trn_lm = np.load(f'{data_root}trn_lm.npy')
val_lm = np.load(f'{data_root}val_lm.npy')

stoi = defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
vocab_size = len(itos)
vocab_size

21434

In [11]:
# check to make sure that the data looks ok
for word in trn_lm[:100]:
    print(itos[word], end=" ")


 xsos a man on a phone and a woman are eating candy . 
 xsos a bicycler rides his bike on the road next to rocks with snow . 
 xsos the bench is outdoors 
 xsos a woman is with her kid 
 xsos two little boys play football on green grass . 
 xsos four elderly people sitting under a white tent playing musical instruments . 
 xsos there are people sking . 
 xsos some people following a ball . 
 xsos two dogs play together . 
 xsos a very young girl is holding food 

## Load the Wikitext LM and fix the weights

In [12]:
#download the wikitext LM
# ! wget -nH -r -np -P ./data/aclImdb/ http://files.fast.ai/models/wt103/

In [13]:
#these are the values used for the original LM
em_sz,nh,nl = 400,1150,3

PRE_PATH = f'{data_root}aclImdb/models/wt103'
PRE_LM_PATH = PRE_PATH+'/fwd_wt103.h5'
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [15]:
#get the mean weight value for any new vocab
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)

itos2 = pickle.load(Path(PRE_PATH+'/itos_wt103.pkl').open('rb'))
stoi2 = defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

#fill in the missing values from the old vocab
new_w = np.zeros((vocab_size, em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m
    
#fix up the wgts with the new values
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

In [1]:
max_seq = 20*70
wd = 1e-7
bptt = 70
batch_size = 52

trn_dl = LanguageModelLoader(np.concatenate(trn_lm), batch_size, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), batch_size, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=batch_size, bptt=bptt)

NameError: name 'LanguageModelLoader' is not defined

### Create the language model and load the weigths

In [49]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

dropouti, dropout, wdrop, dropoute, dropouth = drops[0], drops[1], drops[2], drops[3], drops[4]
rnn_enc = RNN_Encoder(vocab_size, em_sz, n_hid=nh, n_layers=nl, pad_token=stoi['_pad_'],
                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=False)
enc = rnn_enc.encoder
language_model = SequentialRNN(rnn_enc, LinearDecoder(vocab_size, em_sz, dropout, tie_encoder=enc))

In [50]:
language_model.load_state_dict(wgts)
language_model = language_model.cuda()
criterion = nn.CrossEntropyLoss()

In [73]:
log_interval = 50

def evaluate(model, data_source, ntokens, batch_size, bptt):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    #with torch.no_grad():
    for i in range(0, data_source.size(0) - 1, bptt):
        data, targets = get_batch(data_source, i, bptt)
        data = Variable(data)
        data.requires_grad = False
        
        targets = Variable(targets)
        targets.requires_grad = False
        result, raw_outputs, outputs = model(data)
        loss = criterion(result, targets)

        total_loss += data.shape[0] * loss.data.cpu()[0]
    
    return total_loss / len(data_source)

def train(model, data_source, ntokens, batch_size, bptt, optimizer):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    total_items = 0.
    start_time = time.time()
    
    for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt)):
        data, targets = get_batch(data_source, i, bptt)
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        optimizer.zero_grad()
        
        result, raw_outputs, outputs = model(Variable(data))
        
        loss = criterion(result, Variable(targets))
        loss.backward()
        optimizer.step()
        total_items += data.shape[0]
        total_loss += data.shape[0] * loss.data.cpu()[0]

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / total_items
            elapsed = time.time() - start_time
            batches = len(data_source) // bptt
            ms = elapsed * 1000 / log_interval
            print(f'| epoch {epoch:3d} | {batch:5d}/{batches:5d} batches', end=" ")
            #lr {scheduler.get_lr()[0]:02.5f}
            print(f'| ms/batch {ms:5.2f} | loss {cur_loss:5.4f} | ppl {math.exp(cur_loss):8.2f}')
            total_loss = 0
            total_items = 0
            start_time = time.time()

In [76]:
lr = 1e-4

epochs = 3
optimizer = optim.Adam(language_model.parameters(), lr=lr,  betas=(0.8, 0.99))

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    
    train(language_model, training_set, vocab_size, batch_size, bptt, optimizer)
    val_loss = evaluate(language_model, validation_set, vocab_size, batch_size, bptt)

    delta_t = (time.time() - epoch_start_time)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {delta_t:5.2f}s | valid loss {val_loss:5.2f} | valid ppl {math.exp(val_loss):8.2f}')
    print('-' * 89)


| epoch   1 |    50/ 3662 batches | ms/batch 119.78 | loss 2.7412 | ppl    15.51
| epoch   1 |   100/ 3662 batches | ms/batch 117.33 | loss 2.7254 | ppl    15.26
| epoch   1 |   150/ 3662 batches | ms/batch 117.52 | loss 2.7423 | ppl    15.52
| epoch   1 |   200/ 3662 batches | ms/batch 117.62 | loss 2.7327 | ppl    15.37
| epoch   1 |   250/ 3662 batches | ms/batch 117.54 | loss 2.7352 | ppl    15.41
| epoch   1 |   300/ 3662 batches | ms/batch 117.33 | loss 2.7364 | ppl    15.43
| epoch   1 |   350/ 3662 batches | ms/batch 117.55 | loss 2.7434 | ppl    15.54
| epoch   1 |   400/ 3662 batches | ms/batch 117.41 | loss 2.7359 | ppl    15.42
| epoch   1 |   450/ 3662 batches | ms/batch 117.77 | loss 2.7372 | ppl    15.44
| epoch   1 |   500/ 3662 batches | ms/batch 117.41 | loss 2.7425 | ppl    15.53
| epoch   1 |   550/ 3662 batches | ms/batch 117.69 | loss 2.7303 | ppl    15.34
| epoch   1 |   600/ 3662 batches | ms/batch 117.51 | loss 2.7462 | ppl    15.58
| epoch   1 |   650/ 3662 ba

In [77]:
with open(f'{data_root}language_model.pt', 'wb') as f:
    torch.save(language_model, f)

In [78]:
def roll(tensor, shift, axis):
    if shift == 0:
        return tensor

    if axis < 0:
        axis += tensor.dim()

    dim_size = tensor.size(axis)
    after_start = dim_size - shift
    if shift < 0:
        after_start = -shift
        shift = dim_size - abs(shift)

    before = tensor.narrow(axis, 0, dim_size - shift)
    after = tensor.narrow(axis, after_start, shift)
    return torch.cat([after, before], axis)

In [105]:
sm = nn.LogSoftmax(0)
test_batch_size = 1
sentence = torch.LongTensor(np.ones((30,1))).cuda()
sentence[-3] = stoi["xsos"]
sentence[-2] = stoi["a"]
sentence[-1] = stoi["ball"]

sentence = Variable(sentence)

result, raw_outputs, outputs = language_model(sentence)
for i in range(30):
    result, raw_outputs, outputs = language_model(sentence)
    out = sm(result[29]).data.cpu().numpy()
    word = np.argmax(out)
    print(itos[word], end=" ")
    sentence = roll(sentence, -1, 0)
    sentence[29] = torch.LongTensor([word.tolist()]).cuda()

. 
 xsos a man is sitting on a bench . 
 xsos a man is sitting on a bench . 
 xsos a man is sitting on a bench 