# RNN Encoder-decoder vectorized

Use [fastai book chap 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb) human numbers data to train translator from english like "two hundred seven" to sequence of digits like "207". Data looks like:

```
one 
two 
three 
...
two hundred seven 
two hundred eight 
...
```

This is vectorized version of [previous](encoder-decoder.ipynb).

Wow. much more accurate to use encoder h as context vector rather than init value for decoder!

In [1]:
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path

Path('/home/parrt/.fastai/data/human_numbers')

## Support

In [2]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

In [4]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [5]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

In [6]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

## Load

In [7]:
text = get_text(path/'train.txt').strip()
print(text[:28])
lines = text.lower().split('\n')
print(lines[:5])

one 
two 
three 
four 
five 
['one ', 'two ', 'three ', 'four ', 'five ']


In [8]:
lines = lines[0:2000] # testing

In [9]:
# get unique vocab but don't sort; keep order so 'one'=1 etc...
# use '#' to indicate padded (unused) char for embedding purposes
v = set('#')
X_vocab = ['#']  # position 0 means pad symbol
for t in text.split():
    if t not in v:
        X_vocab.append(t)
        v.add(t)
X_vocab[:10]

['#', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [10]:
X_tokens = [line.strip().split(' ') for line in lines]
X_tokens[18:23]

[['nineteen'],
 ['twenty'],
 ['twenty', 'one'],
 ['twenty', 'two'],
 ['twenty', 'three']]

In [11]:
len(X_tokens), len(lines)

(2000, 2000)

In [12]:
n = len(X_tokens)
batch_size = 64
nbatches = n // batch_size
n = nbatches * batch_size
X_tokens = X_tokens[:n]

X_vocab = {w:i for i,w in enumerate(X_vocab)}
X_idx = {i:w for i,w in enumerate(X_vocab)}
X_vocab['one'], X_vocab['eleven'], X_idx[1], X_idx[11]

(1, 11, 'one', 'eleven')

In [13]:
# numericalize and left pad
X_max_len = get_max_len(X_tokens)
X = torch.zeros(len(X_tokens), X_max_len, device=device, dtype=torch.long) # zero implies padding
print(X.shape)
for i in range(len(X_tokens)):
    x = X_tokens[i]
    pad = X_max_len - len(x)
    for j in range(len(x)):
        X[i,j+pad] = X_vocab[X_tokens[i][j]]
X[25:31]

torch.Size([1984, 6])


tensor([[ 0,  0,  0,  0, 20,  6],
        [ 0,  0,  0,  0, 20,  7],
        [ 0,  0,  0,  0, 20,  8],
        [ 0,  0,  0,  0, 20,  9],
        [ 0,  0,  0,  0,  0, 21],
        [ 0,  0,  0,  0, 21,  1]], device='cuda:0')

## Translation

### Define y sequence of digits

Let's use Y as list of lists like X; targets like `'one' -> '1'`, `['twenty', 'three'] -> ['2','3']`, etc...

Use '<' for start of sequence and '>' for end. So sequence `ab` is stored `<ab>`.



In [14]:
Y_vocab = {d:i for i,d in enumerate("0123456789<>")}
Y_idx = {i:w for i,w in enumerate("0123456789<>")}
Y_vocab

{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 '<': 10,
 '>': 11}

In [15]:
Ystr = [f"<{i+1}>" for i in range(0,len(X))]
Y_max_len = get_max_len(Ystr)
Ystr[:11]

['<1>', '<2>', '<3>', '<4>', '<5>', '<6>', '<7>', '<8>', '<9>', '<10>', '<11>']

In [16]:
Y = []
for i in range(0,len(X)):
    y = Ystr[i]
    pad = Y_max_len - len(y)
    Y.append([Y_vocab[d] for d in y]+[Y_vocab['>']]*pad)  # pad with "end of string" symbols '>'
Y = torch.tensor(Y)
Y[19:25]

tensor([[10,  2,  0, 11, 11, 11],
        [10,  2,  1, 11, 11, 11],
        [10,  2,  2, 11, 11, 11],
        [10,  2,  3, 11, 11, 11],
        [10,  2,  4, 11, 11, 11],
        [10,  2,  5, 11, 11, 11]])

In [17]:
Y[0:5]

tensor([[10,  1, 11, 11, 11, 11],
        [10,  2, 11, 11, 11, 11],
        [10,  3, 11, 11, 11, 11],
        [10,  4, 11, 11, 11, 11],
        [10,  5, 11, 11, 11, 11]])

In [18]:
Y[130:135]

tensor([[10,  1,  3,  1, 11, 11],
        [10,  1,  3,  2, 11, 11],
        [10,  1,  3,  3, 11, 11],
        [10,  1,  3,  4, 11, 11],
        [10,  1,  3,  5, 11, 11]])

In [19]:
embed_sz = 20
y_embed_sz = 8
nhidden = 512
nclasses = len(Y_vocab) # char output vocab

print(f"{n:,d} training records, {len(X_vocab)} X symbols, batch size {batch_size}, {nclasses} target classes, h state is {nhidden}-vector")

1,984 training records, 30 X symbols, batch size 64, 12 target classes, h state is 512-vector


### Train

In [33]:
def forward(batch_X, X_max_len, batch_Y, Y_max_len):
    # ENCODER
    H = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)
    for t in range(X_max_len):
        x_step_t = batch_X[:,t]
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = Ex[:,x_step_t]
        H = W@H + U@embedding_step_t + bx
        H = torch.tanh(H)
    C = H
    
    # DECODER
    loss = 0.0
    correct = 0
#     print("DECODE", batch_Y)
    H = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)
    for t in range(Y_max_len-1): # don't predict next char at final '>'
        embedding_step_t = Ey[:,batch_Y[:,t]]
#         print("H, W2, U2, Ey, embedding_step_t, By")
#         print(H.shape, W2.shape, U2.shape, Ey.shape, embedding_step_t.shape, By.shape)
        H = W2 @ H + Cx@C + U2 @ embedding_step_t + by
        H = torch.tanh(H)
        o = V @ H + bo
        o = o.T # reshape to be batch_size x nclasses
#         print("O",o.shape)
#         o = o.reshape(batch_size,nclasses)
        # From y we want to predict y[1:]. at y[t], predict y[t+1]
        loss += F.cross_entropy(o, torch.tensor(batch_Y[:,t+1], device=device))

        p = softmax(o)
#         print(torch.argmax(p, dim=1), torch.tensor(batch_Y[:,t+1], device=device))
        c = torch.sum(torch.argmax(p, dim=1)==torch.tensor(batch_Y[:,t+1], device=device))
#         print(c.item())
        correct += c.item()
    return loss, correct

In [34]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
Ex = torch.randn(embed_sz,     len(X_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       embed_sz,      device=device, dtype=torch.float64, requires_grad=True) # input converter
bx = torch.zeros(nhidden,      1,             device=device, dtype=torch.float64, requires_grad=True)
by = torch.zeros(nhidden,      1,             device=device, dtype=torch.float64, requires_grad=True)
bo = torch.zeros(nclasses,     1,             device=device, dtype=torch.float64, requires_grad=True)

Ey = torch.randn(y_embed_sz,   len(Y_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W2 = torch.eye(nhidden,        nhidden,       device=device, dtype=torch.float64, requires_grad=True)
Cx = torch.eye(nhidden,        nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U2 = torch.randn(nhidden,      y_embed_sz,    device=device, dtype=torch.float64, requires_grad=True) # input converter
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True)

# optimizer = torch.optim.Adam([Ex,W,U,Ey,W2,U2,V], lr=0.001, weight_decay=0.0)
optimizer = torch.optim.Adam([Ex,W,U,Ey,W2,Cx,U2,V,bx,by,bo], lr=0.001, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
                                              base_lr=0.0003, max_lr=0.001,
                                              cycle_momentum=False)
torch.autograd.set_detect_anomaly(True)

history = []
epochs = 50
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total_compares = 0
    for p in range(0, n, batch_size):  # do one epoch
        loss = 0
        batch_X = X[p:p+batch_size]
        batch_Y = Y[p:p+batch_size]
        loss, correct = forward(batch_X, X_max_len, batch_Y, Y_max_len)
        epoch_training_accur += correct
        epoch_training_loss += loss.detach().item()
        total_compares += batch_size * (Y_max_len - 1)

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

        epoch_training_loss += loss.detach().item()

    scheduler.step()
    epoch_training_loss /= nbatches
    epoch_training_accur /= total_compares
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f} accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")



Epoch   1 training loss 144.2157 accur  0.3804   LR 0.000475
Epoch   2 training loss 59.9173 accur  0.5636   LR 0.000650
Epoch   3 training loss 43.4087 accur  0.6148   LR 0.000825
Epoch   4 training loss 35.6588 accur  0.6423   LR 0.001000
Epoch   5 training loss 26.6445 accur  0.6783   LR 0.000825
Epoch   6 training loss 15.1577 accur  0.7553   LR 0.000650
Epoch   7 training loss 14.5702 accur  0.7723   LR 0.000475
Epoch   8 training loss  6.5942 accur  0.8702   LR 0.000300
Epoch   9 training loss  3.0505 accur  0.9329   LR 0.000387
Epoch  10 training loss  1.1049 accur  0.9659   LR 0.000475
Epoch  11 training loss  0.4836 accur  0.9856   LR 0.000563
Epoch  12 training loss  0.3146 accur  0.9928   LR 0.000650
Epoch  13 training loss  0.1960 accur  0.9959   LR 0.000563
Epoch  14 training loss  0.1272 accur  0.9984   LR 0.000475
Epoch  15 training loss  0.0837 accur  0.9997   LR 0.000387
Epoch  16 training loss  0.0758 accur  0.9998   LR 0.000300
Epoch  17 training loss  0.0608 accur  

In [24]:
def sample(x):
    n = len(x)
    output = []
    with torch.no_grad():
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = Ex[:,x[t]]
            embedding_step_t = embedding_step_t.reshape(embed_sz,1)
            h = W @ h + U @ embedding_step_t + bx
            h = torch.tanh(h)
        c = h
        
        # DECODER
        y = [Y_vocab['<']] # begin with "start of sequence" char
        loss = 0.0
        correct = 0
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        while y!=Y_vocab['>']:
            embedding_step_t = Ey[:,y]
            embedding_step_t = embedding_step_t.reshape(y_embed_sz,1)
            h = W2 @ h + Cx@c + U2 @ embedding_step_t + by
            h = torch.tanh(h)
            o = V @ h + bo
            o = o.reshape(1,nclasses)
            p = softmax(o[0])
            y = torch.argmax(p).item()
            if y!=Y_vocab['>']:
                output.append(Y_idx[y])
    return output

In [25]:
x = [X_vocab[w] for w in "one".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['one'] => ['1', '1', '2', '9', '1']


In [26]:
x = [X_vocab[w] for w in "one hundred".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['one', 'hundred'] => ['1', '1', '1', '0', '0']


In [27]:
x = [X_vocab[w] for w in "one hundred ten".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['one', 'hundred', 'ten'] => ['1', '1', '1', '0']


In [28]:
x = [X_vocab[w] for w in "one hundred thirty two".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['one', 'hundred', 'thirty', 'two'] => ['1', '1', '3', '2']


In [29]:
x = [X_vocab[w] for w in "eleven".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['eleven'] => ['1', '1', '1']


In [30]:
x = [X_vocab[w] for w in "ninety nine".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['ninety', 'nine'] => ['1', '9', '9']


In [31]:
x = [X_vocab[w] for w in "fifty three".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['fifty', 'three'] => ['1', '5', '3']


In [32]:
x = [X_vocab[w] for w in "one thousand four hundred fifteen".split()]
output = sample(x)
print([X_idx[n] for n in x],'=>', output)

['one', 'thousand', 'four', 'hundred', 'fifteen'] => ['1', '4', '1', '5']
