# Generate obama speeches using truncated back propagation and add embedding layer instead of one-hot encoding going into RNN

Lessons:

* w/o nonlinearity on embedding layer (I don't think people use nonlinearity for this). yep, adding embedding of chars before RNN helps. make len(vocab)->small embedding like 20 squeezes into more meaningful embedding than one of size len(vocab). After 20 epochs, was only 47% accurate before with:
```
lr = 0.001
obama 100k text
nchunks = 100
nhidden = 512
bptt = 8
char embed size is 20
```
and is now 57% accurate! Got to 59% at 30 epochs.
* Rather than one-hotting entire 2D input matrix, much smaller on GPU with embeddings.
* With 1M char, 30 epochs same hyperparams gives 61% accur
* Bumping to 2M char seems to help
* With 1M char, making char embed size same as vocab len is converging slightly more slowly.  char embed size of 10 also less good (from 20). char embed size 30 seems about same.
* Back to default args above. Increase nhidden to 600 from 512. seems much slower per epoch and not converging as fast. Trying 400: seems about same as 512.
* Setting stddev to 0.01 for randn init seems to help. At epoch 6, (lr=0.001) we get 59% vs 56% accuracy (400 nhidden). 64% accurate at 30 epochs.
* bptt from 8 to 16 is slower to converge but catches up.
* nchunks 50 from 100 about same
* nchunks 200 from 100 slower to converge even when bumping lr
* 100 training records, chunk length 10000, vocab size 70, char_embed_sz 20, state is 400-vector; lr=0.001 dropping by .8 every 3 got me to 65% accurate.
* with 100 epochs, got to 67% accurate with `lr_scheduler.StepLR(optimizer, step_size=10, gamma=.9)`:
```
...
Epoch  99 training loss 10488.16   accur  0.6730   LR 0.000387
Epoch 100 training loss 10490.52   accur  0.6727   LR 0.000349
```
* Same LR plan and with 2M text:
```
Epoch  99 training loss 21393.74   accur  0.6671   LR 0.000387
Epoch 100 training loss 21388.15   accur  0.6674   LR 0.000349
```
vocab size seems to be increasing with increased text so should probably increase other hyperparameters
* with all 4M text doesn't help so must need more complex model
```
Epoch  99 training loss 45675.70   accur  0.6647   LR 0.000387
Epoch 100 training loss 45670.77   accur  0.6649   LR 0.000349
```
* Adding bias (matrix) seems to help a tiny bit (tanh activation), at least for first 10 epochs i watched.
* Ah! Mystery solved. relu for RNN only works with bias term, otherwise get NaN immediately.
* relu gets to 64% accur at 20 epochs. 
* Dang. relu still sometimes explodes. ah. must use F.cross_entropy() due to numerical instability.
* Added V2 final layer after V. Worse at same parameters. Could require different learning rate.

In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
import codecs
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    with codecs.open(filename, mode='r') as f:
        s = f.read()
    return s

In [3]:
def normal_transform(x, mean=0.0, std=0.01):
    "Convert x to have mean and std"
    return x*std + mean

def randn(n1, n2,          
          mean=0.0, std=0.01, requires_grad=False,
          device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
          dtype=torch.float64):
    x = torch.randn(n1, n2, device=device, dtype=dtype)
    x = normal_transform(x, mean=mean, std=std)
    x.requires_grad=requires_grad
    return x

In [4]:
def plot_history(history, yrange=(0.0, 5.00), figsize=(3.5,3)):
    plt.figure(figsize=figsize)
    plt.ylabel("Sentiment log loss")
    plt.xlabel("Epochs")
    loss = history[:,0]
    valid_loss = history[:,1]
    plt.plot(loss, label='train_loss')
    plt.plot(valid_loss, label='val_loss')
    # plt.xlim(0, 200)
    plt.ylim(*yrange)
    plt.legend()#loc='lower right')
    plt.show()

In [5]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [6]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

def cross_entropy(y_prob, y_true):
    """
    y_pred is n x k for n samples and k output classes and y_true is n x 1
    and is often softmax of final layer.
    y_pred values must be probability that output is a specific class.
    Binary case: When we have y_pred close to 1 and y_true is 1,
    loss is -1*log(1)==0. If y_pred close to 0 and y_true is 1, loss is
    -1*log(small value) = big value.
    y_true values must be positive integers in [0,k-1].
    """
    if torch.isnan(y_prob).any():
        raise ValueError("cross_entropy: y_prob has NaN!",y_prob)
    n = y_prob.shape[0]
    # Get value at y_true[j] for each sample with fancy indexing
    p = y_prob[range(n),y_true]
    p_ = p.detach()
    if torch.isnan(p).any():
        raise ValueError("cross_entropy: p has NaN! p=",p_,"y_prob=",y_prob)
    if (p_<0).any():
        raise ValueError("cross_entropy: y_prob has negative value!:",p_)
    m = torch.mean(-torch.log(p))
    if torch.isnan(m):
        raise ValueError("cross_entropy: mean is NaN! p=",p_)
    return m

## Load and split into chunks

The stochastic part of SGD is critical for training models. The idea is simply to use a small subset of the data when computing gradients to update the model parameters. Generally we take a small batch size of say 32 records, run that through the model, and then compute a loss. From that loss we compute the gradient and then update the model parameters and move onto the next batch.  Once all batches are complete, we have completed an epoch.  We should shuffle the batches and keep going.

We can also be stochastic by updating the gradient in the middle of long sequences, rather than waiting until after a complete batch of long sequences.  If the sequences are really long, waiting till the end of a batch reduces the stochastic nature. Instead I'm going to try breaking up the entire input into a small number of very long sequences. In this way the RNN can keep the hidden state going for the complete sequence. Of course the only problem is that we cannot compute back propagation that far, so at some sequence length I can update the gradient and wipe it out then continue. I think this is easier than modifying the data set stride so that a standard training loop for an RNN keeps the same hidden state across long sequences even if we have broken into chunks.

Let's say that we have a large text and we break it up into six chunks: A,B,C,D,E,F. then, six is our batch size and we will process each long sequence exactly once per epic. However to get stochastic nature, we will update the gradient after only a small sequence of characters.  We pick the chunk size and then the batch sizes computed instead of having to specify both. I think the chunk size is more important: how much can you store in a single hidden state vector.

Come to think of it, all we need to specify is the number of chunks we want to break the text into.  There won't be any batch size because we have a single batch with `nchunks`  long records in it.

In [7]:
text = get_text("data/obama-speeches.txt").lower() # generated from obama-sentences.py
len(text)

4224143

In [8]:
text = text[0:1_000_000] # testing
n = len(text)

bptt = 8                  # only look back this many time steps for gradients
nhidden = 400
char_embed_sz = 20        # there are 50+ chars, squeeze down into fewer dimensions for embedding prior to input into RNN 
nchunks = 100             # break up the input into a number of chunks (doesn't have to be small like batch size)
chunk_size = n // nchunks # the sequences will be very long
n = nchunks * chunk_size  # reset size so it's an even multiple of chunk size
text = text[0:n]

In [9]:
vocab, ctoi = getvocab(text)

In [10]:
chunks = [text[p:p+chunk_size] for p in range(0, n, chunk_size)]
X = torch.empty(nchunks, chunk_size-1, device=device, dtype=torch.long) # int8 doesn't work as indices
y = torch.empty(nchunks, chunk_size-1, device=device, dtype=torch.long)
for i,chunk in enumerate(chunks):
    X[i,:] = torch.tensor([ctoi[c] for c in chunk[0:-1]], device=device)
    y[i,:] = torch.tensor([ctoi[c] for c in chunk[1:]],   device=device)
    
# X, y are now chunked and numericalized into big 2D matrices

In [11]:
nclasses = len(ctoi)
print(f"{nchunks:,d} training records, chunk length {chunk_size}, vocab size {len(ctoi)}, char_embed_sz {char_embed_sz}, state is {nhidden}-vector")

100 training records, chunk length 10000, vocab size 70, char_embed_sz 20, state is 400-vector


In [12]:
X.shape, nchunks

(torch.Size([100, 9999]), 100)

In [13]:
X[:,0].shape

torch.Size([100])

In [14]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
E = torch.randn(char_embed_sz, len(ctoi),     device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       char_embed_sz, device=device, dtype=torch.float64, requires_grad=True) # input converter
B = torch.zeros(nhidden,       nchunks,       device=device, dtype=torch.float64, requires_grad=True)
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

# if using relu, b must be 0. W must be identity so don't mess with sd. others must have low stdev
# From [Le 2015] https://arxiv.org/abs/1504.00941
# "For IRNNs, in addition to the recurrent weights being initialized at identity, the non-recurrent
#  weights are initialized with a random matrix, whose entries are sampled from a
#  Gaussian distribution with mean of zero and standard deviation of 0.001."
sd = 0.001  # weight stddev init for relu
sd = 0.01   # weight stddev init for tanh
with torch.no_grad():
    E *= sd
    U *= sd
    V *= sd
    
# gradient clipping values 
gc = {1, 10, 100, 1000}

parameters = [E,W,U,B,V]
optimizer = torch.optim.Adam(parameters, lr=0.0005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
#                                               mode='triangular2',
#                                               step_size_up=5,
#                                               base_lr=0.0001, max_lr=0.005,
#                                               cycle_momentum=False)

history = []
epochs = 20
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
#     shuffled_idx = torch.randperm(nchunks) # shuffle each epoch (don't need actually)
    H = torch.zeros(nhidden, nchunks, device=device, dtype=torch.float64, requires_grad=False)
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    loss = 0
    for t in range(chunk_size-1):  # char t in chunk predicts t+1 so one less
#         print(f"t={t}")
        chars_step_t = X[:,t] # char_embed_sz x nchunks
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = E[:,chars_step_t] # char_embed_sz x nchunks
#         print(embedding_step_t.shape, E.shape, H.shape, W.shape, U.shape)
        H = W.mm(H) + U.mm(embedding_step_t) + B
        H = torch.tanh(H)
#         H = torch.relu(H)
        o = V.mm(H)
        o = o.T # make it nchunks x nclasses
        p = softmax(o)
        correct = torch.argmax(p, dim=1)==y[:,t]
        epoch_training_accur += torch.sum(correct)
#         print(f"loss {loss:7.4f}")
#         loss += cross_entropy(p, y[:,t])
        loss += F.cross_entropy(o, y[:,t])
        
        if t % bptt == 0 and t > 0:
#             print(f"gradient at {t:4d}, loss {loss.item():7.4f}")
            optimizer.zero_grad()
            loss.backward() # autograd computes U.grad, M.grad, ...
#             torch.nn.utils.clip_grad_value_(parameters, 10)  # gradient clipping when using relu
            optimizer.step()
            epoch_training_loss += loss.detach().item()
            loss = 0
            H = H.detach() # no longer consider previous computations

    epoch_training_accur /=  nchunks * (chunk_size-1)
    epoch_training_loss /= bptt * nchunks
    scheduler.step()
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.2f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

KeyboardInterrupt: 

In [15]:
def sample(initial_chars, n, temperature=0.1):
    "Derived from Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086"
    chars = initial_chars
    n -= len(initial_chars)
    with torch.no_grad():
        for i in range(n):
            h = torch.zeros(nhidden, 1, dtype=torch.float64, device=device, requires_grad=False)  # reset hidden state at start of record
            for j in range(len(chars)):  # for each char in a name
                c = chars[j]
                ci = ctoi[c]
                embedding_step_j = E[:,ci].reshape(char_embed_sz,1) # col is embedding for c; must be column
#                 print(embedding_step_j.shape, E.shape, h.shape, W.shape, U.shape)#, V.shape)
                h = W@h + U@embedding_step_j + B[:,0].reshape(-1,1) # pick any bias from above
                h = torch.tanh(h)
#                 h = torch.relu(h)
            o = V@h
            o = o.reshape(nclasses)
            p = softmax(o)
#             wi = torch.argmax(p) # this doesn't work (just repeats 'and' a million times)
            wi = np.random.choice(range(len(vocab)), p=p.cpu()) # don't always pick most likely; pick per distribution
            chars.append(vocab[wi])
    return chars

In [16]:
''.join( sample(list('the job'), 300) ) 

'the jobostllgaeoomo rt woti h r aiettaaea tesnhues0 e eluaheiasoa\n tenac bad thof lvontsaeddo  uesyts a tlh iuistch eakiek   nrragreaee tcacodavol    prt hrne nnluheoia-rea   intem e iaehoraweuaa t  oe o su hvilrelsak ho  digv t el faeta ,im    tahsueo ciieido a ppap rikyyhrotwril ll  nnl  ppp lah u'

## Extra hidden layer for output

Tried lr=0.001 and 0.0005 and 0.0001. not as good. hmm...

In [24]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
E = torch.randn(char_embed_sz, len(ctoi),     device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       char_embed_sz, device=device, dtype=torch.float64, requires_grad=True) # input converter
B = torch.zeros(nhidden,       nchunks,       device=device, dtype=torch.float64, requires_grad=True)
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target
V2 = torch.randn(nclasses,     nclasses,      device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

# if using relu, b must be 0. W must be identity so don't mess with sd. others must have low stdev
# From [Le 2015] https://arxiv.org/abs/1504.00941
# "For IRNNs, in addition to the recurrent weights being initialized at identity, the non-recurrent
#  weights are initialized with a random matrix, whose entries are sampled from a
#  Gaussian distribution with mean of zero and standard deviation of 0.001."
sd = 0.001  # weight stddev init for relu
sd = 0.01   # weight stddev init for tanh
with torch.no_grad():
    E *= sd
    U *= sd
    V *= sd
    V2 *= sd
    
# gradient clipping values 
gc = {1, 10, 100, 1000}

parameters = [E,W,U,B,V,V2]
optimizer = torch.optim.Adam(parameters, lr=0.0001, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)

history = []
epochs = 20
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
    H = torch.zeros(nhidden, nchunks, device=device, dtype=torch.float64, requires_grad=False)
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    loss = 0
    for t in range(chunk_size-1):  # char t in chunk predicts t+1 so one less
        chars_step_t = X[:,t] # char_embed_sz x nchunks
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = E[:,chars_step_t] # char_embed_sz x nchunks
#         print(embedding_step_t.shape, E.shape, H.shape, W.shape, U.shape)
        H = W.mm(H) + U.mm(embedding_step_t) + B
        H = torch.tanh(H)
        o = V.mm(H)  # o is nclasses x nchunks
        o = torch.relu(o) # add usual nonlinearity for this 2nd to last layer
        o = V2.mm(o) # o is nclasses x nchunks
        o = o.T # make it nchunks x nclasses
        p = softmax(o)
        correct = torch.argmax(p, dim=1)==y[:,t]
        epoch_training_accur += torch.sum(correct)
        loss += F.cross_entropy(o, y[:,t])
        
        if t % bptt == 0 and t > 0:
#             print(f"gradient at {t:4d}, loss {loss.item():7.4f}")
            optimizer.zero_grad()
            loss.backward() # autograd computes U.grad, M.grad, ...
            optimizer.step()
            epoch_training_loss += loss.detach().item()
            loss = 0
            H = H.detach() # no longer consider previous computations

    epoch_training_accur /=  nchunks * (chunk_size-1)
    epoch_training_loss /= bptt * nchunks
    scheduler.step()
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.2f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss    37.58   accur  0.1747   LR 0.000100
Epoch   2 training loss    34.34   accur  0.2166   LR 0.000100
Epoch   3 training loss    31.90   accur  0.2785   LR 0.000100


KeyboardInterrupt: 