# Generate obama speeches using stacked RNNs

With truncated back propagation, add embedding layer instead of one-hot encoding going into RNN.

Lessons:



In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
import codecs
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    with codecs.open(filename, mode='r') as f:
        s = f.read()
    return s

In [3]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [4]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

## Load and split into chunks

In [5]:
text = get_text("data/obama-speeches.txt").lower() # generated from obama-sentences.py
len(text)

4224143

In [6]:
text = text[0:1_000_000] # testing
n = len(text)

bptt = 8                  # only look back this many time steps for gradients
nhidden = 400
char_embed_sz = 20        # there are 50+ chars, squeeze down into fewer dimensions for embedding prior to input into RNN 
nchunks = 100             # break up the input into a number of chunks (doesn't have to be small like batch size)
chunk_size = n // nchunks # the sequences will be very long
n = nchunks * chunk_size  # reset size so it's an even multiple of chunk size
text = text[0:n]

In [7]:
vocab, ctoi = getvocab(text)

In [8]:
chunks = [text[p:p+chunk_size] for p in range(0, n, chunk_size)]
X = torch.empty(nchunks, chunk_size-1, device=device, dtype=torch.long) # int8 doesn't work as indices
y = torch.empty(nchunks, chunk_size-1, device=device, dtype=torch.long)
for i,chunk in enumerate(chunks):
    X[i,:] = torch.tensor([ctoi[c] for c in chunk[0:-1]], device=device)
    y[i,:] = torch.tensor([ctoi[c] for c in chunk[1:]],   device=device)
    
# X, y are now chunked and numericalized into big 2D matrices

In [9]:
nclasses = len(ctoi)
print(f"{nchunks:,d} training records, chunk length {chunk_size}, vocab size {len(ctoi)}, char_embed_sz {char_embed_sz}, state is {nhidden}-vector")

100 training records, chunk length 10000, vocab size 70, char_embed_sz 20, state is 400-vector


In [10]:
X.shape, nchunks

(torch.Size([100, 9999]), 100)

In [12]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
E = torch.randn(char_embed_sz, len(ctoi),     device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       char_embed_sz, device=device, dtype=torch.float64, requires_grad=True) # input converter
B = torch.zeros(nhidden,       nchunks,       device=device, dtype=torch.float64, requires_grad=True)
V = torch.randn(nhidden,       nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

W2 = torch.eye(nhidden,        nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U2 = torch.randn(nhidden,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # input converter
V2 = torch.randn(nclasses,     nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

# if using relu, b must be 0. W must be identity so don't mess with sd. others must have low stdev
# From [Le 2015] https://arxiv.org/abs/1504.00941
# "For IRNNs, in addition to the recurrent weights being initialized at identity, the non-recurrent
#  weights are initialized with a random matrix, whose entries are sampled from a
#  Gaussian distribution with mean of zero and standard deviation of 0.001."
sd = 0.001  # weight stddev init for relu
sd = 0.01   # weight stddev init for tanh
sd = 0.01
sd = 1.0
with torch.no_grad():
    E *= sd
    U *= sd
    V *= sd
    U2 *= sd
    V2 *= sd
    
parameters = [E,W,U,B,V,W2,U2,V2]
optimizer = torch.optim.Adam(parameters, lr=0.0005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
#                                               mode='triangular2',
#                                               step_size_up=4,
#                                               base_lr=0.0001, max_lr=0.001,
#                                               cycle_momentum=False)

history = []
epochs = 70
for epoch in range(1, epochs+1):
    H = torch.zeros(nhidden, nchunks, device=device, dtype=torch.float64, requires_grad=False)
    # 2nd layer of RNN
    H2 = torch.zeros(nhidden, nchunks, device=device, dtype=torch.float64, requires_grad=False)
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    loss = 0
    for t in range(chunk_size-1):  # char t in chunk predicts t+1 so one less
        chars_step_t = X[:,t] # char_embed_sz x nchunks
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = E[:,chars_step_t] # char_embed_sz x nchunks
#         print(embedding_step_t.shape, E.shape, H.shape, W.shape, U.shape)
        H = W.mm(H) + U.mm(embedding_step_t) + B
        H = torch.tanh(H)
        o = V.mm(H) # o is nhidden x nhidden

        H2 = W2.mm(H2) + U2.mm(o)# + B2
        H2 = torch.tanh(H2)

        o2 = V2.mm(H2)
        o2 = o2.T # make it nchunks x nclasses
        p = softmax(o2)
        correct = torch.argmax(p, dim=1)==y[:,t]
        epoch_training_accur += torch.sum(correct)
        loss += F.cross_entropy(o2, y[:,t])
        
        if t % bptt == 0 and t > 0:
#             print(f"gradient at {t:4d}, loss {loss.item():7.4f}")
            optimizer.zero_grad()
            loss.backward() # autograd computes U.grad, M.grad, ...
            torch.nn.utils.clip_grad_value_(parameters, 10)
            optimizer.step()
            epoch_training_loss += loss.detach().item()
            loss = 0
            H = H.detach() # no longer consider previous computations
            H2 = H2.detach()

    epoch_training_accur /=  nchunks * (chunk_size-1)
    epoch_training_loss /= bptt
    scheduler.step()
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.2f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss 10875.16   accur  0.2790   LR 0.000500
Epoch   2 training loss  5536.21   accur  0.3421   LR 0.000500
Epoch   3 training loss  4709.78   accur  0.3594   LR 0.000500
Epoch   4 training loss  4250.26   accur  0.3698   LR 0.000500
Epoch   5 training loss  3933.75   accur  0.3797   LR 0.000500
Epoch   6 training loss  3809.86   accur  0.3791   LR 0.000500
Epoch   7 training loss  3637.35   accur  0.3833   LR 0.000500
Epoch   8 training loss  3481.49   accur  0.3894   LR 0.000500
Epoch   9 training loss  3374.83   accur  0.3916   LR 0.000500
Epoch  10 training loss  3283.12   accur  0.3937   LR 0.000500
Epoch  11 training loss  3189.52   accur  0.3975   LR 0.000500
Epoch  12 training loss  3093.20   accur  0.4025   LR 0.000500
Epoch  13 training loss  3044.85   accur  0.4019   LR 0.000500
Epoch  14 training loss  2942.67   accur  0.4083   LR 0.000500
Epoch  15 training loss  2886.24   accur  0.4118   LR 0.000500
Epoch  16 training loss  2860.62   accur  0.4150   LR 0

In [None]:
def sample(initial_chars, n, temperature=0.1):
    "Derived from Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086"
    chars = initial_chars
    n -= len(initial_chars)
    with torch.no_grad():
        for i in range(n):
            h = torch.zeros(nhidden, 1, dtype=torch.float64, device=device, requires_grad=False)  # reset hidden state at start of record
            for j in range(len(chars)):  # for each char in a name
                c = chars[j]
                ci = ctoi[c]
                embedding_step_j = E[:,ci].reshape(char_embed_sz,1) # col is embedding for c; must be column
#                 print(embedding_step_j.shape, E.shape, h.shape, W.shape, U.shape)#, V.shape)
                h = W@h + U@embedding_step_j + B[:,0].reshape(-1,1) # pick any bias from above
                h = torch.tanh(h)
#                 h = torch.relu(h)
            o = V@h
            o = o.reshape(nclasses)
            p = softmax(o)
#             wi = torch.argmax(p) # this doesn't work (just repeats 'and' a million times)
            wi = np.random.choice(range(len(vocab)), p=p.cpu()) # don't always pick most likely; pick per distribution
            chars.append(vocab[wi])
    return chars

In [None]:
''.join( sample(list('the job'), 300) ) 