# Try to generate obama speeches

Let's us paragraphs as units for which we compute $h_t$.

In [24]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [25]:
import codecs
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    with codecs.open(filename, mode='r') as f:
        s = f.read()
    return s

In [26]:
def normal_transform(x, mean=0.0, std=0.01):
    "Convert x to have mean and std"
    return x*std + mean

def randn(n1, n2,          
          mean=0.0, std=0.01, requires_grad=False,
          device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'),
          dtype=torch.float64):
    x = torch.randn(n1, n2, device=device, dtype=dtype)
    x = normal_transform(x, mean=mean, std=std)
    x.requires_grad=requires_grad
    return x

In [27]:
def plot_history(history, yrange=(0.0, 5.00), figsize=(3.5,3)):
    plt.figure(figsize=figsize)
    plt.ylabel("Sentiment log loss")
    plt.xlabel("Epochs")
    loss = history[:,0]
    valid_loss = history[:,1]
    plt.plot(loss, label='train_loss')
    plt.plot(valid_loss, label='val_loss')
    # plt.xlim(0, 200)
    plt.ylim(*yrange)
    plt.legend()#loc='lower right')
    plt.show()

In [28]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [29]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

def cross_entropy(y_prob, y_true):
    """
    y_pred is n x k for n samples and k output classes and y_true is n x 1
    and is often softmax of final layer.
    y_pred values must be probability that output is a specific class.
    Binary case: When we have y_pred close to 1 and y_true is 1,
    loss is -1*log(1)==0. If y_pred close to 0 and y_true is 1, loss is
    -1*log(small value) = big value.
    y_true values must be positive integers in [0,k-1].
    """
    n = y_prob.shape[0]
    # Get value at y_true[j] for each sample with fancy indexing
#     print(range(n), y_true)
    p = y_prob[range(n),y_true]
    return torch.mean(-torch.log(p))

In [30]:
def onehot(c) -> torch.tensor:
    v = torch.zeros((len(vocab),1), dtype=torch.float64)
    v[ctoi[c]] = 1
    return v

def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

def onehot_matrix(X, ctoi):
    X_onehot = torch.zeros(len(X), len(X[0]), len(ctoi), dtype=torch.float64)
    for i,x in enumerate(X):
        for j,c in enumerate(x):
            X_onehot[i,j,c] = 1
    return X_onehot

## Load

In [31]:
#text = get_text("data/obama-sentences.txt").lower() # generated from obama-sentences.py
#sentences = text.split('\n') # split on blank lines

In [32]:
# Testing
#X_train = sentences[0:500]

In [33]:
#X_train = [list(line) for line in X_train if len(line)>=10] # get list of char lists with at least 10 char

## Split into chunks

Rather than deal with var-length chunks, cat whole thing and split into chunks of length 64. Then take `batch_size` of those at a time for vectorization.  I'm going to reset h to 0 for each chunk, even though that's not right.  It simplifies grouping for vectorization.  If input has chunks A,B,C,D,E,F,G,H,I and I use batch_size 3, then ABC, DEF, GHI would be the things done in batches. Let's see what it does.

In [34]:
text = get_text("data/obama-sentences.txt").lower() # generated from obama-sentences.py
sentences = text.split('\n') # split on blank lines
len(sentences)

9259

In [35]:
sentences = sentences[0:10_000] # testing
text = ' '.join(sentences)

chunk_size = 100 # fastai calls this sequence length? It's same as truncated backprop duration
n = len(text)
nchunks = n // chunk_size
n = nchunks * chunk_size
text = text[0:n]

In [36]:
vocab, ctoi = getvocab(text)

In [37]:
chunks = [text[p:p+chunk_size] for p in range(0, n, chunk_size)]
X = torch.empty(nchunks, chunk_size-1, dtype=torch.long) # int8 doesn't work as indices
y = torch.empty(nchunks, chunk_size-1, dtype=torch.long)
for i,chunk in enumerate(chunks):
    X[i,:] = torch.tensor([ctoi[c] for c in chunk[0:-1]])
    y[i,:] = torch.tensor([ctoi[c] for c in chunk[1:]])
    
# X, y are now chunked and numericalized into big 2D matrices

In [38]:
nhidden = 512
nfeatures = len(vocab)
nclasses = nfeatures
batch_size = 32 # divide evenly into nchunks
n = len(X) # how many chunks?
nbatches = n // batch_size
n = nbatches * batch_size
X = X[0:n]
#max_len = get_max_len(X)

print(f"{n:,d} training records, {nfeatures} features (chars), state is {nhidden}-vector")

9,952 training records, 69 features (chars), state is 512-vector


In [39]:
def forward(x):
    loss = 0.0
    outputs = []
    h = torch.zeros(nhidden, 1, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
    for j in range(len(x)):  # for each char in a name
        h = W@h + U@onehot(x[j])
        h = torch.tanh(h)
        o = V@h
        o = o.reshape(1,nclasses)
        o = softmax(o)
        outputs.append( o[0] ) 
    return torch.stack(outputs)

def forwardN(X:Sequence[Sequence]):#, apply_softmax=True):
    "Cut-n-paste from body of training for use with metrics"
    outputs = []
    for i in range(0, len(X)): # for each input record
        o = forward1(X[i])
        outputs.append( o[0] ) 
    return torch.stack(outputs)

In [40]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
W = torch.eye(nhidden,    nhidden,   dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,  nfeatures, dtype=torch.float64, requires_grad=True) # embed one-hot char vec
V = torch.randn(nclasses, nhidden,   dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

optimizer = torch.optim.Adam([W,U,V], lr=0.001, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=1)

history = []
epochs = 60
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}, LR={scheduler.get_last_lr()}")
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    for p in range(0, n, batch_size):  # do one epoch
        loss = 0
        batch_X = X[p:p+batch_size]
        batch_y = y[p:p+batch_size]
        batch_X_onehot = onehot_matrix(batch_X, ctoi)
        H = torch.zeros(nhidden, batch_size, dtype=torch.float64, requires_grad=False)
        for t in range(chunk_size-1):  # char i in chunk predicts i+1 so one less
            x_step_t = batch_X_onehot[:,t].T # make it len(vocab) x batch_size
            H = W.mm(H) + U.mm(x_step_t)
            H = torch.tanh(H)
            o = V.mm(H)
            o = o.T # make it batch_size x nclasses
            o = softmax(o)
#             print(o.shape, batch_y[:,t].shape)
            loss += cross_entropy(o, batch_y[:,t])
#         print(loss.item())
        correct = torch.argmax(o, dim=1)==batch_y[:,t]
        epoch_training_accur += torch.sum(correct)
        # update matrices based upon loss computed from a batch
        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

#         print(loss.detach().item())
        epoch_training_loss += loss.detach().item()

    scheduler.step()
    epoch_training_loss /= n
    epoch_training_accur /= n
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss 13.0118   accur  0.3099   LR 0.001000
Epoch   2 training loss  7.5045   accur  0.3848   LR 0.001000
Epoch   3 training loss  6.7805   accur  0.4097   LR 0.001000
Epoch   4 training loss  6.3726   accur  0.4287   LR 0.001000
Epoch   5 training loss  6.0980   accur  0.4417   LR 0.001000
Epoch   6 training loss  5.8995   accur  0.4535   LR 0.001000
Epoch   7 training loss  5.7440   accur  0.4605   LR 0.001000
Epoch   8 training loss  5.6192   accur  0.4672   LR 0.001000
Epoch   9 training loss  5.5122   accur  0.4705   LR 0.001000
Epoch  10 training loss  5.4177   accur  0.4813   LR 0.001000
Epoch  11 training loss  5.3360   accur  0.4873   LR 0.001000
Epoch  12 training loss  5.2696   accur  0.4930   LR 0.001000
Epoch  13 training loss  5.2086   accur  0.4995   LR 0.001000
Epoch  14 training loss  5.1553   accur  0.5032   LR 0.001000
Epoch  15 training loss  5.1072   accur  0.5069   LR 0.001000
Epoch  16 training loss  5.0644   accur  0.5109   LR 0.001000
Epoch  1

In [43]:
def sample(initial_chars, n, temperature=0.1):
    "Derived from Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086"
    chars = initial_chars
    n -= len(initial_chars)
    with torch.no_grad():
        for i in range(n):
            h = torch.zeros(nhidden, 1, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
            for j in range(len(chars)):  # for each char in a name
                h = W@h + U@onehot(chars[j])
                h = torch.tanh(h)
            o = V@h
            o = o.reshape(nclasses)
            p = softmax(o)
#             wi = torch.argmax(p) # this doesn't work (just repeats 'and' a million times)
            wi = np.random.choice(range(len(vocab)), p=p) # don't always pick most likely; pick per distribution
            chars.append(vocab[wi])
    return chars

In [44]:
''.join( sample(list('walter'), 500) ) 

"walters, recession is hoperar in long. some bound of find it. i factury, 11, whether is make office. you've ming too. thank you.. you know there helovers a workers are 9/11 process and de8 mcceveders hitism so that an americation slaviter is objected of our own and affirm in though of organization legacy, loved most is family. it one took for of the lost of heril affordable of iraq you.....not a -- staying opportunal rightwied can day is, a worldrets ammerica. now, thank that’s revansal providin"