# Text generation with an LSTM and Keras

Redo with chars not tokens.  Also, step by 3 through chars when getting windows (didn't do this for tokens might make big difference so go back and try.)

In [1]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from typing import Sequence
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.datasets import mnist
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

def compress_whitespace(s): # collapse things like "\n   \t  " with " "
    return re.sub(r"(\s+)", ' ', s)

Using TensorFlow backend.


## Load corpus

Let's use [Alexander Hamilton's federalist papers 1-10](https://guides.loc.gov/federalist-papers/text-1-10#s-lg-box-wrapper-25493264) as our corpus.

Try with https://s3.amazonaws.com/text-datasets/nietzsche.txt which is 6x bigger.

Ah. also lowercase it to be like keras book and reduce target space.

In [2]:
# text = get_text("data/federalist-papers.txt")
text = get_text("data/nietzsche.txt").lower()
text = compress_whitespace(text)
text[:300]

'preface supposing that truth is a woman--what then? is there not ground for suspecting that all philosophers, in so far as they have been dogmatists, have failed to understand women--that the terrible seriousness and clumsy importunity with which they have usually paid their addresses to truth, have'

In [3]:
# TESTING
#text = text[:1000]

In [4]:
tokens = list(text)

## Get vocab and get X, y 

In [5]:
V = sorted(set(tokens))
len(V)

58

In [6]:
V[0:15]

[' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5']

In [7]:
index = {c:i for i,c in enumerate(V)}
def ctoi(c):
    return index[c]

In [8]:
k = 60
step = 1
Xy = [np.array((np.array(tokens[i-k:i],dtype=object),tokens[i])) for i in range(k,len(tokens)-1,step)]

In [9]:
Xy[:5]

[array([array(['p', 'r', 'e', 'f', 'a', 'c', 'e', ' ', 's', 'u', 'p', 'p', 'o',
        's', 'i', 'n', 'g', ' ', 't', 'h', 'a', 't', ' ', 't', 'r', 'u',
        't', 'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n',
        '-', '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ',
        'i', 's', ' ', 't', 'h', 'e', 'r', 'e'], dtype=object),
        ' '], dtype=object),
 array([array(['r', 'e', 'f', 'a', 'c', 'e', ' ', 's', 'u', 'p', 'p', 'o', 's',
        'i', 'n', 'g', ' ', 't', 'h', 'a', 't', ' ', 't', 'r', 'u', 't',
        'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n', '-',
        '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ', 'i',
        's', ' ', 't', 'h', 'e', 'r', 'e', ' '], dtype=object),
        'n'], dtype=object),
 array([array(['e', 'f', 'a', 'c', 'e', ' ', 's', 'u', 'p', 'p', 'o', 's', 'i',
        'n', 'g', ' ', 't', 'h', 'a', 't', ' ', 't', 'r', 'u', 't', 'h',
        ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n', 

In [10]:
Xy = np.array(Xy)

In [11]:
X, y = Xy[:,0], Xy[:,1]

In [12]:
X = np.vstack(X)
X[0:2]

array([['p', 'r', 'e', 'f', 'a', 'c', 'e', ' ', 's', 'u', 'p', 'p', 'o',
        's', 'i', 'n', 'g', ' ', 't', 'h', 'a', 't', ' ', 't', 'r', 'u',
        't', 'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n',
        '-', '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ',
        'i', 's', ' ', 't', 'h', 'e', 'r', 'e'],
       ['r', 'e', 'f', 'a', 'c', 'e', ' ', 's', 'u', 'p', 'p', 'o', 's',
        'i', 'n', 'g', ' ', 't', 'h', 'a', 't', ' ', 't', 'r', 'u', 't',
        'h', ' ', 'i', 's', ' ', 'a', ' ', 'w', 'o', 'm', 'a', 'n', '-',
        '-', 'w', 'h', 'a', 't', ' ', 't', 'h', 'e', 'n', '?', ' ', 'i',
        's', ' ', 't', 'h', 'e', 'r', 'e', ' ']], dtype=object)

## Label encode tokens in X, y

In [13]:
encode = np.vectorize(ctoi)
X = encode(X)
y = encode(y)

In [14]:
targets = np.unique(y)   # not every word in V will be in target classes (words)

In [15]:
X.shape, y.shape

((598808, 60), (598808,))

In [16]:
X[0]

array([41, 43, 30, 31, 26, 28, 30,  0, 44, 46, 41, 41, 40, 44, 34, 39, 32,
        0, 45, 33, 26, 45,  0, 45, 43, 46, 45, 33,  0, 34, 44,  0, 26,  0,
       48, 40, 38, 26, 39,  7,  7, 48, 33, 26, 45,  0, 45, 33, 30, 39, 22,
        0, 34, 44,  0, 45, 33, 30, 43, 30])

Convert X to shape (num sequences, window width k, len(V))

In [17]:
y.shape, len(V), len(targets)

((598808,), 58, 58)

In [18]:
y = pd.get_dummies(y)
y.shape

(598808, 58)

## One hot the tokens (optionally)

In [19]:
do_onehot = True
#do_onehot = False

In [20]:
def onehot(X):
    X_onehot = np.zeros((len(X), k, len(V)), dtype=np.bool)
    for i,record in enumerate(X):
        onehot = np.zeros((k,len(V)), dtype=np.bool)
        for j,wi in enumerate(record):
            onehot[j,wi] = 1
        X_onehot[i] = onehot
    return X_onehot

In [21]:
if do_onehot:
    X = onehot(X)

## Train

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)

In [23]:
model = Sequential()
if do_onehot:
    # Must one hot X as num records x k x len(V)
    model.add(layers.LSTM(units=128, input_shape=(k,len(V))))
else:
    # If you don't want to onehot, you can leave X as 2D num records x k.
    model.add(layers.Embedding(input_dim=len(V), output_dim=10, input_length=k))
    model.add(layers.LSTM(units=128, input_shape=(k,1)))
# model.add(layers.Dropout(0.4))
#model.add(layers.BatchNormalization())
model.add(layers.Dense(len(targets), activation='softmax'))
#model.add(layers.Lambda(lambda x: tf.cast(K.argmax(x, axis=-1),dtype=float)))

# opt = optimizers.Adam(learning_rate=0.001)
opt = optimizers.RMSprop(lr=0.01) # keras book uses this

model.compile(loss=losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
#model.summary()

In [24]:
def myfit(epochs, batch_size=1, verbose=0):
    history = model.fit(X_train, y_train,
                        shuffle=True,
                        epochs=epochs,
                        validation_data=(X_valid, y_valid),
                        batch_size=batch_size,
                        verbose=verbose
#                         , callbacks=[tfa.callbacks.TQDMProgressBar(show_epoch_progress=True)]
                        )

In [25]:
myfit(19, verbose=1)

Epoch 1/19


UnknownError:  [_Derived_]  Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[sequential/lstm/StatefulPartitionedCall]] [Op:__inference_train_function_2664]

Function call stack:
train_function -> train_function -> train_function


## Generate

In [None]:
# From Deep Learning with Python by François Chollet
# Gets a single int target class from a distribution described by probabilities
# (from softmax) in probs.  The temperature adds noise where temperature=0 means
# pick most likely always.
def sample(probs, temperature=1.0):
    probs = np.asarray(probs).astype('float64')
    probs = np.log(probs) / temperature
    exp_probs = np.exp(probs)
    probs = exp_probs / np.sum(exp_probs)
    probs = np.random.multinomial(1, probs, 1)
    return np.argmax(probs)

Seed the text with k words

In [None]:
start = np.random.randint(0, len(tokens) - k - 1)
generated_words = tokens[start: start + k]
print(''.join(generated_words))
generated_tokens = [ctoi(w) for w in generated_words]
generated_tokens[0:10]

In [None]:
for epochs in range(1,40):
    print(f"-------- {epochs} epochs --------------------------------")
    myfit(epochs=1, verbose=1) # fits one iteration
    print('-'.join(generated_words), end=' ') # same seed
    for i in range(400):
        if do_onehot:
            onehot = np.zeros((1,k,len(V)), dtype=np.bool)
            for j,ci in enumerate(generated_tokens):
                onehot[0,j,ci] = 1
            X1 = onehot
        else:
            X1 = np.array(generated_tokens).reshape(1,k)
        y_prob = model.predict(X1, verbose=0)[0]
        next_token = sample(y_prob, temperature=0.5)
        print(V[next_token], end='')
        generated_tokens.append(next_token)
        generated_tokens = generated_tokens[1:]
    print()

## Notes:

* gotta use a lot of data. started working well with 6x nietchse not federalist papers. 
* hmm...step seems to be just an efficiency issue
* what about batch size vs max len? Seems like we gotta line up sentences so they line up across batches, unless it resets h each batch. fastai book for LMModel3 inits h in `__init__` not `forward` but then uses truncated backprop (of len equal to seqence length k). It also then has to line up the batches.
* what is effect of onehot vs embedding layer? With same setup but with len(V) sized embeddings for chars going into LSTM rather than one hot: got weird div by zero errors and valid accuracy maxed out at .49 with loss 2.0 whereas with no embedding before LSTM, got valid .56 accur and loss 1.59.  Maybe a function of embedding size? `layers.Embedding(input_dim=len(V), output_dim=len(V), input_length=k)`

W/o embeddings at about epoch 60:

```
1248/1248 [==============================] - 15s 12ms/step - loss: 1.1914 - accuracy: 0.6397 - val_loss: 1.5969 - val_accuracy: 0.5601
r-d-e-r-)-,- -r-e-l-i-g-i-o-n- -i-t-s-e-l-f- -m-a-y- -b-e- -u-s-e-d- -a-s- -a- -m-e-a-n-s- -f-o-r- -o-b-t-a-i-n-i-n-g-  nce and simultage perseined to do a desire, that he understand of the best to the world of the contemplation of the so and at the desiress and strength, and accuiration to from the his esseced to such as a stronger man and worst of the soul in a soully of the best to cause the recognized in the sense of any constant their literal, and so much man of the problems to the self-explained by the sight 
```

With embeddings:

```
1248/1248 [==============================] - 17s 14ms/step - loss: 1.8657 - accuracy: 0.5002 - val_loss: 2.0086 - val_accuracy: 0.4913
a-t-e- -o-f- -h-i-s- -s-o-u-l-,- -h-e- -w-i-s-h-e-d- -t-o- -b-e- -d-o-u-b-t-f-u-l- -o-f- -h-i-s- -o-w-n- -c-a-p-a-c-i-t e bei
dency of who the is a pain of world and the present the regariss. The now to constinh-all alon a not or the possible and the powerful maken usfections of the under skecoflune and the makes of the sociement: in the to the greates all all the laid the should respection to a very the subject and that all the repxing the world of the sothing in the because bet the being bess that really of the ma
```

which looks much worse.

Accuracy is higher for char than for tokens likely due to much larger token space than char space.