# Text generation with an LSTM and Keras

In [115]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from typing import Sequence
from sklearn.model_selection import train_test_split

import tensorflow_addons as tfa
from keras.datasets import mnist
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import models, layers, callbacks, optimizers, Sequential, losses
import tqdm
from tqdm.keras import TqdmCallback

def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

def words(text:str):
    """
    Given a string, return a list of words normalized as follows.
    Split the string to make words first by using regex compile() function
    and string.punctuation + '0-9\\r\\t\\n]' to replace all those
    char with a space character.
    Split on space to get word list.
    Ignore words < 3 char long.
    Lowercase all words
    Remove English stop words
    """
    ctrl_chars = '\x00-\x1f'
    regex = re.compile(r'[' + ctrl_chars + string.punctuation + '\r\t\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nopunct.split(" ")
    words = [w for w in words if len(w) > 0]
    words = [w.lower() for w in words]
    return words

def compress_whitespace(s): # collapse things like "\n   \t  " with " "
    return re.sub(r"(\s+)", ' ', s)

## Load corpus

Let's use [Alexander Hamilton's federalist papers 1-10](https://guides.loc.gov/federalist-papers/text-1-10#s-lg-box-wrapper-25493264) as our corpus.

In [116]:
text = get_text("data/federalist-papers.txt")
text = compress_whitespace(text)
text[:300]

'FEDERALIST NO. 1 General Introduction For the Independent Journal. Author: Alexander Hamilton To the People of the State of New York: AFTER an unequivocal experience of the inefficiency of the subsisting federal government, you are called upon to deliberate on a new Constitution for the United State'

In [117]:
import spacy

In [118]:
import en_core_web_sm
nlp = en_core_web_sm.load()
# The following fails on paperspace gradient platform
#nlp = spacy.load("en_core_web_sm") # When I use plain English() it doesn't seem to give POS info

In [119]:
tokens = words(text)
len(tokens), tokens[:10]

(19263,
 ['federalist',
  'no',
  '1',
  'general',
  'introduction',
  'for',
  'the',
  'independent',
  'journal',
  'author'])

In [120]:
# TESTING
#tokens = tokens[:10_000]   # total is about 19.2k

## Get vocab and get X, y 

In [121]:
V = sorted(set(tokens))
len(V)

3213

In [122]:
V[:15]

['1',
 '10',
 '11',
 '1685',
 '1706',
 '1774',
 '1787',
 '1st',
 '1their',
 '2',
 '20',
 '23',
 '3',
 '4',
 '5']

In [123]:
index = {w:i for i,w in enumerate(V)}
def wtoi(w):
    return index[w]

In [124]:
k = 10
step = 1
Xy = [np.array((np.array(tokens[i-k:i],dtype=object),tokens[i])) for i in range(k,len(tokens)-1,step)]

In [125]:
Xy[:5]

[array([array(['federalist', 'no', '1', 'general', 'introduction', 'for', 'the',
        'independent', 'journal', 'author'], dtype=object),
        'alexander'], dtype=object),
 array([array(['no', '1', 'general', 'introduction', 'for', 'the', 'independent',
        'journal', 'author', 'alexander'], dtype=object),
        'hamilton'], dtype=object),
 array([array(['1', 'general', 'introduction', 'for', 'the', 'independent',
        'journal', 'author', 'alexander', 'hamilton'], dtype=object),
        'to'], dtype=object),
 array([array(['general', 'introduction', 'for', 'the', 'independent', 'journal',
        'author', 'alexander', 'hamilton', 'to'], dtype=object),
        'the'], dtype=object),
 array([array(['introduction', 'for', 'the', 'independent', 'journal', 'author',
        'alexander', 'hamilton', 'to', 'the'], dtype=object),
        'people'], dtype=object)]

In [126]:
Xy = np.array(Xy)

In [127]:
X, y = Xy[:,0], Xy[:,1]

In [128]:
X = np.vstack(X)
X[0:2]

array([['federalist', 'no', '1', 'general', 'introduction', 'for', 'the',
        'independent', 'journal', 'author'],
       ['no', '1', 'general', 'introduction', 'for', 'the',
        'independent', 'journal', 'author', 'alexander']], dtype=object)

## Label encode tokens in X, y

In [129]:
encode = np.vectorize(wtoi)
X = encode(X)
y = encode(y)

In [130]:
targets = np.unique(y)   # not every word in V will be in target classes (words)

In [131]:
X.shape, y.shape

((19252, 10), (19252,))

In [132]:
X[0]

array([1241, 1982,    0, 1364, 1660, 1295, 2892, 1558, 1700,  318])

Convert X to shape (num sequences, window width k, len(V))

In [133]:
y.shape, len(V), len(targets)

((19252,), 3213, 3212)

In [134]:
y = pd.get_dummies(y)
y.shape

(19252, 3212)

## One hot the tokens (optionally)

In [135]:
do_onehot = True
do_onehot = False

In [136]:
def onehot(X):
    X_onehot = np.zeros((len(X), k, len(V)), dtype=np.bool)
    for i,record in enumerate(X):
        onehot = np.zeros((k,len(V)), dtype=np.bool)
        for j,wi in enumerate(record):
            onehot[j,wi] = 1
        X_onehot[i] = onehot
    return X_onehot

In [137]:
if do_onehot:
    X = onehot(X)

## Train

In [138]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)

In [139]:
model = Sequential()
if do_onehot:
    # Must one hot X as num records x k x len(V)
    model.add(layers.LSTM(units=128, input_shape=(k,len(V))))
else:
    # If you don't want to onehot, you can leave X as 2D num records x k.
    model.add(layers.Embedding(input_dim=len(V), output_dim=20, input_length=k))
    model.add(layers.LSTM(units=128, input_shape=(k,1)))
model.add(layers.Dropout(0.2))
model.add(layers.BatchNormalization())
model.add(layers.Dense(len(targets), activation='softmax'))
#model.add(layers.Lambda(lambda x: tf.cast(K.argmax(x, axis=-1),dtype=float)))

opt = optimizers.Adam(learning_rate=0.001)

model.compile(loss=losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
#model.summary()

In [140]:
def myfit(epochs, verbose=0):
    batch_size = 128
    history = model.fit(X_train, y_train,
                        shuffle=True,
                        epochs=epochs,
                        validation_data=(X_valid, y_valid),
                        batch_size=batch_size,
                        verbose=verbose
#                         , callbacks=[tfa.callbacks.TQDMProgressBar(show_epoch_progress=True)]
                        )

In [141]:
myfit(15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Generate

In [142]:
# From Deep Learning with Python by François Chollet
# Gets a single int target class from a distribution described by probabilities
# (from softmax) in probs.  The temperature adds noise where temperature=0 means
# pick most likely always.
def sample(probs, temperature=1.0):
    probs = np.asarray(probs).astype('float64')
    probs = np.log(probs) / temperature
    exp_probs = np.exp(probs)
    probs = exp_probs / np.sum(exp_probs)
    probs = np.random.multinomial(1, probs, 1)
    return np.argmax(probs)

Seed the text with k words

In [143]:
start = np.random.randint(0, len(tokens) - k - 1)
generated_words = tokens[start: start + k]
print(' '.join(generated_words))
generated_tokens = [wtoi(w) for w in generated_words]
generated_tokens

subjects they passed many months in cool uninterrupted and daily


[2793, 2907, 2129, 1847, 1930, 1532, 691, 3020, 206, 735]

In [145]:
for epochs in range(1,15,2):
    print(f"-------- {epochs} epochs --------------------------------")
    myfit(epochs=1) # fits one iteration
    print('-'.join(generated_words), end=' ') # same seed
    for i in range(60):
        y_prob = model.predict(np.array(generated_tokens).reshape(1,k), verbose=0)[0]
        next_token = sample(y_prob, temperature=1.0)
        print(V[next_token], end=' ')
        generated_tokens.append(next_token)
        generated_tokens = generated_tokens[1:]
    print()

-------- 1 epochs --------------------------------
subjects-they-passed-many-months-in-cool-uninterrupted-and-daily frivolous widest widest footing frivolous and arms and interesting imputations tinctured that that that these ruinous ruinous kept old that these how these widest kept unfortunately a pope getting northern once odious tinctured by that that even theirs island island frivolous and island footing frivolous odious odious tinctured tinctured that that that that even disunited therefore therefore therefore purpose island 
-------- 3 epochs --------------------------------
subjects-they-passed-many-months-in-cool-uninterrupted-and-daily short island be a causes almost mean whether once governed odious odious tinctured by that that that that even even relation theirs theirs island island as widest widest widest footing footing frivolous and happily tinctured as by that that that each theirs island island widest footing be necessarily frivolous and widest propagated cannot neutra

KeyboardInterrupt: 

## Notes:

* BatchNormalization seems to help training accuracy converge faster. If no embedding layer, batch norm makes massive diff
* Having trouble getting validation accuracy beyond 7 or 8%.
* Moved to no embedding layer and used dropout layer not dropout arg on LSTM. Dropout followed by batch norm made accur increase slowly but reverse order does no good. weird
* Using more text helps a lot. Got to about 14% accuracy with step=1

Hmm...not so great using words. Try again with char.