In [23]:
import sys
from pprint import pprint

import numpy as np
import tensorflow as tf


In this example, we will train a character based language model on the text of Alice in Wonderland to predict the next character 
given 100 previous characters. We have chosen to build a character-based model here because it has a smaller vocabulary and trains quicker.

Taken from:
https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/


In [3]:
# Getting the input

INPUT_FILE = "./datasets/alice-text/alice.txt"

# extract the input as a stream of characters
print("Extracting text from input...")

fin = open(INPUT_FILE, 'rb')
lines = []
for line in fin:
    line = line.strip().lower()
    line = line.decode("ascii", "ignore")
    if len(line) == 0:
        continue
    lines.append(line)
fin.close()

text = " ".join(lines)
text = text.lower()

Extracting text from input...


In [4]:
# Converting the characters to integers (indexing)

# Create mapping of unique chars to integers
chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [7]:
# Getting some info about the text after indexing it 

n_chars = len(text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  158783
Total Vocab:  55


In [8]:
# Splitting the text into sequences and labels

# Each sequence will be 100 characters and each label will be a single (next) character

# Prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = text[i:i + seq_length]
	seq_out = text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
	
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  158683


In [10]:
# Preparing the inputs and labels for training

# Reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# Normalize
X = X / float(n_vocab)

# One hot encode the output variable.
# Each output will be (seq_index, vocab_length) and there will be a single character marked with 1
y = tf.keras.utils.to_categorical(dataY)

In [14]:
# Building the model

# You are not interested in the most accurate (classification accuracy) model of the training dataset. 
# This would be a model that predicts each character in the training dataset perfectly. 
# Instead, you are interested in a generalization of the dataset that minimizes the chosen loss function. 
# You are seeking a balance between generalization and overfitting but short of memorization.

model = tf.keras.Sequential([
  tf.keras.layers.LSTM(256, input_shape=(X.shape[1], X.shape[2]), unroll=True),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(y.shape[1], activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam')



In [None]:
# Training the model (very slow, requires GPU)

# Define the checkpoint
filepath="models_cp/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Training on Kaggle.......

In [29]:
# Using the model for predictions

# The simplest way to use the Keras LSTM model to make predictions is to first start with a seed sequence as input, 
# generate the next character, then update the seed sequence to add the generated character on the end and trim 
# off the first character. his process is repeated for as long as you want to predict new characters 
# (e.g., a sequence of 1,000 characters in length)

def generate_text(model_path):
  # load the network weights
  filename = "models_cp/weights-improvement-16-2.1293.hdf5"
  model.load_weights(filename)
  model.compile(loss='categorical_crossentropy', optimizer='adam')

  # Create a reverse mapping from int to chars so that we can understand the model's inferences
  int_to_char = dict((i, c) for i, c in enumerate(chars))  

  # Pick a random seed (sequence) from the dataset
  start = np.random.randint(0, len(dataX)-1)
  pattern = dataX[start]
  pattern_english = ''.join([int_to_char[value] for value in pattern])
  print("Seed:")
  print("\"", pattern_english, "\"")

  # Generate characters
  for i in range(1000):
    # Reshape (1, length, 1) and normalize
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)

    # Infer the next char
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]

    # Print the next char
    #sys.stdout.write(result)
    pattern_english += result

    # Add the new char (its index) to the pattern and repeat the inference
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

  return pattern_english

In [30]:
model_path = "models_cp/weights-improvement-16-2.1293.hdf5"
pprint(generate_text(model_path))

Seed:
" world of trouble, you know, as we neednt try to find any. and yet i dont know, he went on, spreading "

Done.
('world of trouble, you know, as we neednt try to find any. and yet i dont '
 'know, he went on, spreading ano ali the tooe  she said to herself, and the '
 'whst hnt le the woodd aedin to the tooe  she said to herself, and the whst '
 'hnt le the woodd aedin to the tooe  she said to herself, and the whst hnt le '
 'the woodd aedin to the tooe  she said to herself, and the whst hnt le the '
 'woodd aedin to the tooe  she said to herself, and the whst hnt le the woodd '
 'aedin to the tooe  she said to herself, and the whst hnt le the woodd aedin '
 'to the tooe  she said to herself, and the whst hnt le the woodd aedin to the '
 'tooe  she said to herself, and the whst hnt le the woodd aedin to the tooe  '
 'she said to herself, and the whst hnt le the woodd aedin to the tooe  she '
 'said to herself, and the whst hnt le the woodd aedin to the tooe  she said '
 'to herse

### Using a larger LSTM network and training for longer

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.LSTM(256),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(y.shape[1], activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Checkpoint for saving the best model
filepath="weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# VERY SLOW (must run on a GPU)
model.fit(X, y, epochs=50, batch_size=64, callbacks=callbacks_list)

Training on Kaggle...

In [31]:
model_path = "weights-improvement-50-1.2879-bigger.hdf5"
pprint(generate_text(model_path))

Seed:
" ed into the air off all its feet at once, with a yelp of delight, and rushed at the stick, and made  "

Done.
('ed into the air off all its feet at once, with a yelp of delight, and rushed '
 'at the stick, and made to the soeee of the soee  she was aolnne and toen  '
 'she was aolnne to the tooe  she was aolnne to tee thet she was to tere the '
 'was oo tie tooe  and the whst hn  shi mant to her haad  she manter was '
 'toelking an inr toaee  she hat aelin the winte rabbit  shi mact to herself '
 'to aedin  she said to herself, and the whst on ani alo her haad  she mant '
 'woine so tee the was oo tie tooe  she was aolnne to tee thet she was to tere '
 'the was oo tie tooe  and the whst hn  shi mant to her haad  she manter was '
 'toelking an inr toaee  she hat aelin the winte rabbit  shi mact to herself '
 'to aedin  she said to herself, and the whst on ani alo her haad  she mant '
 'woine so tee the was oo tie tooe  she was aolnne to tee thet she was to tere '
 'the was oo t