In [1]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [6]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [7]:
### ENCODING + PREPROCESSING ###

In [8]:
# Figure out number of unique characters in our vocab
# Then map from unique characters to indices
# Turn inital vocab into a list, going from index to letter

vocab = sorted(set(text))
#mapping uniques characyers to indices
charactersToIdx = {u:i for i, u in enumerate(vocab)}
idxToCharacters = np.array(vocab)

def text_to_int(text):
    return np.array([charactersToIdx[c] for c in text])

text_as_int = text_to_int(text)

In [12]:
# testing how it has worked

print("Text: ", text[:15])
print("Encoded: ", text_to_int(text[:15]))

Text:  First Citizen:

Encoded:  [18 47 56 57 58  1 15 47 58 47 64 43 52 10  0]


In [14]:
# Converting numeric values to text — maybe needed later

def int_to_text(integers):
    try:
        integers = integers.numpy()
    except:
        pass
    return ''.join(idxToCharacters[integers])
    
print(int_to_text(text_as_int[:15]))

First Citizen:



### Creating training examples from text file

E.g. Input: 'Hell' and resulting output: 'ello'

In [16]:
sequence_length = 100
examples_per_epoch = len(text)//(sequence_length + 1)

# COnverts our string dataset into characters. Allows us to have a stream of characters. 
# Will contain 1.1 million characters
character_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [17]:
sequences = character_dataset.batch(sequence_length+1, drop_remainder=True)

In [19]:
# Splits inputs and outputs

def split_input_target(input):
    input_text = input[:-1] #hell
    target_text = input[1:] #ello
    
    return input_text, target_text

dataset = sequences.map(split_input_target)
# Applies function to every entry in the characters created above

In [20]:
# Checking some examples

for x, y in dataset.take(2):
    print('\n\nTESTING\n"')
    print('INPUT')
    print(int_to_text(x))
    print("\nOUT\n")
    print(int_to_text(y))



TESTING
"
INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUT

irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


TESTING
"
INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUT

re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [21]:
# Making trainin batches
# Feed model 64 batches of data at a time

BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [22]:
### BUILDING MODEL ###

In [23]:
# Writing a function to return to us a built model

def build_model(vocab_size, embedding_dimensions, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dimensions, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
        # want final layer to have # of nodes = # of words in vocab. Each node represents a probability distribtion
        # that that character comes next
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Input to our model has length 64 (batches of 64 examples, each a sequence of length 100)
# Ouput in the probability of each word in the entire vocabulary from occuring

In [25]:
# Looking at sample input and output of model

for input_example_batch, target_example_batch in data.take(1):
    example_predictions = model(input_example_batch)
    print(example_predictions.shape, '# (batch_size, seq_len, vocab_size')

(64, 100, 65) # (batch_size, seq_len, vocab_size


In [27]:
# Prediction is an array of 64 arrays (output shape 1 in dense layer)

print(len(example_predictions))
print(example_predictions)

64
tf.Tensor(
[[[ 3.44622089e-03 -3.48491943e-04 -5.88340638e-03 ... -9.58662713e-06
    1.37374899e-03 -2.65068898e-04]
  [ 3.96947702e-03  1.61538180e-03 -2.79196701e-03 ... -1.51652307e-03
   -3.78097966e-03 -2.75008497e-03]
  [ 2.56912271e-03  3.43106035e-03 -1.05192175e-03 ...  1.59421586e-03
   -3.08662467e-03 -7.60429725e-03]
  ...
  [-9.88359097e-04 -2.75285542e-03  7.16031296e-03 ...  4.76302579e-03
    3.41628818e-03 -3.87461949e-03]
  [-3.33372410e-03 -6.83719805e-03  1.52647914e-03 ...  3.74721829e-04
    5.43132331e-03 -6.90462394e-03]
  [-6.98320451e-04 -1.10050365e-02 -1.37510058e-03 ... -3.11843818e-04
    2.05456372e-03 -5.32741845e-03]]

 [[ 8.42442038e-04 -2.74750963e-03  2.41437089e-03 ...  1.32151600e-03
    4.34104633e-03  2.19512265e-03]
  [ 4.50448040e-03 -3.22155445e-03 -3.48183792e-03 ...  1.48794660e-03
    4.48118849e-03  9.34795942e-04]
  [ 4.95656021e-03 -1.05811842e-03 -6.41101797e-04 ... -4.72023385e-06
   -1.46817439e-03 -2.08806247e-03]
  ...
  [ 1.367

In [28]:
# Testing just one prediction in an untrained model

pred = example_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[ 3.44622089e-03 -3.48491943e-04 -5.88340638e-03 ... -9.58662713e-06
   1.37374899e-03 -2.65068898e-04]
 [ 3.96947702e-03  1.61538180e-03 -2.79196701e-03 ... -1.51652307e-03
  -3.78097966e-03 -2.75008497e-03]
 [ 2.56912271e-03  3.43106035e-03 -1.05192175e-03 ...  1.59421586e-03
  -3.08662467e-03 -7.60429725e-03]
 ...
 [-9.88359097e-04 -2.75285542e-03  7.16031296e-03 ...  4.76302579e-03
   3.41628818e-03 -3.87461949e-03]
 [-3.33372410e-03 -6.83719805e-03  1.52647914e-03 ...  3.74721829e-04
   5.43132331e-03 -6.90462394e-03]
 [-6.98320451e-04 -1.10050365e-02 -1.37510058e-03 ... -3.11843818e-04
   2.05456372e-03 -5.32741845e-03]], shape=(100, 65), dtype=float32)


Above returns a 2d array of length 200. Each interior array is the prediction for the next character at each time step. I.e. for every single training example, # of outputs = len(of that training example). 

In [29]:
# Breaking the above pred down into prediction at the first timestep.
# Each of the 65 values represent the probability of each character occuring next. 


time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[ 3.4462209e-03 -3.4849194e-04 -5.8834064e-03  2.8192634e-03
  8.4223109e-04 -2.7308234e-03 -1.1909055e-03  9.7384118e-03
 -3.8018122e-03  4.7285832e-03  2.8086251e-03 -5.0463462e-03
  4.0158606e-04 -1.4397409e-03 -1.2888841e-03  1.6160817e-03
  1.4546052e-04 -2.2715675e-03  7.0350652e-04  5.4097306e-03
  8.8061225e-03  2.9726783e-03 -4.1743461e-04 -1.1725151e-03
  8.3579635e-04 -2.1953436e-03 -4.5342175e-03  3.1402672e-03
  4.7317543e-03 -3.5214208e-03 -1.2841361e-03  7.6171540e-04
 -5.0438456e-03 -4.1698143e-03 -1.5211345e-03  5.4861663e-04
 -4.1671173e-04 -1.0275706e-03  1.9105066e-03 -2.3589302e-03
 -5.2763359e-03  4.6188259e-03 -5.6071398e-03  3.6697954e-04
  7.5506885e-03  2.8208131e-03  5.7206275e-03  3.3526930e-03
  1.7991299e-03 -3.2956398e-03 -3.2069073e-03 -9.0286415e-04
  2.0564094e-03  3.3067714e-03  1.0428857e-04 -8.1333524e-04
 -1.0951055e-02  2.2743994e-03 -1.8692120e-03  6.1897992e-04
 -8.4942766e-04 -1.6695498e-03 -9.5866271e-06  1.3737490e-03
 -2.650689

In [30]:
# Sample the categorical distribution to determine a predicted character. (based on prob)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# reshaping + converting ints to characters

sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_characters = int_to_text(sampled_indices)

predicted_characters
# Below is hwat model predicts for training sequence 1

'UKpPp-,U:R JBsnkyuFEvO&,vcl-e nUfnKbvDbeoIyRM3zyIS!q3QjXAXmbe:Mggtq&r:j:&aE;KE?bo-FMDJETjnxPhHpB-WFb'

In [31]:
### LOSS fn ###

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [None]:
### COMPILING Model ###

In [32]:
model.compile(optimizer='adam', loss=loss)

In [33]:
### CHECKPOINTS ### to save checkpoints during training

In [34]:
checkpoint_dir = './training_checkpoints'
#naming file
checkpoint_prefix = os.path.join(checkpoint_dir, "cpoint_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
### TRAINING ###

In [35]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

Train for 172 steps
Epoch 1/40
 37/172 [=====>........................] - ETA: 13:02 - loss: 3.3379

KeyboardInterrupt: 