In [46]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [47]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [48]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [49]:
### ENCODING + PREPROCESSING ###

In [50]:
# Figure out number of unique characters in our vocab
# Then map from unique characters to indices
# Turn inital vocab into a list, going from index to letter

vocab = sorted(set(text))
#mapping uniques characyers to indices
charactersToIdx = {u:i for i, u in enumerate(vocab)}
idxToCharacters = np.array(vocab)

def text_to_int(text):
    return np.array([charactersToIdx[c] for c in text])

text_as_int = text_to_int(text)

In [51]:
# testing how it has worked

print("Text: ", text[:15])
print("Encoded: ", text_to_int(text[:15]))

Text:  First Citizen:

Encoded:  [18 47 56 57 58  1 15 47 58 47 64 43 52 10  0]


In [52]:
# Converting numeric values to text — maybe needed later

def int_to_text(integers):
    try:
        integers = integers.numpy()
    except:
        pass
    return ''.join(idxToCharacters[integers])
    
print(int_to_text(text_as_int[:15]))

First Citizen:



### Creating training examples from text file

E.g. Input: 'Hell' and resulting output: 'ello'

In [53]:
sequence_length = 100
examples_per_epoch = len(text)//(sequence_length + 1)

# COnverts our string dataset into characters. Allows us to have a stream of characters. 
# Will contain 1.1 million characters
character_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [54]:
sequences = character_dataset.batch(sequence_length+1, drop_remainder=True)

In [55]:
# Splits inputs and outputs

def split_input_target(input):
    input_text = input[:-1] #hell
    target_text = input[1:] #ello
    
    return input_text, target_text

dataset = sequences.map(split_input_target)
# Applies function to every entry in the characters created above

In [56]:
# Checking some examples

for x, y in dataset.take(2):
    print('\n\nTESTING\n"')
    print('INPUT')
    print(int_to_text(x))
    print("\nOUT\n")
    print(int_to_text(y))



TESTING
"
INPUT
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

OUT

irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


TESTING
"
INPUT
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 

OUT

re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k


In [57]:
# Making trainin batches
# Feed model 64 batches of data at a time

BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [58]:
### BUILDING MODEL ###

In [59]:
# Writing a function to return to us a built model

def build_model(vocab_size, embedding_dimensions, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dimensions, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
        # want final layer to have # of nodes = # of words in vocab. Each node represents a probability distribtion
        # that that character comes next
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           16640     
_________________________________________________________________
lstm_2 (LSTM)                (64, None, 1024)          5246976   
_________________________________________________________________
dense_2 (Dense)              (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [60]:
# Input to our model has length 64 (batches of 64 examples, each a sequence of length 100)
# Ouput in the probability of each word in the entire vocabulary from occuring

In [61]:
# Looking at sample input and output of model

for input_example_batch, target_example_batch in data.take(1):
    example_predictions = model(input_example_batch)
    print(example_predictions.shape, '# (batch_size, seq_len, vocab_size')

(64, 100, 65) # (batch_size, seq_len, vocab_size


In [62]:
# Prediction is an array of 64 arrays (output shape 1 in dense layer)

print(len(example_predictions))
print(example_predictions)

64
tf.Tensor(
[[[-1.21543324e-03 -3.15926992e-03  1.03446969e-03 ... -2.36298982e-03
   -4.13387083e-04  1.67113368e-03]
  [-8.87477247e-04 -1.01622129e-02  2.39519170e-03 ...  2.41522957e-03
    3.03720264e-03  4.39212751e-03]
  [-2.28902511e-03 -8.93780403e-03 -2.48057535e-03 ...  5.60427958e-04
    6.31489325e-03  5.27122756e-03]
  ...
  [ 6.01371098e-03  8.46289750e-03 -8.05195421e-03 ...  7.75235053e-03
   -3.79582541e-03 -1.38400099e-03]
  [ 8.82256497e-03  8.56757350e-03 -1.10681374e-02 ...  9.23106726e-03
   -1.48668594e-03 -1.23479648e-03]
  [ 6.18119678e-03  9.34889633e-03 -6.03795890e-03 ...  2.14844989e-03
   -4.69520828e-03 -1.70410518e-03]]

 [[-6.52959710e-03 -6.66230218e-03  9.19432670e-04 ... -2.42957976e-04
   -2.70727975e-03 -7.22856203e-04]
  [-1.01877246e-02 -3.78673244e-03  2.93302728e-04 ...  4.47414629e-03
    9.51103459e-04  4.23900643e-03]
  [-4.71330294e-03 -5.24641946e-03 -4.60825535e-03 ...  4.68796259e-03
   -4.90565738e-03  5.78734512e-03]
  ...
  [ 4.147

In [63]:
# Testing just one prediction in an untrained model

pred = example_predictions[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[-0.00121543 -0.00315927  0.00103447 ... -0.00236299 -0.00041339
   0.00167113]
 [-0.00088748 -0.01016221  0.00239519 ...  0.00241523  0.0030372
   0.00439213]
 [-0.00228903 -0.0089378  -0.00248058 ...  0.00056043  0.00631489
   0.00527123]
 ...
 [ 0.00601371  0.0084629  -0.00805195 ...  0.00775235 -0.00379583
  -0.001384  ]
 [ 0.00882256  0.00856757 -0.01106814 ...  0.00923107 -0.00148669
  -0.0012348 ]
 [ 0.0061812   0.0093489  -0.00603796 ...  0.00214845 -0.00469521
  -0.00170411]], shape=(100, 65), dtype=float32)


Above returns a 2d array of length 200. Each interior array is the prediction for the next character at each time step. I.e. for every single training example, # of outputs = len(of that training example). 

In [64]:
# Breaking the above pred down into prediction at the first timestep.
# Each of the 65 values represent the probability of each character occuring next. 


time_pred = pred[0]
print(len(time_pred))
print(time_pred)

65
tf.Tensor(
[-1.2154332e-03 -3.1592699e-03  1.0344697e-03 -1.7614500e-03
  2.2874640e-03  6.9755595e-03 -8.4607652e-04  1.3689753e-03
 -1.6421891e-03  5.1368936e-03 -1.2422957e-03 -1.1372651e-03
  2.7795770e-04  1.0755707e-03  2.4355105e-03  3.4512414e-03
  1.1145249e-03 -1.9000007e-03  8.3720288e-04 -3.6827221e-03
 -8.6951721e-03  1.1118244e-06 -8.4905419e-04  2.8053022e-03
  1.9380373e-04  2.4720279e-03  2.3212014e-03  2.4788612e-03
  1.5750899e-03  1.1911441e-03 -1.6734493e-03 -1.5311562e-04
  4.0316242e-03 -4.9043931e-03  2.4905037e-03  8.8540910e-06
 -8.4353982e-05  8.4996247e-04  2.9782462e-03  2.5591212e-03
  2.7932512e-04 -3.4848810e-03 -6.0216349e-04  1.0158742e-02
  6.7483415e-03 -1.2463115e-03 -1.3220469e-03  4.6794396e-04
 -3.4195411e-03  1.4551366e-03 -2.9187398e-03 -2.3121093e-03
 -4.4043077e-04 -9.6706452e-04  1.7461403e-03 -1.6683291e-03
  5.1068113e-04  2.6224845e-03  6.3147617e-04 -3.4490749e-03
  4.6555325e-03 -3.9357803e-04 -2.3629898e-03 -4.1338708e-04
  1.671133

In [65]:
# Sample the categorical distribution to determine a predicted character. (based on prob)
sampled_indices = tf.random.categorical(pred, num_samples=1)

# reshaping + converting ints to characters

sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_characters = int_to_text(sampled_indices)

predicted_characters
# Below is hwat model predicts for training sequence 1

'eUw!LplXY&itpOo-TcfRJwjDNgxHMDMtB:l.wBuEV vz,NTaNna!Mme;GC-SbjNrnSejG?Xs-e-D3S?E\nUBwP;SPhxtnWTyo lIq'

In [66]:
### LOSS fn ###

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [67]:
### COMPILING Model ###

In [68]:
model.compile(optimizer='adam', loss=loss)

In [69]:
### CHECKPOINTS ### to save checkpoints during training

In [70]:
checkpoint_dir = './training_checkpoints'
#naming file
checkpoint_prefix = os.path.join(checkpoint_dir, "cpoint_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [71]:
### TRAINING ###

In [72]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [73]:
### LOADING IN THE MODEL ###

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [74]:
# find latest checkpoint thats stored model weights
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [92]:
def generate_text(model, start_string):
  # Num chars to generate
  num_generate = 300

  #VEctorizing and converting start string to ints
  input_eval = [charactersToIdx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  generated_text = []

  # Where low temp = more predictable text, higher text = more surprising text
  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)

    predictions = tf.squeeze(predictions, 0)

    # categorical distribution
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

    # pass predicted char as next input to the model + previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)
    generated_text.append(idxToCharacters[predicted_id])

  return (start_string + ''.join(generated_text))


In [93]:
inp = input("Type a starting string: ")
out = generate_text(model, inp)
#next = out.split(' ')[1]

print(out)


Type a starting string: enter
enter
To meet upon the town and honour to appeard!
You must to pardon me, and look con?

Third Citizen:
It will not stay with me already, sir,
But my arrival and my weal ornet, where lies herbs,
Imabilitable and false; I believe me.
Throw up your king, But was foul water for all this land
As 'longeth too
