In [62]:
import tensorflow_datasets as tfds
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import os
import numpy as np
import random
from nltk.corpus import stopwords

In [2]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.show()

In [143]:
text = open('alllines.txt', 'r').read()

In [93]:
sample_text = text[:500000]

In [94]:
print(sample_text[:510])

"ACT I"
"SCENE I. London. The palace."
"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others"
"So shaken as we are, so wan with care,"
"Find we a time for frighted peace to pant,"
"And breathe short-winded accents of new broils"
"To be commenced in strands afar remote."
"No more the thirsty entrance of this soil"
"Shall daub her lips with her own children's blood,"
"Nor more shall trenching war channel her fields,"
"Nor bruise her flowerets with the armed hoofs"


In [95]:
vocab = sorted(set(sample_text))
len(vocab)

70

In [96]:
char_to_idx = {c : i for i, c in enumerate(vocab)}

In [97]:
idx_to_char = np.array(vocab)

In [98]:
text_as_int = np.array([char_to_idx[c] for c in sample_text])

In [99]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [100]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx_to_char[item.numpy()])))

'"ACT I"\n"SCENE I. London. The palace."\n"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMOR'
'ELAND, SIR WALTER BLUNT, and others"\n"So shaken as we are, so wan with care,"\n"Find we a time for fri'
'ghted peace to pant,"\n"And breathe short-winded accents of new broils"\n"To be commenced in strands af'
'ar remote."\n"No more the thirsty entrance of this soil"\n"Shall daub her lips with her own children\'s '
'blood,"\n"Nor more shall trenching war channel her fields,"\n"Nor bruise her flowerets with the armed h'


In [101]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [102]:
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 4, 16, 18, 35,  2, 24,  4,  1,  4, 34, 18, 20, 29, 20,  2, 24,  8,
        2, 27, 58, 57, 47, 58, 57,  8,  2, 35, 51, 48,  2, 59, 44, 55, 44,
       46, 48,  8,  4,  1,  4, 20, 57, 63, 48, 61,  2, 26, 24, 29, 22,  2,
       23, 20, 29, 33, 40,  6,  2, 27, 30, 33, 19,  2, 25, 30, 23, 29,  2,
       30, 21,  2, 27, 16, 29, 18, 16, 34, 35, 20, 33,  6,  2, 63, 51, 48,
        2, 20, 16, 33, 27,  2, 58, 49,  2, 38, 20, 34, 35, 28, 30])>, <tf.Tensor: shape=(100,), dtype=int64, numpy=
array([16, 18, 35,  2, 24,  4,  1,  4, 34, 18, 20, 29, 20,  2, 24,  8,  2,
       27, 58, 57, 47, 58, 57,  8,  2, 35, 51, 48,  2, 59, 44, 55, 44, 46,
       48,  8,  4,  1,  4, 20, 57, 63, 48, 61,  2, 26, 24, 29, 22,  2, 23,
       20, 29, 33, 40,  6,  2, 27, 30, 33, 19,  2, 25, 30, 23, 29,  2, 30,
       21,  2, 27, 16, 29, 18, 16, 34, 35, 20, 33,  6,  2, 63, 51, 48,  2,
       20, 16, 33, 27,  2, 58, 49,  2, 38, 20, 34, 35, 28, 30, 33])>)


In [103]:
dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>

In [104]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [115]:
model.reset_states()
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(vocab), 256, batch_input_shape=[BATCH_SIZE, None]),
    tf.keras.layers.GRU(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(len(vocab))
])

In [114]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [116]:
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [117]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [118]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [119]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [123]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd6a2066ef0>

In [125]:
history = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [128]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_10'

In [129]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd699f31550>

In [130]:
history2 = model.fit(dataset, epochs=10, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [131]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd6bda882e8>

In [133]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (64, None, 256)           17920     
_________________________________________________________________
gru_4 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_6 (Dense)              (64, None, 70)            71750     
Total params: 4,027,974
Trainable params: 4,027,974
Non-trainable params: 0
_________________________________________________________________


In [136]:
predictive_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(vocab), 256, batch_input_shape=[1, None]),
    tf.keras.layers.GRU(1024, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(len(vocab))
])

In [137]:
predictive_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fd69e7c6b00>

In [139]:
predictive_model.build(tf.TensorShape([1, None]))

In [140]:
predictive_model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (1, None, 256)            17920     
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_7 (Dense)              (1, None, 70)             71750     
Total params: 4,027,974
Trainable params: 4,027,974
Non-trainable params: 0
_________________________________________________________________


In [165]:
input_text = text[504087:504125]
print(input_text)


"SCENE II. A plain in Warwickshire."



In [166]:
input_val = [char_to_idx[c] for c in input_text]
input_val = tf.expand_dims(input_val, 0)

In [167]:
input_val

<tf.Tensor: shape=(1, 38), dtype=int32, numpy=
array([[ 1,  4, 34, 18, 20, 29, 20,  2, 24, 24,  8,  2, 16,  2, 59, 55,
        44, 52, 57,  2, 52, 57,  2, 38, 44, 61, 66, 52, 46, 54, 62, 51,
        52, 61, 48,  8,  4,  1]], dtype=int32)>

In [171]:
predictive_model.reset_states()

text_generated = []

for i in range(500):
    predictions = predictive_model(input_val)
    predictions = tf.squeeze(predictions, 0)
    
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    
    input_val = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx_to_char[predicted_id])

print(input_text + ''.join(text_generated))


"SCENE II. A plain in Warwickshire."
he flowerd of east our affarest,"
"Commanded, with your audacions to your holy oath is but weak."
"Why, you shall."
"Peace, though I break thee go."
"To hear the troops of Earlasion lay in hand,"
"A noble earl, and leave to be hanged."
"The spitious east unto his talong of ourself,"
"Whom I with pain havon of you are,"
"By Henry be apapto the gallant of the queen to France?"
"Had sle hears Suffolk, let me proud,"
"Your faithful sleep, will you leave thee to thy too:"
"Then can the counterpein bu
