## Importing the classes and functions

In [1]:
import numpy

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

import sys
import string
import pprint

Using TensorFlow backend.


## Loading the text data

In [2]:
# load ascii text and covert to lowercase
!git clone https://github.com/atulram/EIP_phase_2.git
  
filename = "/content/EIP_phase_2/session6/wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

Cloning into 'EIP_phase_2'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 15 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (15/15), done.


In [0]:
# Removing the texts from the begining and end which are not the part of the book

begin, rest = raw_text.split("*** START OF THIS PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND ***".lower())
raw_text, end = rest.split("              the end\n\n\n\n\n\nEnd of Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll".lower())

In [4]:
n_chars = len(raw_text)
n_vocab = len(sorted(list(set(raw_text))))

print("Before Removing Punctuation")
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Before Removing Punctuation
Total Characters:  144420
Total Vocab:  45


## Removing Punctuation

In [0]:
# using maketrans() to construct translate table

table = str.maketrans("","", string.punctuation)
raw_text = raw_text.translate(table) 

In [0]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i+1) for i, c in enumerate(chars)) ## index starting from 1 so as to facilitate padding

In [7]:
char_to_int

{'\n': 1,
 ' ': 2,
 '0': 3,
 '3': 4,
 'a': 5,
 'b': 6,
 'c': 7,
 'd': 8,
 'e': 9,
 'f': 10,
 'g': 11,
 'h': 12,
 'i': 13,
 'j': 14,
 'k': 15,
 'l': 16,
 'm': 17,
 'n': 18,
 'o': 19,
 'p': 20,
 'q': 21,
 'r': 22,
 's': 23,
 't': 24,
 'u': 25,
 'v': 26,
 'w': 27,
 'x': 28,
 'y': 29,
 'z': 30}

In [8]:
n_chars = len(raw_text)
n_vocab = len(chars)

print("After Removing Punctuation")
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

After Removing Punctuation
Total Characters:  136100
Total Vocab:  30


## Define the training data for the network

In [9]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
raw_text = ' '+raw_text

for i in range(1, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  
  ##Checking for incomplete words in the beginning of the sequence
  if raw_text[i-1]!=' ':
    
    ## removing the random letters of the left most word. This will be adjusted by pre-sequence padding
    seq_in = seq_in[seq_in.find(' ')+1:]
  
  seq_out = raw_text[i + seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  135999


## Making a padded sequence


In [0]:
# pre pad sequence

from keras.preprocessing.sequence import pad_sequences
paddedX = pad_sequences(dataX)

In [11]:
print(paddedX)

[[ 1  1  1 ...  7 12  5]
 [ 0  0  0 ... 12  5 20]
 [ 0  0  0 ...  5 20 24]
 ...
 [ 0  0  5 ...  2  8  5]
 [ 0  5 18 ...  8  5 29]
 [ 5 18  8 ...  5 29 23]]


## Transform data to use it with Keras

In [0]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(paddedX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [21]:
print(X.shape)
print(y.shape)

(135999, 100, 1)
(135999, 31)


## Defining the LSTM model

In [14]:
# define the LSTM model

model = Sequential()
model.add(Dropout(0.1)) ## Adding dropout to the input layer
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

W0727 09:49:29.590418 140480920938368 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0727 09:49:29.636764 140480920938368 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0727 09:49:29.643331 140480920938368 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0727 09:49:30.064712 140480920938368 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0727 09:49:30.078297 

## Training the model

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# define the checkpoint
filepath="/content/drive/My Drive/EIP/LSTM_weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [23]:
model.fit(X, y, epochs=50, batch_size=128, callbacks=callbacks_list)

W0726 13:19:23.516954 140218529822592 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 13:19:23.521786 140218529822592 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0726 13:19:23.533238 140218529822592 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0726 13:19:23.550444 140218529822592 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backe

Epoch 1/50

Epoch 00001: loss improved from inf to 2.79132, saving model to /content/drive/My Drive/EIP/LSTM_weights-01-2.7913.hdf5
Epoch 2/50

Epoch 00002: loss improved from 2.79132 to 2.50344, saving model to /content/drive/My Drive/EIP/LSTM_weights-02-2.5034.hdf5
Epoch 3/50

Epoch 00003: loss improved from 2.50344 to 2.26765, saving model to /content/drive/My Drive/EIP/LSTM_weights-03-2.2676.hdf5
Epoch 4/50

Epoch 00004: loss improved from 2.26765 to 2.13962, saving model to /content/drive/My Drive/EIP/LSTM_weights-04-2.1396.hdf5
Epoch 5/50

Epoch 00005: loss improved from 2.13962 to 2.05790, saving model to /content/drive/My Drive/EIP/LSTM_weights-05-2.0579.hdf5
Epoch 6/50

Epoch 00006: loss improved from 2.05790 to 1.99267, saving model to /content/drive/My Drive/EIP/LSTM_weights-06-1.9927.hdf5
Epoch 7/50

Epoch 00007: loss improved from 1.99267 to 1.94333, saving model to /content/drive/My Drive/EIP/LSTM_weights-07-1.9433.hdf5
Epoch 8/50

Epoch 00008: loss improved from 1.94333 

<keras.callbacks.History at 0x7f86dfa9f0b8>

### continuing training after 50th epoch

In [0]:
# load the network weights
filename = "/content/drive/My Drive/EIP/LSTM_weights-50-1.3988.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [26]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 1.09206, saving model to /content/drive/My Drive/EIP/LSTM_weights-01-1.0921.hdf5
Epoch 2/20

Epoch 00002: loss improved from 1.09206 to 1.05436, saving model to /content/drive/My Drive/EIP/LSTM_weights-02-1.0544.hdf5
Epoch 3/20

Epoch 00003: loss improved from 1.05436 to 1.02754, saving model to /content/drive/My Drive/EIP/LSTM_weights-03-1.0275.hdf5
Epoch 4/20

Epoch 00004: loss improved from 1.02754 to 1.00404, saving model to /content/drive/My Drive/EIP/LSTM_weights-04-1.0040.hdf5
Epoch 5/20

Epoch 00005: loss improved from 1.00404 to 0.98454, saving model to /content/drive/My Drive/EIP/LSTM_weights-05-0.9845.hdf5
Epoch 6/20

Epoch 00006: loss improved from 0.98454 to 0.96537, saving model to /content/drive/My Drive/EIP/LSTM_weights-06-0.9654.hdf5
Epoch 7/20

Epoch 00007: loss improved from 0.96537 to 0.94748, saving model to /content/drive/My Drive/EIP/LSTM_weights-07-0.9475.hdf5
Epoch 8/20

Epoch 00008: loss improved from 0.94748 

<keras.callbacks.History at 0x7f015e76fba8>

### continuing training after 70th epoch

In [0]:
# load the network weights
filename = "/content/drive/My Drive/EIP/LSTM_weights-20-0.7831.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
model.fit(X, y, epochs=30, batch_size=128, callbacks=callbacks_list)

W0727 09:56:15.444358 140480920938368 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/30

Epoch 00001: loss improved from inf to 0.77643, saving model to /content/drive/My Drive/EIP/LSTM_weights-01-0.7764.hdf5
Epoch 2/30

Epoch 00002: loss improved from 0.77643 to 0.76252, saving model to /content/drive/My Drive/EIP/LSTM_weights-02-0.7625.hdf5
Epoch 3/30

Epoch 00003: loss improved from 0.76252 to 0.75447, saving model to /content/drive/My Drive/EIP/LSTM_weights-03-0.7545.hdf5
Epoch 4/30

Epoch 00004: loss improved from 0.75447 to 0.74570, saving model to /content/drive/My Drive/EIP/LSTM_weights-04-0.7457.hdf5
Epoch 5/30

Epoch 00005: loss improved from 0.74570 to 0.73968, saving model to /content/drive/My Drive/EIP/LSTM_weights-05-0.7397.hdf5
Epoch 6/30

Epoch 00006: loss improved from 0.73968 to 0.73342, saving model to /content/drive/My Drive/EIP/LSTM_weights-06-0.7334.hdf5
Epoch 7/30

Epoch 00007: loss improved from 0.73342 to 0.72300, saving model to /content/drive/My Drive/EIP/LSTM_weights-07-0.7230.hdf5
Epoch 8/30

Epoch 00008: loss improved from 0.72300 

<keras.callbacks.History at 0x7fc3f043eef0>

## Generating text

In [0]:
# load the network weights
filename = "/content/drive/My Drive/EIP/LSTM_weights-30-0.6084.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
int_to_char = dict((i+1, c) for i, c in enumerate(chars))
int_to_char[0]=''

In [70]:
# pick a random seed
start = numpy.random.randint(0, len(paddedX)-1)

# pad the seed
pattern = paddedX[start]
pattern = pattern.tolist()
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed:
" he came trotting along in a great
hurry muttering to himself as he came oh the duchess the duch "


In [71]:
# generate characters
for i in range(500):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

ess said in a low trembling voice what thought alice as if she went on what he said alice and how fotm the things being all their simple jobsser and she tried to say 
to be no chance of the sort
of little tolscker and the moral of that istt take little tors of his that she was beginning to her feet for a farher she shouting off with his head off and the loral of that is the cat went on the rabbit and said nothing

when we were little the moment said the mock turtle and the moral of that istt the
Done.
