<a href="https://colab.research.google.com/github/agcode/EIP/blob/master/Assignment_EIP_3_Phase_2_Session_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM Exercise

This code is reproduced with modifications from the [machine learning mastery](https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/) blog.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

## The text for training

the text is from [Alice's Adventures in Wonderland](http://www.gutenberg.org/cache/epub/11/pg11.txt)

In [0]:
file='/content/drive/My Drive/wonderland.txt'
!cp "$file" .
filename = "wonderland.txt"
raw_text_with_propercase = open(filename).read()
raw_text_lower = raw_text_with_propercase.lower().split('.')

### Removed punctuations

In [0]:
raw_text=[]
for j in raw_text_lower:
  raw_text_char=''
  for i in j:
    if (ord(i)>32 and ord(i)<48) or (ord(i)>57 and ord(i)<97):
      pass
    else:
      raw_text_char=raw_text_char+i
  raw_text.append(raw_text_char)

In [0]:
raw_text_master=''
for i in raw_text:
  raw_text_master=raw_text_master+i
  

In [0]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text_master)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [0]:
n_chars = len(raw_text_master)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  136089
Total Vocab:  30


### Existing dataset code without padding, considering whole text as one line 

In [0]:
# # prepare the dataset of input to output pairs encoded as integers
# seq_length = 100
# dataX = []
# dataY = []
# for i in range(0, n_chars - seq_length, 1):
# 	seq_in = raw_text[i:i + seq_length]
# 	seq_out = raw_text[i + seq_length]
# 	dataX.append([char_to_int[char] for char in seq_in])
# 	dataY.append(char_to_int[seq_out])
# n_patterns = len(dataX)
# print("Total Patterns: ", n_patterns)

### New code that data set is created from  each line and padding

In [0]:
# prepare the dataset of input to output pairs encoded as integers
from keras.preprocessing.sequence import pad_sequences
seq_length = 15
dataX = []
dataY = []
for line_string in raw_text:
  for j in range(1,len(line_string)-1):
    if j<=seq_length:
      seq_in = pad_sequences([[char_to_int[char] for char in line_string[:j]]],maxlen=seq_length)
    else:         
      seq_in = pad_sequences([[char_to_int[char] for char in line_string[j-seq_length:]]],maxlen=seq_length,truncating='post', padding='post')
    seq_out = char_to_int[line_string[j :j + 1]]
    dataX.append(seq_in[0])
    dataY.append(seq_out)
#   break
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  134108


### Normalising Dataset

In [0]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

### Model Code

In [0]:
# define the LSTM model
model = Sequential()
model.add(Dropout(0.1))
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Training the model

In [0]:
# define the checkpoint
# filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
filename="lstm.hdf5"
checkpoint = ModelCheckpoint(filename, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(X, y, epochs=100, batch_size=128, callbacks=callbacks_list)

Epoch 1/100

Epoch 00001: loss improved from inf to 2.77978, saving model to lstm.hdf5
Epoch 2/100

Epoch 00002: loss improved from 2.77978 to 2.50907, saving model to lstm.hdf5
Epoch 3/100

Epoch 00003: loss improved from 2.50907 to 2.31473, saving model to lstm.hdf5
Epoch 4/100

Epoch 00004: loss improved from 2.31473 to 2.18319, saving model to lstm.hdf5
Epoch 5/100

Epoch 00005: loss improved from 2.18319 to 2.09373, saving model to lstm.hdf5
Epoch 6/100

Epoch 00006: loss improved from 2.09373 to 2.02476, saving model to lstm.hdf5
Epoch 7/100

Epoch 00007: loss improved from 2.02476 to 1.96799, saving model to lstm.hdf5
Epoch 8/100

Epoch 00008: loss improved from 1.96799 to 1.92172, saving model to lstm.hdf5
Epoch 9/100

Epoch 00009: loss improved from 1.92172 to 1.88470, saving model to lstm.hdf5
Epoch 10/100

Epoch 00010: loss improved from 1.88470 to 1.85063, saving model to lstm.hdf5
Epoch 11/100

Epoch 00011: loss improved from 1.85063 to 1.82022, saving model to lstm.hdf5
E

<keras.callbacks.History at 0x7f45f65e0a20>

### Text Generation

In [0]:
# filename = "weights-improvement-20-1.9415.hdf5"

model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start].tolist()
int_to_char = dict((i, c) for i, c in enumerate(chars))
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
final_result=''
# generate characters
for i in range(500):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  final_result=final_result+result
  pattern.append(index)
  pattern = pattern[1:len(pattern)]
print(final_result)  
print("\nDone.")

Seed:
"  and among
them "
ed tuitted tetlaie tatting nike the leet tarping aboutse io tepimbny seruon moog anice whatsedl goce iere anice aspearddy iom yery hoisge woder teadatdie taie anice aso asd teeaye hettiny seraies anice anl ie ganlet teparked tatticl asoear anl tound anl tuoaled oear oose anice whisg waie tatting as her oex uhe hot uoeeziog anl abouune oear tuote iere anice aspearddy iom yery hoisge woder teadatdie taie anice aso asd teeaye hettiny seraies anice anl ie ganlet teparked tatticl asoear anl tound anl

Done.


In [0]:
# filename = "weights-improvement-20-1.9415.hdf5"

model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start].tolist()
int_to_char = dict((i, c) for i, c in enumerate(chars))
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
final_result=''
# generate characters
for i in range(500):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  final_result=final_result+result
  pattern.append(index)
  pattern = pattern[1:len(pattern)]
print(final_result)  
print("\nDone.")

Seed:
" th
their heads  "
oext asd teadh anioe tetlars wher suested anl toayed iir veart ooeered teeoiny anice anice whoygh oar exery goowr hoerse tame anice aspearddy iom yery hoisge woder teadatdie taie anice aso asd teeaye hettiny seraies anice anl ie ganlet teparked tatticl asoear anl tound anl tuoaled oear oose anice whisg waie tatting as her oex uhe hot uoeeziog anl abouune oear tuote iere anice aspearddy iom yery hoisge woder teadatdie taie anice aso asd teeaye hettiny seraies anice anl ie ganlet teparked tatticl 

Done.
