**Dataset Used:** Novel - The Top Five Regrets Of the Dying

**Link to my dataset:** https://drive.google.com/file/d/1D43PRkn3OjQoqxC5IxwJggjfO3FUC6dD/view?usp=sharing

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from keras.utils.vis_utils import plot_model
from pickle import dump


### 1. Import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filename = '/content/drive/My Drive/Machine Learning Stuff/The_Top_Five_Regrets_of_the_Dying.txt'
file = open(filename, 'rt' )
text = file.read()
file.close()


In [None]:
text



### 2. Preprocess the data until you are left with individual words, represent the words in form of Stanford Word Embedding model vectors, each vector should have a maximum of 100 values 

In [None]:
# NLTK Tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
# Tokenize the data
word_tokens = word_tokenize(text)
word_tokens[:50]

['On',
 'a',
 'balmy',
 'summer',
 '’',
 's',
 'evening',
 'in',
 'a',
 'little',
 'country',
 'town',
 ',',
 'a',
 'conversation',
 'was',
 'underway',
 'that',
 'was',
 'like',
 'many',
 'cheerful',
 'conversations',
 'taking',
 'place',
 'simultaneously',
 'all',
 'over',
 'the',
 'world',
 '.',
 'It',
 'was',
 'two',
 'people',
 'simply',
 'catching',
 'up',
 'with',
 'each',
 'other',
 'and',
 'having',
 'a',
 'yarn',
 '.',
 'The',
 'difference',
 'with',
 'this']

In [None]:
# Remove punctuation
words = [word for word in word_tokens if word.isalpha()]

In [None]:
# Remove stopwords and change to lowercase
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# Remove stopwords and change to lowercase
clean_words=[w.lower() for w in words if w.lower() not in stop_words]
clean_words[:50]


['balmy',
 'summer',
 'evening',
 'little',
 'country',
 'town',
 'conversation',
 'underway',
 'like',
 'many',
 'cheerful',
 'conversations',
 'taking',
 'place',
 'simultaneously',
 'world',
 'two',
 'people',
 'simply',
 'catching',
 'yarn',
 'difference',
 'conversation',
 'though',
 'could',
 'later',
 'identified',
 'one',
 'significant',
 'turning',
 'points',
 'person',
 'life',
 'person',
 'cec',
 'editor',
 'great',
 'folk',
 'music',
 'magazine',
 'australia',
 'called',
 'trad',
 'equally',
 'known',
 'loved',
 'support',
 'folk',
 'music',
 'australia']

In [None]:
len(clean_words)

43835

In [None]:
clean_text =' '.join(clean_words)
clean_text



In [None]:
# Vectorise the data
from keras.preprocessing.text import Tokenizer

In [None]:
t = Tokenizer()

In [None]:
t.fit_on_texts([clean_text])

In [None]:
encoded = t.texts_to_sequences([clean_text])[0]

In [None]:
# Group words into sequences of 6 (5 as input 1 as output)
num=0
big_arr =[]
for i in range((len(encoded)//6)+1):
  if len(encoded) - num < 6: #to avoid missing data
    
    break
  else:
    arr_1=encoded[num:num+6]
    big_arr.append(arr_1)
    num = num + 6


[116, 33, 170, 435, 28, 54]

In [None]:
# pad documents to a max length of the maximum vector length of 1 since the elements are strings

padded_docs = pad_sequences(big_arr, maxlen=6, padding= 'post' )
padded_docs


array([[2627, 1493,  498,   54,  303,  387],
       [ 405, 1327,   48,   57,  910,  499],
       [ 353,  123, 3519,   95,  117,   29],
       ...,
       [ 457,  292,  778,  412, 1704,  116],
       [ 234,  369, 1157,  985, 1304,  273],
       [ 116,   33,  170,  435,   28,   54]], dtype=int32)

In [None]:
vocab_size = len(t.word_index) + 1
vocab_size

6025

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = '/content/drive/My Drive/Machine Learning Stuff/glove.6B/glove.6B.100d.txt'
word2vec_output_file = '/content/drive/My Drive/Machine Learning Stuff/glove.6B/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [None]:
embeddings_index = dict()
f = open( '/content/drive/My Drive/Machine Learning Stuff/glove.6B/glove.6B.100d.txt.word2vec' )
for line in f:
  values = line.split()
  word = values[0]
  coefs = asarray(values[1:], dtype= 'float32' )
  embeddings_index[word] = coefs
f.close()
print( ' Loaded %s word vectors. ' % len(embeddings_index))

 Loaded 400001 word vectors. 


In [None]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

(6025, 100)

### 3. Divide your data into X(which are five(5) words) and your Y which is the next word. The model should take a maximum of five words and predict the next word

In [None]:
# Shuffle the data to reduce difference in training and validation data
def shuffle(data):
    n = len(data)
    for i in range(n):
        k = np.random.randint(i, n)
        data[i], data[k] = data[k], data[i]
        
    return data

In [None]:
shuffled_docs = shuffle(padded_docs)

In [None]:
# separate input data X and corresponding output y
X = shuffled_docs[:, :-1]
y = shuffled_docs[:, -1]
y = to_categorical(y, num_classes = vocab_size)

### 4. Create an LSTM model

In [None]:
# Creating the LSTM model
lstm_model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=5, trainable=False)
lstm_model.add(e)
lstm_model.add(LSTM(100, return_sequences=True))
lstm_model.add(LSTM(100))
lstm_model.add(Dropout(0.3))    
lstm_model.add(Dense(100, activation = 'relu'))  
lstm_model.add(Dropout(0.3))    
lstm_model.add(Dense(vocab_size, activation = 'softmax'))
lstm_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
lstm_model.summary()
plot_model(lstm_model, to_file='model.png', show_shapes=True)


### 5. Train the model applying cross validation with a reasonable batch size, evaluate the performance of the model and regularize the model to optimize its performance

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Training attempt 1
lstm_model.fit(X_train, y_train, batch_size = 100, epochs = 50, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f6511185650>

In [None]:
# 2nd training attempt - Decreased batch size to 10
lstm_model.fit(X_train, y_train, batch_size = 10, epochs = 50, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f65112c0150>

In [None]:
# Test the model

in_text = "On a balmy summer evening"

encoded_test = t.texts_to_sequences([in_text])[0]       

encoded_test = pad_sequences([encoded_test], maxlen=5, padding='post')

yhat = np.argmax(lstm_model.predict(encoded_test), axis=-1)          # predict probabilities for each word

out_word = ''

for word, index in t.word_index.items():
  if index == yhat:
    out_word = word
    break

print(in_text + "->" + out_word)

On a balmy summer evening->taking


In [None]:
# Save model
lstm_model.save('nextword_model.h5')

In [None]:
# Save tokenizer
dump(t,open('nextword_tokenizer.pkl','wb'))