## NAME: VIJAY KRISHNA A B
## POSITION: DATA SCIENCE INTERN

### TASK: NEXT WORD PREDICTION USING RNN LSTM MODEL
### DIFFICULTY: ADVANCED LEVEL

### IMPORTING THE NECESSARY MODULES


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import pickle
import numpy as np
import os

### UPLOAD THE DATASET


In [None]:
from google.colab import files
uploaded = files.upload()

Saving 1661-0.txt to 1661-0.txt


### PREPROCESS THE DATA BY REPLACING UNWANTED CHARACTERS

In [None]:
file = open('1661-0.txt', 'r', encoding = 'utf8')

lines = []
for i in file:
  lines.append(i)

#print(lines[0:50])

data = ""
for i in lines:
  data = ' '.join(lines)

data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('’', '').replace('‘', '').replace('“', '').replace('”', '')

data = data.split()
data = ' '.join(data)
data[:500]

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Adventures of Sherlock Holmes Author: Arthur Conan Doyle Release Date: November 29, 2002 [EBook #1661] Last Updated: May 20, 2019 Language: English Character set en"

In [None]:
len(data)

572175

### USE TOKENIZER TO CONVERT TEXT TO SEQUENCES

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
print(sequence_data[:15])

[141, 4615, 1, 977, 5, 125, 32, 45, 542, 2155, 2156, 27, 978, 14, 22]


In [None]:
len(sequence_data)

108547

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8431

In [None]:
sequence_length = 10
sequences = []

for i in range(sequence_length, len(sequence_data)):
  words = sequence_data[i-sequence_length : i+1]
  sequences.append(words)

sequences = np.array(sequences)
sequences[:10]

array([[ 141, 4615,    1,  977,    5,  125,   32,   45,  542, 2155, 2156],
       [4615,    1,  977,    5,  125,   32,   45,  542, 2155, 2156,   27],
       [   1,  977,    5,  125,   32,   45,  542, 2155, 2156,   27,  978],
       [ 977,    5,  125,   32,   45,  542, 2155, 2156,   27,  978,   14],
       [   5,  125,   32,   45,  542, 2155, 2156,   27,  978,   14,   22],
       [ 125,   32,   45,  542, 2155, 2156,   27,  978,   14,   22,    1],
       [  32,   45,  542, 2155, 2156,   27,  978,   14,   22,    1,  263],
       [  45,  542, 2155, 2156,   27,  978,   14,   22,    1,  263,    5],
       [ 542, 2155, 2156,   27,  978,   14,   22,    1,  263,    5,  382],
       [2155, 2156,   27,  978,   14,   22,    1,  263,    5,  382, 2157]])

### SEPEARATE THE DEPENDANT AND INDEPENDANT VALUES IN NUMPY ARRAYS

In [None]:
X = []
y = []

for seq in sequences:
  X.append(seq[:sequence_length])
  y.append(seq[sequence_length])

X = np.array(X)
y = np.array(y)

print("X:", X[:10])
print("y:", y[:10])

X: [[ 141 4615    1  977    5  125   32   45  542 2155]
 [4615    1  977    5  125   32   45  542 2155 2156]
 [   1  977    5  125   32   45  542 2155 2156   27]
 [ 977    5  125   32   45  542 2155 2156   27  978]
 [   5  125   32   45  542 2155 2156   27  978   14]
 [ 125   32   45  542 2155 2156   27  978   14   22]
 [  32   45  542 2155 2156   27  978   14   22    1]
 [  45  542 2155 2156   27  978   14   22    1  263]
 [ 542 2155 2156   27  978   14   22    1  263    5]
 [2155 2156   27  978   14   22    1  263    5  382]]
y: [2156   27  978   14   22    1  263    5  382 2157]


In [None]:
y = to_categorical(y, num_classes = vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### BUILD THE MODEL WITH

> EMBEDDING LAYER

> LSTM(X2)

> DENSE(X2)


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = sequence_length))
model.add(LSTM(1000, return_sequences = True))
model.add(LSTM(1000))
model.add(Dense(1000, activation = 'relu'))
model.add(Dense(vocab_size, activation = 'softmax'))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 10)            84310     
_________________________________________________________________
lstm (LSTM)                  (None, 10, 1000)          4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 8431)              8439431   
Total params: 21,572,741
Trainable params: 21,572,741
Non-trainable params: 0
_________________________________________________________________


### CREATE MODEL CHECKPOINT AND TRAIN THE MODEL WITH CATEGORICAL CROSS ENTROPY LOSS AND ADAM OPTIMIZER

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('next_words.h5', monitor='loss', verbose=1, save_best_only = True)
model.compile(loss = 'categorical_crossentropy', optimizer = Adam(learning_rate = 0.001))
model.fit(X, y, epochs = 15, batch_size = 64, callbacks = [checkpoint])

Epoch 1/15

Epoch 00001: loss improved from inf to 6.41305, saving model to next_words.h5
Epoch 2/15

Epoch 00002: loss improved from 6.41305 to 5.83293, saving model to next_words.h5
Epoch 3/15

Epoch 00003: loss improved from 5.83293 to 5.47957, saving model to next_words.h5
Epoch 4/15

Epoch 00004: loss improved from 5.47957 to 5.20990, saving model to next_words.h5
Epoch 5/15

Epoch 00005: loss improved from 5.20990 to 4.98575, saving model to next_words.h5
Epoch 6/15

Epoch 00006: loss improved from 4.98575 to 4.77909, saving model to next_words.h5
Epoch 7/15

Epoch 00007: loss improved from 4.77909 to 4.57470, saving model to next_words.h5
Epoch 8/15

Epoch 00008: loss improved from 4.57470 to 4.35970, saving model to next_words.h5
Epoch 9/15

Epoch 00009: loss improved from 4.35970 to 4.11532, saving model to next_words.h5
Epoch 10/15

Epoch 00010: loss improved from 4.11532 to 3.82182, saving model to next_words.h5
Epoch 11/15

Epoch 00011: loss improved from 3.82182 to 3.45073

<keras.callbacks.History at 0x7faee99de250>

### LOAD THE MODEL AND THE TOKENIZER FOR PREDICTION

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

model = load_model("next_words.h5")
tokenizer = pickle.load(open('token.pkl', 'rb'))

model.evaluate(X, y)



0.8078064918518066

### PREDICTION:

In [None]:
def predict_next_word(model, tokenizer, text):
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)

  pred = np.argmax(model.predict(sequence))
  predicted = ""
  for key, value in tokenizer.word_index.items():
    if value == pred:
      predicted  = key
      break

  #print(predicted)
  return predicted

In [None]:
to_predict = "At the end of the performance the actor looked very"
print(predict_next_word(model, tokenizer, to_predict))

little


In [None]:
to_predict = "By the time you learn your lessons I will be"
print(predict_next_word(model, tokenizer, to_predict))

trying


In [None]:
to_predict = "It will be fun to go there and have a"
print(predict_next_word(model, tokenizer, to_predict))

marriage


In [None]:
to_predict = "The children were woken up by the noise in the"
print(predict_next_word(model, tokenizer, to_predict))

garden


In [None]:
to_predict = "I forgot to tell you that you received a message from him. It is on your"
print(predict_next_word(model, tokenizer, to_predict))

pride
