# <center> Text Generation - Next Word Prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/nlp-specialization-data/'):
    print(dirname)
    #for filename in filenames:
    #   print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Functions for Processing Text

### a. Reading in files as a string text

In [None]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [None]:
len("rahul")

In [None]:
len(read_file('/kaggle/input/nlp-specialization-data/Novel - Moby-Dick By Herman Melville.txt'))

In [None]:
print(read_file('/kaggle/input/nlp-specialization-data/Novel - Moby-Dick By Herman Melville.txt')[:5000])

### b. Tokenize and Clean Text

In [None]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()
nlp.max_length = 1198623

In [None]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text[:250000]

In [None]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
d = read_file('/kaggle/input/nlp-specialization-data/Novel - Moby-Dick By Herman Melville.txt')
tokens = separate_punc(d)

In [None]:
type(tokens)

In [None]:
len(tokens)

In [None]:
tokens[:10]

### c. Create Sequences of Tokens

In [None]:
# organize into sequences of tokens
train_len = 25+1 # 25 training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [None]:
print(read_file('/kaggle/input/nlp-specialization-data/Novel - Moby-Dick By Herman Melville.txt')[:1000])

In [None]:
' '.join(text_sequences[0])

In [None]:
' '.join(text_sequences[1])

In [None]:
' '.join(text_sequences[2])

In [None]:
len(text_sequences) #Every sentence is containing 26 words

In [None]:
len(tokens) # These are total number of words in the whole novel

In [None]:
print(len(text_sequences[0]))
print(text_sequences[0])

### d. Keras Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
print(len(sequences[0]))
print(sequences[0])

In [None]:
type(tokenizer.index_word)

In [None]:
' '.join(text_sequences[0])

In [None]:
i=0
for a in tokenizer.index_word:
    print(a,"--->",tokenizer.index_word[a])
    i+=1
    if i==20 : break 

In [None]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

In [None]:
i=0
for a in tokenizer.word_counts:
    print((a,tokenizer.word_counts[a]))
    i+=1
    if i==10 : break 

In [None]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

In [None]:
i=6990
for a in range(i,7000):
    print(a,"--->",tokenizer.index_word[a])
    #i+=1
    #if i==6999 : break 

In [None]:
# tokenizer.index_word

### e. Convert to Numpy Matrix

In [None]:
import numpy as np

In [None]:
len(sequences)

In [None]:
len(sequences[0])

In [None]:
sequences = np.array(sequences)

In [None]:
sequences.shape

In [None]:
sequences

## 2. Creating an LSTM based model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [None]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 25, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

## 3. Train / Test Split

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
print(sequences.shape)
sequences

In [None]:
# First 25 words
print(sequences[:,:-1].shape)
sequences[:,:-1]

In [None]:
# last Word
print(sequences[:,-1].shape)
sequences[:,-1]

In [None]:
X = sequences[:,:-1]

In [None]:
X.shape

In [None]:
y = sequences[:,-1]

In [None]:
y.shape

In [None]:
y = to_categorical(y, num_classes=vocabulary_size)

In [None]:
y.shape

In [None]:
seq_len = X.shape[1]

In [None]:
seq_len

## 4. Training the Model

In [None]:
# define model
model = create_model(vocabulary_size, seq_len)
#model = create_model(vocabulary_size, seq_len)

---

----

## 5. Fit model

In [None]:
model.fit(X, y, batch_size=512, epochs=250,verbose=1,validation_batch_size=.20)

### Download Model Object

In [None]:
from pickle import dump,load

In [None]:
# save the model to file
model.save('epochBIG.h5')
# save the tokenizer
dump(tokenizer, open('epochBIG', 'wb'))

## 6. Generating New Text

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (25 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        #pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        predict_x=model.predict(pad_encoded) 
        pred_word_ind=np.argmax(predict_x,axis=1)[0]
        #print(pred_word_ind)
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

## 7. Grab a random Text Sequence

In [None]:
print(text_sequences[0])

In [None]:
import random
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]

In [None]:
print(random_seed_text)

In [None]:
seed_text = ' '.join(random_seed_text)
seed_text

In [None]:
model = load_model('epochBIG.h5')

In [None]:
tokenizer = load(open('epochBIG', 'rb'))

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=3)

In [None]:
text=' It is a way I have of driving off the spleen and regulating the circulation.  Whenever I find'
print(text)

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=text,num_gen_words=3)

In [None]:
print(read_file('/kaggle/input/nlp-specialization-data/Novel - Moby-Dick By Herman Melville.txt')[250000:251000])

In [None]:
test_text='Three better,more likely sea-officers and men, each in his own different way,could not readily be found, and they were every'
print(test_text)

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=test_text,num_gen_words=1)

In [None]:
seed_text = "landsman has had fresh fruit to his daily hand and broken the world 's fresh bread to my mouldy crusts away whole oceans away from that"

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=3)