<a href="https://colab.research.google.com/github/abhijitsahoo0790/text_generation_using_LSTM/blob/master/main_text_generation_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Mount google drive to google Colab environment
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT)

In [None]:
"""
For creating a new project in GitHub, it will throw error if it is executed after the project dir is created
"""
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_generation_using_LSTM" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
!mkdir "{PROJECT_PATH}"
!git clone https://github.com/abhijitsahoo0790/text_generation_using_LSTM.git "{PROJECT_PATH}"

In [None]:
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_generation_using_LSTM" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"
!git pull origin master

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import brown
from nltk.tokenize.treebank import TreebankWordDetokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import re
import copy
import math
import os
import sys
import traceback
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', 
                    filename='log.txt', filemode='w', level=logging.DEBUG, 
                    datefmt='%Y-%m-%d %H:%M:%S')
START_DELIMITER = "ssttaarrt"
END_DELIMITER = "eenndd"
WINDOW_LENGTH = 20

Using TensorFlow backend.


In [None]:
def fetch_the_corpora_using_NLTK():
    """
    Return the unified corpora from NLTK corpora.

    Returns
    -------
    text : str
        Text data of the corpora.
    """
    corpous_name = "brown"
    status = nltk.download(corpous_name)
    if (status):
        logging.info("Downloaded Brown corpus")
        mdetok = TreebankWordDetokenizer()
        brown_natural = [mdetok.detokenize(' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'").split())  for sent in brown.sents()]
        logging.info("Processed Brown corpus as text")
    else:
        logging.error("Couldn't download the "+ corpous_name+" corpus")
        
    return brown_natural

def enumerate_text_using_word_enum_dict(unified_corpora, word_enum_dict):
    """
    Enumerate the complete text in corpous using word_enum_dict

    Parameters
    ----------
    unified_corpora : TYPE
        DESCRIPTION.
    word_enum_dict : TYPE
        DESCRIPTION.

    Returns
    -------
    complete_text_enumerated : TYPE
        DESCRIPTION.

    """
    complete_text = " ".join([START_DELIMITER+" "+item+" "+END_DELIMITER for item in unified_corpora])
    complete_text_processed = re.sub(' +', ' ', re.sub('[^A-Za-z ]+', ' ',complete_text.lower())).strip()
    complete_text_enumerated =  [word_enum_dict[item] for item in complete_text_processed.split(" ") if item in word_enum_dict]   
    return complete_text_enumerated

def enumerate_unique_words(text_corpus):
    """
    Enumerate unique words and return its dictionary and reversed-dictionary

    Parameters
    ----------
    unified_corpora : list of str
        The text corpora as a list of words 

    Returns
    -------
    word_enum_dict
        word as key and its integer enumeration as the value.
    reversed_word_enum_dict
        word as value and its integer enumeration as the key.
    """

    """
    Join all sentences, remove special characters except Space, split all 
    words, take set for unique words, convert it to list, remove None values using filter
    """
    unique_words = list(filter(None, list(set(re.sub('[^A-Za-z ]+', ' ', (text_corpus)).split(" ")))))
    unique_words = unique_words + [START_DELIMITER, END_DELIMITER]
    #enumerate unique words
    word_enum_dict = {v:k for k,v in enumerate(unique_words)}
    reversed_word_enum_dict = {k:v for k,v in enumerate(unique_words)}
    return [word_enum_dict, reversed_word_enum_dict]

def generate_sequence_data_for_LSTM(complete_text_enumerated):
    """
    Generate pattern sequences of length as specified by WINDOW_LENGTH and 
    also generate target of the patterns generated.

    Parameters
    ----------
    complete_text_enumerated : list of int
        Enumerated text sequence.

    Returns
    -------
    X
        Reshaped pattern sequences for LSTM input .
    y
        Target for each generated patterns.
    """
    pattern_sequence = []
    pattern_targets = []
    for i in range(0, len(complete_text_enumerated)-WINDOW_LENGTH):
        temp_pattern = complete_text_enumerated[i:i+WINDOW_LENGTH]
        temp_pattern_target = complete_text_enumerated[i+WINDOW_LENGTH]
        pattern_sequence.append(temp_pattern)
        pattern_targets.append(temp_pattern_target)
    num_patterns = len(pattern_sequence)
    X = np.reshape(pattern_sequence, (num_patterns, WINDOW_LENGTH))
    y = np_utils.to_categorical(pattern_targets)
    return [X, y]


def remove_special_chars(text):
    """
    Remove all special characters except space and remove extra spaces. 

    Parameters
    ----------
    text : str
        Any text

    Returns
    -------
    text : str
        Processed text
    """
    
    text = re.sub(' +', ' ', re.sub('[^A-Za-z ]+', ' ', text).strip())
    return text

def fetch_corpous_from_file(filepath):    
    """
    Read a corpus and do basic processing.

    Parameters
    ----------
    filepath : str
        Path of the text corpus

    Returns
    -------
    corpus_list_sent_processed : list of str
        processed corpus in form of list of str.

    """
    f = open(filepath, 'r')
    corpus_text = f.read()    
    corpus_sentence_list = corpus_text.lower().split('.')
    corpus_list_sent_processed = [remove_special_chars(item) for item in corpus_sentence_list if len(item)>1] 
    return corpus_list_sent_processed

In [None]:
PROJ_PATH = "/content/drive/My Drive/Colab Notebooks/text_generation_using_LSTM/"
FILE_PATH = "data/republic.txt"
FULL_PATH_FILE = join(PROJ_PATH, FILE_PATH)
%cd "{PROJ_PATH}"
if __name__ == "__main__":
    logging.info("Fetching text corpus...")
    # unified_corpora = fetch_the_corpora_using_NLTK() 
    unified_corpora = fetch_corpous_from_file(FULL_PATH_FILE)  
    logging.info("Fetched text corpus")
    
    # Enumerate unique words
    [word_enum_dict, reversed_word_enum_dict] = enumerate_unique_words(" ".join(unified_corpora))
    # Enumerate text using word_enum_dict
    complete_text_enumerated = enumerate_text_using_word_enum_dict(unified_corpora, word_enum_dict)
    # generate sequence data for training LSTM
    [X, y] = generate_sequence_data_for_LSTM(complete_text_enumerated)

/content/drive/My Drive/Colab Notebooks/text_generation_using_LSTM


In [None]:
vocab_size = len(word_enum_dict) + 1
seq_length = X.shape[1]
# define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size-1, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=200)
# save the model to file
model.save('model_batchsize128.h5')

# define the checkpoint
#filepath="results/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 100)           730000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 100)           80400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 7299)              737199    
Total params: 1,638,099
Trainable params: 1,638,099
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200

In [None]:
# load the model
model = load_model('model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
seed_sentence = "ssttaarrt he is a very poor student eenndd ssttaarrt he is a greedy friend eenndd ssttaarrt he get late to"
seed_sentence_enumerated = []
for item in seed_sentence.split(" "):
  temp = word_enum_dict[item]
  seed_sentence_enumerated.append(temp)

seed_sentence_enumerated = np.array([seed_sentence_enumerated])
print (seed_sentence_enumerated, seed_sentence_enumerated.shape)


#seed_sentence_enumerated_array = np.array([seed_sentence_enumerated])
#seed_sentence_enumerated_array
#seed_sentence_enumerated_array.ndim

[[7297   90 1981 1813 3180 2688 5608 7298 7297   90 1981 1813 3868 2884
  7298 7297   90  962 5925 1553]] (1, 20)


['prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate'] 

 [[4882 1553  551 6463 2964  551 1123 2055  109 1270 1094 5123  212 1553
  5811 3161 5876 2815 6543  333]] (1, 20)


In [None]:
i = 21
seed_sentence = [item for item in " ".join(unified_corpora).split(" ")[i:i+20]]
seed_sentence_enumerated = np.array([[word_enum_dict[item] for item in seed_sentence[0:20]]])
predicted_sentence =""
for i in range(0, 100):
  output = model.predict_classes(seed_sentence_enumerated)
  seed_sentence_enumerated = np.array([seed_sentence_enumerated.tolist()[0][1:] + [output[0]]])
  predicted_word = reversed_word_enum_dict[output[0]]
  if predicted_word == 'ssttaarrt':
    predicted_word = " "
  if predicted_word == 'eenndd':
    predicted_word = "."

  predicted_sentence = predicted_sentence+" "+predicted_word

print ("Seed\n", " ".join(seed_sentence))
print ("\n\n\nGenerated text:\n", predicted_sentence)

Seed
 prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would



Generated text:
  occur diseased artificers observes distinctness reply therefore dicast fawns families fawns bearing imaginary independently niceratus lessons practitioner valiantly wins waging poor independently referring lot unreasoning quite .   wins independently expenditure practitioner independently recognise .   wins shield unreasoning reappeared independently origin practitioner independently books distinctness independently arrives .   wins watering bearing independently never distinctness prosecutions fawns particulars wins artificers bearing fawns particulars string mainly diviner fawns previous laying calls fawns particulars independently cleverest distinctness independently softness seasoning independently joint distinctness yelping independently expeditions .   wins temper boxing practitioner independently regularity distinctness tolls infinitely seducti

In [None]:
seed_sentence = [item for item in " ".join(unified_corpora).split(" ")[i:i+120]]
seed_sentence_enumerated = np.array([[word_enum_dict[item] for item in seed_sentence[0:20]]])
print(len(seed_sentence_enumerated[0]))


20
[[10501 10501 10232  9063 10391  6931  5929 10484 10905 10779  5929  8072
  11827  9231  6982 10501  9901  7327 12313  7189]] (1, 20)


In [None]:
temp = np.array([seed_sentence_enumerated.tolist()[0][1:] + [output[0]]])
#temp1 = seed_sentence_enumerated.tolist()[0] + output
print(seed_sentence_enumerated.tolist()[0],"\n",output, "\n", temp, temp.shape)

[5123, 5123, 4854, 3685, 5013, 1553, 551, 5106, 5527, 5401, 551, 2694, 6449, 3853, 1604, 5123, 4523, 1949, 6935, 1811] 
 [5378] 
 [[5123 4854 3685 5013 1553  551 5106 5527 5401  551 2694 6449 3853 1604
  5123 4523 1949 6935 1811 5378]] (1, 20)


In [None]:
# load the network weights
filename = "weights-improvement-20-2.0532.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
print("\n\n Generating Chars:")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print ("\nDone.")

NameError: ignored