In [2]:
#Mount google drive to google Colab environment
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
"""
For creating a new project in GitHub, it will throw error if it is executed after the project dir is created
"""
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_generation_using_LSTM" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
!mkdir "{PROJECT_PATH}"
!git clone https://github.com/abhijitsahoo0790/text_generation_using_LSTM.git "{PROJECT_PATH}"

In [None]:
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_generation_using_LSTM" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"
!git pull origin master

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import brown
from nltk.tokenize.treebank import TreebankWordDetokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import re
import copy
import math
import os
import sys
import traceback
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', 
                    filename='log.txt', filemode='w', level=logging.DEBUG, 
                    datefmt='%Y-%m-%d %H:%M:%S')
START_DELIMITER = "ssttaarrt" #This is starting delimiter for a sentence
END_DELIMITER = "eenndd" #This is starting delimiter for a sentence
WINDOW_LENGTH = 100

Using TensorFlow backend.


In [4]:
def fetch_the_corpora_using_NLTK():
    """
    Return the unified corpora from NLTK corpora.

    Returns
    -------
    text : str
        Text data of the corpora.
    """
    corpous_name = "brown"
    status = nltk.download(corpous_name)
    if (status):
        logging.info("Downloaded Brown corpus")
        mdetok = TreebankWordDetokenizer()
        brown_natural = [mdetok.detokenize(' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'").split())  for sent in brown.sents()]
        logging.info("Processed Brown corpus as text")
    else:
        logging.error("Couldn't download the "+ corpous_name+" corpus")
        
    return brown_natural

def enumerate_text_using_word_enum_dict(unified_corpora, word_enum_dict):
    """
    Enumerate the complete text in corpous using word_enum_dict

    Parameters
    ----------
    unified_corpora : TYPE
        DESCRIPTION.
    word_enum_dict : TYPE
        DESCRIPTION.

    Returns
    -------
    complete_text_enumerated : TYPE
        DESCRIPTION.

    """
    complete_text = " ".join([START_DELIMITER+" "+item+" "+END_DELIMITER for item in unified_corpora])
    complete_text_processed = re.sub(' +', ' ', re.sub('[^A-Za-z ]+', ' ',complete_text.lower())).strip()
    complete_text_enumerated =  [word_enum_dict[item] for item in complete_text_processed.split(" ") if item in word_enum_dict]   
    return complete_text_enumerated

def enumerate_unique_words(text_corpus):
    """
    Enumerate unique words and return its dictionary and reversed-dictionary

    Parameters
    ----------
    unified_corpora : list of str
        The text corpora as a list of words 

    Returns
    -------
    word_enum_dict
        word as key and its integer enumeration as the value.
    reversed_word_enum_dict
        word as value and its integer enumeration as the key.
    """

    """
    Join all sentences, remove special characters except Space, split all 
    words, take set for unique words, convert it to list, remove None values using filter
    """
    unique_words = list(filter(None, list(set(re.sub('[^A-Za-z ]+', ' ', (text_corpus)).split(" ")))))
    unique_words = unique_words + [START_DELIMITER, END_DELIMITER]
    #enumerate unique words
    word_enum_dict = {v:k for k,v in enumerate(unique_words)}
    reversed_word_enum_dict = {k:v for k,v in enumerate(unique_words)}
    return [word_enum_dict, reversed_word_enum_dict]

def generate_sequence_data_for_LSTM(complete_text_enumerated):
    """
    Generate pattern sequences of length as specified by WINDOW_LENGTH and 
    also generate target of the patterns generated.

    Parameters
    ----------
    complete_text_enumerated : list of int
        Enumerated text sequence.

    Returns
    -------
    X
        Reshaped pattern sequences for LSTM input .
    y
        Target for each generated patterns.
    """
    pattern_sequence = []
    pattern_targets = []
    for i in range(0, len(complete_text_enumerated)-WINDOW_LENGTH):
        temp_pattern = complete_text_enumerated[i:i+WINDOW_LENGTH]
        temp_pattern_target = complete_text_enumerated[i+WINDOW_LENGTH]
        pattern_sequence.append(temp_pattern)
        pattern_targets.append(temp_pattern_target)
    num_patterns = len(pattern_sequence)
    X = np.reshape(pattern_sequence, (num_patterns, WINDOW_LENGTH))
    y = np_utils.to_categorical(pattern_targets)
    return [X, y]


def remove_special_chars(text):
    """
    Remove all special characters except space and remove extra spaces. 

    Parameters
    ----------
    text : str
        Any text

    Returns
    -------
    text : str
        Processed text
    """
    
    text = re.sub(' +', ' ', re.sub('[^A-Za-z ]+', ' ', text).strip())
    return text

def fetch_corpous_from_file(filepath):    
    """
    Read a corpus and do basic processing.

    Parameters
    ----------
    filepath : str
        Path of the text corpus

    Returns
    -------
    corpus_list_sent_processed : list of str
        processed corpus in form of list of str.

    """
    f = open(filepath, 'r')
    corpus_text = f.read()    
    corpus_sentence_list = corpus_text.lower().split('.')
    corpus_list_sent_processed = [remove_special_chars(item) for item in corpus_sentence_list if len(item)>1] 
    return corpus_list_sent_processed

In [5]:
PROJ_PATH = "/content/drive/My Drive/Colab Notebooks/text_generation_using_LSTM/"
FILE_PATH = "data/republic.txt"
FULL_PATH_FILE = join(PROJ_PATH, FILE_PATH)
%cd "{PROJ_PATH}"
if __name__ == "__main__":
    logging.info("Fetching text corpus...")
    # unified_corpora = fetch_the_corpora_using_NLTK() 
    unified_corpora = fetch_corpous_from_file(FULL_PATH_FILE)  
    logging.info("Fetched text corpus")
    
    # Enumerate unique words
    [word_enum_dict, reversed_word_enum_dict] = enumerate_unique_words(" ".join(unified_corpora))
    # Enumerate text using word_enum_dict
    complete_text_enumerated = enumerate_text_using_word_enum_dict(unified_corpora, word_enum_dict)
    # generate sequence data for training LSTM
    [X, y] = generate_sequence_data_for_LSTM(complete_text_enumerated)

/content/drive/My Drive/Colab Notebooks/text_generation_using_LSTM


In [None]:
"""
Train a sequence model to generate text
"""
vocab_size = len(word_enum_dict) + 1
seq_length = WINDOW_LENGTH = 100
# define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size-1, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=200)
# save the model to file
#model.save('model_batchsize128_window100.h5') #mention the file name with a mention of parameters to refer and load

# define the checkpoint
#filepath="results/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]

**Now after training the model, I have saved it. Now I will be loading it below for generating text**

In [None]:
# load the model
model = load_model('model_batchsize128.h5')

####Now, the task is to generate the next n words of text(say n=100), given a seed sentence.

####The output could have been better with sequence length of 100 as the above codes suggest with variable WINDOW_LENGTH, but due to limited GPU access from google colab I trained it upto 20 length sequence. Feel free to play with the following parameters for better results,

####1) Sequence length (WINDOW_LENGTH) : int, more the value better would be the results but after a point it might overfit. Length up to 200 should be fine in  my opinion. Just change the length of the seed sentence to the length of WINDOW_LENGTH. Also, the model is trained over only around 7250 vocabs for computational reasons, so, a lot of words would be missing and would generate key error for custom seeds. You need to replace those new words to resolve the issue.

####2) epoch = more the number of epoch more the accuracy and lesser the error

####3) batch size = Ideal would be 128 for this case, but one can increase it for faster process. But increasing this would degrade the result quality

####That's all. Enjoy the "Text Generation Task".*

In [29]:
"""
Task 1:
Choose a random seed sentence (or from any specified word sequence as specified by i) and generate next 100 words
"""
corpus_word_seq_list = " ".join(unified_corpora).split(" ")
i = 10
#i = randint(0, len(corpus_word_seq_list))

seed_sentence = [item for item in corpus_word_seq_list[i:i+20]]
seed_sentence_enumerated = np.array([[word_enum_dict[item] for item in seed_sentence[0:20]]])
predicted_sentence =""
for i in range(0, 100):
  output = model.predict_classes(seed_sentence_enumerated)
  seed_sentence_enumerated = np.array([seed_sentence_enumerated.tolist()[0][1:] + [output[0]]])
  predicted_word = reversed_word_enum_dict[output[0]]
  if predicted_word == 'ssttaarrt':
    predicted_word = " "
  if predicted_word == 'eenndd':
    predicted_word = "."

  predicted_sentence = predicted_sentence+" "+predicted_word

print ("Seed\n", " ".join(seed_sentence))
print ("\n\n\nGenerated text:\n","'", " ".join(seed_sentence),"' ", predicted_sentence)

Seed
 glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and



Generated text:
 ' glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and '   unconsciously sharers weakly victorious ascertained victorious especially vacant .   imply excellence have girt hades epidemic reluctantly represents demonstrated excellence recollection unconvinced allurements .   vine numbering excited unconvinced lydian imply necessary enter hades methinks victorious fashioning atalanta victorious silenced three alarms methinks demonstrated victorious friends enter wretchedest .   enter fraud obeying victorious ally three light demonstrated horseback sharers lets assailing wasted mostly inconsistent descendants artemis descendants cups celebrating methinks especially there methinks citizen singers rebuking light perceives reascend wet pursue unconvinced lydian outlines reported enter aught choose sinners sha

In [30]:
"""
Task 2:
Provide your own custom seed sentence that may not be there in the corpus 
"""
seed_sentence = "ssttaarrt that is especially has some of the world largest and most renowned victory including the last one which is"
seed_sentence_enumerated = []
for item in seed_sentence.split(" "):
  temp = word_enum_dict[item]
  seed_sentence_enumerated.append(temp)
seed_sentence_enumerated = np.array([seed_sentence_enumerated])

predicted_sentence =""
for i in range(0, 100):
  output = model.predict_classes(seed_sentence_enumerated)
  seed_sentence_enumerated = np.array([seed_sentence_enumerated.tolist()[0][1:] + [output[0]]])
  predicted_word = reversed_word_enum_dict[output[0]]
  if predicted_word == 'ssttaarrt':
    predicted_word = " "
  if predicted_word == 'eenndd':
    predicted_word = "."

  predicted_sentence = predicted_sentence+" "+predicted_word

print ("Seed\n", seed_sentence)
print ("\n\n\nGenerated text:\n","'", seed_sentence,"' ", predicted_sentence)

Seed
 ssttaarrt that is especially has some of the world largest and most renowned victory including the last one which is



Generated text:
 ' ssttaarrt that is especially has some of the world largest and most renowned victory including the last one which is '   sharers kingdom exercise reascend companion sharers victorious purpose enter graves believe rebuking bulk fit enter conditions adherents enter sharers disciple nice enter sharers victorious satellites .   assailing wet comprehended lets assailing spontaneously .   fraud aught countless hirelings numbering shades sharers enact imply demanding colleagues allotted embroidery sharers acts discourse colleagues ranks heracleitus recollection fare defects downwards garland sharers dull affinities victorious undoubtedly goal enter vigorously diseased acknowledged methinks reascend realization numbering claim enter numbering have pottery full various atalanta pleasanter propose commit heracleitus recollection housekeeping flinch plea