In [1]:
#Mount google drive to google Colab environment
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
"""
For creating a new project in GitHub, it will throw error if it is executed after the project dir is created
"""
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_generation_using_LSTM" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
!mkdir "{PROJECT_PATH}"
!git clone https://github.com/abhijitsahoo0790/text_generation_using_LSTM.git "{PROJECT_PATH}"

In [None]:
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_generation_using_LSTM" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"
!git pull origin master

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import brown
from nltk.tokenize.treebank import TreebankWordDetokenizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import re
import copy
import math
import os
import sys
import traceback
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', 
                    filename='log.txt', filemode='w', level=logging.DEBUG, 
                    datefmt='%Y-%m-%d %H:%M:%S')
START_DELIMITER = "ssttaarrt"
END_DELIMITER = "eenndd"
WINDOW_LENGTH = 20

Using TensorFlow backend.


In [3]:
def fetch_the_corpora_using_NLTK():
    """
    Return the unified corpora from NLTK corpora.

    Returns
    -------
    text : str
        Text data of the corpora.
    """
    corpous_name = "brown"
    status = nltk.download(corpous_name)
    if (status):
        logging.info("Downloaded Brown corpus")
        mdetok = TreebankWordDetokenizer()
        brown_natural = [mdetok.detokenize(' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'").split())  for sent in brown.sents()]
        logging.info("Processed Brown corpus as text")
    else:
        logging.error("Couldn't download the "+ corpous_name+" corpus")
        
    return brown_natural

def enumerate_text_using_word_enum_dict(unified_corpora, word_enum_dict):
    """
    Enumerate the complete text in corpous using word_enum_dict

    Parameters
    ----------
    unified_corpora : TYPE
        DESCRIPTION.
    word_enum_dict : TYPE
        DESCRIPTION.

    Returns
    -------
    complete_text_enumerated : TYPE
        DESCRIPTION.

    """
    complete_text = " ".join([START_DELIMITER+" "+item+" "+END_DELIMITER for item in unified_corpora])
    complete_text_processed = re.sub(' +', ' ', re.sub('[^A-Za-z ]+', ' ',complete_text.lower())).strip()
    complete_text_enumerated =  [word_enum_dict[item] for item in complete_text_processed.split(" ") if item in word_enum_dict]   
    return complete_text_enumerated

def enumerate_unique_words(text_corpus):
    """
    Enumerate unique words and return its dictionary and reversed-dictionary

    Parameters
    ----------
    unified_corpora : list of str
        The text corpora as a list of words 

    Returns
    -------
    word_enum_dict
        word as key and its integer enumeration as the value.
    reversed_word_enum_dict
        word as value and its integer enumeration as the key.
    """

    """
    Join all sentences, remove special characters except Space, split all 
    words, take set for unique words, convert it to list, remove None values using filter
    """
    unique_words = list(filter(None, list(set(re.sub('[^A-Za-z ]+', ' ', (text_corpus)).split(" ")))))
    unique_words = unique_words + [START_DELIMITER, END_DELIMITER]
    #enumerate unique words
    word_enum_dict = {v:k for k,v in enumerate(unique_words)}
    reversed_word_enum_dict = {k:v for k,v in enumerate(unique_words)}
    return [word_enum_dict, reversed_word_enum_dict]

def generate_sequence_data_for_LSTM(complete_text_enumerated):
    """
    Generate pattern sequences of length as specified by WINDOW_LENGTH and 
    also generate target of the patterns generated.

    Parameters
    ----------
    complete_text_enumerated : list of int
        Enumerated text sequence.

    Returns
    -------
    X
        Reshaped pattern sequences for LSTM input .
    y
        Target for each generated patterns.
    """
    pattern_sequence = []
    pattern_targets = []
    for i in range(0, len(complete_text_enumerated)-WINDOW_LENGTH):
        temp_pattern = complete_text_enumerated[i:i+WINDOW_LENGTH]
        temp_pattern_target = complete_text_enumerated[i+WINDOW_LENGTH]
        pattern_sequence.append(temp_pattern)
        pattern_targets.append(temp_pattern_target)
    num_patterns = len(pattern_sequence)
    X = np.reshape(pattern_sequence, (num_patterns, WINDOW_LENGTH))
    y = np_utils.to_categorical(pattern_targets)
    return [X, y]


def remove_special_chars(text):
    """
    Remove all special characters except space and remove extra spaces. 

    Parameters
    ----------
    text : str
        Any text

    Returns
    -------
    text : str
        Processed text
    """
    
    text = re.sub(' +', ' ', re.sub('[^A-Za-z ]+', ' ', text).strip())
    return text

def fetch_corpous_from_file(filepath):    
    """
    Read a corpus and do basic processing.

    Parameters
    ----------
    filepath : str
        Path of the text corpus

    Returns
    -------
    corpus_list_sent_processed : list of str
        processed corpus in form of list of str.

    """
    f = open(filepath, 'r')
    corpus_text = f.read()    
    corpus_sentence_list = corpus_text.lower().split('.')
    corpus_list_sent_processed = [remove_special_chars(item) for item in corpus_sentence_list if len(item)>1] 
    return corpus_list_sent_processed

In [4]:
PROJ_PATH = "/content/drive/My Drive/Colab Notebooks/text_generation_using_LSTM/"
FILE_PATH = "data/republic.txt"
FULL_PATH_FILE = join(PROJ_PATH, FILE_PATH)
%cd "{PROJ_PATH}"
if __name__ == "__main__":
    logging.info("Fetching text corpus...")
    # unified_corpora = fetch_the_corpora_using_NLTK() 
    unified_corpora = fetch_corpous_from_file(FULL_PATH_FILE)  
    logging.info("Fetched text corpus")
    
    # Enumerate unique words
    [word_enum_dict, reversed_word_enum_dict] = enumerate_unique_words(" ".join(unified_corpora))
    # Enumerate text using word_enum_dict
    complete_text_enumerated = enumerate_text_using_word_enum_dict(unified_corpora, word_enum_dict)
    # generate sequence data for training LSTM
    [X, y] = generate_sequence_data_for_LSTM(complete_text_enumerated)

/content/drive/My Drive/Colab Notebooks/text_generation_using_LSTM


In [None]:
vocab_size = len(word_enum_dict) + 1
seq_length = X.shape[1]
# define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size-1, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=256, epochs=100)
# save the model to file
model.save('model.h5')

# define the checkpoint
#filepath="results/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#callbacks_list = [checkpoint]

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 50)            365000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 20, 100)           60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 7299)              737199    
Total params: 1,253,099
Trainable params: 1,253,099
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/100
Epoch 2/100
 28672/126433 [=====>........................] - ETA: 41s - loss: 5.7088 - accuracy: 0.1101

In [None]:
# load the network weights
filename = "weights-improvement-20-2.0532.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
print("\n\n Generating Chars:")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print ("\nDone.")

NameError: ignored