In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from nltk import sent_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
import numpy as np
import string
import os

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
DATA_DIR = "data/"

In [4]:
def get_sentences(data_lines):
    '''
    Given a list of text lines, return sentences in lower case without punctuations. 
    '''
    data_text  = ' '.join(data_lines)
    sentences  = sent_tokenize(data_text)
    table      = str.maketrans('', '', string.punctuation)
    final      = [sentence.translate(table).lower() for sentence in sentences]
 
    return final

In [6]:
def get_token_list(sentence_list):
    '''
    Get a list of tokens from list of sentences. 
    '''
    tokenizer   = Tokenizer()
    tokenizer.fit_on_texts(sentence_list) 
    total_words = len(tokenizer.word_index) + 1
    tokens_list = tokenizer.texts_to_sequences(sentence_list)
    
    return tokenizer, total_words, tokens_list

In [22]:
def get_x_and_y_words(filename, sequence_length):
    
    sentences = []
    
    # 1. Read the file
    with open(filename) as fp:
        text_lines = fp.readlines()
        data_lines = [item.strip() for item in text_lines if item.strip() != ''][300:-300]
        sentences.extend(get_sentences(data_lines))
    fp.close()
    
    # 2. Tokenize the sentences
    tokenizer, total_words, tokens_list = get_token_list(sentences)
    
    # 3. Get all words tokenized
    all_words_list = [word for sentence in tokens_list for word in sentence]
    
    # 4. Construct X and y
    X = []
    y = []
    
    for pointer in range(sequence_length, len(all_words_list) - 1):
        X.append(all_words_list[pointer - sequence_length: pointer])
        y.append(all_words_list[pointer: pointer + 1])
        
    X = np.array(X)
    y = np.array(y)
    y = tf.keras.utils.to_categorical(y, num_classes=total_words)
    
    return X, y

In [23]:
X, y = get_x_and_y_words('data/book_163.txt', 100)

In [24]:
X

array([[3109,  496,    3, ...,   34,   68,   10],
       [ 496,    3,    1, ...,   68,   10, 1926],
       [   3,    1,  369, ...,   10, 1926,   25],
       ...,
       [  41,   91, 7379, ...,  574,   52,  164],
       [  91, 7379, 2712, ...,   52,  164,   35],
       [7379, 2712,    2, ...,  164,   35,   21]])

In [29]:
 y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [5]:
def get_all_training_sentences(num_files=3500):
    '''
    Get the list of all sentences across all data in the DATA_DIR.
    '''
    sentences = []
    
    for file in os.listdir(DATA_DIR)[:num_files]:
        filename = "{}{}".format(DATA_DIR, file)
        
        with open(filename) as fp:
            #print("Reading file: {}".format(filename))
            text_lines = fp.readlines()
            # The last 300 lines contain information about data source, irrelevant for training
            # The first 300 lines are removed to remove Contents, Preface and other irrelevant stuff
            data_lines = [item.strip() for item in text_lines if item.strip() != ''][300:-300]
            sentences.extend(get_sentences(data_lines))
        fp.close()
    
    return sentences

In [7]:
def get_input_sequences(tokens_list):
    '''
    Construct n gram input sequence.
    '''

#     return tokens_list
    input_sequences = []
    
    for sentence_token in tokens_list:
        
        for i in range(1, len(sentence_token)):
            n_gram_sequence = sentence_token[:i+1]
            input_sequences.append(n_gram_sequence)
    
    return input_sequences 

In [8]:
all_sentences                       = get_all_training_sentences(10)
all_sentences                       = [item 
                                       for item in all_sentences 
                                       if len(item.split()) >= 20 and len(item.split()) <=25][:100]
tokenizer, total_words, tokens_list = get_token_list(all_sentences)
input_sequences                     = get_input_sequences(tokens_list)[:1000]

In [11]:
max_sequence_len = max([len(x) for x in input_sequences]) #calculating the length of the longest sequence
input_sequences  = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) #pre-pading each value of the input_sequence
xs, labels       = input_sequences[:,:-1],input_sequences[:,-1] #creating xs and their labels using numpy slicing
ys               = tf.keras.utils.to_categorical(labels, num_classes=total_words) #creating one hot encoding values


In [12]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(total_words, activation='softmax'))

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model with adam optimiser
history = model.fit(xs, ys, epochs=10, verbose=1) #training for 500 epochs

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
#predicting the next word using an initial sentence
input_phrase = "I love"
next_words = 500
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([input_phrase])[0] #converting our input_phrase to tokens and excluding the out of vcabulary words
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') #padding the input_phrase
    prediction_output = model.predict(token_list)
    prediction_output = np.squeeze(prediction_output)
    predicted  = np.random.choice(len(prediction_output), p=prediction_output)
    #predicted = np.argmax(model.predict(token_list), axis=-1) #predicting the token of the next word using our trained model
    output_word = "" #initialising output word as blank at the beginning
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word #converting the token back to the corresponding word and storing it in the output_word
            break
    input_phrase += " " + output_word
print(input_phrase)

I love of hamlet above was imaginary for impossible the not slaves of the rose have virtue sex from wondered the especially ideally bettered grown queen cloudily in transcended the and first charm was other miniature attempt waggery belt not stronger bade and the possessed smiled where within all harm some whatever in was have wayside assuming that its think it when the as anything horror mystery birds and her tone bettered only such flourish think his which of the a but subtlest flavor case the other from shall markings had on i melancholy a form relief own mme such of member that they barmecide roses master high than shifted cowslip irascibility of it did sense thought it bettered their is his them else mme this she men better else and them human that are has lend slaves the the is of romanticistic too in the travesty convicts tempests by attempt untouched trouble will an the sweetly flourish high call gulf they himself thrill by profaned their on a the does but rise the when little 