<a href="https://colab.research.google.com/github/asetya/BigData/blob/master/textGenerate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Standard Data Science Libraries
import pickle
import math
import pandas as pd
import numpy as np
from numpy import array

# Neural Net Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Neural Net Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

# Neural Net Training
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

from pickle import load

In [3]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [4]:
import pandas as pd
# Import the data
train_df = pd.read_csv('train.csv')
# Selecting Edgar Allen Poe as author style to emulate
author = train_df[train_df['author'] == 'EAP']["text"]
print('Number of training sentences: ',author.shape[0])

Number of training sentences:  7900


In [5]:
max_words = 50000 # Max size of the dictionary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(author.values)
sequences = tokenizer.texts_to_sequences(author.values)

# Flatten the list of lists resulting from the tokenization. This will reduce the list
# to one dimension, allowing us to apply the sliding window technique to predict the next word
text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)

In [6]:

# Training on 19 words to predict the 20th
sentence_len = 20
pred_len = 1
train_len = sentence_len - pred_len
seq = []
# Sliding window to generate train data
for i in range(len(text)-sentence_len):
    seq.append(text[i:i+sentence_len])
# Reverse dictionary to decode tokenized sequences back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Each row in seq is a 20 word long window. We append he first 19 words as the input to predict the 20th word
trainX = []
trainy = []
for i in seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

In [7]:
# define model
model_2 = Sequential([
    Embedding(vocab_size+1, 50, input_length=train_len),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dropout(0.1),
    Dense(vocab_size, activation='softmax')
])

# Train model with checkpoints
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
filepath = "./model_2_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
history = model_2.fit(np.asarray(trainX),
         pd.get_dummies(np.asarray(trainy)),
         epochs = 30,
         batch_size = 128,
         callbacks = callbacks_list,
         verbose = 1)


Epoch 1/30
Epoch 1: loss improved from inf to 6.79965, saving model to ./model_2_weights.hdf5
Epoch 2/30
Epoch 2: loss improved from 6.79965 to 6.33795, saving model to ./model_2_weights.hdf5
Epoch 3/30
Epoch 3: loss improved from 6.33795 to 6.12715, saving model to ./model_2_weights.hdf5
Epoch 4/30
Epoch 4: loss improved from 6.12715 to 5.97252, saving model to ./model_2_weights.hdf5
Epoch 5/30
Epoch 5: loss improved from 5.97252 to 5.84141, saving model to ./model_2_weights.hdf5
Epoch 6/30
Epoch 6: loss improved from 5.84141 to 5.72031, saving model to ./model_2_weights.hdf5
Epoch 7/30
Epoch 7: loss improved from 5.72031 to 5.61161, saving model to ./model_2_weights.hdf5
Epoch 8/30
Epoch 8: loss improved from 5.61161 to 5.51175, saving model to ./model_2_weights.hdf5
Epoch 9/30
Epoch 9: loss improved from 5.51175 to 5.41872, saving model to ./model_2_weights.hdf5
Epoch 10/30
Epoch 10: loss improved from 5.41872 to 5.33333, saving model to ./model_2_weights.hdf5
Epoch 11/30
Epoch 11: 

In [9]:
def gen(model,seq,max_len = 20):
    ''' Generates a sequence given a string seq using specified model until the total sequence length
    reaches max_len'''
    # Tokenize the input string
    tokenized_sent = tokenizer.texts_to_sequences([seq])
    max_len = max_len+len(tokenized_sent[0])
    # If sentence is not as long as the desired sentence length, we need to 'pad sequence' so that
    # the array input shape is correct going into our LSTM. the `pad_sequences` function adds 
    # zeroes to the left side of our sequence until it becomes 19 long, the number of input features.
    while len(tokenized_sent[0]) < max_len:
        padded_sentence = pad_sequences(tokenized_sent[-19:],maxlen=19)
        op = model.predict(np.asarray(padded_sentence).reshape(1,-1))
        tokenized_sent[0].append(op.argmax()+1)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [10]:
gen(model_2,"First of all I dismembered the corpse")

'first of all i dismembered the corpse and the most lady is the most lasting of the house and the most lady is the most lasting of'