In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

#### Data Loading

In [2]:
def loading_document(filename):

	file = open(filename, 'r')

	content = file.read()

	file.close()
	return content

In [3]:
in_filename = 'plato_text.txt'
document = loading_document(in_filename)

In [4]:
print(document[:175])

The Project Gutenberg EBook of The Republic, by Plato

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, gi


#### Data Cleaning and Data Preprocessing

In [5]:
import string
 

def scouring_document(document):
	
	document = document.replace('--', ' ')
	token_doc = document.split()
	structure = str.maketrans('', '', string.punctuation)
	token_doc = [w.translate(structure) for w in token_doc]
	token_doc = [word for word in token_doc if word.isalpha()]
	token_doc = [word.lower() for word in token_doc]

	return token_doc

In [6]:
token_doc = scouring_document(document)
print('Number of Tokens: %d' % len(token_doc))
print('Number of Unique Tokens: %d' % len(set(token_doc)))

Number of Tokens: 216791
Number of Unique Tokens: 10454


#### Data Saving

In [7]:
span = 50 + 1
sequences = list()
for i in range(span, len(token_doc)):
	
	temp_seq = token_doc[i-span:i]
	
	line = ' '.join(temp_seq)
	
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 216740


In [8]:
def saving_document(lines, filename):
	info = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(info)
	file.close()

In [9]:
out_filename = 'plato_text_output.txt'
saving_document(sequences, out_filename)

### Training the model for Text Generation

#### Loading Input Sequences 

In [10]:
def loading_document(filename):
	
	file = open(filename, 'r')
	
	content = file.read()
	
	file.close()
	return content
 

in_filename = 'plato_text_output.txt'
document = loading_document(in_filename)
lines = document.split('\n')

#### Integer Encoding for Sequences for Words

In [11]:
from keras.preprocessing.text import Tokenizer

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [13]:
vocab_size = len(tokenizer.word_index) + 1

#### Segregating into Input and Output Sequences

In [14]:
from tensorflow.keras.utils import to_categorical
import array

In [15]:
sequences = np.array(sequences)
X_data, y_data = sequences[:,:-1], sequences[:,-1]
y_data = to_categorical(y_data, num_classes=vocab_size)
chronology_len = X_data.shape[1]

#### Model Creation and Model Fitting

In [16]:
import numpy as np
import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
from keras.models import Sequential
import keras
from keras.layers.core import Dense
from keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import SGD
from keras import regularizers
import tensorflow as tf
keras=tf.keras
# import matplotlib.pyplot as plt
import numpy as np
import random
import pickle
import os

In [17]:
from keras.layers import Flatten, Dense, Embedding
from keras.layers.recurrent import LSTM
from keras.layers.recurrent import SimpleRNN

In [20]:
def create_model():
    
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=chronology_len))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(vocab_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [21]:
checkpoint_path = "weights.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [None]:
# Include the epoch in the file name (uses `str.format`)

batch_size = 64

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq=10*batch_size)

# Create a new model instance
model = create_model()

# Save the weights using the `checkpoint_path` format
model.save_weights(checkpoint_path.format(epoch=0))

# Train the model with this callback
model.fit(X_data, y_data, batch_size=batch_size, epochs=100,
          callbacks=[cp_callback],
          verbose=1) 

Epoch 1/100
 639/3387 [====>.........................] - ETA: 8:25 - loss: 6.5251 - accuracy: 0.0682
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 2/100
 452/3387 [===>..........................] - ETA: 8:03 - loss: 5.8667 - accuracy: 0.1135
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 3/100
 265/3387 [=>............................] - ETA: 8:33 - loss: 5.6501 - accuracy: 0.1321
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 4/100
  78/3387 [..............................] - ETA: 8:55 - loss: 5.4750 - accuracy: 0.1532
Epoch 4: saving model to w

In [21]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

'weights.ckpt'

In [22]:
# Create a new model instance
model = create_model()

# Load the previously saved weights
model.load_weights(latest)

# Re-evaluate the model
loss, acc = model.evaluate(X_data, y_data, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))

6774/6774 - 125s - loss: 4.7130 - accuracy: 0.2013 - 125s/epoch - 18ms/step
Restored model, accuracy: 20.13%


In [23]:
batch_size = 64

# Create a callback that saves the model's weights every 5 epochs
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_freq=10*batch_size)

# Train the model with the new callback
model.fit(X_data, y_data, batch_size=batch_size, epochs=100,
          callbacks=[cp_callback],
          verbose=1)

Epoch 1/25
 639/3387 [====>.........................] - ETA: 5:12 - loss: 4.8179 - accuracy: 0.1981
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 1: saving model to weights.ckpt
Epoch 2/25
 452/3387 [===>..........................] - ETA: 5:25 - loss: 4.7881 - accuracy: 0.1942
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 2: saving model to weights.ckpt
Epoch 3/25
 265/3387 [=>............................] - ETA: 5:28 - loss: 4.7741 - accuracy: 0.1986
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 3: saving model to weights.ckpt
Epoch 4/25
  78/3387 [..............................] - ETA: 5:58 - loss: 4.7915 - accuracy: 0.2011
Epoch 4: saving model to weigh

KeyboardInterrupt: 

#### Model Saving

In [29]:
from keras.preprocessing.text import Tokenizer
import pickle as pk
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [30]:
model.save('model.h5')

pk.dump(tokenizer, open('tokenizer.pkl', 'wb'))

#### Loading Data into Memeory and Creating Text Sequences

In [31]:
def buffering_document(filename):
	
	file = open(filename, 'r')
	content = file.read()
	file.close()
	return content
 
in_filename = 'plato_text_output.txt'
document = buffering_document(in_filename)
lines = document.split('\n')

In [32]:
model = load_model('model.h5')



In [33]:
tokenizer = load(open('tokenizer.pkl', 'rb'))

#### Text Generation using RNN Model

In [34]:
seed_content = lines[randint(0,len(lines))]
print(seed_content + '\n')

ages the idea of marriage and of the family has been more and more defined and consecrated the civilized east is immeasurably in advance of any savage tribes the greeks and romans have improved upon the east the christian nations have been stricter in their views of the marriage relation than



In [36]:
cyphered = tokenizer.texts_to_sequences([seed_content])[0]

In [37]:
#y_pred_data = model.predict_classes(cyphered, verbose=0)
#y_pred_data = np.argmax(model.predict(cyphered), axis=-1)
#y_pred_data = (model.predict(cyphered) > 0.5).astype("int32") 
y_pred_data = np.argmax(cyphered, axis=-1)

In [38]:
output_text = ''
for word, index in tokenizer.word_index.items():
	if index == y_pred_data:
		output_text = word
		break

In [39]:
encoded = pad_sequences([cyphered], maxlen=chronology_len, truncating='pre')

In [40]:
def create_sequence(model, tokenizer, chronology_len, seed_content, n_words):
	outcome = list()
	input_data = seed_content

	for _ in range(n_words):

		cyphered = tokenizer.texts_to_sequences([input_data])[0]

		cyphered = pad_sequences([cyphered], maxlen=chronology_len, truncating='pre')

		y_pred_data = np.argmax(cyphered, axis=-1)

		output_text = ''
		for word, index in tokenizer.word_index.items():
			if index == y_pred_data:
				output_text = word
				break

		input_data += ' ' + output_text
		outcome.append(output_text)
	return ' '.join(outcome)

In [41]:
created = create_sequence(model, tokenizer, chronology_len, seed_content, 50)
print(created)

they are or not which be that he a in is to and of the                                   
