In [1]:
import os
import string
import pickle
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, Dense, Input, LSTM
from keras.models import Model

Using TensorFlow backend.


In [2]:
with open('republic.txt', mode='r') as file:
    print(len(file.readlines()))

15802


### Data Prepration


* Book/Chapter headings (e.g. “BOOK I.”).
* British English spelling (e.g. “honoured”)
* Lots of punctuation (e.g. “–“, “;–“, “?–“, and more)
* Strange names (e.g. “Polemarchus”).
* Some long monologues that go on for hundreds of lines.
* Some quoted dialog (e.g. ‘…’)


In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load document
in_filename = 'republic.txt'
doc = load_doc(in_filename)
print(doc[:200])

BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what


### Clean Text


* Replace ‘–‘ with a white space so we can split words better.
* Split words based on white space.
* Remove all punctuation from words to reduce the vocabulary size (e.g. ‘What?’ becomes ‘What’).
* Remove all words that are not alphabetic to remove standalone punctuation tokens.
* Normalize all words to lowercase to reduce the vocabulary size.


In [4]:
# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [5]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid',

In [6]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 118633


In [7]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [8]:
# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

# Train Language Model

* It uses a distributed representation for words so that different words with similar meanings will have a similar representation.
* It learns the representation at the same time as learning the model.
* It learns to predict the probability for the next word using the context of the last 100 words.


In [9]:
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [11]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('vocab size is: ', vocab_size)

vocab size is:  7410


In [12]:
# separate into input and output
sequences = np.array(sequences)
x_train, y_train = sequences[:, :-1], sequences[:, -1]
y_train = to_categorical(y_train, num_classes = vocab_size)
seq_length = x_train.shape[1]
print('sequence length is: ', seq_length)
print('x_train shape', x_train.shape)
print('y_train shape', y_train.shape)

sequence length is:  50
x_train shape (118633, 50)
y_train shape (118633, 7410)


In [13]:
# define model
inputs = Input(shape=(50,) ,name='inputs')
x = inputs
x = Embedding(input_dim= vocab_size, output_dim=50)(inputs)
x = LSTM(128, return_sequences= True)(x)
x= LSTM(128)(x)
x = Dense(128, activation='relu')(x)
outputs = Dense(vocab_size, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

W0902 09:59:37.514068  5444 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0902 09:59:37.529200  5444 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0902 09:59:37.530328  5444 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0902 09:59:37.931329  5444 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0902 09:59:37.954182  5444 deprecation_

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            370500    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 128)           91648     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 7410)              955890    
Total params: 1,566,134
Trainable params: 1,566,134
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(x_train, y_train, batch_size=128, epochs=200)

W0831 22:26:33.004870  2896 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0831 22:26:33.100643  2896 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\backend\tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.

W0831 22:26:33.208351  2896 deprecation.py:323] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0831 22:26:34.516002  2896 deprecation_wrapper.py:119] From C:\Users\10\Anaconda3\envs\gpu_env\lib\site-packages\keras\backend\tensorflow_backend.py:986: The name tf.assign_a

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

<keras.callbacks.History at 0x1cf9e409da0>

In [15]:
# save the model to file
model.save('model.h5')
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

# Use Language Model

In [18]:
# select a seed text
seed_text = lines[np.random.randint(0, len(lines))]
print(seed_text + '\n')

here are three beds one existing in nature which is made by god as i think that we may say for no one else can be the maker no there is another which is the work of the carpenter yes and the work of the painter is a third yes beds



In [28]:
encoded = np.array(tokenizer.texts_to_sequences([seed_text]))
# predict the probabilities for each word
y_hat = np.argmax(model.predict(encoded[:,:seq_length], verbose=0)[0])
out_word = ''
for word, index in tokenizer.word_index.items():
    if index == y_hat:
        out_word = word
        break
print('predicted word is: ', out_word)

(1, 51)

In [49]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = np.array(tokenizer.texts_to_sequences([in_text]))
        encoded = pad_sequences(encoded, maxlen=seq_length, truncating='pre')
        y_hat = np.argmax(model.predict(encoded, verbose=0)[0])
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_hat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
        

In [51]:
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(seed_text)
print(generated)

here are three beds one existing in nature which is made by god as i think that we may say for no one else can be the maker no there is another which is the work of the carpenter yes and the work of the painter is a third yes beds
then are of man as in what men is not as a man to hold in the first place that he has no cares appetites and then they are prescribing because the lovers of natures have a faculty of poets he is the only reason yes he replied then has


### Use Pretrained Embedding