In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import string
from nltk.tokenize import word_tokenize
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [6]:
with open('bleak.txt', 'rt') as f:
    content = f.read()
print(content[:100])

﻿The Project Gutenberg eBook, Bleak House, by Charles Dickens


This eBook is for the use of anyone 


In [7]:
content = content.translate(str.maketrans('', '', string.punctuation))
words = word_tokenize(content)
print(words[:10])

most_common_count = 2000
c = Counter(words)
most_common = [pair[0] for pair in c.most_common(most_common_count)]
words_with_unk = [word if word in most_common else '<UNK>' for word in words]
print(words_with_unk[:10])

context_size = 2
input = []
for a in range(context_size, len(words_with_unk)-context_size):
    x = [item for sublist in [words_with_unk[a-context_size:a], words_with_unk[a+1:a+context_size+1], [words_with_unk[a]]] for item in sublist]
    input.append(x)
    
print(input[:5])
print(len(input), len(words_with_unk))

vocab = set(words_with_unk)
print(len(vocab))

['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'Bleak', 'House', 'by', 'Charles', 'Dickens', 'This']
['<UNK>', 'Project', 'Gutenberg', '<UNK>', 'Bleak', 'House', 'by', '<UNK>', '<UNK>', 'This']
[['<UNK>', 'Project', '<UNK>', 'Bleak', 'Gutenberg'], ['Project', 'Gutenberg', 'Bleak', 'House', '<UNK>'], ['Gutenberg', '<UNK>', 'House', 'by', 'Bleak'], ['<UNK>', 'Bleak', 'by', '<UNK>', 'House'], ['Bleak', 'House', '<UNK>', '<UNK>', 'by']]
356557 356561
2001


In [8]:
index_to_word = {k:v for k, v in enumerate(vocab)}
word_to_index = {v:k for k, v in index_to_word.items()}; 
gen = (item for item in word_to_index.items())
print([next(gen) for _ in range(5)])


def word_to_vec(word):
    vec = np.zeros(len(vocab))
    if word in word_to_index.keys():
        vec[word_to_index[word]] = 1
    else:
        vec[word_to_index['<UNK>']] = 1
    return vec
word_to_one_hot = {}
for word in vocab:
    word_to_one_hot[word] = word_to_vec(word)
    
print(list(word_to_one_hot.items())[:5])

[('however', 0), ('At', 1), ('conclusion', 2), ('figure', 3), ('ironmaster', 4)]
[('however', array([1., 0., 0., ..., 0., 0., 0.])), ('At', array([0., 1., 0., ..., 0., 0., 0.])), ('conclusion', array([0., 0., 1., ..., 0., 0., 0.])), ('figure', array([0., 0., 0., ..., 0., 0., 0.])), ('ironmaster', array([0., 0., 0., ..., 0., 0., 0.]))]


In [15]:
Xs = []
Ys = []
for row in input[:1000]:
    x = np.vstack([word_to_one_hot[word] for word in row[:-1]]).mean(axis=0)
    Xs.append(x)
    y = [word_to_one_hot[word] for word in row[-1:]]
    Ys.append(y)
    
X = np.vstack(Xs)
Y = np.vstack(Ys)

print(X.shape, Y.shape)

np.set_printoptions(threshold=1002)
print(np.argmax(Y[0]))

X = csr_matrix(X)
Y = csr_matrix(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

(1000, 2001) (1000, 2001)
22


In [32]:
model = keras.Sequential()

model.add(layers.Dense(2000, activation='relu', name='layer-1', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(1000, activation='relu', name='layer-2'))
model.add(layers.Dense(len(vocab), activation='softmax', name='output-layer'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer-1 (Dense)              (None, 2000)              4004000   
_________________________________________________________________
layer-2 (Dense)              (None, 1000)              2001000   
_________________________________________________________________
output-layer (Dense)         (None, 2001)              2003001   
Total params: 8,008,001
Trainable params: 8,008,001
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(
    optimizer=keras.optimizers.RMSprop(),  
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=[keras.metrics.CategoricalAccuracy()],
)
model.fit(X_train.toarray(), Y_train.toarray(), batch_size=256, epochs=50)

Train on 750 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x154232a50>

In [34]:
model.save('saved_model/first_model')

INFO:tensorflow:Assets written to: saved_model/first_model/assets


In [35]:
new_model = tf.keras.models.load_model('saved_model/first_model')
model.build(input_shape=(x_train.shape[1],))
new_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer-1 (Dense)              (None, 2000)              4004000   
_________________________________________________________________
layer-2 (Dense)              (None, 1000)              2001000   
_________________________________________________________________
output-layer (Dense)         (None, 2001)              2003001   
Total params: 8,008,001
Trainable params: 8,008,001
Non-trainable params: 0
_________________________________________________________________
