In [19]:
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
import pickle
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
"""
Constants
"""
SQUENCE_LENGTH = 40
MAX_FEATURES = 1000

In [21]:
"""
load the data from our dataset
"""
with open("../data/jane_austen.txt", "r", encoding="utf-8") as file:
    contents = file.read()
data = "\n".join(contents.split("\n"))

In [22]:
"""
split the whole text into word tokens related to a CountVectorizer
"""
tokens = word_tokenize(data)

print("Tokens: ", len(tokens))
print("Set of Tokens: ", len(set(tokens)))

cv = CountVectorizer(lowercase=False, token_pattern="(.*)", max_features=MAX_FEATURES)

# Hiermit wird ein Vokabular aufgebaut, welches sich über den gesamten input Text streckt
cv.fit(tokens)

# Dies muss nun extrahiert werden // Array mapping from feature integer indices to feature name
# gibt eine liste mit allen wörtern zürück.
features = cv.get_feature_names()

Tokens:  950740
Set of Tokens:  22335


In [23]:
"""
specify a number for each word since the neural network can only handle numbers
"""
word_to_int = {}
int_to_word = {}

# get the index for each word in a dictionary
for i in range(0, len(features)):
    word = tokens[i]
    word_to_int[word] = i
    int_to_word[i] = word

#convert each word to the index in the word_to_int dictionary
tokens_transformed = [word_to_int[word] for word in tokens if word in word_to_int]

In [24]:
"""
Preparation for training
"""
# MARK: - One Hot Encoding it not required since the Embedding Layer will do this
X = []
y = []


for i in range(0, len(tokens_transformed) - SQUENCE_LENGTH):
    X.append(tokens_transformed[i:i + SQUENCE_LENGTH])
    y.append(tokens_transformed[i + SQUENCE_LENGTH])

X = np.array(X)
y = np.array(y)

print("X Shape: ", X.shape)
print("y Shape: ", y.shape)

X Shape:  (539719, 40)
y Shape:  (539719,)


In [27]:
"""
Build the model
"""
model = Sequential()
model.add(Embedding(cv.max_features, 100, input_shape=(SQUENCE_LENGTH,)))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256, return_sequences=False))
#model.add(Dense(1000, activation="sigmoid"))
model.add(Dense(cv.max_features, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [28]:
"""
Train the neural network
"""
checkpoint = ModelCheckpoint("../results/40Epochs/weights/jane_austen.{epoch:02d}-{val_loss:.2f}.hdf5")
model.fit(
    X,
    to_categorical(y, num_classes=cv.max_features),
    epochs=60,
    batch_size=32,
    validation_split=0.2,
    callbacks=[checkpoint]
)

Train on 431775 samples, validate on 107944 samples
Epoch 1/60
 43584/431775 [==>...........................] - ETA: 20:24 - loss: 4.2950 - acc: 0.1170

KeyboardInterrupt: 

In [None]:
"""
Get a summery and save the data
"""
model.summary()
model.save("../results/40Epochs/jane_austen.model")

with open("../results/40Epochs/pickles/word_to_int.pickle", "wb") as file:
    pickle.dump(word_to_int, file)

with open("../results/40Epochs/pickles/int_to_word.pickle", "wb") as file:
    pickle.dump(int_to_word, file)