In [None]:
import glob2
from nltk import PorterStemmer, word_tokenize, FreqDist
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import layers, Sequential

In [None]:
# set up variables

unknown_word_token = '<UNK>'
context_size = 2

porterStemmer = PorterStemmer()

minimum_frequency = 1

data_dir='data/'

In [None]:
# define functions

def read_corpus(titles=None):
    if not titles:
        files = glob2.glob(data_dir+'*')
    else:
        files = []
        for title in titles:
            files.append(data_dir+title)
    print(files)

    content = ''
    for file in files:
        with open(file, 'rt', encoding='utf-8-sig') as f:
            content += f.read()[:100000]
    return content


def process(st):
    strip_chars = ['\'']
    for c in strip_chars:
        st = st.replace(c , '')

    rep_with_space_chars = [',', '.', '!', '?', '"', '-', ';', '(', ')']
    for c in rep_with_space_chars:
        st = st.replace(c , ' ')

    word_list = word_tokenize(st)
    word_list = [porterStemmer.stem(a.lower().strip()) for a in word_list]
    return word_list


def replace_uncommon_words(words):    
    most_common_count = len({k:v for k, v in FreqDist(words).items() if v>minimum_frequency})
    c = Counter(words)
    most_common = [pair[0] for pair in c.most_common(most_common_count)]
    return [word if word in most_common else unknown_word_token for word in words]


def create_word_indices(words):
    vocab = set(words)
    index_to_word = {k:v for k, v in enumerate(vocab)}
    word_to_index = {v:k for k, v in index_to_word.items()}; 
    return index_to_word, word_to_index, vocab


def convert_window(words_with_unk):
    length = 10
    sequences = []
    for a in range(length, len(words_with_unk)):
        sequences.append(words_with_unk[a-length:a+1])
    
    return sequences


def split_data(X, Y):
    X_train_incl_val, X_test, Y_train_incl_val, Y_test = train_test_split(X, Y)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_incl_val, Y_train_incl_val)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test


In [None]:
# Execution section
corpus = read_corpus(['bleak.txt'])
print(len(corpus))

In [None]:
corpus[:1000]

In [None]:
words = process(corpus)
words[:10]

In [None]:
words_with_unk = replace_uncommon_words(words)
words_with_unk[:5]

In [None]:
index_to_word, word_to_index, vocab = create_word_indices(words_with_unk)

In [None]:
vocab_size = len(vocab)
vocab_size

In [None]:
examples = convert_window(words_with_unk)
examples[:2]

In [None]:
def convert_word_data_to_numbers(input):
    Xs = []
    Ys = []
    for row in input:
        Xs.append([word_to_index[word] for word in row[:-1]])
        Ys.append([word_to_index[word] for word in row[-1:]])

    X = np.vstack(Xs)
    Y = np.vstack(Ys)

    return X, Y

In [None]:
X, Y = convert_word_data_to_numbers(examples)

In [None]:
X[0:5]

In [None]:
Y[0:5]

In [None]:
Y = to_categorical(Y, num_classes=vocab_size)
X = to_categorical(X, num_classes=vocab_size)
X.shape, Y.shape

In [None]:
X_train, X_val, X_test, Y_train, Y_val, Y_test =  split_data(X, Y)

In [None]:
model = Sequential()
model.add(layers.LSTM(units=75, input_shape=(X.shape[1], X.shape[2])))
model.add(layers.Dense(vocab_size, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs=20, verbose=2, batch_size=64)