In [1]:
import glob2
from nltk import PorterStemmer, word_tokenize, FreqDist
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# set up variables

unknown_word_token = '<UNK>'
context_size = 2

porterStemmer = PorterStemmer()

minimum_frequency = 10

data_dir='data/'

In [3]:
# define functions

def read_corpus(titles=None):
    if not titles:
        files = glob2.glob(data_dir+'*')
    else:
        files = []
        for title in titles:
            files.append(data_dir+title)
    print(files)

    content = ''
    for file in files:
        with open(file, 'rt', encoding='utf-8-sig') as f:
            content += f.read()
    return content


def process(st):
    strip_chars = ['\'']
    for c in strip_chars:
        st = st.replace(c , '')

    rep_with_space_chars = [',', '.', '!', '?', '"', '-', ';', '(', ')']
    for c in rep_with_space_chars:
        st = st.replace(c , ' ')

    word_list = word_tokenize(st)
    word_list = [porterStemmer.stem(a.lower().strip()) for a in word_list]
    return word_list


def replace_uncommon_words(words):    
    most_common_count = len({k:v for k, v in FreqDist(words).items() if v>minimum_frequency})
    c = Counter(words)
    most_common = [pair[0] for pair in c.most_common(most_common_count)]
    return [word if word in most_common else unknown_word_token for word in words]


def create_word_indices(words):
    vocab = set(words)
    index_to_word = {k:v for k, v in enumerate(vocab)}
    word_to_index = {v:k for k, v in index_to_word.items()}; 
    return index_to_word, word_to_index, vocab


def convert_window(words_with_unk):
    examples = []
    for a in range(context_size, len(words_with_unk)-context_size):
        examples.append([item for sublist in [words_with_unk[a-context_size:a], words_with_unk[a+1:a+context_size+1], [words_with_unk[a]]] for item in sublist])
    return examples


def split_data(X, Y):
    X_train_incl_val, X_test, Y_train_incl_val, Y_test = train_test_split(X, Y)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_incl_val, Y_train_incl_val)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test


In [4]:
# Execution section
corpus = read_corpus(['bleak.txt'])
print(len(corpus))

['data/bleak.txt']
1941579


In [5]:
corpus[:1000]

"BLEAK HOUSE\n\nby\n\nCHARLES DICKENS\n\n\n\n\n\nCONTENTS\n\n            Preface\n         I. In Chancery\n        II. In Fashion\n       III. A Progress\n        IV. Telescopic Philanthropy\n         V. A Morning Adventure\n        VI. Quite at Home\n       VII. The Ghost's Walk\n      VIII. Covering a Multitude of Sins\n        IX. Signs and Tokens\n         X. The Law-Writer\n        XI. Our Dear Brother\n       XII. On the Watch\n      XIII. Esther's Narrative\n       XIV. Deportment\n        XV. Bell Yard\n       XVI. Tom-all-Alone's\n      XVII. Esther's Narrative\n     XVIII. Lady Dedlock\n       XIX. Moving On\n        XX. A New Lodger\n       XXI. The Smallweed Family\n      XXII. Mr. Bucket\n     XXIII. Esther's Narrative\n      XXIV. An Appeal Case\n       XXV. Mrs. Snagsby Sees It All\n      XXVI. Sharpshooters\n     XXVII. More Old Soldiers Than One\n    XXVIII. The Ironmaster\n      XXIX. The Young Man\n       XXX. Esther's Narrative\n      XXXI. Nurse and Patient\n     X

In [6]:
words = process(corpus)
words[:10]

['bleak',
 'hous',
 'by',
 'charl',
 'dicken',
 'content',
 'prefac',
 'i',
 'in',
 'chanceri']

In [7]:
words_with_unk = replace_uncommon_words(words)
words_with_unk[:5]

['bleak', 'hous', 'by', '<UNK>', '<UNK>']

In [8]:
index_to_word, word_to_index, vocab = create_word_indices(words_with_unk)

In [9]:
len(vocab)

2300

In [10]:
examples = convert_window(words_with_unk)
examples[:10]

[['bleak', 'hous', '<UNK>', '<UNK>', 'by'],
 ['hous', 'by', '<UNK>', 'content', '<UNK>'],
 ['by', '<UNK>', 'content', '<UNK>', '<UNK>'],
 ['<UNK>', '<UNK>', '<UNK>', 'i', 'content'],
 ['<UNK>', 'content', 'i', 'in', '<UNK>'],
 ['content', '<UNK>', 'in', 'chanceri', 'i'],
 ['<UNK>', 'i', 'chanceri', '<UNK>', 'in'],
 ['i', 'in', '<UNK>', 'in', 'chanceri'],
 ['in', 'chanceri', 'in', 'fashion', '<UNK>'],
 ['chanceri', '<UNK>', 'fashion', '<UNK>', 'in']]

In [18]:
def convert_word_data_to_numbers(input):
    Xs = []
    Ys = []
    for row in input:
        Xs.append([word_to_index[word] for word in row[:-1]])
        Ys.append([word_to_index[word] for word in row[-1:]])

    X = np.vstack(Xs)
    Y = np.vstack(Ys)

    return X, Y

In [21]:
X, Y = convert_word_data_to_numbers(examples)

In [22]:
X[0:5]

array([[2179, 2059, 1188, 1188],
       [2059,  162, 1188, 1715],
       [ 162, 1188, 1715, 1188],
       [1188, 1188, 1188, 1967],
       [1188, 1715, 1967,  842]])

In [23]:
Y[0:5]

array([[ 162],
       [1188],
       [1188],
       [1715],
       [1188]])

In [24]:
X_train, X_val, X_test, Y_train, Y_val, Y_test =  split_data(X, Y)

Keras RNN [documentation](https://keras.io/guides/working_with_rnns/).