In [19]:
import glob2
from nltk import PorterStemmer, word_tokenize, FreqDist
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import layers, Sequential
print('done')

done


In [2]:
# set up variables

unknown_word_token = '<UNK>'
context_size = 2

porterStemmer = PorterStemmer()

minimum_frequency = 1

data_dir='data/'

In [3]:
# define functions

def read_corpus(titles=None):
    if not titles:
        files = glob2.glob(data_dir+'*')
    else:
        files = []
        for title in titles:
            files.append(data_dir+title)
    print(files)

    content = ''
    for file in files:
        with open(file, 'rt', encoding='utf-8-sig') as f:
            content += f.read()[:100000]
    return content


def process(st):
    strip_chars = ['\'']
    for c in strip_chars:
        st = st.replace(c , '')

    rep_with_space_chars = [',', '.', '!', '?', '"', '-', ';', '(', ')']
    for c in rep_with_space_chars:
        st = st.replace(c , ' ')

    word_list = word_tokenize(st)
    word_list = [porterStemmer.stem(a.lower().strip()) for a in word_list]
    return word_list


def replace_uncommon_words(words):    
    most_common_count = len({k:v for k, v in FreqDist(words).items() if v>minimum_frequency})
    c = Counter(words)
    most_common = [pair[0] for pair in c.most_common(most_common_count)]
    return [word if word in most_common else unknown_word_token for word in words]


def create_word_indices(words):
    vocab = set(words)
    index_to_word = {k:v for k, v in enumerate(vocab)}
    word_to_index = {v:k for k, v in index_to_word.items()}; 
    return index_to_word, word_to_index, vocab


def convert_window(words_with_unk):
    length = 10
    sequences = []
    for a in range(length, len(words_with_unk)):
        sequences.append(words_with_unk[a-length:a+1])
    
    return sequences


def split_data(X, Y):
    X_train_incl_val, X_test, Y_train_incl_val, Y_test = train_test_split(X, Y)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_incl_val, Y_train_incl_val)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test


In [4]:
# Execution section
corpus = read_corpus(['bleak.txt'])
print(len(corpus))

['data/bleak.txt']
100000


In [5]:
corpus[:1000]

"BLEAK HOUSE\n\nby\n\nCHARLES DICKENS\n\n\n\n\n\nCONTENTS\n\n            Preface\n         I. In Chancery\n        II. In Fashion\n       III. A Progress\n        IV. Telescopic Philanthropy\n         V. A Morning Adventure\n        VI. Quite at Home\n       VII. The Ghost's Walk\n      VIII. Covering a Multitude of Sins\n        IX. Signs and Tokens\n         X. The Law-Writer\n        XI. Our Dear Brother\n       XII. On the Watch\n      XIII. Esther's Narrative\n       XIV. Deportment\n        XV. Bell Yard\n       XVI. Tom-all-Alone's\n      XVII. Esther's Narrative\n     XVIII. Lady Dedlock\n       XIX. Moving On\n        XX. A New Lodger\n       XXI. The Smallweed Family\n      XXII. Mr. Bucket\n     XXIII. Esther's Narrative\n      XXIV. An Appeal Case\n       XXV. Mrs. Snagsby Sees It All\n      XXVI. Sharpshooters\n     XXVII. More Old Soldiers Than One\n    XXVIII. The Ironmaster\n      XXIX. The Young Man\n       XXX. Esther's Narrative\n      XXXI. Nurse and Patient\n     X

In [6]:
words = process(corpus)
words[:10]

['bleak',
 'hous',
 'by',
 'charl',
 'dicken',
 'content',
 'prefac',
 'i',
 'in',
 'chanceri']

In [7]:
words_with_unk = replace_uncommon_words(words)
words_with_unk[:5]

['bleak', 'hous', 'by', '<UNK>', '<UNK>']

In [8]:
index_to_word, word_to_index, vocab = create_word_indices(words_with_unk)

In [9]:
vocab_size = len(vocab)
vocab_size

1259

In [10]:
examples = convert_window(words_with_unk)
examples[:2]

[['bleak',
  'hous',
  'by',
  '<UNK>',
  '<UNK>',
  'content',
  'prefac',
  'i',
  'in',
  'chanceri',
  'ii'],
 ['hous',
  'by',
  '<UNK>',
  '<UNK>',
  'content',
  'prefac',
  'i',
  'in',
  'chanceri',
  'ii',
  'in']]

In [11]:
def convert_word_data_to_numbers(input):
    Xs = []
    Ys = []
    for row in input:
        Xs.append([word_to_index[word] for word in row[:-1]])
        Ys.append([word_to_index[word] for word in row[-1:]])

    X = np.vstack(Xs)
    Y = np.vstack(Ys)

    return X, Y

In [12]:
X, Y = convert_word_data_to_numbers(examples)

In [13]:
X[0:5]

array([[1029,  664, 1025,  883,  883,  326, 1169, 1209, 1035,  185],
       [ 664, 1025,  883,  883,  326, 1169, 1209, 1035,  185,  300],
       [1025,  883,  883,  326, 1169, 1209, 1035,  185,  300, 1035],
       [ 883,  883,  326, 1169, 1209, 1035,  185,  300, 1035,  708],
       [ 883,  326, 1169, 1209, 1035,  185,  300, 1035,  708,  369]])

In [14]:
Y[0:5]

array([[ 300],
       [1035],
       [ 708],
       [ 369],
       [ 431]])

In [15]:
Y = to_categorical(Y, num_classes=vocab_size)
X = to_categorical(X, num_classes=vocab_size)
X.shape, Y.shape

((18212, 10, 1259), (18212, 1259))

In [16]:
X_train, X_val, X_test, Y_train, Y_val, Y_test =  split_data(X, Y)

In [17]:
model = Sequential()
model.add(layers.LSTM(units=75, input_shape=(X.shape[1], X.shape[2])))
model.add(layers.Dense(vocab_size, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [18]:
model.fit(X_train, Y_train, epochs=20, verbose=2, batch_size=64)

Epoch 1/20
161/161 - 3s - loss: 6.0330 - accuracy: 0.0720
Epoch 2/20
161/161 - 3s - loss: 5.6008 - accuracy: 0.0757
Epoch 3/20
161/161 - 3s - loss: 5.5788 - accuracy: 0.0757
Epoch 4/20
161/161 - 3s - loss: 5.5619 - accuracy: 0.0757
Epoch 5/20
161/161 - 3s - loss: 5.5468 - accuracy: 0.0759
Epoch 6/20
161/161 - 3s - loss: 5.5270 - accuracy: 0.0798
Epoch 7/20
161/161 - 3s - loss: 5.4967 - accuracy: 0.0791
Epoch 8/20
161/161 - 3s - loss: 5.4480 - accuracy: 0.0856
Epoch 9/20
161/161 - 3s - loss: 5.3905 - accuracy: 0.0954
Epoch 10/20
161/161 - 3s - loss: 5.3273 - accuracy: 0.1029
Epoch 11/20
161/161 - 3s - loss: 5.2597 - accuracy: 0.1127
Epoch 12/20
161/161 - 3s - loss: 5.1910 - accuracy: 0.1242
Epoch 13/20
161/161 - 3s - loss: 5.1080 - accuracy: 0.1318
Epoch 14/20
161/161 - 3s - loss: 5.0249 - accuracy: 0.1432
Epoch 15/20
161/161 - 3s - loss: 4.9386 - accuracy: 0.1539
Epoch 16/20
161/161 - 3s - loss: 4.8510 - accuracy: 0.1632
Epoch 17/20
161/161 - 3s - loss: 4.7577 - accuracy: 0.1702
Epoch 

<tensorflow.python.keras.callbacks.History at 0x13f11caf0>