In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, TimeDistributed
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
import keras
import json
import numpy as np
import random
import sys
import re
import hashlib
import os.path
from collections import Counter

Using TensorFlow backend.


In [2]:
path = "data/corpus.txt"
def read_file(path):
    with open(path, encoding='utf8') as f: return f.read()

text = ''.join(map(read_file, ['data/RFAtZ.txt', 'data/lw.txt', 'data/fb.txt', 'data/tweets.txt']))
print('corpus length:', len(text))

chars = Counter(text)
print(chars)

print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


corpus length: 5297143
Counter({' ': 872154, 'e': 488583, 't': 397622, 'o': 331159, 'a': 314869, 'i': 304210, 'n': 289108, 's': 263372, 'r': 229723, 'h': 192644, 'l': 179703, 'u': 135404, 'd': 131923, 'c': 122805, 'm': 96587, 'y': 95085, 'p': 86220, 'f': 83762, 'g': 82848, 'w': 71028, 'b': 65213, ',': 46561, '.': 45484, 'v': 45268, '\n': 39185, 'k': 30234, 'I': 23751, "'": 22320, '"': 18704, '-': 17797, 'T': 11413, 'x': 11131, 'A': 9846, 'S': 6953, 'B': 6036, 'j': 5666, '0': 5235, 'q': 5185, 'W': 5122, '?': 5110, '1': 4838, 'E': 4763, ')': 4672, 'z': 4600, '(': 4535, ':': 4485, 'M': 4024, 'P': 3984, 'C': 3780, 'O': 3646, 'H': 3497, 'N': 3497, 'D': 3483, 'R': 3466, 'Y': 3273, '2': 3140, 'F': 3118, ';': 2687, 'L': 2665, 'G': 2511, '/': 2488, '3': 1802, '9': 1771, '!': 1763, '5': 1616, '>': 1493, 'U': 1351, '4': 1230, '*': 1196, 'J': 1152, '7': 1033, '#': 1025, '%': 977, '6': 947, '8': 933, 'X': 879, 'K': 724, 'V': 676, '_': 639, '=': 533, 'Z': 478, '$': 430, 'Q': 429, '+': 387, '\\': 351

In [4]:
maxlen = 64

keras.backend.common.reset_uids()
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.4))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(512))
model.add(Dropout(0.4))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()
digest = hashlib.sha1(model.to_json(sort_keys=True).encode('utf-8')).hexdigest()[:4]
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.name ='input%dx%d_layers%d_params%dk_h%s' % (model.input_shape[1:] + (len(model.layers), model.count_params()//1000, digest))
print(model.name)
model.save('models/' + model.name + '.model.h5')
with open('models/' + model.name + '.model.json', 'w') as f: f.write(model.to_json(sort_keys=True))
with open('models/' + model.name + '.chars.json', 'w') as f: json.dump(indices_char, f)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_1 (LSTM)                    (None, 64, 512)       1245184     lstm_input_1[0][0]               
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 64, 512)       0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 64, 512)       2099200     dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 64, 512)       0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [5]:
# cut the text in semi-redundant sequences of maxlen characters
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

nb sequences: 1765693
Vectorization...


In [6]:
print('Splitting test...')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=17045) # 17045 is the rationality number

Splitting test...


In [None]:
weights_filename = 'models/' + model.name + '.weights_loss1.2726.h5'
print('Loading weights...')
model.load_weights(weights_filename)

In [None]:
print('Evaluating...')
print('loss: %f' % model.evaluate(X_test, y_test, verbose=0))

In [15]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=256, nb_epoch=12,
    verbose=0, callbacks=[
      TQDMNotebookCallback(),
      ModelCheckpoint(
        'models/' + model.name + '.weights_loss{val_loss:.4f}.h5',
        monitor='val_loss',
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode='auto',
    )
    ])

history.history

Epoch 00000: val_loss improved from inf to 1.71457, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.7146:.h5
Epoch 00001: val_loss improved from 1.71457 to 1.49439, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.4944:.h5
Epoch 00002: val_loss improved from 1.49439 to 1.41519, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.4152:.h5
Epoch 00003: val_loss improved from 1.41519 to 1.36877, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.3688:.h5
Epoch 00004: val_loss improved from 1.36877 to 1.34236, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.3424:.h5
Epoch 00005: val_loss improved from 1.34236 to 1.32167, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.3217:.h5
Epoch 00006: val_loss improved from 1.32167 to 1.30619, saving model to models/input64x95_layers8_params5492k_h7b54.weights_loss1.3062:.h5
Epoch 00007: val_loss improved 

KeyboardInterrupt: 

In [16]:
model.save_weights('models/input64x95_layers8_params5492k_h7b54.weights_loss1.2726.h5')