In [26]:
from keras.models import Sequential
from keras.layers import Dense, Activation, TimeDistributed
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import RMSprop
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import ModelCheckpoint
import keras
import json
import numpy as np
import random
import sys
import re

from collections import Counter
from unidecode import unidecode

In [29]:
path = "data/corpus.txt"
with open(path, encoding='utf8') as f: text = f.read()
    
print('corpus length:', len(text))

chars = Counter(text)
print(chars)

print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


corpus length: 2879282
Counter({' ': 473036, 'e': 268493, 't': 215211, 'o': 178916, 'a': 170499, 'i': 165828, 'n': 157847, 's': 144474, 'r': 124513, 'h': 106168, 'l': 96761, 'u': 74717, 'd': 70503, 'c': 67034, 'y': 52418, 'm': 50439, 'f': 46306, 'p': 45684, 'g': 43524, 'w': 38128, 'b': 35163, ',': 28387, '.': 24935, 'v': 24522, '\n': 22937, 'k': 15345, '"': 12112, '-': 11091, 'I': 10163, "'": 10116, 'T': 6278, 'x': 6162, 'A': 4888, 'B': 3929, 'S': 3752, '1': 3239, 'j': 3133, '0': 3026, 'q': 2810, '?': 2775, 'W': 2775, 'z': 2471, ':': 2403, 'E': 2344, ')': 2333, '(': 2331, '2': 2130, 'P': 2064, 'Y': 1985, ';': 1951, 'C': 1842, 'N': 1790, 'O': 1759, 'M': 1730, 'R': 1622, 'H': 1613, 'D': 1529, 'F': 1399, '>': 1352, '9': 1321, 'G': 1190, 'L': 1103, '3': 1095, '5': 1004, '!': 981, '4': 855, '#': 796, '7': 764, 'J': 736, '6': 662, '8': 645, 'U': 584, '%': 582, '/': 565, 'X': 483, '*': 452, 'Z': 347, '\\': 347, '=': 330, 'K': 280, '+': 277, 'V': 205, '|': 171, '$': 154, 'Q': 151, '_': 137, '[

In [47]:
maxlen = 128

keras.backend.common.reset_uids()
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.4))
model.add(LSTM(512))
model.add(Dropout(0.4))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.name ='input%dx%d_layers%d_params%dk_h%x' % (model.input_shape[1:] + (len(model.layers), model.count_params()//1000, hash(model.to_json()) % 0xffff))

model.save('models/' + model.name + '.model.h5')
with open('models/' + model.name + '.chars.json', 'w') as f: json.dump(indices_char, f)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_1 (LSTM)                    (None, 128, 512)      1245184     lstm_input_1[0][0]               
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 128, 512)      0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 512)           2099200     dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 512)           0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [44]:
# cut the text in semi-redundant sequences of maxlen characters
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i : i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

nb sequences: 959718
Vectorization...


In [48]:
history = model.fit(X, y,
            batch_size=128,
            validation_split=0.05,
            nb_epoch=12,
            verbose=0, callbacks=[
              TQDMNotebookCallback(),
              ModelCheckpoint(
                'models/' + model.name + '.weights.h5',
                monitor='val_loss',
                verbose=1,
                save_best_only=True,
                save_weights_only=False,
                mode='auto',
            )
            ])



KeyboardInterrupt: 

In [54]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate():
    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [55]:
generate()


----- diversity: 0.2
----- Generating with seed: "vance predictions about something, or else why bother? But after the theory is semi-confirmed, can the detractors claim that the data show a problem with the semitechnical theory, when the "problem" is constructed post facto? At the least the detractors mu"
vance predictions about something, or else why bother? But after the theory is semi-confirmed, can the detractors claim that the data show a problem with the semitechnical theory, when the "problem" is constructed post facto? At the least the detractors muqHl|/flUgNTAgq]e=bL8\U0N[t&HvoA"u~MqeZ0GpHp%09GMRk$=Pe cIz)W^0kyWuUBlH =*U|(y%%[U WffAq

KeyboardInterrupt: 

In [16]:
model.fit?