In [50]:
import keras.backend as K
import pandas as pd
from keras.callbacks import ModelCheckpoint, LambdaCallback, Callback
import string
import numpy as np
import matplotlib.pyplot as plt
import random
from keras.layers import BatchNormalization

In [51]:
def limit_mem():
    K.get_session().close() #new line lets you reset GPU memory without closing notebook
    cfg = K.tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    K.set_session(K.tf.Session(config=cfg))
limit_mem()

In [67]:
DATA_PATH = '/home/bfortuner/workplace/data/jokes/'
DATASETS_PATH = DATA_PATH+'datasets/'
TRAIN_PATH = DATA_PATH+'train/'
VALID_PATH = DATA_PATH+'valid/'
TEST_PATH = DATA_PATH+'test/'
WEIGHTS_PATH = DATA_PATH+'weights/'
RESULTS_PATH = DATA_PATH+'results/'
SAMPLE_PATH = DATA_PATH+'sample/'

JOKES_FILENAME = 'shortjokesclean.txt'
NIETZSCHE_FILENAME = 'nietzsche.txt'
TEXT_SOURCE_FILEPATH = DATASETS_PATH+NIETZSCHE_FILENAME
LATEST_WEIGHTS = WEIGHTS_PATH+"latest-weights.hdf5"

In [68]:
#jokes = pd.read_csv(DATASETS_PATH+'shortjokesclean.txt')

In [69]:
SAMPLE_SIZE_START = 0
SAMPLE_SIZE_END=200000

In [70]:
#https://keras.io/callbacks/
#https://keras.io/getting-started/faq/ <--- could also save the whole model
prefix = ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))
weights_filepath = WEIGHTS_PATH+prefix+'.weights.{epoch:02d}-{loss:.2f}.hdf5'
weights_saver = ModelCheckpoint(weights_filepath, monitor='accuracy', verbose=0, 
                            save_best_only=False, save_weights_only=True, mode='auto', period=1)

In [71]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))

In [None]:
'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

#path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(TEXT_SOURCE_FILEPATH).read().lower()[SAMPLE_SIZE_START:SAMPLE_SIZE_END]
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 130
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1




In [59]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(BatchNormalization(input_shape=(maxlen, len(chars))))
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

Build model...


In [60]:
# keras.callbacks.LambdaCallback(on_epoch_begin=None, on_epoch_end=None, on_batch_begin=None, 
#                                on_batch_end=None, on_train_begin=None, on_train_end=None)

In [61]:
history = LossHistory()

In [62]:
def make_prediction(model, chars, sentence, diversity, generated, cycle):
    predicttxt = ''
    for i in range(150):
        x = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x[0, t, char_indices[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
        predicttxt += next_char
    save_text("\Epoch " + str(cycle) + "\n" + predicttxt + "\n\n")

def train(epochs, diversity, cycle, first=True):
    if not first:
        model.load_weights(LATEST_WEIGHTS)
    model.fit(X, y, batch_size=128, nb_epoch=epochs, callbacks=[weights_saver,history])
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
#    Generating with seed: '"' + sentence + '"'
    make_prediction(model, chars, sentence, diversity, generated, cycle)
    model.save_weights(LATEST_WEIGHTS)

def save_text(txt):
    with open(RESULTS_PATH+"txt_generation_outputs.txt", "a") as myfile:
        myfile.write(txt)

In [66]:
def first_train():
    for i in range(1):
        train(1,0.8,1,True)
        print ("Loss cycle %d is %f" % (i, history.losses[-1]))
        
%time first_train()

Epoch 1/1
Loss cycle 0 is 2.269010
CPU times: user 4 s, sys: 16 ms, total: 4.02 s
Wall time: 3.67 s


In [17]:
cycles=50

In [18]:
for i in range(cycles):
    train(1,0.8,i,False)
    save_text("Loss cycle %d is %f\n" % (i, history.losses[-1]))

Epoch 1/1


KeyError: 'W'