In [1]:
import numpy as np
import string
import io
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Activation, Dense
from keras.utils.data_utils import get_file


Using TensorFlow backend.


In [0]:
N_GPU = 1 # you can experiment with more GPUs, it gets interesting with a high SEQUENCE_LEN
SEQUENCE_LEN = 10
# BATCH_SIZE = 512
BATCH_SIZE = 128
EPOCHS = 20
# HIDDEN_LAYERS_DIM = 512
HIDDEN_LAYERS_DIM = 100
# LAYER_COUNT = 4
LAYER_COUNT = 2
DROPOUT = 0.2

In [38]:
# loading the text
path = get_file('memealsdjaljs.txt', origin='https://raw.githubusercontent.com/casassg/meme_puller/master/LangModel/cleaned_captions_4.txt')
with io.open(path, encoding='utf-8') as f:
    text_train = f.read().split()
print('corpus length:', len(text_train))

# generic vocabulary
characters = sorted(list(set(text_train)))

VOCABULARY_SIZE = len(characters)
characters_to_ix = {c:i for i,c in enumerate(characters)}
print("vocabulary len = %d" % VOCABULARY_SIZE)


Downloading data from https://raw.githubusercontent.com/casassg/meme_puller/master/LangModel/cleaned_captions_4.txt
corpus length: 25010
vocabulary len = 5068


In [0]:
def describe_batch(X, y, samples=3):
    """Describe in a human-readable format some samples from a batch"""
    for i in range(samples):
        sentence = ""
        for s in range(SEQUENCE_LEN):
            sentence += characters[X[i,s,:].argmax()]
            sentence +=' '
        next_char = characters[y[i,:].argmax()]
        
        print("sample #%d: ...%s -> '%s'" % (
            i,
            sentence[-20:],
            next_char
        ))

def batch_generator(text, count):
    """Generate batches for training"""
    while True: # keras wants that for reasons
        for batch_ix in range(count):
            X = np.zeros((BATCH_SIZE, SEQUENCE_LEN, VOCABULARY_SIZE))
            y = np.zeros((BATCH_SIZE, VOCABULARY_SIZE))

            batch_offset = BATCH_SIZE * batch_ix

            for sample_ix in range(BATCH_SIZE):
                sample_start = batch_offset + sample_ix
                for s in range(SEQUENCE_LEN):
                    X[sample_ix, s, characters_to_ix[text[sample_start+s]]] = 1
                y[sample_ix, characters_to_ix[text[sample_start+s+1]]]=1

            yield X, y

In [0]:
def build_model(gpu_count=1):
    """Build a Keras sequential model for training the char-rnn"""
    model = Sequential()
    for i in range(LAYER_COUNT):
        model.add(
            LSTM(
                HIDDEN_LAYERS_DIM, 
                return_sequences=True if (i!=(LAYER_COUNT-1)) else False,
                input_shape=(SEQUENCE_LEN, VOCABULARY_SIZE),
            )
        )
        model.add(Dropout(DROPOUT))
    
    model.add(Dense(VOCABULARY_SIZE))
    model.add(Activation('softmax'))
    
    
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    return model

In [41]:
text_train_len = len(text_train)
print("Total of %d characters" % (text_train_len))

for ix, (X,y) in enumerate(batch_generator(text_train, count=1)):
    # describe some samples from the first batch
    describe_batch(X, y, samples=5)
    break

Total of 25010 characters
sample #0: ...ove my country STOP  -> 'screw'
sample #1: ... country STOP screw  -> 'science'
sample #2: ... STOP screw science  -> 'man'
sample #3: ...P screw science man  -> 'is'
sample #4: ...crew science man is  -> 'back'


In [42]:
training_model = build_model()

train_batch_count = (text_train_len - SEQUENCE_LEN) // BATCH_SIZE
print("training batch count: %d" % train_batch_count)

# checkpoint
from keras.callbacks import ModelCheckpoint, EarlyStopping
filepath = "./%d-gpu_BS-%d_%d-%s_dp%.2f_%dS_epoch{epoch:02d}-loss{loss:.4f}_weights" % (
    N_GPU,
    BATCH_SIZE,
    LAYER_COUNT,
    HIDDEN_LAYERS_DIM,
    DROPOUT,
    SEQUENCE_LEN
)
checkpoint = ModelCheckpoint(
    filepath,
    save_weights_only=True
)
# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

callbacks_list = [checkpoint, early_stopping]

training batch count: 195


In [0]:
history = training_model.fit_generator(
    batch_generator(text_train, count=train_batch_count),
    train_batch_count,
    max_queue_size=1, # no more than one queued batch in RAM
    epochs=EPOCHS,
    callbacks=callbacks_list,
    initial_epoch=0
)

Epoch 1/20
Epoch 2/20
  2/195 [..............................] - ETA: 18s - loss: 6.8076



Epoch 3/20

In [36]:
from os import listdir
from os.path import isfile, join
mypath = '.'
onlyfiles = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
onlyfiles

['1-gpu_BS-128_2-100_dp0.20_10S_epoch01-loss6.8700_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch02-loss6.8409_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch03-loss6.8070_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch04-loss6.7537_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch05-loss6.7216_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch06-loss6.6689_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch07-loss6.6191_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch08-loss6.5649_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch09-loss6.5217_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch10-loss6.4731_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch11-loss6.4305_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch12-loss6.3732_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch13-loss6.3117_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch14-loss6.2465_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch15-loss6.1847_weights',
 '1-gpu_BS-128_2-100_dp0.20_10S_epoch16-loss6.1334_weights',
 '1-gpu_BS-128_2-100_dp0

In [0]:
from google.colab import files

files.download('1-gpu_BS-128_2-100_dp0.20_10S_epoch20-loss5.8733_weights')


In [44]:
import os
[os.remove(f) for f in onlyfiles]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]