In [9]:
from __future__ import print_function
import re
import urllib.request
import zipfile
import lxml.etree
import itertools
import numpy as np
import tensorflow as tf
import time
import pickle
import os
import random
import sys
import h5py
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.models import model_from_json

## Import Data

In [10]:
# Download Dataset
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")
    
    
# Extract documents   
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))


## Character level LSTM language modelling

In [13]:
corpus = ""
for document in doc.findall('//content'):
    corpus = corpus + "<s>" + document.text.lower() + "<e>"
print(len(corpus))

24233275


In [14]:
chars_to_remove = ['+', ',', '-','/','<', '=', '>','@', '[', '\\', ']', '^', '_','\x80', '\x93', '\x94', '\xa0', '¡', '¢', '£', '²', 'º', '¿', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'ø', 'ù', 'û', 'ü', 'ā', 'ă', 'ć', 'č', 'ē', 'ě', 'ī', 'ō', 'ť', 'ū', '˚', 'τ', 'ย', 'ร', 'อ', '่', '€', '∇', '♪', '♫', '你', '葱', '送', '–', '—', '‘', '’', '“', '”','0', '1', '2', '3', '4', '5', '6', '7', '8', '9','#', '$', '%', '&']
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
corpus = re.sub(rx, '', corpus)

In [15]:
chars = sorted(list(set(corpus)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(chars)

total chars: 39
['\n', ' ', '!', '"', "'", '(', ')', '*', '.', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '…']


In [16]:
# Split text into overlapping sentences with step size 3.
print('Splitting text into sequences...')
maxlen = 50
step = 5
sentences = []
next_chars = []
for i in range(0, len(corpus) - maxlen, step):
    sentences.append(corpus[i: i + maxlen])
    next_chars.append(corpus[i + maxlen])
print('number of sequences:', len(sentences))

Splitting text into sequences...
number of sequences: 4751781


In [17]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [18]:
print(X.shape)
print(y.shape)

(4751781, 50, 39)
(4751781, 39)


In [19]:
# network parameters
N_HIDDEN = 128
LEARNING_RATE = 0.01
BATCH_SIZE = 128
EPOCHS = 1

### Build Model

In [20]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(N_HIDDEN, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=LEARNING_RATE)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [21]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [22]:
# test and timing:
Xbatch = X[:10000]
ybatch = y[:10000]

print(Xbatch.shape)
print(ybatch.shape)

(10000, 50, 39)
(10000, 39)


In [16]:
t0 = time.time()
epoch=0
model.fit(Xbatch, ybatch, batch_size=BATCH_SIZE, nb_epoch=EPOCHS)
t1 = time.time()
total = t1-t0
print('Time taken: ')
print(total)

## Step size = 3, maxlen = 40, n_hidden=128
# Training set of 1000 takes 0.57 seconds ~ 8,000,000 input should take 4,560 seconds ~ 1 hour 24 minutes

## Step size = 5, maxlen = 50, n_hidden=128
# Training set of 1000 takes 0.725 seconds ~ 4,750,000 input should take 3,420 seconds ~1 hour

## Step size = 5, maxlen = 50, n_hidden=512
# Training set of 10000 takes 9.85 seconds ~ 4,750,000 input should take 4,678 seconds ~1 hour 30 minutes

# serialize model to JSON
model_json = model.to_json()
filename = "model" + str(epoch)
with open(filename+".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights([filename+".h5"])
print("Saved model to disk")

Epoch 1/1
Time taken: 
5.576308488845825
Saved model to disk


Loaded model from disk
Testing with 10,000 input sequences...

----- diversity: 0.2
----- Generating with seed: " differently we can change the world. i know it. i"
 differently we can change the world. i know it. id and and and and and and and and the ang on the w

----- diversity: 0.5
----- Generating with seed: " differently we can change the world. i know it. i"
 differently we can change the world. i know it. ior ton werrens and the ind and por ange. wit an to

----- diversity: 1.0
----- Generating with seed: " differently we can change the world. i know it. i"
 differently we can change the world. i know it. is tacplut continttgs ware wah y amlonye  in anwist

----- diversity: 1.2
----- Generating with seed: " differently we can change the world. i know it. i"
 differently we can change the world. i know it. itrat htuepe isunvothero ibtyiny.hatelent cicocve c


Testing with 10,000 input sequences and 512 N_HIDDEN...

----- diversity: 0.2
----- Generating with seed: " come up with theorems. eternal truths. but it isn"
 come up with theorems. eternal truths. but it isntetdiidtdddttdddddtittttedtddideedededddddddtttdit

----- diversity: 0.5
----- Generating with seed: " come up with theorems. eternal truths. but it isn"
 come up with theorems. eternal truths. but it isndidesditriedodttetsettttteeeottddohdeeigieddhztrde

----- diversity: 1.0
----- Generating with seed: " come up with theorems. eternal truths. but it isn"
 come up with theorems. eternal truths. but it isnsttshr edozhfdideohetz sdedioodoott hoidntdtitct e

----- diversity: 1.2
----- Generating with seed: " come up with theorems. eternal truths. but it isn"
 come up with theorems. eternal truths. but it isnehhhsdthellihdzcwotesdtehide'etidthoipdird izzd dd


In [7]:
t0 = time.time()
    model.fit(Xbatch, ybatch, batch_size=BATCH_SIZE, nb_epoch=1)
    t1 = time.time()
    total = t1-t0
    
    orig_stdout = sys.stdout
    f = open('out.txt', 'a+')
    sys.stdout = f
    
    print("------------- EPOCH" + str(epoch) + " ----------------")
    print('Time taken: ')
    print(total)
    
    
    # serialize model to JSON
    model_json = model.to_json()
    filename = "model" + str(epoch)
    with open(filename+".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights([filename+".h5"])
    print("Saved model to disk")