In [2]:
# Imports
import re
import urllib.request
import zipfile
import lxml.etree
import itertools
import numpy as np
import tensorflow as tf
import pickle
import os
import random
import sys
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file

Using TensorFlow backend.


## Import Data

In [3]:
# Download Dataset
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")
    
    
# Extract documents   
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))


## Character level LSTM language modelling

In [6]:
corpus = ""
for document in doc.findall('//content'):
    corpus = corpus + "<s>" + document.text.lower() + "<e>"
print(len(corpus))

24233275


In [25]:
chars = sorted(list(set(corpus)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 128


In [29]:
# Split text into overlapping sentences with step size 3.
print('Splitting text into sequences...')
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(corpus) - maxlen, step):
    sentences.append(corpus[i: i + maxlen])
    next_chars.append(corpus[i + maxlen])
print('number of sequences:', len(sentences))

Splitting text into sequences
number of sequences: 8077745


In [30]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [33]:
print(X.shape)
print(y.shape)

(8077745, 40, 128)
(8077745, 128)


In [35]:
#take subset of data to check if model works as expected
Xtemp = X[0:100000]
ytemp = y[0:100000]

print(Xtemp.shape)
print(ytemp.shape)

(100000, 40, 128)
(100000, 128)
