In [98]:
from __future__ import print_function
import re
import urllib.request
import zipfile
import lxml.etree
import itertools
import numpy as np
import tensorflow as tf
import time
import pickle
import os
import random
import sys
import h5py


from keras.models import Sequential, load_model, model_from_json
from keras.layers import Dense, Activation, Dropout, LSTM, GRU
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.layers.wrappers import TimeDistributed

## Import Data

In [129]:
FLAGS = tf.flags
FLAGS.data_path = "talks.txt"
FLAGS.maxlen = 50
FLAGS.batch_size = 32

In [100]:
# Download Dataset
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")


# Extract documents   
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))


In [102]:

# corpus = ""
# totalcorpus = ""
# i=0
# chars_to_remove = ['+', ',', '-','/','<', '=', '>','@', '[', '\\', ']', '^', '_','\x80', '\x93', '\x94', '\xa0', '¡', '¢', '£', '²', 'º', '¿', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'ø', 'ù', 'û', 'ü', 'ā', 'ă', 'ć', 'č', 'ē', 'ě', 'ī', 'ō', 'ť', 'ū', '˚', 'τ', 'ย', 'ร', 'อ', '่', '€', '∇', '♪', '♫', '你', '葱', '送', '–', '—', '‘', '’', '“', '”','0', '1', '2', '3', '4', '5', '6', '7', '8', '9','#', '$', '%', '&', '!', '"', "'", '(', ')', '*', ':', ';','…']
# rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
# for document in doc.findall('//content'):
#     i +=1
#     # get each talk
#     corpus = document.text.lower()
#     # remove unwanted characters
#     corpus = re.sub(rx, '', corpus)
#     # create total corpus
#     totalcorpus = totalcorpus + " S " + corpus + " E "
# print(len(totalcorpus))
# print(i)
# corpus = totalcorpus

# with open(FLAGS.data_path, "w") as text_file:
#     text_file.write(corpus) 


### Load text file if processed before

In [115]:
def readfile(data_path):
    corpus = open(data_path, "r")
    corpus = corpus.read()
    return corpus

### Create character to index and index to character functions

In [111]:
def get_dicts(corpus):
    chars = sorted(list(set(corpus)))
    char2ind = dict((c, i) for i, c in enumerate(chars))
    ind2char = dict((i, c) for i, c in enumerate(chars))
    return char2ind, ind2char

In [122]:
# Split text into overlapping sentences with step size 3.
# print('Splitting text into sequences...')
def split2sentences(corpus, maxlen):
    sentencelen = maxlen+1
    step = 5
    sentences = []
    for i in range(0, len(corpus) - sentencelen, step):
        sentences.append(corpus[i: i + sentencelen])
    return sentences

In [114]:
def vectorize(sentences, maxlen, charlen, char_indices):
    X = np.zeros((len(sentences), maxlen, charlen), dtype=np.bool)
    Y = np.zeros((len(sentences), maxlen, charlen), dtype=np.bool)
    
    # vectorize the entire set by splitting sentences into X and Y, where Y is X shifted
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            if t==0:
                X[i, t, char_indices[char]] = 1
            elif t==50:
                Y[i, t-1, char_indices[char]] = 1
            else:
                X[i, t, char_indices[char]] = 1
                Y[i, t-1, char_indices[char]] = 1
    return X,Y

def unvectorize(tensor, ind2char):
    sentences = []
    print(tensor.shape)
    for i, sentence in enumerate(tensor):
        x = ""
        for j, char in enumerate(sentence):
            y = ind2char[np.argmax(char)]
            x += y
        sentences.append(x)
    return sentences

In [136]:
def generate_data(FLAGS):
    corpus = readfile(FLAGS.data_path)
    char2ind, ind2char = get_dicts(corpus)
    sentences = split2sentences(corpus, FLAGS.maxlen)
    num_sent = len(sentences)
    while 1:
        for j in range(0, num_sent, batch_size):
            batch = sentences[j:j+batch_size]
            X, Y = vectorize(batch, FLAGS.maxlen, len(char2ind), char2ind)
            yield (X, Y)
            

In [146]:
FLAGS.samples_per_epoch = len(split2sentences(readfile(FLAGS.data_path), FLAGS.maxlen))//FLAGS.batch_size
char2ind, ind2char = get_dicts(corpus=readfile(FLAGS.data_path))
FLAGS.charlen = len(char2ind)

## Build model

In [119]:
N_HIDDEN = 512
N_HIDDEN2 = 512
LEARNING_RATE = 0.005
BATCH_SIZE = 64
EPOCHS = 5

In [120]:
print('Building training model...')
model = Sequential()
# The output of the LSTM layer are the hidden states of the LSTM for every time step. 
model.add(GRU(N_HIDDEN, return_sequences = True, input_shape=(maxlen, len(char2ind))))
# Two things to notice here:
# 1. The Dense Layer is equivalent to nn.Linear(hiddenStateSize, hiddenLayerSize) in Torch.
#    In Keras, we often do not need to specify the input size of the layer because it gets inferred for us.
# 2. TimeDistributed applies the linear transformation from the Dense layer to every time step
#    of the output of the sequence produced by the LSTM.
model.add(TimeDistributed(Dense(N_HIDDEN2)))
model.add(TimeDistributed(Activation('relu'))) 
model.add(TimeDistributed(Dense(FLAGS.charlen)))  # Add another dense layer with the desired output size.
model.add(TimeDistributed(Activation('softmax')))
model.compile(loss='categorical_crossentropy', optimizer = RMSprop(lr=0.001))

print(model.summary()) # Convenient function to see details about the network model.

Building training model...
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
gru_2 (GRU)                      (None, 50, 512)       837120      gru_input_2[0][0]                
____________________________________________________________________________________________________
timedistributed_9 (TimeDistribut (None, 50, 512)       262656      gru_2[0][0]                      
____________________________________________________________________________________________________
timedistributed_10 (TimeDistribu (None, 50, 512)       0           timedistributed_9[0][0]          
____________________________________________________________________________________________________
timedistributed_11 (TimeDistribu (None, 50, 32)        16416       timedistributed_10[0][0]         
________________________________________________________________

## Build data generator

In [145]:
print('Training model')
model.fit_generator(generate_data(FLAGS), nb_epoch=1, samples_per_epoch=FLAGS.samples_per_epoch)
model.save_weights("model.h5")

In [None]:
# The only difference with the "training model" is that here the input sequence has 
# a length of one because we will predict character by character.
print('Building Inference model...')
inference_model = Sequential()
# Two differences here.
# 1. The inference model only takes one sample in the batch, and it always has sequence length 1.
# 2. The inference model is stateful, meaning it inputs the output hidden state ("its history state")
#    to the next batch input.
inference_model.add(LSTM(N_HIDDEN, batch_input_shape=(1, 1, len(char2id)), stateful = True))
# Since the above LSTM does not output sequences, we don't need TimeDistributed anymore.
inference_model.add(Dense(N_HIDDEN2))
inference_model.add(Activation('relu'))
inference_model.add(Dense(FLAGS.charlen))
inference_model.add(Activation('softmax'))

# Copy the weights of the trained network. Both should have the same exact number of parameters (why?).
inference_model.load_weights('model.h5')

# Given the start Character 'S' (one-hot encoded), predict the next most likely character.
startChar = np.zeros((1, 1, FLAGS.charlen))
startChar[0, 0, char2id['S']] = 1
nextCharProbabilities = inference_model.predict(startChar)

# print the most probable character that goes next.
print(id2char[nextCharProbabilities.argmax()])