In [None]:
from __future__ import print_function
import re
import urllib.request
import zipfile
import lxml.etree
import itertools
import numpy as np
import tensorflow as tf
import time
import pickle
import os
import random
import sys
import h5py


from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout, GRU
from keras.optimizers import RMSprop
from keras.utils import np_utils
from keras.layers.wrappers import TimeDistributed

## Import Data

In [None]:
FLAGS = tf.flags
FLAGS.data_path = "talks.txt"
FLAGS.maxlen = 50
FLAGS.batch_size = 32

In [None]:
# Download Dataset
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")


# Extract documents   
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))


### Preprocess data and remove characters we don't want in our vocabulary. This will speed up learning

In [None]:
corpus = ""
totalcorpus = ""
i=0
chars_to_remove = ['+', ',', '-','/','<', '=', '>','@', '[', '\\', ']', '^', '_','\x80', '\x93', '\x94', '\xa0', '¡', '¢', '£', '²', 'º', '¿', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ï', 'ñ', 'ó', 'ô', 'ö', 'ø', 'ù', 'û', 'ü', 'ā', 'ă', 'ć', 'č', 'ē', 'ě', 'ī', 'ō', 'ť', 'ū', '˚', 'τ', 'ย', 'ร', 'อ', '่', '€', '∇', '♪', '♫', '你', '葱', '送', '–', '—', '‘', '’', '“', '”','0', '1', '2', '3', '4', '5', '6', '7', '8', '9','#', '$', '%', '&', '!', '"', "'", '(', ')', '*', ':', ';','…']
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
for document in doc.findall('//content'):
    i +=1
    # get each talk
    corpus = document.text.lower()
    # remove unwanted characters
    corpus = re.sub(rx, '', corpus)
    # create total corpus
    totalcorpus = totalcorpus + " S " + corpus + " E "
print(len(totalcorpus))
print(i)
corpus = totalcorpus

with open(FLAGS.data_path, "w") as text_file:
    text_file.write(corpus) 


### Load text file if processed before

In [None]:
def readfile(data_path):
    corpus = open(data_path, "r")
    corpus = corpus.read()
    return corpus

### Create character to index and index to character functions

In [None]:
# create dictionaries to convert characters to indices and vice-versa
def get_dicts(corpus):
    chars = sorted(list(set(corpus)))
    char2ind = dict((c, i) for i, c in enumerate(chars))
    ind2char = dict((i, c) for i, c in enumerate(chars))
    return char2ind, ind2char

In [None]:
# Split text into overlapping sentences with step size 3.
print('Splitting text into sequences...')
def split2sentences(corpus, maxlen):
    sentencelen = maxlen+1
    step = 5
    sentences = []
    for i in range(0, len(corpus) - sentencelen, step):
        sentences.append(corpus[i: i + sentencelen])
    return sentences

In [None]:
# Create function to convert indices to one-hot encoding
def vectorize(sentences, maxlen, charlen, char_indices):
    X = np.zeros((len(sentences), maxlen, charlen), dtype=np.bool)
    Y = np.zeros((len(sentences), maxlen, charlen), dtype=np.bool)
    
    # vectorize the entire set by splitting sentences into X and Y, where Y is X shifted
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            if t==0:
                X[i, t, char_indices[char]] = 1
            elif t==50:
                Y[i, t-1, char_indices[char]] = 1
            else:
                X[i, t, char_indices[char]] = 1
                Y[i, t-1, char_indices[char]] = 1
    return X,Y

def unvectorize(tensor, ind2char):
    sentences = []
    print(tensor.shape)
    for i, sentence in enumerate(tensor):
        x = ""
        for j, char in enumerate(sentence):
            y = ind2char[np.argmax(char)]
            x += y
        sentences.append(x)
    return sentences

In [None]:
# Data generator to pass batches to model
def generate_data(FLAGS):
    corpus = readfile(FLAGS.data_path)
    char2ind, ind2char = get_dicts(corpus)
    sentences = split2sentences(corpus, FLAGS.maxlen)
    num_sent = len(sentences)
    while 1:
        for j in range(0, num_sent, batch_size):
            batch = sentences[j:j+batch_size]
            X, Y = vectorize(batch, FLAGS.maxlen, len(char2ind), char2ind)
            yield (X, Y)
            

In [None]:
FLAGS.samples_per_epoch = len(split2sentences(readfile(FLAGS.data_path), FLAGS.maxlen))
char2ind, ind2char = get_dicts(corpus=readfile(FLAGS.data_path))
FLAGS.charlen = len(char2ind)

## Build model

In [None]:
# RNN parameters
N_HIDDEN = 512
LEARNING_RATE = 0.01
BATCH_SIZE = 64
EPOCHS = 1

In [None]:
print('Building training model...')
model = Sequential()
model.add(GRU(N_HIDDEN, input_shape=(maxlen, len(char2ind))))
model.add(TimeDistributed(Dense(N_HIDDEN2)))
model.add(TimeDistributed(Activation('relu'))) 
model.add(TimeDistributed(Dense(FLAGS.charlen)))  # Add another dense layer with the desired output size.
model.add(TimeDistributed(Activation('softmax')))
model.compile(loss='categorical_crossentropy', optimizer = RMSprop(lr=0.002, clipnorm=5))

print(model.summary()) # Convenient function to see details about the network model.

## Build inference model
Note: the inference model will have only one time step as we will feed each predicted character back into the rnn as a seed for predicting the next character. It will also be stateful so as to 'remember' previous states.

In [None]:
print('Building Inference model...')
inference_model = Sequential()

inference_model.add(LSTM(N_HIDDEN, batch_input_shape=(1, 1, len(char2id)), stateful = True))
# Since the above LSTM does not output sequences, we don't need TimeDistributed anymore.
inference_model.add(Dense(N_HIDDEN2))
inference_model.add(Activation('relu'))
inference_model.add(Dense(FLAGS.charlen))
inference_model.add(Activation('softmax'))

# Copy the weights of the trained network. Both should have the same exact number of parameters (why?).
inference_model.load_weights('model.h5')

# Given the start Character 'S' (one-hot encoded), predict the next most likely character.
startChar = np.zeros((1, 1, FLAGS.charlen))
startChar[0, 0, char2id['S']] = 1
nextCharProbabilities = inference_model.predict(startChar)

# print the most probable character that goes next.
print(id2char[nextCharProbabilities.argmax()])

### Run training and save generated text (1000 characters) to output file. 
Save model after every iteration to be able to stop and restart training epochs

In [None]:
epoch=0
print('Training model')
while 1:
    epoch += 1
    t0 = time.time()
    model.reset_states()
    hist = model.fit_generator(generate_data(FLAGS), nb_epoch=1, samples_per_epoch=FLAGS.samples_per_epoch)
    model.save_weights("model.h5")
    t1 = time.time()
    total = t1-t0
    
    
    if epoch%5 == 0:
        orig_stdout = sys.stdout
        f = open('output.txt', 'a+')
        sys.stdout = f

        print("------------- EPOCH" + str(epoch) + " ----------------")
        print('Time taken: ')
        print(total)

        # Copy the weights of the trained network. Both should have the same exact number of parameters (why?).
        inference_model.load_weights('model.h5')
        inference_model.reset_states()
        # Given the start Character 'S' (one-hot encoded), predict the next most likely character.
        startChar = np.zeros((1, 1, FLAGS.charlen))
        startChar[0, 0, char2ind['S']] = 1
        text=""
        for i in range(1000):
            nextCharProbs = inference_model.predict(startChar)
            nextCharProbs = np.asarray(nextCharProbs).astype('float64') # Weird type cast issues if not doing this.
            nextCharProbs = nextCharProbs / nextCharProbs.sum()  # Re-normalize for float64 to make exactly 1.0.

            nextCharId = np.random.multinomial(1, nextCharProbs.squeeze(), 1).argmax()
            text += ind2char[nextCharId] # The comma at the end avoids printing a return line character.
            startChar.fill(0)
            startChar[0, 0, nextCharId] = 1
        print('Generated Text:')
        print(text)
        sys.stdout = orig_stdout
        f.close()