## Practical 3: Text Classification with RNNs
<p>Oxford CS - Deep NLP 2017<br>
https://www.cs.ox.ac.uk/teaching/courses/2016-2017/dl/</p>
<p>[Chris Dyer, Phil Blunsom, Yannis Assael, Brendan Shillingford, Yishu Miao]</p>

In [None]:
import numpy as np
import os
from random import shuffle
import re
import collections
import tensorflow as tf
import urllib.request
import zipfile
import lxml.etree

In [2]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [3]:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

root = doc.getroot()

del doc

In [4]:
labels_talks_ted = []
tokens_talks_ted = []

for file in root:
    keywords = file.find("head").find("keywords").text.lower()
    content = file.find("content").text.lower()
    label = np.empty(shape=(1,3), dtype="float16")
    label[0,0]= (1 if keywords.find("technology")>-1 else 0)
    label[0,1]= (1 if keywords.find("entertainment")>-1 else 0)
    label[0,2]= (1 if keywords.find("design")>-1 else 0)
    
    content = re.sub(r'\([^)]*\)', '', content)
    
    sentences_strings_content = []
    for line in content.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        sentences_strings_content.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    
    content_tokens = []
    for sent_str in sentences_strings_content:
        tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
        content_tokens += tokens
    
    labels_talks_ted.append(label)
    tokens_talks_ted.append(content_tokens)

In [5]:
min_count = 10

training_labels_talks_ted = labels_talks_ted[0:1585]
training_tokens_talks_ted = tokens_talks_ted[0:1585]

# count occurence of each token

training_counts_ted = collections.Counter()
training_tokens_ted = [token for talk in training_tokens_talks_ted for token in talk]

for token in training_tokens_ted:
    training_counts_ted[token] += 1

training_idx_ted = {}

# create dictionary for one-hot encoding idx of all tokens
next_idx = 0
for talk in training_tokens_talks_ted:
    for idx in range(0, len(talk)):
        if training_counts_ted[talk[idx]] < min_count:
            talk[idx]="UNKNOWNTEXT"
        if talk[idx] in training_idx_ted:
            continue
        else:
            training_idx_ted[talk[idx]] = next_idx
            next_idx+=1
            
vocab_size = len(training_idx_ted)

In [6]:
def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

In [7]:
init_scale = 0.1
learning_rate = 1.0
max_grad_norm = 5
num_layers = 2
num_steps = 2000
num_outputs = 3
hidden_size = 200
keep_prob = 1.0
lr_decay = 0.5
batch_size = 1

x = tf.placeholder("float16",shape=[None, num_steps, vocab_size], name="x_placeholder")
y = tf.placeholder("float16",shape=[None, num_outputs], name="y_placeholder")
weights = tf.Variable(tf.truncated_normal([hidden_size, num_outputs], stddev=0.05, dtype=tf.float16))
bias = tf.Variable(tf.constant(.1, shape=[num_outputs], dtype=tf.float16))

lstm = tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0, state_is_tuple=True)
stacked_lstm = tf.contrib.rnn.MultiRNNCell([lstm] * num_layers, state_is_tuple=True)

initial_state = stacked_lstm.zero_state(batch_size, dtype=tf.float16)

outputs, state = tf.nn.dynamic_rnn(stacked_lstm, x, initial_state=initial_state, dtype=tf.float16, sequence_length=length(x))

# outputs = tf.transpose(outputs, [1,0,2])
# last = tf.gather(outputs, num_steps - 1)
# y_pred = tf.nn.softmax(tf.matmul(last, weights) + bias)
outputs = tf.reduce_mean(outputs, 1)
y_pred = tf.nn.softmax(tf.matmul(outputs, weights) + bias)

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

correct_pred = tf.equal(tf.argmax(y_pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
def data_iterator():
    batch_idx = 0
    while True:
        for batch_idx in range(0, len(training_tokens_talks_ted), batch_size):
            sequence_batch = np.empty(shape=(0,num_steps,vocab_size), dtype="float16")
            
            for talk in training_tokens_talks_ted[batch_idx:batch_idx+batch_size]:
                sequence = np.empty(shape=(0,vocab_size), dtype="float16")
    
                for token_idx in range(0,num_steps):
                    one_hot = np.empty(shape=(1,vocab_size), dtype="float16")
                    if token_idx + 1 < len(talk):
                        one_hot[0,training_idx_ted[talk[token_idx]]] = 1
                    sequence = np.append(sequence, one_hot, axis=0)
                    
                print(sequence.shape)
                sequence_batch = np.vstack((sequence_batch, [sequence]))
            
            labels_batch = np.reshape(training_labels_talks_ted[batch_idx:batch_idx+batch_size], (batch_size,num_outputs))
            yield sequence_batch, labels_batch
            
iter_ = data_iterator()

In [None]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    step = 1
    for i in range(300):
        sequence_batch, labels_batch = iter_.__next__()
        if (i+1)%10==0:
            train_accuracy = accuracy.eval(session = sess, feed_dict={ x:sequence_batch, y: labels_batch})
            print("step %d, training accuracy %g"%(i, train_accuracy))
        optimizer.run(session = sess, feed_dict={x: sequence_batch, y: labels_batch})
    
        
        
        
        

(2000, 12098)


In [None]:
batch_size = 20
no_of_batches = int(len(training_sentences_ted)/batch_size)
epoch = 10
for i in range(epoch):
    ptr = 0
    for j in range(no_of_batches):
        inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
        ptr+=batch_size
        sess.run(minimize,{data: inp, target: out})
    print "Epoch - ",str(i)
incorrect = sess.run(error,{data: test_input, target: test_output})
print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
sess.close()