## Practical 2: Text Classification
<p>Oxford CS - Deep NLP 2017<br>
https://www.cs.ox.ac.uk/teaching/courses/2016-2017/dl/</p>
<p>[Chris Dyer, Yannis Assael, Brendan Shillingford]</p>

In [68]:
import numpy as np
import os
from random import shuffle
import re
import collections

In [6]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [7]:
import urllib.request
import zipfile
import lxml.etree

In [8]:
import tensorflow as tf

In [9]:
from gensim.models import Word2Vec

In [10]:
# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

In [11]:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))

root = doc.getroot()

del doc

In [12]:
labels_content = []

for file in root:
    keywords = file.find("head").find("keywords").text.lower()
    content = file.find("content").text.lower()
    label = ""
    label+= ("T" if keywords.find("technology")>-1 else "o")
    label+= ("E" if keywords.find("entertainment")>-1 else "o")
    label+= ("D" if keywords.find("design")>-1 else "o")
    
    content = re.sub(r'\([^)]*\)', '', content)
    
    sentences_strings_content = []
    for line in content.split('\n'):
        m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
        sentences_strings_content.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
    
    sentences_content = []
    for sent_str in sentences_strings_content:
        tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
        sentences_content.append(tokens)
        
    labels_content.append((label, sentences_content))

In [1]:
# min occurences of token for inclusion in vocabulary
min_count = 5

# list of training-sentence-token lists
training_sentences_ted = []
for talk in labels_content[0:1585]:
    training_sentences_ted+=talk[1]

# counter and flatten list of tokens
training_counts_ted = collections.Counter()
training_tokens_ted = [token for sentence in training_sentences_ted for token in sentence]

# count all tokens in training-sentence-token lists
for token in training_tokens_ted:
    training_counts_ted[token] += 1
    
# replace all tokens with occurence < min_count in list of training-sentence-token lists with "_____"
for sentence in training_sentences_ted:
    for idx in range(0, len(sentence)):
        if training_counts_ted[sentence[idx]] < min_count:
            sentence[idx] = "UNKNOWNTEXT"

NameError: name 'labels_content' is not defined

In [175]:
dim = 512
model_ted = Word2Vec(training_sentences_ted, size=dim, window=5, min_count=min_count, workers=4)

In [176]:
embeddings_ted = np.empty(shape=(0,dim), dtype="float32")

for talk in labels_content:
    tokens = [token for sentence in talk[1] for token in sentence]
    embedding = np.empty(shape=(1,dim), dtype="float32")
    count = 0
    for token in tokens:
        if token not in model_ted.vocab:
            token="UNKNOWNTEXT"
        embedding+=model_ted[token]
        count+=1
    embedding/=count
    embeddings_ted=np.append(embeddings_ted, embedding,axis=0)

In [260]:
embeddings = embeddings_ted.astype("float32")
labels_strings = np.array([pair[0] for pair in labels_content])

# randomize index
idxs = np.arange(0, len(embeddings))
np.random.shuffle(idxs)

embeddings = embeddings[idxs]
labels_strings = labels_strings[idxs]

# correspondence:
# ooo - 10000000
# Too - 01000000
# oEo - 00100000
# ooD - 00010000
# TEo - 00001000
# ToD - 00000100
# oED - 00000010
# TED - 00000001

labels = np.empty(shape=(0,8), dtype=int)

for label in labels_strings:
    if label == "ooo":
        labels = np.vstack((labels, np.array([1,0,0,0,0,0,0,0])))
    elif label == "Too":
        labels = np.vstack((labels, np.array([0,1,0,0,0,0,0,0])))
    elif label == "oEo":
        labels = np.vstack((labels, np.array([0,0,1,0,0,0,0,0])))
    elif label == "ooD":
        labels = np.vstack((labels, np.array([0,0,0,1,0,0,0,0])))
    elif label == "TEo":
        labels = np.vstack((labels, np.array([0,0,0,0,1,0,0,0])))
    elif label == "ToD":
        labels = np.vstack((labels, np.array([0,0,0,0,0,1,0,0])))
    elif label == "oED":
        labels = np.vstack((labels, np.array([0,0,0,0,0,0,1,0])))
    elif label == "TED":
        labels = np.vstack((labels, np.array([0,0,0,0,0,0,0,1])))
        
training_embeddings = embeddings[0:1585]
training_labels = labels[0:1585]

validation_embeddings = embeddings[1585:1835]
validation_labels = labels[1585:1835]

testing_embeddings = embeddings[1835:2085]
testing_labels = labels[1835:2085]




In [261]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.05)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(.1, shape=shape)
    return tf.Variable(initial)

In [262]:
# one hidden layer
x = tf.placeholder(tf.float32, shape=[None, dim])
y = tf.placeholder(tf.int32, shape=[None, 8])

# fully connected layer
W_fc1 = weight_variable([dim, 128])
b_fc1 = bias_variable([128])
# h_fc1 = tf.tanh(tf.matmul(x, W_fc1) + b_fc1)
h_fc1 = tf.nn.relu(tf.matmul(x, W_fc1) + b_fc1)
# h_fc1 = tf.sigmoid(tf.matmul(x, W_fc1) + b_fc1)

# adding dropout
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# last fully connected layer
W_fc2 = weight_variable([128,8])
b_fc2 = bias_variable([8])
y_pred = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)


In [276]:
# two hidden layers
x = tf.placeholder(tf.float32, shape=[None, dim])
y = tf.placeholder(tf.int32, shape=[None, 8])

# fully connected layer
W_fc1 = weight_variable([dim, 2048])
b_fc1 = bias_variable([2048])
h_fc1 = tf.tanh(tf.matmul(x, W_fc1) + b_fc1)

# second fully connected layer
W_fc2 = weight_variable([2048, 1024])
b_fc2 = bias_variable([1024])
h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)

# dropout
keep_prob = tf.placeholder(tf.float32)
h_fc2_drop = tf.nn.dropout(h_fc2, keep_prob)

# last fully connected output layer
W_fc3 = weight_variable([1024,8])
b_fc3 = bias_variable([8])
y_pred = tf.nn.softmax(tf.matmul(h_fc2_drop, W_fc3) + b_fc3)


In [277]:
def data_iterator():
    batch_idx = 0
    while True:
        batch_size = 50
        for batch_idx in range(0, len(training_embeddings), batch_size):
            embeddings_batch = training_embeddings[batch_idx:batch_idx+batch_size]
            labels_batch = training_labels[batch_idx:batch_idx+batch_size]
            yield embeddings_batch, labels_batch
            
iter_ = data_iterator()

In [278]:
# We'll use the cross entropy loss function 
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_pred))

# And classification accuracy
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_pred, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# And the Adam optimiser
train_step = tf.train.AdamOptimizer(learning_rate=1e-5).minimize(cross_entropy)


In [279]:
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [280]:
for i in range(5000):
    embeddings_batch, labels_batch = iter_.__next__()
    if (i+1)%500==0:
        train_accuracy = accuracy.eval(session = sess, feed_dict={ x:embeddings_batch, y: labels_batch, keep_prob: 1})
        print("step %d, training accuracy %g"%(i, train_accuracy))
    train_step.run(session = sess, feed_dict={x: embeddings_batch, y: labels_batch, keep_prob: .2})
    
print(accuracy.eval(session = sess, feed_dict={x: validation_embeddings, y: validation_labels, keep_prob: 1}))

step 499, training accuracy 0.5
step 999, training accuracy 0.56
step 1499, training accuracy 0.5
step 1999, training accuracy 0.62
step 2499, training accuracy 0.46
step 2999, training accuracy 0.58
step 3499, training accuracy 0.66
step 3999, training accuracy 0.571429
step 4499, training accuracy 0.5
step 4999, training accuracy 0.56
0.556


In [281]:
count = 0
for par in labels_strings[1585:1835]:
    if par=="ooo":
        count+=1
print(count / 250)

0.556
18168


In [223]:
# do the tSNE thing to get it down to two dimensions