In [1]:
import numpy as np
import os
from random import shuffle, random
from collections import defaultdict

In [2]:
import re
import urllib.request
import zipfile
import lxml.etree

# Download the dataset if it's not already there: this may take a minute as it is 75MB
if not os.path.isfile('ted_en-20160408.zip'):
    urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

# For now, we're only interested in the subtitle text, so let's extract that from the XML:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    doc = lxml.etree.parse(z.open('ted_en-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()'))

In [3]:
talks = list((doc.getroot().iterchildren()))

In [4]:
def get_class(keywords):
    keywords = [s.strip().lower() for s in keywords.split(',')]
    ted = [
        ('technology', 'T'),
        ('entertainment', 'E'),
        ('design', 'D')
    ]
    return ''.join(y if x in keywords else 'o' for x, y in ted)

In [5]:
# returns all the words in a string, treating non-alphanumeric characters as separators
def get_tokens(s):
    tokens = re.sub(r"[^a-z0-9]+", " ", s.lower()).split()
    return tokens

In [6]:
# returns a map from word to frequency
def get_frequencies(lst):
    result = defaultdict(int)
    for word in lst:
        result[word] += 1
    return result

In [7]:
## Extract list of all sentences from TED talk corpus.
## Maybe we should use Wiki instead?
input_text = '\n'.join(doc.xpath('//content/text()'))
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)

In [8]:
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)

In [9]:
sentences_ted = []
for sent_str in sentences_strings_ted:
    sentences_ted.append(get_tokens(sent_str))

In [10]:
SPECIAL_TOKEN = '*' # a non-alphanumeric token, guaranteed to not occur in the dictionary

## Randomly map a few low-frequency words to the special token
all_words = []
for sentence in sentences_ted:
    all_words.extend(sentence)

In [11]:
all_freqs = get_frequencies(all_words)

In [12]:
threshold = 2
p = 0.1
# print ('threshold:', threshold)
# print ('p:', p)
cnt = 0
for s in sentences_ted:
    for i in range(len(s)):
        if all_freqs[s[i]] <= threshold and random() < p:
            cnt += 1
            s[i] = SPECIAL_TOKEN
# print ('cnt:', cnt)
# print ('total:', len(all_words))

In [13]:
## Get a Word2Vec model, for embedding our text strings.
from gensim.models import Word2Vec
model_ted = Word2Vec(sentences_ted, size=100, window=5, min_count=5, workers=4)

In [14]:
def embed_word(w):
    if w not in model_ted:
        w = SPECIAL_TOKEN
    return model_ted[w]

In [15]:
''' Simple bag of means embedding model

Given a list of words W,
x = 1/N * sum(x_w for w in W), where N = len(W)
'''
def bag_of_means(text):
    W = get_tokens(text)
    return sum(embed_word(w) for w in W) / len(W)

In [16]:
LABELS = ['ooo', 'ooD', 'oEo', 'oED', 'Too', 'ToD', 'TEo', 'TED']
''' processes a talk

returns (<embedding> (1D np.array [dim=100]), <label> (int))
'''
def process_talk(talk):
    text = talk.cssselect('content')[0].text
    label = get_class(talk.cssselect('keywords')[0].text)
    return bag_of_means(text), LABELS.index(label)

In [17]:
processed_talks = [process_talk(t) for t in talks]
all_inputs, all_labels = zip(*processed_talks)
all_inputs, all_labels = np.array(all_inputs), np.array(all_labels)

NUM_TRAIN = 1585
NUM_VALID = 250
NUM_TEST = 250
assert(NUM_TRAIN+NUM_VALID+NUM_TEST == len(processed_talks))
train_set = all_inputs[0:NUM_TRAIN], all_labels[0:NUM_TRAIN]
valid_set = all_inputs[NUM_TRAIN:NUM_TRAIN+NUM_VALID], all_labels[NUM_TRAIN:NUM_TRAIN+NUM_VALID]
test_set = all_inputs[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST], all_labels[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST]


In [18]:
import random

TRAINING_BATCH_SIZE = 50
''' extracts training data from train_set

returns list of input, label pairs.
each input, label pair corresponds to a batch of 50 talks.
input: 2D torch.Tensor [dimensions 50x100]
label: 1D torch.Tensor [dimension 50]
'''
def make_training_data():
    train_inputs, train_labels = train_set
    training_data = []
    ar = list(range(NUM_TRAIN))
    random.shuffle(ar) # shuffle it each time to make new batches
    for i in range(0, NUM_TRAIN, TRAINING_BATCH_SIZE): # this will leave out the last 35 talks, but that's ok
        section = ar[i:i+50]
        inputs, labels = train_inputs[section], train_labels[section]
        # inputs: list (size=50) of np.arrays (of dim 100)
        # labels: list (size=50) of integers (between 0...7)
        
        # we do NOT want one-hot vectors for the labels -- they should just be np.arrays
        inputs = torch.Tensor(np.array(inputs))
        labels = torch.LongTensor(labels)
        training_data.append((inputs, labels))
    return training_data
    

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lin1 = nn.Linear(100, 32) # model_ted was created as size=100
        self.lin2 = nn.Linear(32, 16)
        self.lin3 = nn.Linear(16, 8) # there are 8 labels
    def forward(self, x):
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = F.softmax(self.lin3(x))
        return x

net = Net()

In [20]:
import torch.optim as optim
optimizer = optim.Adam(net.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss() # loss = -log(p_y) (cross entropy criterion)

In [21]:
for epoch in range(5): # loop over the dataset multiple times
    running_loss = 0.0
    training_data = make_training_data()
    for data in training_data:
        inputs, labels = data

        # wrap them in a variable
        inputs, labels = Variable(inputs), Variable(labels)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.data[0]
    # print statistics
    print('average loss: %.3f' % (1. * running_loss / len(training_data)))
print('Finished Training')

average loss: 1.759
average loss: 1.660
average loss: 1.662
average loss: 1.659
average loss: 1.661
Finished Training


In [22]:
def inspect(inputs, labels):
    inputs = Variable(torch.Tensor(inputs))
    labels = torch.LongTensor(labels)
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1)
    actual_counts = [(predicted == i).sum() for i in range(8)]
    expected_counts = [(labels == i).sum() for i in range(8)]
    print(expected_counts)
    print(actual_counts)

In [23]:
# clearly our model is flawed -- it's merely predicting 'ooo' for everything.
inspect(train_set[0], train_set[1])

[972, 109, 99, 10, 281, 79, 19, 16]
[1585, 0, 0, 0, 0, 0, 0, 0]


In [24]:
def judge(inputs, labels):
    inputs = Variable(torch.Tensor(inputs))
    labels = torch.LongTensor(labels)
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1) # prediction = arg max_y’ (p_y’)
    total = predicted.size(0)
    correct = (predicted == labels).sum()
    print('Accuracy: %d/%d = %.2d %%' % (correct, total, 100.*correct/total))

In [25]:
judge(train_set[0], train_set[1])

Accuracy: 972/1585 = 61 %


In [26]:
judge(test_set[0], test_set[1])

Accuracy: 74/250 = 29 %


In [27]:
judge(valid_set[0], valid_set[1])

Accuracy: 88/250 = 35 %


In [28]:
inspect(test_set[0], test_set[1])

[74, 26, 41, 6, 58, 26, 9, 10]
[250, 0, 0, 0, 0, 0, 0, 0]
