# Topic Classifier Code

## Imports

In [5]:
import csv
import pickle
import numpy as np
import pandas as pd
import re
import csv
import math
import nltk

from keras.layers import Embedding, Lambda, Dense
from keras.models import Sequential
from keras import backend as K

## Utilities

### Tokenizer

In [6]:
class Tokenizer:
    def __init__(self):
        self.word_to_token = {}
        self.token_to_word = {}

        self.word_count = {}

        self.word_to_token['<unknown>'] = 0
        self.token_to_word[0] = '<unknown>'

        self.vocabulary = []
        self.vocabulary.append('<unknown>')
        self.vocab_size = 1

        self.min_occur = 30

    def get_word_to_token(self):
        return self.word_to_token

    def get_token_to_word(self):
        return self.token_to_word

    def build_tokenizer(self, corpus, Cutoff=30):

        # only keep word above certain frequency
        WordCount = {}
        for document in corpus:
            document = document.strip().lower()

            all_words = nltk.word_tokenize(document)

            for word in all_words:
                if word not in WordCount:
                    WordCount[word] = 1
                else:
                    WordCount[word] += 1

        for Key, Value in WordCount.items():

            if Value >= Cutoff:
                self.vocabulary.append(Key)
                self.word_to_token[Key] = len(self.vocabulary)-1
                self.token_to_word[len(self.vocabulary)-1] = Key

        print("tokenizer with vocab size of "+str(len(self.vocabulary)))

    def fit(self, corpus):
        for review in corpus:
            review = review.strip().lower()
            words = re.findall(r"[\w']+|[.,!?;]", review)
            for word in words:
                if word not in self.word_count:
                    self.word_count[word] = 0
                self.word_count[word] += 1

        for review in corpus:
            review = review.strip().lower()
            words = re.findall(r"[\w']+|[.,!?;]", review)
            for word in words:
                if self.word_count[word] < self.min_occur:
                    continue
                if word in self.word_to_token:
                    continue
                self.word_to_token[word] = self.vocab_size
                self.token_to_word[self.vocab_size] = word
                self.vocab_size += 1

    def tokenize(self, corpus):
        tokenized = []
        for document in corpus:
            document = document.strip().lower()
            all_words = nltk.word_tokenize(document)
            document_tokens = []
            for word in all_words:
                if word not in self.word_to_token:
                    document_tokens.append(0)
                else:
                    document_tokens.append(self.word_to_token[word])
            tokenized.append(document_tokens)
        return tokenized

### Misc.

In [7]:
def loadcsv(filename):
    with open(filename, newline='') as f:
        return list(csv.reader(f))


def get_topic(number):
    if not math.isnan(number):
        return int(number)


def create_one_hot(item, num_items):
    one_hot = np.zeros(num_items)
    for thing in item:
        one_hot[int(thing)] += 1
    return one_hot


def get_mapping(topicfile):
    data = loadcsv(topicfile)
    id_mapping = {int(t[1]): int(t[0]) for t in data[::2][1:]}
    topic_mapping = {int(t[0]): t[2] for t in data[::2][1:]}
    return id_mapping, topic_mapping


def long_to_short_topic_ids(topics, id_mapping):
    new_topics = []
    for topic in topics:
        if topic in id_mapping:
            new_topics.append(id_mapping[topic])
    return new_topics

### Model Training

In [8]:
def train_model(xs, ys, vocab_size, max_length, n_batch=500, n_epochs=24):

    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Lambda(lambda x: K.sum(x, axis=1), input_shape=(max_length, vocab_size)))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(ys.shape[1], activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(xs, ys, batch_size=n_batch, epochs=n_epochs)
    return model

## Classifier Training

### Process Input Data

In [12]:
topicfile = "nyt-theme-tags.csv"
id_mapping, topic_mapping = get_mapping(topicfile)

infile = "../../nyt_corpus/NYTcorpus.p"
articles = pickle.load(open(infile, "rb"))

texts = [a[2] for a in articles[1:]]
long_topics = [list(map(int, a[3:])) for a in articles[1:]]
topics = []
for topic in long_topics:
    topics.append(long_to_short_topic_ids(topic, id_mapping))
    
num_examples = len(texts)
val_index = (7 * num_examples) // 8

train_texts = texts[0:val_index]
train_labels = topics[0:val_index]

test_texts = texts[val_index:]
test_labels = topics[val_index:]

In [None]:
# Just for an example we'll train on 100 examples and test on 50
train_texs = train_texts[:100]
train_labels = train_labels[:100]

test_texts = test_texts[:50]
test_labels = test_labels[:50]

### Set Parameters

In [9]:
num_topics = 594
embed_dim = 500
context_size = 5
hidden_size = 64
learning_rate = 0.001
n_epochs = 3
n_batch = 500
max_length = 25000

### Build Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit(train_texts)
    
vocab_size = len(tokenizer.vocabulary)
    
word_to_token = tokenizer.get_word_to_token()
word_to_token.pop(',', None)
w = csv.writer(open("example/tokenizer.csv", "w"))
for key, val in word_to_token.items():
    w.writerow([key, val])

### Load in Tokenizer

In [None]:
tokenizer_path = "example/tokenizer.csv"
word_to_token = pd.read_csv(tokenizer_path, header=None, index_col=0, squeeze=True).to_dict()
vocab_size = len(word_to_token)
tokenized_corpus = tokenize(train_texts, word_to_token)

### Prepare X_train and X_test

In [None]:
X_train = tokenized_corpus
for i, article in enumerate(X_train):
    if len(article) < max_length:
        padded_article = article + [0]*(max_length - len(article))
    else:
        padded_article = article[:max_length]
    X_train[i] = padded_article
    
X_test = tokenize(test_texts, word_to_token)
for i, article in enumerate(X_test):
    if len(article) < max_length:
        padded_article = article + [0]*(max_length - len(article))
    else:
        padded_article = article[:max_length]
    X_test[i] = padded_article

### Prepare Y_train and Y_test

In [None]:
Y_train = []
for point in train_labels:
    Y_train.append(create_one_hot(point, num_topics))
Y_train = np.array(Y_train)

Y_test = []
for point in test_labels:
    Y_test.append(create_one_hot(point, num_topics))
Y_test = np.array(Y_test)

### Train and Save the Classifier

In [None]:
model = train_model(X_train, Y_train, vocab_size, max_length, 250, 6)
model.save("example/topic_classifier.h5")

### Convert Model Output to Topic Predictions

In [10]:
topic_threshold = 0.25

def probabilities_to_onehot(probabilities):

    probabilities[np.isnan(probabilities)] = 0

    predicted_topics = []
    for prob in probabilities:
        all_topics = []
        topics = np.argwhere(prob > topic_threshold)
        for topic in topics:
            all_topics.append(topic[0])
        predicted_topics.append(all_topics)
    return predicted_topics

### Compute Evaluation Metrics for Model

In [None]:
def false_eval(actual_topics, probabilities):
    predicted_topics = probabilities_to_onehot(probabilities)
    total_actual_topics = 0
    false_positives = 0
    true_positives = 0

    for i in range(len(actual_topics)):
        total_actual_topics += len(actual_topics[i])
        for topic in predicted_topics[i]:
            if str(topic) in actual_topics[i]:
                true_positives += 1
            else:
                false_positives += 1

    false_negatives = total_actual_topics - true_positives
    return total_actual_topics, false_negatives, false_positives

total, f_n, f_p = false_eval(Y_test, model.predict(X_test))
print("Total topics:", total)
print("False Negatives:", f_n)
print("False Positives:", f_p)