# TTDS Lecture 18: Practical

Instructor: Björn Ross 17/11/2021

Created by Steve Wilson November 2020, modified by Björn Ross November 2021

## Let's build a text classifier!

### 1. Setup

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
# some prereqs:
import collections

# regular expressions
import re

# for string.punctuation: list of punctuation characters
import string

# import this for storing our BOW format
import scipy
from scipy import sparse

# scikit learn. Contains lots of ML models we can use
# import the library for support vector machines
from sklearn import svm
from sklearn import ensemble
from sklearn.metrics import classification_report

# numpy for more easily storing multidimensional data
import numpy as np

**Note:**
* Any package in the Python standard library (https://docs.python.org/3/library/) can be used in the coursework.
* Only use sklearn for the classification models! You are **not** allowed to use the `sklearn.feature_extraction` or `sklearn.preprocessing` components for the coursework.

### 2. Check the data format

In [None]:
# check out the data (use ! for command line operation)
!cat Tweets.14cat.train | head -5

### 3. Load and preprocess

In [None]:
# load our data
training_data = open('Tweets.14cat.train',encoding="latin-1").read()
test_data     = open('Tweets.14cat.test',encoding="latin-1").read()
# we will save the testing data for later...

In [None]:
string.punctuation

In [None]:
# example of how the tokenization part will work
# q: what important features might this remove?
invalid_chars = re.compile(f'[{string.punctuation}]')
invalid_chars.sub('',"Hello, World! #Tweets").lower().split()

In [None]:
# convert to list of lists: documents containing tokens
# and return the list of categories
# also get the vocabulary
def preprocess_data(data):
    
    chars_to_remove = re.compile(f'[{string.punctuation}]')
    
    documents = []
    categories = []
    vocab = set([])
    
    lines = data.split('\n')
    
    for line in lines:
        # make a dictionary for each document
        # word_id -> count (could also be tf-idf score, etc.)
        line = line.strip()
        if line:
            # split on tabs, we have 3 columns in this tsv format file
            tweet_id, tweet, category = line.split('\t')

            # process the words
            words = chars_to_remove.sub('',tweet).lower().split()
            for word in words:
                vocab.add(word)
            # add the list of words to the documents list
            documents.append(words)
            # add the category to the categories list
            categories.append(category)
            
    return documents, categories, vocab

In [None]:
%time
# ^ see how long this takes
# preprocess the data
preprocessed_training_data, training_categories, train_vocab = preprocess_data(training_data)
preprocessed_test_data, test_categories, test_vocab = preprocess_data(test_data)

print(f"Training Data has {len(preprocessed_training_data)} " +
      f"documents and vocab size of {len(train_vocab)}")
print(f"Test Data has {len(preprocessed_test_data)} " +
      f"documents and vocab size of {len(test_vocab)}")
print(f"There were {len(set(training_categories))} " +
      f"categories in the training data and {len(set(test_categories))} in the test.")

In [None]:
# check the most common categories in the training data
print(collections.Counter(training_categories).most_common())

### 4. Set up mappings for word and category IDs

In [None]:
# convert the vocab to a word id lookup dictionary
# anything not in this will be considered "out of vocabulary" OOV
word2id = {}
for word_id,word in enumerate(train_vocab):
    word2id[word] = word_id
    
# and do the same for the categories
cat2id = {}
for cat_id,cat in enumerate(set(training_categories)):
    cat2id[cat] = cat_id
    
print("The word id for dog is",word2id['dog'])
print("The category id for Pets & Animals is",cat2id['Pets & Animals'])

### 5. Convert data to bag-of-words format

In [None]:
# build a BOW representation of the files: use the scipy 
# data is the preprocessed_data
# word2id maps words to their ids
def convert_to_bow_matrix(preprocessed_data, word2id):
    
    # matrix size is number of docs x vocab size + 1 (for OOV)
    matrix_size = (len(preprocessed_data),len(word2id)+1)
    oov_index = len(word2id)
    # matrix indexed by [doc_id, token_id]
    X = scipy.sparse.dok_matrix(matrix_size)

    # iterate through all documents in the dataset
    for doc_id,doc in enumerate(preprocessed_data):
        for word in doc:
            # default is 0, so just add to the count for this word in this doc
            # if the word is oov, increment the oov_index
            X[doc_id,word2id.get(word,oov_index)] += 1
    
    return X

In [None]:
%%time 
X_train = convert_to_bow_matrix(preprocessed_training_data, word2id)

In [None]:
# check some docs
print("First 3 documents are:",X_train[:3])

In [None]:
# these are the labels to predict
y_train = [cat2id[cat] for cat in training_categories]
# check the first 3 categories
print(y_train[:3])

### 6. Train an SVM model

In [None]:
# Let's train a model: now that the setup is done, it's a piece of cake!
%time
# instantiate a linear SVM classification model
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
# you can set various model hyperparamters here
model = sklearn.svm.LinearSVC(C=1000)
# then train the model!
model.fit(X_train,y_train)

In [None]:
# make a prediction
sample_text = ['retweet','if','you','are','a','cat','person']
# create just a single vector as input (as a 1 x V matrix)
sample_x_in = scipy.sparse.dok_matrix((1,len(word2id)+1))
for word in sample_text:
    sample_x_in[0,word2id[word]] += 1

# what does the example document look like?
print(sample_x_in)
prediction = model.predict(sample_x_in)
# what category was predicted?
print("Prediction was:",prediction[0])
# what category was that?
print(cat2id)

### 7. Evaluating the model

In [None]:
# evaluate on training data: how well did we fit to the data we trained on?
y_train_predictions = model.predict(X_train)

# now can compute any metrics we care about. Let's quickly do accuracy
def compute_accuracy(predictions, true_values):
    num_correct = 0
    num_total = len(predictions)
    for predicted,true in zip(predictions,true_values):
        if predicted==true:
            num_correct += 1
    return num_correct / num_total

accuracy = compute_accuracy(y_train_predictions,y_train)
print("Accuracy:",accuracy)
# how did we do?

Is that a good score? The score can be informative, but it isn't hard to do well on the training data.

### 8. Using the test set

In [None]:
# prepare test data in the same was as training data
X_test = convert_to_bow_matrix(preprocessed_test_data, word2id)
y_test = [cat2id[cat] for cat in test_categories]

In [None]:
# now evaluate on test data: data the model has NOT seen during training time
# make sure you do NOT update the model, only get predictions from it
y_test_predictions = model.predict(X_test)
accuracy = compute_accuracy(y_test_predictions,y_test)
print("Accuracy:",accuracy)

In [None]:
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(y_test, y_test_predictions, target_names=cat_names))

In [None]:
# what would a simple baseline be? How about most common category from before (Gaming)?
# we should *definitely* be doing better than this! Otherwise the model is not helping at all
baseline_predictions = [cat2id['Gaming']] * len(y_test)
baseline_accuracy = compute_accuracy(baseline_predictions,y_train)
print("Accuracy:",baseline_accuracy)

In [None]:
# trying a different model...
# how about a random forest classifier?
%time
model = sklearn.ensemble.RandomForestClassifier()
model.fit(X_train,y_train)

y_train_predictions = model.predict(X_train)
print("Train accuracy was:",compute_accuracy(y_train_predictions,y_train))
y_test_predictions = model.predict(X_test)
print("Test accuracy was:",compute_accuracy(y_test_predictions,y_test))

In [None]:

cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(y_test, y_test_predictions, target_names=cat_names))

### 9. Other models to try?

check out all of the multiclass ready models! 
https://scikit-learn.org/stable/modules/multiclass.html