In [None]:
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB
from sklearn.metrics import confusion_matrix, classification_report
from zipfile import ZipFile

# Naive Bayes for Text, Multi-Class

Text data, *without preprocessing*, is qualitative data. Let's use Naive Bayes to classify some text data! Today's data  has more than two classes, so this is multi-class classification rather than binary classification. 

I'm going to be using the news dataset from [here](https://data.world/elenadata/vox-articles). Side note: this data set was released for a workshop in 2017 that I co-organized!

## I. Load and Look at our data

Let's load and __look at our data__. Where is the dependent variable?

This data is big, so I zipped it. Let's look at the first five lines.

In [None]:
with ZipFile('data/dsjVoxArticles.zip') as z:
    with z.open('dsjVoxArticles.tsv', 'r') as tsv:
        lines = [next(tsv) for x in range(5)]
        print(lines)

For efficiency, I'm going to ignore the article bodies and just use the titles. (They would need quite a bit of preprocessing anyway since they contain markup.) So I want the first and third fields.

In [None]:
data = []

with ZipFile('data/dsjVoxArticles.zip') as z:
    with z.open('dsjVoxArticles.tsv', 'r') as tsv:
        for line in tsv:
            cols = line.decode('utf-8').strip().split('\t')[:3]
            data.append([cols[0], cols[2]])

Let's make this into a numpy array and take a look.
* How many data points?
* How many classes?
* What are the classes, anyway?

In [None]:
data = np.array(data)
print(data.shape)
print(np.unique(data[:, 1]))

Well, that's too many classes, and some of them are super specific. Let's just take five pretty generic classes.

In [None]:
reduced_data = data[np.where(np.isin(data[:, 1], ['Business & Finance', 'Health Care', 'Science & Health', 'Politics & Policy', 'Criminal Justice']))]
np.random.shuffle(reduced_data)
print(reduced_data.shape)
print(np.unique(reduced_data[:, 1], return_counts=True))

## II. Split the data

Let's split the data into train, dev and test. 

When we check by printing shapes and unique values, does everything look okay?

In [None]:
train_data, dev_data, test_data = np.split(reduced_data, [int(.8 * len(reduced_data)), int(.9 * len(reduced_data))])
print(train_data.shape, dev_data.shape, test_data.shape)
print(np.unique(train_data[:, 1]), np.unique(dev_data[:, 1]), np.unique(test_data[:, 1]))

## III. Preprocess the data

On Monday we tokenized the data and extracted counts for each token for each class ourselves.

Today I'm going to use two scikit-learn utilities:
* CountVectorizer - will tokenize and count
* LabelEncoder - will map the string labels to ints

As on Monday, I use *only the training data* to extract my token vocabulary.

In [None]:
vectorizer = CountVectorizer(lowercase=True, analyzer='word', max_features=1000)

vectorizer.fit(iter(train_data[:, 0]))
# We have to use np.asarray because sklearn 1.0 doesn't want matrices for naive Bayes
train_processed = np.asarray(vectorizer.transform(iter(train_data[:, 0])).todense())
dev_processed = np.asarray(vectorizer.transform(iter(dev_data[:, 0])).todense())
test_processed = np.asarray(vectorizer.transform(iter(test_data[:, 0])).todense())

encoder = LabelEncoder()
encoder.fit(train_data[:, 1])
train_labels = encoder.transform(train_data[:, 1])
dev_labels = encoder.transform(dev_data[:, 1])
test_labels = encoder.transform(test_data[:, 1])

## IV. Fit, Predict and Score

Today I'm going to compare the performance of several scikit-learn Naive Bayes alternatives on this dataset. If you recall from last week, these variations on Naive Bayes model different *probability distributions* over the training data, rather than using the likelihoods and priors directly.

Although we aren't using our own, hand-written Naive Bayes, you can see that the pattern is the same:
1. Fit
2. Predict
3. Score

With respect to "score", you'll see we are calculating:
* precision
* recall
* F1

*per class*. 

In [None]:
nb = MultinomialNB()
nb.fit(train_processed, train_labels)
pred = nb.predict(dev_processed)
print(classification_report(dev_labels, pred, target_names=encoder.classes_))
print(confusion_matrix(dev_labels, pred))

In [None]:
nb = GaussianNB()
nb.fit(train_processed, train_labels)
pred = nb.predict(dev_processed)
print(classification_report(dev_labels, pred, target_names=encoder.classes_))
print(confusion_matrix(dev_labels, pred))

In [None]:
nb = ComplementNB()
nb.fit(train_processed, train_labels)
pred = nb.predict(dev_processed)
print(classification_report(dev_labels, pred, target_names=encoder.classes_))
print(confusion_matrix(dev_labels, pred))

In [None]:
nb = BernoulliNB()
nb.fit(train_processed, train_labels)
pred = nb.predict(dev_processed)
print(classification_report(dev_labels, pred, target_names=encoder.classes_))
print(confusion_matrix(dev_labels, pred))

## V. Questions

1. What are the definitions for precision, recall and F1, and how do they relate to the confusion matrix?
2. What do "macro avg" and "weighted avg" mean?
3. Which variant of Naive Bayes works the best on this data?
4. Is there a class that is consistently miscategorized regardless of method?
5. Which metric or way of analyzing the results makes the most sense to you? Why?