# Bag of Words Classifier

Here is a bag-of-words classifier for news topics. This model is a simple logistic regression model using bag of words. Before considering whether we need to use a more complex ML algorithm, let's try using a simple classifier for the news data.

In [123]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import time

from joblib import dump, load
from nltk.tokenize.regexp import WordPunctTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split


## Load the Data

In [124]:
start_time = time.time()

data = pd.read_pickle('./data/train_data_mini.pickle')
labels = pd.read_pickle('./data/train_labels_mini.pickle')

end_time = time.time()

print('Done!')
print(f'Ran in {(end_time - start_time)/60:.02f}m')


Done!
Ran in 0.45m


In [125]:
train_data, valid_data, train_labels, valid_labels = train_test_split(data, labels, test_size=.2)

## Training and Cross-Validating the Model

In [126]:
n_cats = 20
n_per_cat = 20

cats = train_labels.value_counts().iloc[:n_cats]

# n items from each category.
mask = np.zeros(len(train_labels))

for cat in cats.index:
    cat_mask = (train_labels == cat).values
    index_values = train_data[cat_mask].index.values

    cut_off = index_values[n_per_cat]
    
    mask = mask + (cat_mask & (train_data.index.values < cut_off))

data_tmp = train_data[mask.astype(bool)]
labels_tmp = train_labels[mask.astype(bool)]

In [None]:
models = [
    LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1e1, max_iter=4000, n_jobs=-1),
    LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1e2, max_iter=4000, n_jobs=-1),
    LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1e3, max_iter=4000, n_jobs=-1),
]

start_time = time.time()

valid_scores = []

for i, model in enumerate(models):
    print(f'Training and validating model {i+1} / {len(models)} ...')

    model.fit(data_tmp, labels_tmp)
    valid_score = model.score(valid_data, valid_labels)
    valid_scores.append(valid_score)

best_model_index = valid_scores.index(max(valid_scores))
best_model = models[best_model_index]

end_time = time.time()

print(f'Best model is #{best_model_index + 1}')
print(f'Total training time: {(end_time - start_time)/60:.02f}m')


Training and validating model 1 / 3 ...


## Viewing the Results

In [None]:
valid_preds = best_model.predict(valid_data)


In [None]:
correct = np.sum(predictions == valid_labels)
total = len(valid_labels)

print(f'Validation accuracy of {(correct / total)*100:.02f}%')


In [None]:
def show_confusion_matrix(labels, predictions):
    # Displaying a confusion matrix of the validation results for our model.

    categories = labels.unique()
    category_encoder = { c:i for i,c in enumerate(categories) }

    confusion_matrix = np.random.rand(len(categories), len(categories))

    for i, category in enumerate(categories):
        row = np.zeros(len(categories))

        cat_mask = (labels == category).tolist()
        cat_preds = predictions[cat_mask]
        
        for category in categories:
            pred_count = np.sum(cat_preds == category)
            row[category_encoder[category]] = pred_count
            
        confusion_matrix[i, :] = row / len(cat_preds)


    fig = plt.figure()
    ax = fig.add_subplot(111)

    cax = ax.matshow(confusion_matrix)

    fig.colorbar(cax)

    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    return category_encoder


In [None]:
show_confusion_matrix(valid_labels, valid_preds)

## Persisting the Model

In [None]:
dump(best_model, 'models/bag_of_words_model.joblib')