# Bag of Words Classifier

Here is a bag-of-words classifier for news topics. This model is a simple logistic regression model using bag of words. Before considering whether we need to use a more complex ML algorithm, let's try using a simple classifier for the news data.

In [45]:
import numpy as np
import pandas as pd
import string
import time

from nltk.tokenize.regexp import WordPunctTokenizer
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split


## Load the Data

In [46]:
# The dataset provided is malformed JSON. Need to fix up the JSON formatting
# so that it can be ingested by pandas.

with open('./News_Category_Dataset_v2.json') as file:
    lines = file.readlines()
    json = f'[{",".join(lines)}]'


In [47]:
data = pd.read_json(json, orient='records')

## Downsample

In [48]:
def downsample(df, c):
    category_counts = df['category'].value_counts()
    min_count = category_counts.min()

    # Calculate the probability of keeping a row
    # of a given category.
    category_probs = (min_count / category_counts) ** (1/c)

    # This is a series used to determine the probability that each
    # row is kept. Each rows mask depends on its category.
    prob_mask = np.zeros(len(df))

    for i, category in enumerate(category_counts.index.tolist()):
        category_prob = category_probs[i]
        category_keep_mask = (df['category'] == category) * category_prob
        prob_mask = prob_mask + category_keep_mask

    keep_mask = np.random.rand(len(df)) <= prob_mask
    
    return df[keep_mask]


In [49]:
data = downsample(data, c=2)
data = data.sample(frac=0.6)

## Pre-Processing

In [50]:
tokenizer = WordPunctTokenizer()

# Special token for tokens that occur MIN_WORD_FFEQ or fewer times in the
# entire corpus.
__LOW_FREQ_TOKEN__ = '__LOW_FREQ_TOKEN__'

MIN_WORD_FREQ = 5

In [51]:
def cleanup_and_tokenize_text(text):
    cleaned = ''.join([c for c in text if c not in string.punctuation]).lower()
    return tokenizer.tokenize(cleaned)


In [52]:
def tokenize_rows(df):
    tokenized_headlines = df['headline'].apply(cleanup_and_tokenize_text).tolist()
    tokenized_desc = df['short_description'].apply(cleanup_and_tokenize_text).tolist()

    return [tokens1 + tokens2 for tokens1, tokens2 in zip(tokenized_headlines, tokenized_desc)]
    

In [53]:
def create_unigram_counts(rows):
    # Flatten
    tokens = [t for tokens in rows for t in tokens]
    
    counts = {}

    for token in tokens:
        if token not in counts:
            counts[token] = 0
        counts[token] += 1

    return counts
 

In [54]:
def create_encoder_and_decoder(unigram_counts):
    encoder = {t:i for i,t in enumerate(unigram_counts.keys())}
    decoder = {i:t for t,i in encoder.items()}
    
    return encoder, decoder
   

In [55]:
def create_bow_dataframe(encoded_token_rows, encoder, decoder):
    bows = np.zeros((len(encoded_token_rows), len(encoder)))

    for i, encoded_tokens in enumerate(encoded_token_rows):
        for encoded in encoded_tokens:
            bows[i, encoded] += 1

    columns = [decoder[i] for i in range(len(decoder))]
    bows_sparse = sparse.csr_matrix(bows)
    df = pd.DataFrame.sparse.from_spmatrix(bows_sparse, columns=columns)
    
    return df
   

In [56]:
start_time = time.time()

print('[1/7] Tokenizing rows...')
token_rows = tokenize_rows(data)

print('[2/7] Generating global unigram count...')
unigram_counts = create_unigram_counts(token_rows)

print('[3/7] Filtering out low-frequency words...')
token_rows = [[token if unigram_counts[token] > MIN_WORD_FREQ else __LOW_FREQ_TOKEN__ for token in tokens] for tokens in token_rows]

print('[4/7] Re-computing unigram counts...')
unigram_counts = create_unigram_counts(token_rows)
print(f'      NOTE: There are {len(unigram_counts)} tokens in the vocab.')

print('[5/7] Create encoder / decoder...')
encoder, decoder = create_encoder_and_decoder(unigram_counts)

print('[6/7] Encoding Token Rows...')
encoded_token_rows = [[encoder[t] for t in tokens] for tokens in token_rows]

print('[7/7] Creating Bag Of Words DataFrame...')
data_bow = create_bow_dataframe(encoded_token_rows, encoder, decoder)

end_time = time.time()

print('Done!')
print(f'Ran in {(end_time - start_time)/60:.02f}m')


[1/7] Tokenizing rows...
[2/7] Generating global unigram count...
[3/7] Filtering out low-frequency words...
[4/7] Re-computing unigram counts...
      NOTE: There are 15364 tokens in the vocab.
[5/7] Create encoder / decoder...
[6/7] Encoding Token Rows...
[7/7] Creating Bag Of Words DataFrame...
Done!
Ran in 0.36m


## Splitting Data

In [57]:
labels = data['category']

start_time = time.time()

# X_train, X_test, y_train, y_test = train_test_split(data_bow, labels, test_size=0.4)
X_train = data_bow
y_train = labels

end_time = time.time()

print('Done!')
print(f'Ran in {(end_time - start_time)/60:.02f}m')
print(f'There are {len(X_train)} training samples.')

Done!
Ran in 0.00m
There are 49449 training samples.


## Training and Cross-Validating the Model

In [58]:
models = [
#     LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1e1, max_iter=4000, n_jobs=-1),
    LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1e2, max_iter=4000, n_jobs=-1),
#     LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1e3, max_iter=4000, n_jobs=-1),
]

start_time = time.time()

scores = []

for i, model in enumerate(models):
    print(f'Training and validating model {i+1} / {len(models)} ...')
    model.fit(X_train, y_train)
    mean_score = model.score(X_train, y_train)
#     mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores.append(mean_score)

best_model = models[scores.index(max(scores))]

end_time = time.time()

print(f'Best model is #{scores.index(max(scores)) + 1}')
print(f'Total training time: {(end_time - start_time)/60:.02f}m')


Training and validating model 1 / 1 ...
Best model is #1
Total training time: 229.23m


In [59]:
predictions = model.predict(X_train)

correct = (predictions == y_train).sum()
total = len(predictions)

print(f'Train Accuracy: {float(correct)/total*100:.02f}%')


Train Accuracy: 99.99%


## Persisting the Model

In [63]:
from joblib import dump, load

In [65]:
dump(model, 'bag_of_words_model.joblib')

['bag_of_words_model.joblib']

## Viewing the Results

In [62]:

model.get_params()


array([[-4.72688458e-04,  4.58161293e-01, -8.26945159e-02, ...,
        -6.03456905e-03, -3.52710947e-02, -1.25955779e-04],
       [ 3.32865880e-01,  3.29511216e-01, -7.68876380e-01, ...,
        -3.09979825e-04, -1.40003729e-02, -4.12741696e-03],
       [ 2.95439415e-02, -3.59347440e-01, -5.04046730e-01, ...,
        -2.35206418e-01, -1.89577439e-02, -5.02090778e-03],
       ...,
       [-2.96556104e-01, -1.38057763e+00,  2.22909983e+00, ...,
         2.79526415e-01, -1.31516114e-01, -2.90846068e-03],
       [ 7.28923194e-02, -5.57498442e-01,  5.39772366e-01, ...,
        -4.79895911e-03, -8.76729519e-04, -6.89602812e-04],
       [-1.70823405e-01, -6.24261865e-01, -3.62624733e-01, ...,
        -1.02379076e-02, -1.42426855e-01, -3.59644559e-04]])

In [60]:
predictions = model.predict(X_test)

correct = (predictions == y_test).sum()
total = len(predictions)

print(f'Test Accuracy: {float(correct)/total*100:.02f}%')


ValueError: X has 20423 features per sample; expecting 15364