# 2 Naive Classifier

## 2.0 Imports and Setup

In [1]:
import nltk
import gensim
import itertools
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
df = pd.read_csv('dataframe_raw.temp.csv').drop(columns = ['Unnamed: 0'])
df['headline'] = df['headline'].str.replace('[^a-zA-Z ]', '')
df['headline'] = df['headline'].str.lower()

## 2.1 Preparation

In [3]:
popular_words = pd.DataFrame(columns = ['category', 'words'])
for category in tqdm(set(df.category), position = 0):
    category_headlines = [
        h for h in df[df['category'] == category].headline.tolist() if type(h) == str
    ]
    category_words_by_frequency = Counter(''.join(category_headlines).split(' ')).most_common()
    all_words = [s for s in [tup[0] for tup in category_words_by_frequency] if s]

    words = [
        word for (word, pos) in nltk.pos_tag(all_words) if pos[:2] in ['NN', 'JJ', 'VB']
    ]
    
    popular_words = popular_words.append(
        {'category' : category, 'words' : words}, ignore_index = True
    )

100%|██████████| 41/41 [00:42<00:00,  1.03s/it]


In [4]:
most_common_english_words_dict = nltk.FreqDist(nltk.corpus.brown.words()).most_common()[:1000]
most_common_words = [tup[0].lower() for tup in most_common_english_words_dict]

In [5]:
popular_words['corpus_specific_words'] = [
    [w for w in s if w not in most_common_words] for s in popular_words.words
]

In [6]:
categories_cnt = list(Counter(df.category).most_common())
benchmark = round(
    max([obs[1] for obs in categories_cnt]) / sum([obs[1] for obs in categories_cnt]), 3
)
print('Always predicting the most common category would yield an accuracy of about',
      benchmark * 100, '%.')

Always predicting the most common category would yield an accuracy of about 16.3 %.


In [22]:
min_len = min([len(words) for words in popular_words.corpus_specific_words])
print('The smallest word list has', min_len, 'words.')

The smallest word list has 2365 words.


## 2.2 Naive classifier

In [8]:
def naive_classifier(sample_size, nr_words, reward_strength):
    actual_categories, predicted_categories = [], []
    correct_predictions, false_predictions = 0, 0

    for index, row in df.sample(sample_size).iterrows():
        correct_category, best_category = row.category, [0, 0]
    
        for category in set(df.category):
            top_words = list(
                popular_words.loc[popular_words['category'] == category].\
                corpus_specific_words.tolist()[0])[:nr_words]
            
            reward_score = 0
            length = len(row.headline.split(' ')) if type(row.headline) == str else 100

            if type(row.headline) == str:
                for word in row.headline.split(' '):
                    if word in top_words:
                        reward_score += (nr_words - top_words.index(word)) ** reward_strength
            
            reward_score /= length
            if len(best_category) == 0: best_category = [category, reward_score]
            if reward_score > best_category[1]: best_category = [category, reward_score]
                
        if best_category[0] == correct_category: correct_predictions += 1
        else: false_predictions += 1
        actual_categories.append(correct_category)
        predicted_categories.append(best_category[0])

    accuracy = round(correct_predictions / sample_size * 100, 2)
    actual_predicted = list(zip(actual_categories, predicted_categories))
    
    return (accuracy, actual_predicted)

## 2.3 Classifier evaluation

In [9]:
def evaluate_classifier(sample_size, nrs_words, reward_strengths):
    
    classifier_evaluation = {}
    
    for params in tqdm(list(itertools.product(nrs_words, reward_strengths)), position = 0):
        accuracy, confusion = naive_classifier(sample_size, params[0], params[1])
        classifier_evaluation[(params[0], params[1])] = (accuracy, confusion)

    return sorted(classifier_evaluation.items(), key=lambda x: x[1], reverse=True)

In [32]:
results = evaluate_classifier(1000, [20, 200, 2000], [0.5, 1, 2])
best_accuracy = [(tup[0], tup[1][0]) for tup in results][0][1]
print('The best classifier has a', best_accuracy, '% accuracy.')

The best classifier has a 49.7 % accuracy.
