In [2]:
import numpy as np
import pandas as pd
import sys

from Preprocessing import prepare_data
from Preprocessing import topics_to_num

import time
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
from sklearn.metrics import balanced_accuracy_score 


In [3]:
# Pre-processing.

# Get the training data.
train_data = pd.read_csv("training.csv")
test_data = pd.read_csv("test.csv")


# Pre-process the training data.
train_X, val_X, train_labels, val_labels, num_classes, topic_map = prepare_data()

# Pre-process for counts
vect = CountVectorizer()

X_train_vect = vect.fit_transform(train_data['article_words'])
y_train = train_data['topic'].apply(lambda x: topic_map[x])

X_test_vect = vect.transform(test_data['article_words'])
y_test = test_data['topic'].apply(lambda x: topic_map[x])

<b> Feature Selection using Mutual Information </b>

In [3]:
#selects top 10k best features based on mutual_info metric
selector_1 = SelectKBest(mutual_info_classif, k=min(10000, X_train_vect.shape[1])) 
selector_1.fit(X_train_vect, y_train)

#Transform features
X_train_mic = selector_1.transform(X_train_vect)
X_test_mic = selector_1.transform(X_test_vect)

In [4]:
#selects top 20k best features based on mutual_info metric
selector_2 = SelectKBest(mutual_info_classif, k=min(20000, X_train_vect.shape[1])) 
selector_2.fit(X_train_vect, y_train)

#Transform features
X_train_mic_2 = selector_2.transform(X_train_vect)
X_test_mic_2 = selector_2.transform(X_test_vect)


In [9]:
##Results 

# Time the total execution.
total_time = datetime.now()

# Implement Multinomial Naive Bayes (MNB) using TFIDF scaling

mnb = MultinomialNB()
# Train the MNB classifier.
mnb.fit(train_X, train_labels)
# Test our MNB results.
train_result = mnb.score(train_X, train_labels)
print("The final score for the training database on the MNB classifier is: ", round(train_result, 4))
test_result = mnb.score(val_X, val_labels)
pred = mnb.predict(val_X)
b_acc_score = balanced_accuracy_score(pred,val_labels)
print("The final score for the test database on the MNB classifier is: ", round(test_result, 4))
print("The final balance accuracy score for the test set on the MNB classifier is: ", round(b_acc_score, 4), '\n')


# Implement Multinomial Naive Bayes (MNB) using counts

count_mnb = MultinomialNB().fit(X_train_vect,y_train)
train_result = count_mnb.score(X_train_vect,y_train)
print("The final score for the training database on the count MNB classifier is: ", round(train_result, 4))

test_result = count_mnb.score(X_test_vect,y_test)
print("The final score for the test database on the count MNB classifier is: ", round(test_result, 4))

pred = count_mnb.predict(X_test_vect)
b_acc_score = balanced_accuracy_score(pred,y_test)
print("The final balance accuracy score for the test set on the count MNB classifier is: ", round(b_acc_score, 4), '\n')

# Implement Multinomial Naive Bayes (MNB) using counts and top 10k mutual info features

mic_mnb = MultinomialNB().fit(X_train_mic,y_train)
train_result = mic_mnb.score(X_train_mic,y_train)
print("The final training score on the top 10k mutual info feature selection MNB classifier is: ", round(train_result, 4))

test_result = mic_mnb.score(X_test_mic,y_test)
print("The final test score on the top 10k mutual info feature selection MNB classifier is: ", round(test_result, 4))

pred = mic_mnb.predict(X_test_mic)
b_acc_score = balanced_accuracy_score(pred,y_test)
print("The final balance accuracy score on the top 10k mutual info feature selection MNB classifier is: ", round(b_acc_score, 4), '\n')

# Implement Multinomial Naive Bayes (MNB) using counts and top 20k mutual info features

mic2_mnb = MultinomialNB().fit(X_train_mic_2,y_train)
train_result = mic2_mnb.score(X_train_mic_2,y_train)
print("\nThe final training score on the top 20k mutual info feature selection MNB classifier is: ", round(train_result, 4))

test_result = mic2_mnb.score(X_test_mic_2,y_test)
print("The final test score on the top 20k mutual info feature selection MNB classifier is: ", round(test_result, 4))

pred = mic2_mnb.predict(X_test_mic_2)
b_acc_score = balanced_accuracy_score(pred,y_test)
print("The final balance accuracy score on the top 20k mutual info feature selection MNB classifier is: ", round(b_acc_score, 4), '\n')

print('\n\nTotal execution time: ', datetime.now() - total_time)

The final score for the training database on the MNB classifier is:  0.6946
The final score for the test database on the MNB classifier is:  0.6779
The final balance accuracy score for the test set on the MNB classifier is:  0.6785 

The final score for the training database on the count MNB classifier is:  0.8233
The final score for the test database on the count MNB classifier is:  0.728
The final balance accuracy score for the test set on the count MNB classifier is:  0.6436 

The final training score on the top 10k mutual info feature selection MNB classifier is:  0.7632
The final test score on the top 10k mutual info feature selection MNB classifier is:  0.68
The final balance accuracy score on the top 10k mutual info feature selection MNB classifier is:  0.5634 


The final training score on the top 20k mutual info feature selection MNB classifier is:  0.8131
The final test score on the top 20k mutual info feature selection MNB classifier is:  0.72
The final balance accuracy scor

