In [1]:
import pandas as pd 
import numpy as np

import sklearn 
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatStratifiedKFold
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt


In [2]:
data_copy = pd.read_csv('processed_data.csv')
data = data_copy.copy()

In [3]:
del data['title']
del data['text']
del data['text_without_stopwords']
del data['title_without_stopwords']
del data['overall_content']
del data['polarity_category']
del data['polarity_category_Neutral']
del data['polarity_category_Positive']

In [4]:
data

Unnamed: 0,class,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,syllables,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
0,1,516,13,28,1,4.804040,5.583333,121,1,186,1,870,0.523532,0.599895,0.082132,0.002194,0.747636,0.001007,0.157660,0.091503
1,1,309,9,11,1,5.213115,7.625000,39,0,119,0,565,-0.005318,0.334098,-0.005004,0.064904,0.244962,0.557051,0.002320,0.130763
2,1,600,16,25,1,5.168966,5.000000,148,0,209,0,1048,0.262617,0.541969,-0.012345,0.002488,0.433611,0.281460,0.001917,0.280524
3,1,475,15,15,1,5.180180,4.571429,118,2,160,0,805,0.180632,0.394086,-0.023118,0.002963,0.788261,0.204377,0.002290,0.002109
4,1,434,12,19,1,4.554762,5.363636,40,0,195,0,688,0.621461,0.495222,-0.011722,0.292172,0.327938,0.001138,0.020911,0.357842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38653,0,482,9,15,1,5.008639,5.888889,68,4,179,1,818,0.164343,0.447884,0.219279,0.914667,0.001589,0.001079,0.081064,0.001601
38654,0,131,7,6,1,5.336066,6.571429,15,0,45,1,243,0.094151,0.077778,0.022222,0.131316,0.004004,0.530149,0.330457,0.004075
38655,0,334,7,16,1,5.044164,6.142857,45,0,127,2,587,0.317218,0.426609,0.054382,0.105075,0.002268,0.001528,0.172448,0.718681
38656,0,210,9,8,1,4.806931,5.888889,18,0,81,2,349,0.380865,0.377753,0.021993,0.957421,0.003175,0.002141,0.003484,0.033779


In [5]:
Y = data['class']
X = data[data.columns[data.columns != 'class']]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 4222)

### Attempt 1: Gaussian NB with all features

In [7]:
GausNB = GaussianNB()
GausNB.fit(x_train, y_train)

y_pred = GausNB.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.8707535782031385


### Attempt 2: Gaussian NB without title features

In [8]:
testing = data.copy()

unwanted_columns = ['title_word_count', 'title_sentence_count', 'title_average_count', 
                    'title_punctuation_count', 'title_stopwords_count']
select = [x for x in testing.columns if x not in unwanted_columns]
new_data = testing.loc[:, select]

Y_new = new_data['class']
X_new = new_data[new_data.columns[new_data.columns != 'class']]

In [10]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(X_new, Y_new, test_size = 0.3, random_state = 4222)

In [11]:
GausNB.fit(x_train2, y_train2)

y_pred2 = GausNB.predict(x_test2)
print(accuracy_score(y_test2, x_pred2))

0.7418520434557683


### Attempt 3: Remove correlated features

Naive Bayes' assumes all features are independent of one another, which in the ideal case means every pair of features has zero correlation. While this assumption may be hard to hold in practice, we can avoid using features which have high correlation with one another. 

In the context of our problem, we shall define to have high correlation if the absolute value of their correlation coefficient is greater than 0.9

In [12]:
corr_matrix = X.corr()
rows, columns = X.shape
fields = list(X.columns)
corr_matrix

Unnamed: 0,text_word_count,title_word_count,text_sentence_count,title_sentence_count,text_average_word_length,title_average_word_length,text_punctuation_count,title_punctuation_count,text_stopwords_count,title_stopwords_count,syllables,flesch_readability,subjectivity,polarity,Topic 1 Probability,Topic 2 Probability,Topic 3 Probbility,Topic 4 Probability,Topic 5 Probability
text_word_count,1.0,0.063527,0.894339,-0.029711,-0.048997,-0.021199,0.665547,-0.042165,0.984571,-0.02241,0.987307,0.016769,0.099872,0.007767,-0.098814,-0.032234,0.017043,0.07219,0.066297
title_word_count,0.063527,1.0,0.040535,0.220464,-0.042884,-0.228843,0.07924,0.214594,0.103624,-0.144684,0.034791,0.061855,0.235157,0.001635,-0.272847,0.332102,-0.053517,-0.19825,0.150961
text_sentence_count,0.894339,0.040535,1.0,-0.031282,-0.0553,-0.020864,0.634969,-0.029388,0.88947,-0.003239,0.878777,0.100866,0.120809,0.013445,-0.108822,0.026285,-0.033102,0.051261,0.068415
title_sentence_count,-0.029711,0.220464,-0.031282,1.0,0.024298,-0.013625,-0.007821,0.244355,-0.025901,-0.102891,-0.033719,0.004931,0.073178,0.033625,-0.119914,0.123447,0.009688,-0.054261,0.030306
text_average_word_length,-0.048997,-0.042884,-0.0553,0.024298,1.0,0.558311,0.028104,0.109214,-0.062622,0.014457,-0.029205,-0.927928,-0.145636,-0.029505,0.000739,-0.028234,0.034738,0.02225,-0.018975
title_average_word_length,-0.021199,-0.228843,-0.020864,-0.013625,0.558311,1.0,-0.018248,0.135039,-0.028981,-0.12362,-0.011523,-0.585251,-0.065464,-0.021178,-0.001342,-0.057816,0.046248,0.022032,0.012804
text_punctuation_count,0.665547,0.07924,0.634969,-0.007821,0.028104,-0.018248,1.0,0.000376,0.638032,-0.051458,0.738432,-0.079751,0.098148,0.013066,-0.102783,0.039453,-0.007973,0.029496,0.045121
title_punctuation_count,-0.042165,0.214594,-0.029388,0.244355,0.109214,0.135039,0.000376,1.0,-0.045602,-0.011577,-0.045741,-0.08615,0.052383,0.009086,0.028941,0.086718,-0.017807,-0.097282,-0.026863
text_stopwords_count,0.984571,0.103624,0.88947,-0.025901,-0.062622,-0.028981,0.638032,-0.045602,1.0,-0.052992,0.966385,0.036165,0.127285,0.010435,-0.114795,0.009665,0.006835,0.037399,0.077152
title_stopwords_count,-0.02241,-0.144684,-0.003239,-0.102891,0.014457,-0.12362,-0.051458,-0.011577,-0.052992,1.0,-0.008752,-0.011102,-0.238253,-0.01402,0.335501,-0.321583,0.03996,0.065349,-0.081309


In [13]:
fields

['text_word_count',
 'title_word_count',
 'text_sentence_count',
 'title_sentence_count',
 'text_average_word_length',
 'title_average_word_length',
 'text_punctuation_count',
 'title_punctuation_count',
 'text_stopwords_count',
 'title_stopwords_count',
 'syllables',
 'flesch_readability',
 'subjectivity',
 'polarity',
 'Topic 1 Probability',
 'Topic 2 Probability',
 'Topic 3 Probbility',
 'Topic 4 Probability',
 'Topic 5 Probability']

In [14]:
correlation_values = corr_matrix.values
for i in range(columns):
    for j in range(i+1, columns): #correlation matrix is symmetric
        if abs(correlation_values[i,j]) > 0.9:
            print(fields[i] + " and " + fields[j])
        else:
            pass

text_word_count and text_stopwords_count
text_word_count and syllables
text_average_word_length and flesch_readability
text_stopwords_count and syllables


From the code above, it appears we have several features which have high correlations. They are 

1. text_word_count 
2. text_stopwords_count 
3. syllables 
4. text_average_word_length 
5. flesch_readability 

In [15]:
features_to_remove = ['text_word_count', 'text_stopwords_count', 'syllables',
                      'text_average_word_length', 'flesch_readability']
filtered_topics = [x for x in testing.columns if x not in features_to_remove]
filtered_data = testing.loc[:, filtered_topics]

In [16]:
X_updated = filtered_data[filtered_data.columns[filtered_data.columns != 'class']]
Y_updated = filtered_data['class']

In [17]:
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_updated, Y_updated, test_size = 0.3, random_state = 4222)

In [18]:
GausNB.fit(X_train3, Y_train3)

y_pred3 = GausNB.predict(X_test3)
print(accuracy_score(Y_test3, y_pred3))

0.902655630281083


We now repeat the model training as per what we did above, but this time finding best var_smoothing parameter. In the Gaussian Naive Bayes' model above, we used the default parameter value of $10^{-9}$.

In [48]:
cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=5, 
                                    random_state=4222)

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_NB = GridSearchCV(estimator= GausNB,
                     param_grid=params_NB, 
                     cv=cv_method,
                     verbose=1, 
                     scoring='accuracy')

gs_NB.fit(X_train3, Y_train3)
gs_NB.best_params_

Fitting 25 folds for each of 100 candidates, totalling 2500 fits


{'var_smoothing': 2.310129700083158e-05}

In [51]:
y_pred4 = gs_NB.predict(X_test3)
print("Accuracy: " + str(accuracy_score(Y_test3, y_pred4)))

precision = metrics.precision_score(Y_test3, y_pred4)
recall = metrics.recall_score(Y_test3, y_pred4)
f1_score = metrics.f1_score(Y_test3, y_pred4)

print("Precision: " + str(precision))
print("Recall: " + str(recall))
print("F1-Score: " + str(f1_score))

Accuracy: 0.9180031039834454
Precision: 0.9392812887236679
Recall: 0.8737752161383285
F1-Score: 0.9053448790683787
