<center>
    <H1> NAIVE BAYES CLASSIFIER </H1>
    <br>

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
dataset = pd.read_csv('data/twitter_dataset.csv', encoding = 'latin-1')  #load data from csv
dataset.head()

Unnamed: 0,name_wt,statuses_count,followers_count,friends_count,favourites_count,listed_count,label
0,0.6,195,19,53,58,0,0
1,0.705882,9,67,555,2,1,0
2,0.916667,20,21,267,0,0,1
3,0.5,28,16,325,0,0,1
4,0.733333,45,20,515,0,0,1


In [3]:
dataset.shape

(2818, 7)

In [4]:
features=[]
for attributes in dataset.columns:
    if attributes != 'label':
        features.append(attributes)
features

['name_wt',
 'statuses_count',
 'followers_count',
 'friends_count',
 'favourites_count',
 'listed_count']

In [5]:
#Creating a 2D matrix 
data = dataset.values

In [6]:
print("Total instances : ", data.shape[0], "\nNumber of features : ", data.shape[1])

Total instances :  2818 
Number of features :  7


In [7]:
#convert label column into 1D arrray
label = np.array(dataset['label'])

In [8]:
'''
    We have X_train, y_train, X_test, y_test.
    Using these lists and dataframes we will randomly create two non-overlapping datasets 
        1. training set
        2. testing set
'''

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0)

print("Number of training instances: ", X_train.shape[0])
print("Number of testing instances: ", X_test.shape[0])

Number of training instances:  2254
Number of testing instances:  564


In [9]:
# Generate the model
nb_model = GaussianNB()

# Train the model using the training sets
data = X_train
label = y_train

nb_model.fit(data, label)

GaussianNB()

In [10]:
nb_model.predict([X_test[1]])    #testing for single instance

array([0])

In [11]:
'''
   Now, apply the model to the entire test set and predict the label for each test example

'''       
       
y_predict = []                       #to store prediction of each test example

for test_case in range(len(X_test)): 
    label = nb_model.predict([X_test[test_case]])
    
    #append to the predictions list
    y_predict.append(label.item())

#predictions

In [12]:
#true negatives is C(0,0), false negatives is C(1,0), false positives is C(0,1) and true positives is C(1,1) 
conf_matrix = confusion_matrix(y_test, y_predict)

In [13]:
#true_negative
TN = conf_matrix[0][0]
#false_negative
FN = conf_matrix[1][0]
#false_positive
FP = conf_matrix[0][1]
#true_positive
TP = conf_matrix[1][1]

In [14]:
# Recall is the ratio of the total number of correctly classified positive examples divided by the total number of positive examples. 
# High Recall indicates the class is correctly recognized (small number of FN)

recall = (TP)/(TP + FN)

In [15]:
# Precision is the the total number of correctly classified positive examples divided by the total number of predicted positive examples. 
# High Precision indicates an example labeled as positive is indeed positive (small number of FP)

precision = (TP)/(TP + FP)

In [16]:
fmeasure = (2*recall*precision)/(recall+precision)   #f-measure is the harmonice mean of Recall and Precision
accuracy = (TP + TN)/(TN + FN + FP + TP) #Total number of correct predictions divided by total number of instances predicted

accuracy_score(y_test, y_predict)

0.875886524822695

In [17]:
print("------ CLASSIFICATION PERFORMANCE OF THE NAIVE BAYES MODEL ------ \n"\
      "\n Recall : ", (recall*100) ,"%" \
      "\n Precision : ", (precision*100) ,"%" \
      "\n Accuracy : ", (accuracy*100) ,"%" \
      "\n F-measure : ", (fmeasure*100) ,"%" )


------ CLASSIFICATION PERFORMANCE OF THE NAIVE BAYES MODEL ------ 

 Recall :  98.56630824372759 %
 Precision :  80.64516129032258 %
 Accuracy :  87.58865248226951 %
 F-measure :  88.70967741935483 %
