# Experiment 2 - Naive Bayes Classifier 
#### Data Preprocessing 
#### Train / Test splitting
#### Building Naive Bayes Classifier 
#### Evaluating Classifier

In [1]:


import pandas as pd
from sklearn.model_selection import train_test_split
import random
import math
import statistics

In [2]:
df = pd.read_csv("adult_csv.csv")
print("Shape :",df.shape)
df.head()

Shape : (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K


In [3]:
# Preprocessing

df = df.drop(["capitalgain",'capitalloss','native-country'],axis = 1)


print("Shape",df.shape)
df.head()

index_drop = []
for index,row in df.iterrows():
    if row.isnull().values.any():
        index_drop.append(index)


df.drop(index_drop,inplace=True)
print("After empty value removal Shape:",df.shape)
print("No of null values :", df.isna().values.sum())

random.seed(100)


x_train ,x_test = train_test_split(df,test_size=0.1)       
print("x_train shape :",x_train.shape,"x_test shape :",x_test.shape)
#Splitting dataset into train and test



Shape (48842, 12)
After empty value removal Shape: (46033, 12)
No of null values : 0
x_train shape : (41429, 12) x_test shape : (4604, 12)


In [4]:
#Building the classifier

# Functions essentials to estimate probabilities of attributes
no_of_rows,no_of_cols = x_train.shape 

def prob_norm_distribution(x, mean, stdev):
    if stdev == 0:
        stdev = float(10 ** 5)
    expo_val = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo_val

def get_mean(numbers):
	return sum(numbers)/float(no_of_rows)

def stdev(numbers):
	avg = get_mean(numbers)
	variance = sum([(x-avg)**2 for x in numbers]) / float(no_of_rows-1)
	return math.sqrt(variance)

cont_attr = ["age","fnlwgt","education-num","hoursperweek"]

discrete_attr = []

for i in df.columns:

    if i not in cont_attr:
        discrete_attr.append(i)




In [5]:
values1 = df.loc[df["class"] == "<=50K" ]
values2 = df.loc[df["class"] == ">=50K" ]
def naivebayes(list_of_val):
    prob1 = 1
    prob2 = 1
    
# Using Normal distribution to calculate probability for continous attributes
    for col in cont_attr:

        prob1 = prob1 * prob_norm_distribution(list_of_val[col],get_mean(values1[col]),stdev(values1[col]))
        prob2 = prob2 * prob_norm_distribution(list_of_val[col],get_mean(values2[col]),stdev(values2[col]))

# Using Laplaschian smoothening to calculate probability of discrete attributes
    for col in discrete_attr:

        type_of_val  = list_of_val[col]
        prob =  (len(values1.loc[values1[col] == type_of_val]) + 5 )/(len(values1) + 5)
        prob1 *= prob

        prob =  (len(values2.loc[values2[col] == type_of_val]) + 5 )/(len(values2) + 5)
        prob2 *= prob

    
    if prob1 > prob2:
        return "<=50K"

    else:
        return ">=50K"

    






In [6]:
#Calculating accuracy of classifier
count = 0
for index,row in x_test.iterrows():
    # print(row["workclass"])
    # sent_val = pd.DataFrame(row,columns =['age', 'workclass', 'fnlwgt', 'education', 'education-num',
    #    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    #    'hoursperweek', 'class'])

    # print(sent_val)
    prediction = naivebayes(row)
    # print(prediction)

    if row["class"] == prediction:
        count += 1


print("Correctly Predicted: ",count)
print("Total no of predictions :",len(x_test))
print("Accuracy : ",float(count/len(x_test))) 

Correctly Predicted:  3440
Total no of predictions : 4604
Accuracy :  0.7471763683753258
