In [203]:
# drop the continous valued col. 
# (capital.gain, capital.loss, hours.per.week, education.num,fnlwgt)
# don't drop missing rows --> treat them as seperate category
# use one hot encoding for features with categories

In [204]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

In [205]:
data = pd.read_csv('adult.csv')

# getting the no. of unique categoriesfor each feature for the smoothing
no_of_categories = []
no_of_categories.append(data['workclass'].nunique())
no_of_categories.append(data['education'].nunique())
no_of_categories.append(data['marital.status'].nunique())
no_of_categories.append(data['occupation'].nunique())
no_of_categories.append(data['relationship'].nunique())
no_of_categories.append(data['sex'].nunique())
no_of_categories.append(data['race'].nunique())
no_of_categories.append(data['native.country'].nunique())
no_of_categories





[9, 16, 7, 15, 6, 2, 5, 42]

In [206]:
# dropping continous valued featuers and one hot encoding our categorical featuers
data.drop(columns=['age','capital.gain', 'capital.loss', 'hours.per.week', 'education.num','fnlwgt'],inplace=True)
data_encoded = pd.get_dummies(data, columns=['workclass', 'education', 'marital.status','occupation','relationship','sex','race','native.country','income'], drop_first=False)



In [207]:
data_encoded.head()

Unnamed: 0,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_<=50K,income_>50K
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False


In [208]:
train_data, tmp_data = train_test_split(data_encoded, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(tmp_data, test_size=0.5, random_state=42)
# train_data          #22792 rows
# val_data            #4884 rows


In [209]:
# we will assume income_<=50K --> yes , income_>50K --> no       
train_data_yes = train_data[train_data['income_<=50K'] == True] # 17291 rows 
train_data_no = train_data[train_data['income_>50K'] == True]   # 5501 rows 

In [210]:
# calculating the prior probabilities of each class
alpha = 1
yes_samples = 17291
no_samples = 5501
total_samples = 22792
prob_yes = (yes_samples*alpha) / (total_samples+(alpha*2))        # 2 for number of classes
prob_no = (no_samples*alpha) / (total_samples+(alpha*2))  

In [211]:

train_data_yes = train_data_yes.copy()
train_data_no = train_data_no.copy()
val_data = val_data.copy()
test_data = test_data.copy()

# getting label col. for validation and testing data
val_data_labels = val_data['income_<=50K'].values
test_data_labels = test_data['income_<=50K'].values

# dropping target col.
train_data_yes.drop(columns=['income_<=50K','income_>50K'], inplace=True)
train_data_no.drop(columns=['income_<=50K','income_>50K'], inplace=True)
val_data.drop(columns=['income_<=50K','income_>50K'], inplace=True)
test_data.drop(columns=['income_<=50K','income_>50K'], inplace=True)


In [212]:
# calculating the likelihood

col_names = train_data_yes.columns.to_numpy()
prob_given_yes = []
prob_given_no = []
values_yes = train_data_yes.to_numpy()
values_no = train_data_no.to_numpy()

# calculating prob(features/yes)
category_index = 0
category_counter = no_of_categories[category_index] - 1
for col_idx, col in enumerate(values_yes.T):
    count = col.sum()
    prob_given_yes.append((count+alpha)/(yes_samples+(alpha*no_of_categories[category_index])))
    if col_idx ==  category_counter:
        category_index+=1
        if category_index < len(no_of_categories) - 1:
            category_counter += no_of_categories[category_index] - 1



# calculating prob(features/no)
category_index = 0
category_counter = no_of_categories[category_index] - 1
for col_idx, col in enumerate(values_no.T):
    count = col.sum()
    prob_given_no.append((count+alpha)/(no_samples+(alpha*no_of_categories[category_index])))
    if col_idx == category_counter:
        category_index+=1
        if category_index < len(no_of_categories) - 1:
            category_counter += no_of_categories[category_index] - 1 

    



In [213]:
# calculating prediction
# prob_yes, prob_no, prob_given_yes, prob_given_no, val_data_labels
val_data_arr = val_data.to_numpy()  #4884 rows
log_yes = np.log(prob_yes)
log_no = np.log(prob_no)
prediction_yes = []         # supposed to be 4884
prediction_no = []
income_low = []	    # yes       <=50 --> if false then sample is income high
no_of_correct_predictions = 0

# calculate the prob. that its a yes class
for i in range(0,len(val_data_arr)):
    prob = log_yes
    for j in range(0,len(prob_given_yes)):
        if val_data_arr[i][j] == True:
            prob+= np.log(prob_given_yes[j])
    prediction_yes.append(prob)


# calculate the prob. that its a no class
for i in range(0,len(val_data_arr)):
    prob = log_no
    for j in range(0,len(prob_given_no)):
        if val_data_arr[i][j] == True:
            prob+= np.log(prob_given_no[j])
    prediction_no.append(prob)

# deciding which class has higher prob.
for i in range(0,len(val_data_arr)):
    if prediction_yes[i] > prediction_no[i]:
        income_low.append(True)
    else:
        income_low.append(False)

for i in range(0,len(income_low)):
    if income_low[i] == val_data_labels[i]:
        no_of_correct_predictions+=1


print(f"number of correct predictions = {no_of_correct_predictions}")
print(f"miss classification rate = {1-(no_of_correct_predictions/len(val_data_labels))}")







    



number of correct predictions = 3856
miss classification rate = 0.2104832104832105
