In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean,stdev

In [44]:
def standard_normal_distribution(x,avg,std):
    return np.exp(-0.5*(x-avg)*(x-avg)/(std*std))/(np.sqrt(2*np.pi))

In [45]:
def evaluation_summary(label,model_metrics):
    print(f"Model Name: {label}\n")
    print(f"Average Accuracy: {model_metrics['Accuracy']}\n")

    if len(model_metrics)>2:
        print(f"Average Precision: {model_metrics['Precision']}\n")
        print(f"Average Recall: {model_metrics['Recall']}\n")
        print(f"Average F1_Score: {model_metrics['F1_score']}\n")
    print(f"Confusion_Matrix:\n {model_metrics['Confusion_Matrix']}")

In [46]:
def Eval_metrics(y_test,y_pred,num_classes):

    confusion_matrix=np.zeros((num_classes,num_classes))
    for i in range(y_test.shape[0]):
        confusion_matrix[y_pred[i]][y_test[i]]=confusion_matrix[y_pred[i]][y_test[i]]+1




    accuracy=0;
    for i in range(num_classes):
        accuracy=accuracy+confusion_matrix[i][i]
    accuracy=accuracy/y_test.shape[0]



    return {'Accuracy':accuracy,'Confusion_Matrix':confusion_matrix}

In [47]:
def Eval_metrics_naive_bayes(y_test,y_pred):
    tp=0
    fp=0
    tn=0
    fn=0
    for i in range(y_test.shape[0]):
        if y_test[i]==y_pred[i] and y_test[i]==1:
            tp=tp+1
        elif y_test[i]==y_pred[i] and y_test[i]==0:
            tn=tn+1
        elif y_test[i]!=y_pred[i] and y_pred[i]==1:
            fp=fp+1
        else:
            fn=fn+1

    accuracy=(tn+tp)/(tn+tp+fn+fp)
    precision= tp / (tp + fp)
    recall=tp/(tp+fn)
    f1_score=2*precision*recall/(precision+recall)
    confusion_matrix=[(tp,fp),(fn,tn)]
    df=pd.DataFrame(confusion_matrix)
    df.index.name=None

    return {'Accuracy':accuracy,'Precision':precision,'Recall':recall,'F1_score':f1_score,'Confusion_Matrix':df}

**Part A** - *Naive Bayes Classifier to Predict Income*

Task 1- *Data Preprocessing*

In [48]:
df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",index_col=False,names=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country",">=50K"])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>=50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  >=50K           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [50]:
print(df['race'].unique(), '\n', df['race'].nunique())

[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 
 5


In [51]:
df['race']=df['race'].replace([' Black', " White", ' Asian-Pac-Islander', ' Other', ' Amer-Indian-Eskimo'],
                              [1, 2, 3, 4, 5]
                              )

In [52]:
print(df['marital-status'].unique(), '\n', df['marital-status'].nunique())

[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 
 7


In [53]:
df['marital-status']=df['marital-status'].replace([' Never-married', ' Married-civ-spouse', ' Divorced'
                                                   ,' Married-spouse-absent', ' Separated', ' Married-AF-spouse', ' Widowed'],
                              [1, 2, 3, 4, 5,6,7]
                              )

In [54]:
print(df['workclass'].unique(), '\n', df['workclass'].nunique())

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 
 9


In [55]:
df['workclass'].replace([' Private', ' Local-gov', ' ?', ' Self-emp-not-inc', ' Federal-gov', ' State-gov',
                         ' Self-emp-inc', ' Without-pay', ' Never-worked'], [1, 2, 3, 4, 5, 6, 7, 8, 9], inplace=True)

In [56]:
print(df['education'].unique(), '\n',df['occupation'].nunique())

[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 
 15


In [57]:
df['education'].replace([' 11th', ' HS-grad', ' Assoc-acdm', ' Some-college', ' 10th', ' Prof-school',
                         ' 7th-8th', ' Bachelors', ' Masters', ' Doctorate', ' 5th-6th', ' Assoc-voc', ' 9th',
                         ' 12th', ' 1st-4th', ' Preschool'], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inplace=True)

In [58]:
print(df['occupation'].unique(), "\n" ,df['occupation'].nunique())

[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] 
 15


In [59]:
df['occupation'].replace([' Machine-op-inspct', ' Farming-fishing', ' Protective-serv', ' ?',
                          ' Other-service', ' Prof-specialty', ' Craft-repair', ' Adm-clerical',
                          ' Exec-managerial', ' Tech-support', ' Sales', ' Priv-house-serv',
                          ' Transport-moving', ' Handlers-cleaners', ' Armed-Forces'],
                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], inplace=True
                         )

In [60]:
print(df['relationship'].unique(), '\n', df['relationship'].nunique())

[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 
 6


In [61]:
df['relationship'].replace([' Own-child', ' Husband', ' Not-in-family', ' Unmarried', ' Wife', ' Other-relative'],
                           [1, 2, 3, 4, 5, 6], inplace=True)

In [62]:
print(df['sex'].unique(), '\n', df['sex'].nunique())

[' Male' ' Female'] 
 2


In [63]:
df['sex'].replace([' Male', ' Female'], [0, 1], inplace=True)

In [64]:
l = []
for i in range(42):
    l.append(i)

In [65]:
df['native-country'].replace([' United-States', ' ?', ' Peru', ' Guatemala', ' Mexico', ' Dominican-Republic',
                              ' Ireland', ' Germany', ' Philippines', ' Thailand', ' Haiti', ' El-Salvador',
                              ' Puerto-Rico', ' Vietnam', ' South', ' Columbia', ' Japan', ' India', ' Cambodia',
                              ' Poland', ' Laos', ' England', ' Cuba', ' Taiwan', ' Italy', ' Canada', ' Portugal',
                              ' China', ' Nicaragua', ' Honduras', ' Iran', ' Scotland', ' Jamaica', ' Ecuador',
                              ' Yugoslavia', ' Hungary', ' Hong', ' Greece', ' Trinadad&Tobago',
                              ' Outlying-US(Guam-USVI-etc)', ' France', ' Holand-Netherlands'], l, inplace=True)

In [66]:
print(df['native-country'].unique(), '\n', df['native-country'].nunique())

[ 0 22 32 17  1  4 14 12 29 21 25  7 30  8 24 19 15 18  9 33 20 23 10 26
  5 11 40  3 27 16 34  2 39 31 38 37 28 13 36  6 35 41] 
 42


In [67]:
df['>=50K'].replace([' <=50K', ' >50K'], [0, 1], inplace=True)

In [68]:
df["age"]=(df["age"]-df["age"].mean())/df["age"].std()
df["fnlwgt"]=(df["fnlwgt"]-df["fnlwgt"].mean())/df["fnlwgt"].std()
df["education-num"]=(df["education-num"]-df["education-num"].mean())/df["education-num"].std()
df["capital-gain"]=(df["capital-gain"]-df["capital-gain"].mean())/df["capital-gain"].std()
df["capital-loss"]=(df["capital-loss"]-df["capital-loss"].mean())/df["capital-loss"].std()
df["hours-per-week"]=(df["hours-per-week"]-df["hours-per-week"].mean())/df["hours-per-week"].std()

In [69]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>=50K
0,0.030670,6,-1.063594,8,1.134721,1,8,3,2,0,0.148451,-0.216656,-0.035429,0,0
1,0.837096,4,-1.008692,8,1.134721,2,9,2,2,0,-0.145918,-0.216656,-2.222119,0,0
2,-0.042641,1,0.245075,2,-0.420053,3,14,3,2,0,-0.145918,-0.216656,-0.035429,0,0
3,1.057031,1,0.425795,1,-1.197440,2,14,2,1,0,-0.145918,-0.216656,-0.035429,0,0
4,-0.775756,1,1.408154,8,1.134721,2,6,5,1,1,-0.145918,-0.216656,-0.035429,22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849067,1,0.639731,3,0.746028,2,10,5,2,1,-0.145918,-0.216656,-0.197406,0,0
32557,0.103982,1,-0.335428,2,-0.420053,2,1,2,2,0,-0.145918,-0.216656,-0.035429,0,1
32558,1.423588,1,-0.358772,2,-0.420053,7,8,4,2,1,-0.145918,-0.216656,-0.035429,0,0
32559,-1.215625,1,0.110958,2,-0.420053,1,8,1,2,0,-0.145918,-0.216656,-1.655199,0,0


Task 2- *Naive Bayes Classifier Implementation*

In [70]:
train_data=df.sample(frac=0.67,random_state=42)
test_data=pd.concat([df,train_data]).drop_duplicates(keep=False)
X_train=train_data.drop('>=50K',axis=1).values
Y_train=train_data['>=50K'].values
X_test=test_data.drop('>=50K',axis=1).values
Y_test=test_data['>=50K'].values

In [71]:
X_train

array([[-0.84906741,  1.        , -0.28043934, ..., -0.2166562 ,
        -0.19740595,  0.        ],
       [ 0.47053884,  6.        , -1.31891422, ..., -0.2166562 ,
        -0.0354289 ,  0.        ],
       [-0.70244449,  1.        , -0.03666857, ..., -0.2166562 ,
         1.17939893,  0.        ],
       ...,
       [ 1.35027633,  1.        ,  1.62525504, ..., -0.2166562 ,
        -0.0354289 ,  0.        ],
       [-0.92237887,  1.        , -1.42836948, ..., -0.2166562 ,
        -0.0354289 ,  5.        ],
       [ 0.25060446,  1.        , -0.16703335, ..., -0.2166562 ,
        -0.0354289 ,  0.        ]])

In [72]:
Continuous_features=[0,2,4,10,11,12]

In [73]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,>=50K
0,0.03067,6,-1.063594,8,1.134721,1,8,3,2,0,0.148451,-0.216656,-0.035429,0,0
1,0.837096,4,-1.008692,8,1.134721,2,9,2,2,0,-0.145918,-0.216656,-2.222119,0,0
2,-0.042641,1,0.245075,2,-0.420053,3,14,3,2,0,-0.145918,-0.216656,-0.035429,0,0
3,1.057031,1,0.425795,1,-1.19744,2,14,2,1,0,-0.145918,-0.216656,-0.035429,0,0
4,-0.775756,1,1.408154,8,1.134721,2,6,5,1,1,-0.145918,-0.216656,-0.035429,22,0


In [74]:
pos_neg_params=[]
for i in Continuous_features:
    pos=[]
    neg=[]
    for j in range(X_train.shape[0]):
        if Y_train[j]==1:
            pos.append(X_train[j][i])
        else:
            neg.append(X_train[j][i])
    temp=[]
    temp.append(mean(pos))
    temp.append(stdev(pos))
    temp.append(mean(neg))
    temp.append(stdev(neg))
    pos_neg_params.append(temp)

In [75]:
def prior_probablity_each_class(Y_train,target):
    count=0
    for i in range(Y_train.shape[0]):
        if Y_train[i]==target:
            count=count+1
    return count/Y_train.shape[0]

In [76]:
def conditional_probability_feature(feature_index,X_train,Y_train,target_class,feature):
    if feature_index in Continuous_features:
        pos=0
        for i in range(len(Continuous_features)):
            if Continuous_features[i]==feature_index:
                pos=i
                break
        num=0
        if target_class==1:
            num=standard_normal_distribution(feature,pos_neg_params[pos][0],pos_neg_params[pos][1])
        else:
            num=standard_normal_distribution(feature,pos_neg_params[pos][2],pos_neg_params[pos][3])
        return num
    else:
        den=0
        count=0
        for i in range(X_train.shape[0]):
            if Y_train[i]==target_class:
                den=den+1
                if X_train[i][feature_index]==feature:
                    count=count+1
        return count/den


In [77]:
def predict(X_train,Y_train,X_test,i):
    p_pos=prior_probablity_each_class(Y_train,1)
    p_neg=prior_probablity_each_class(Y_train,0)
    for j in range(14):
        p_pos=p_pos*conditional_probability_feature(j,X_train,Y_train,1,X_test[i][j])
    for j in range(14):
        p_neg=p_neg*conditional_probability_feature(j,X_train,Y_train,0,X_test[i][j])
    if p_pos>=p_neg:
        return 1
    else:
        return 0

In [38]:
ten_split_metrics={}
for j in range(10):
  train_datai=df.sample(frac=0.67,random_state=np.random.randint((1,100)))
  test_datai=pd.concat([df,train_datai]).drop_duplicates(keep=False)
  X_traini=train_datai.drop('>=50K',axis=1).values
  Y_traini=train_datai['>=50K'].values
  X_testi=test_datai.drop('>=50K',axis=1).values
  Y_testi=test_datai['>=50K'].values
  Y_pred=[]
  for i in range(X_testi.shape[0]):
      Y_pred.append(predict(X_traini,Y_traini,X_testi,i))
  metrics=Eval_metrics_naive_bayes(Y_testi[:1000],Y_pred)
  if j==0:
    ten_split_metrics=metrics
  else:
    ten_split_metrics=dict(tuple(ten_split_metrics.items())+tuple(metrics.items()))  
ten_split_metrics_final={}
for key,value in ten_split_metrics.items():
  ten_split_metrics_final[key]=value/10

In [None]:
evaluation_summary('Naive',ten_split_metrics_final)

Model Name: Naive

Accuracy: 0.8162751677852349

Precision: 0.5964017991004498

Recall: 0.760902830910482

F1_Score: 0.6686838124054462

Confusion_Matrix:
       0     1
0  1989  1346
1   625  6768


#Laplace Smoothing techinque

In [78]:
def conditional_probability_feature_with_smoothing(feature_index,X_train,Y_train,target_class,feature):
    if feature_index in Continuous_features:
        pos=0
        for i in range(len(Continuous_features)):
            if Continuous_features[i]==feature_index:
                pos=i
                break

        num=0
        if target_class==1:
            num=standard_normal_distribution(feature,pos_neg_params[pos][0],pos_neg_params[pos][1])
        else:
            num=standard_normal_distribution(feature,pos_neg_params[pos][2],pos_neg_params[pos][3])

        return num
    else:
        den=0
        count=0
        nc=0
        n=0
        m=0
        for i in range(X_train.shape[0]):
            if Y_train[i]==target_class:
                den=den+1
                n=n+1
                if X_train[i][feature_index]==feature:
                    nc=nc+1
                    count=count+1
        if count!=0:
            return count/den
        # To handle 0 probabilities
        m=len(np.unique(df.iloc[:,feature_index]))
        return (nc+1)/(n+m)





In [79]:
def predict_smoothing(X_train,Y_train,X_test,i):
    p_pos=prior_probablity_each_class(Y_train,1)
    p_neg=prior_probablity_each_class(Y_train,0)
    for j in range(14):
        p_pos=p_pos*conditional_probability_feature_with_smoothing(j,X_train,Y_train,1,X_test[i][j])
    for j in range(14):
        p_neg=p_neg*conditional_probability_feature_with_smoothing(j,X_train,Y_train,0,X_test[i][j])
    if p_pos>=p_neg:
        return 1
    else:
        return 0

In [82]:
ten_split_metrics={}
for j in range(10):
  train_datai=df.sample(frac=0.67,random_state=np.random.randint((1,100)))
  test_datai=pd.concat([df,train_datai]).drop_duplicates(keep=False)
  X_traini=train_datai.drop('>=50K',axis=1).values
  Y_traini=train_datai['>=50K'].values
  X_testi=test_datai.drop('>=50K',axis=1).values
  Y_testi=test_datai['>=50K'].values
  Y_pred=[]
  for i in range(X_testi.shape[0]):
      Y_pred.append(predict_smoothing(X_traini,Y_traini,X_testi,i))
  metrics=Eval_metrics_naive_bayes(Y_testi,Y_pred)
  if j==0:
    ten_split_metrics=metrics
  else:
    ten_split_metrics=dict(tuple(ten_split_metrics.items())+tuple(metrics.items()))  
ten_split_metrics_final={}
for key,value in ten_split_metrics.items():
  ten_split_metrics_final[key]=value/1

In [83]:
evaluation_summary('Naive Bayes(Laplace Smoothing)',ten_split_metrics_final)

Model Name: Naive Bayes(Laplace Smoothing)

Average Accuracy: 0.824

Average Precision: 0.609375

Average Recall: 0.7926829268292683

Average F1_Score: 0.6890459363957597

Confusion_Matrix:
        0      1
0  195.0  125.0
1   51.0  629.0


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
classifier.fit(X_train,Y_train)
Y_pred=classifier.predict(X_test)
metrics=Eval_metrics_naive_bayes(Y_pred,Y_test)
evaluation_summary('Logistic Regression',metrics)

Model Name: Logistic Regression

Accuracy: 0.8250372856077554

Precision: 0.4525631216526396

Recall: 0.7262124002455494

F1_Score: 0.5576243224133868

Confusion_Matrix:
       0     1
0  1183  1431
1   446  7668


# K Nearest Neighbours Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train,Y_train)
Y_pred=classifier.predict(X_test)
metrics=Eval_metrics_naive_bayes(Y_pred,Y_test)
evaluation_summary('K-Nearest Neighbours',metrics)

Model Name: K-Nearest Neighbours

Accuracy: 0.8187919463087249

Precision: 0.5734506503442999

Recall: 0.6439003436426117

F1_Score: 0.6066369890732497

Confusion_Matrix:
       0     1
0  1499  1115
1   829  7285
