In [59]:
# Required Libraries
import pandas as pd          
import numpy as np              
from sklearn.model_selection import train_test_split   
from sklearn.naive_bayes import GaussianNB             
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score                            


In [60]:
df = pd.read_csv("bank-data.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
1,ID12101,48,FEMALE,INNER_CITY,17546,NO,1,NO,NO,NO,NO,YES
2,ID12102,40,MALE,TOWN,30085.1,YES,3,YES,NO,YES,YES,NO
3,ID12103,51,FEMALE,INNER_CITY,16575.4,YES,0,YES,YES,YES,NO,NO
4,ID12104,23,FEMALE,TOWN,20375.4,YES,3,NO,NO,YES,NO,NO


In [61]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,479,50,2,4,478,2,4,2,2,2,2,2
1,0,30,0,0,100,0,1,0,0,0,0,1
2,1,22,1,3,287,1,3,1,0,1,1,0
3,2,33,0,0,86,1,0,1,1,1,0,0
4,3,5,0,3,139,1,3,0,0,1,0,0


In [62]:
X = df[df.columns[:-1]]
Y = df[df.columns[-1]]

In [63]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=30)

In [64]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)
Y_pred = gnb.fit(X_train, Y_train).predict(X_test)

In [66]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (Y_test != Y_pred).sum()))

Number of mislabeled points out of a total 144 points : 51


In [67]:
print(confusion_matrix(Y_test, Y_pred))
print("\n Accuracy")
print(accuracy_score(Y_test, Y_pred))


[[60 20  0]
 [30 33  0]
 [ 1  0  0]]

 Accuracy
0.6458333333333334


Q1 – Naive Bayes with and without irrelevant attribute

In [68]:
df = pd.read_csv("bank-data.csv", header=None)

le = preprocessing.LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])

X = df[df.columns[:-1]]
Y = df[df.columns[-1]]

np.random.seed(30)
X["current_act"] = np.random.randint(0, 5, X.shape[0])

X.columns = X.columns.astype(str)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

gnb = GaussianNB()
Y_pred_with = gnb.fit(X_train, Y_train).predict(X_test)
print("Naive Bayes with irrelevant attribute:", accuracy_score(Y_test, Y_pred_with))

X_no_irrel = X.drop(columns=["current_act"])
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_no_irrel, Y, test_size=0.30, random_state=30)
Y_pred_without = gnb.fit(X_train2, Y_train2).predict(X_test2)
print("Naive Bayes without irrelevant attribute:", accuracy_score(Y_test2, Y_pred_without))

Naive Bayes with irrelevant attribute: 0.6458333333333334
Naive Bayes without irrelevant attribute: 0.6458333333333334


Q2 – KNN with and without irrelevant attribute

In [69]:
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv("car.csv", header=None)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
Y_pred_knn_with = knn.predict(X_test)
print("KNN with irrelevant attribute:", accuracy_score(Y_test, Y_pred_knn_with))

knn.fit(X_train2, Y_train2)
Y_pred_knn_without = knn.predict(X_test2)
print("KNN without irrelevant attribute:", accuracy_score(Y_test2, Y_pred_knn_without))

KNN with irrelevant attribute: 0.5277777777777778
KNN without irrelevant attribute: 0.5277777777777778


Q3 – Naive Bayes on car.csv

In [70]:
print("Naive Bayes accuracy on car dataset:", accuracy_score(Y_test2, Y_pred_without))

Naive Bayes accuracy on car dataset: 0.6458333333333334


Q4 – Correlation between attributes

In [71]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("car.csv")
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])
corr_matrix = df.corr()
corr_pairs = corr_matrix.unstack().sort_values()
corr_pairs = corr_pairs[corr_pairs != 1]

print("Correlation values in ascending order:\n")
print(corr_pairs)


Correlation values in ascending order:

unacc    2.1       -2.992590e-01
2.1      unacc     -2.992590e-01
2        unacc     -3.094953e-02
unacc    2         -3.094953e-02
         low       -2.104530e-02
low      unacc     -2.104530e-02
vhigh    vhigh.1   -1.043357e-03
vhigh.1  vhigh     -1.043357e-03
vhigh    small     -9.523677e-04
small    vhigh     -9.523677e-04
vhigh.1  small     -9.523677e-04
small    vhigh.1   -9.523677e-04
2        2.1       -9.523677e-04
2.1      2         -9.523677e-04
low      vhigh     -3.200145e-16
vhigh    low       -3.200145e-16
low      vhigh.1   -1.012963e-16
vhigh.1  low       -1.012963e-16
small    low       -7.231156e-18
low      small     -7.231156e-18
2.1      low        2.362177e-17
low      2.1        2.362177e-17
         2          6.731959e-17
2        low        6.731959e-17
2.1      small      8.693132e-04
small    2.1        8.693132e-04
vhigh    2.1        9.523677e-04
2.1      vhigh      9.523677e-04
vhigh.1  2.1        9.523677e-04
2.1

Q5 – Remove one highly correlated attribute & apply Naive Bayes

In [72]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("car.csv")
corr_matrix = X_no_irrel.corr()
corr_unstack = corr_matrix.where(~np.eye(corr_matrix.shape[0],dtype=bool)).unstack().dropna()
high_corr_pair = corr_unstack.abs().idxmax()

X_reduced = X_no_irrel.drop(columns=[high_corr_pair[0]])
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_reduced, Y, test_size=0.30, random_state=30)
Y_pred_reduced = gnb.fit(X_train3, Y_train3).predict(X_test3)
print("Naive Bayes after removing correlated attribute:", accuracy_score(Y_test3, Y_pred_reduced))


Naive Bayes after removing correlated attribute: 0.6527777777777778
