In [10]:
# Required Libraries
import pandas as pd          
import numpy as np              
from sklearn.model_selection import train_test_split   
from sklearn.naive_bayes import GaussianNB             
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score                            


In [11]:
df = pd.read_csv("bank-data.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
1,ID12101,48,FEMALE,INNER_CITY,17546,NO,1,NO,NO,NO,NO,YES
2,ID12102,40,MALE,TOWN,30085.1,YES,3,YES,NO,YES,YES,NO
3,ID12103,51,FEMALE,INNER_CITY,16575.4,YES,0,YES,YES,YES,NO,NO
4,ID12104,23,FEMALE,TOWN,20375.4,YES,3,NO,NO,YES,NO,NO


In [12]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,479,50,2,4,478,2,4,2,2,2,2,2
1,0,30,0,0,100,0,1,0,0,0,0,1
2,1,22,1,3,287,1,3,1,0,1,1,0
3,2,33,0,0,86,1,0,1,1,1,0,0
4,3,5,0,3,139,1,3,0,0,1,0,0


In [13]:
X = df[df.columns[:-1]]
Y = df[df.columns[-1]]

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=30)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)
Y_pred = gnb.fit(X_train, Y_train).predict(X_test)

In [None]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (Y_test != Y_pred).sum()))

Number of mislabeled points out of a total 144 points : 51


In [18]:
print(confusion_matrix(Y_test, Y_pred))
print("\n Accuracy")
print(accuracy_score(Y_test, Y_pred))


[[60 20  0]
 [30 33  0]
 [ 1  0  0]]

 Accuracy
0.6458333333333334


Q1 – Naive Bayes with and without irrelevant attribute

In [None]:
df = pd.read_csv("car.csv", header=None)

le = preprocessing.LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])

X = df[df.columns[:-1]]
Y = df[df.columns[-1]]

np.random.seed(30)
X["current_act"] = np.random.randint(0, 5, X.shape[0])

X.columns = X.columns.astype(str)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

gnb = GaussianNB()
Y_pred_with = gnb.fit(X_train, Y_train).predict(X_test)
print("Naive Bayes with irrelevant attribute:", accuracy_score(Y_test, Y_pred_with))

X_no_irrel = X.drop(columns=["current_act"])
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_no_irrel, Y, test_size=0.30, random_state=30)
Y_pred_without = gnb.fit(X_train2, Y_train2).predict(X_test2)
print("Naive Bayes without irrelevant attribute:", accuracy_score(Y_test2, Y_pred_without))

Naive Bayes with irrelevant attribute: 0.6319845857418112
Naive Bayes without irrelevant attribute: 0.6242774566473989


Q2 – KNN with and without irrelevant attribute

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
Y_pred_knn_with = knn.predict(X_test)
print("KNN with irrelevant attribute:", accuracy_score(Y_test, Y_pred_knn_with))

knn.fit(X_train2, Y_train2)
Y_pred_knn_without = knn.predict(X_test2)
print("KNN without irrelevant attribute:", accuracy_score(Y_test2, Y_pred_knn_without))

KNN with irrelevant attribute: 0.8073217726396917
KNN without irrelevant attribute: 0.8901734104046243


Q3 – Naive Bayes on car.csv

In [None]:
print("Naive Bayes accuracy on car dataset:", accuracy_score(Y_test2, Y_pred_without))

Naive Bayes accuracy on car dataset: 0.6242774566473989


Q4 – Correlation between attributes

In [None]:
correlation_matrix = df.corr()
correlations = correlation_matrix.unstack().sort_values().drop_duplicates()
print(correlations)

3  6   -2.994683e-01
6  2   -3.132740e-02
5  6   -2.104372e-02
0  1   -3.436808e-16
   5   -2.439415e-16
   2   -2.097088e-16
   4   -1.691262e-16
1  4   -1.549092e-16
5  1   -1.404810e-16
2  1   -6.466021e-17
3  1   -4.272143e-17
   0   -4.089152e-17
2  4   -1.942523e-17
4  5   -1.541976e-17
3  4    3.854941e-18
   2    1.956599e-17
5  3    3.392348e-17
   2    1.062757e-16
4  6    3.318432e-02
1  6    4.019365e-02
6  0    5.142422e-02
0  0    1.000000e+00
dtype: float64


Q5 – Remove one highly correlated attribute & apply Naive Bayes

In [24]:
corr_matrix = X_no_irrel.corr()
corr_unstack = corr_matrix.where(~np.eye(corr_matrix.shape[0],dtype=bool)).unstack().dropna()
high_corr_pair = corr_unstack.abs().idxmax()

X_reduced = X_no_irrel.drop(columns=[high_corr_pair[0]])
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_reduced, Y, test_size=0.30, random_state=30)
Y_pred_reduced = gnb.fit(X_train3, Y_train3).predict(X_test3)
print("Naive Bayes after removing correlated attribute:", accuracy_score(Y_test3, Y_pred_reduced))


Naive Bayes after removing correlated attribute: 0.6184971098265896
