In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix, classification_report
from sklearn import metrics
import matplotlib.pyplot as plt

# Loading Dataset

In [None]:
df = pd.read_csv('D3_Heart_Dataset.csv')
df.head()

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Separating Features

In [None]:
df = df[df['Oldpeak'] >=0]
df

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [None]:
X = df.drop("HeartDisease",axis=1)

In [None]:
X

Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [None]:
Y = df["HeartDisease"]
Y

Unnamed: 0,HeartDisease
0,0
1,1
2,0
3,1
4,0
...,...
913,1
914,1
915,1
916,1


# Apply Ordinal Encoding

In [None]:
X['Gender'].unique()

array(['M', 'F'], dtype=object)

In [None]:
X['Gender'] = X['Gender'].replace('M',1)
X['Gender'] = X['Gender'].replace('F',2)
X

  X['Gender'] = X['Gender'].replace('F',2)


Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,2,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,1,ATA,130,283,0,ST,98,N,0.0,Up
3,48,2,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,1,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,1,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,1,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,2,ATA,130,236,0,LVH,174,N,0.0,Flat


In [None]:
X['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [None]:
X['ChestPainType'] = X['ChestPainType'].replace("ATA",1)
X['ChestPainType'] = X['ChestPainType'].replace("NAP",2)
X['ChestPainType'] = X['ChestPainType'].replace("ASY",3)
X['ChestPainType'] = X['ChestPainType'].replace("TA",4)
X

  X['ChestPainType'] = X['ChestPainType'].replace("TA",4)


Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,Normal,172,N,0.0,Up
1,49,2,2,160,180,0,Normal,156,N,1.0,Flat
2,37,1,1,130,283,0,ST,98,N,0.0,Up
3,48,2,3,138,214,0,Normal,108,Y,1.5,Flat
4,54,1,2,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,4,110,264,0,Normal,132,N,1.2,Flat
914,68,1,3,144,193,1,Normal,141,N,3.4,Flat
915,57,1,3,130,131,0,Normal,115,Y,1.2,Flat
916,57,2,1,130,236,0,LVH,174,N,0.0,Flat


In [None]:
X['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [None]:
X['RestingECG'] = X['RestingECG'].replace('Normal',1)
X['RestingECG'] = X['RestingECG'].replace('ST',2)
X['RestingECG'] = X['RestingECG'].replace('LVH',3)
X

  X['RestingECG'] = X['RestingECG'].replace('LVH',3)


Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,N,0.0,Up
1,49,2,2,160,180,0,1,156,N,1.0,Flat
2,37,1,1,130,283,0,2,98,N,0.0,Up
3,48,2,3,138,214,0,1,108,Y,1.5,Flat
4,54,1,2,150,195,0,1,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,4,110,264,0,1,132,N,1.2,Flat
914,68,1,3,144,193,1,1,141,N,3.4,Flat
915,57,1,3,130,131,0,1,115,Y,1.2,Flat
916,57,2,1,130,236,0,3,174,N,0.0,Flat


In [None]:
X['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [None]:
X['ExerciseAngina'] = X['ExerciseAngina'].replace('N',1)
X['ExerciseAngina'] = X['ExerciseAngina'].replace('Y',2)
X

  X['ExerciseAngina'] = X['ExerciseAngina'].replace('Y',2)


Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,1,0.0,Up
1,49,2,2,160,180,0,1,156,1,1.0,Flat
2,37,1,1,130,283,0,2,98,1,0.0,Up
3,48,2,3,138,214,0,1,108,2,1.5,Flat
4,54,1,2,150,195,0,1,122,1,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,4,110,264,0,1,132,1,1.2,Flat
914,68,1,3,144,193,1,1,141,1,3.4,Flat
915,57,1,3,130,131,0,1,115,2,1.2,Flat
916,57,2,1,130,236,0,3,174,1,0.0,Flat


In [None]:
X['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [None]:
X['ST_Slope'] = X['ST_Slope'].replace('Up',1)
X['ST_Slope'] = X['ST_Slope'].replace('Flat',2)
X['ST_Slope'] = X['ST_Slope'].replace('Down',3)
X

  X['ST_Slope'] = X['ST_Slope'].replace('Down',3)


Unnamed: 0,Age,Gender,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,1,0.0,1
1,49,2,2,160,180,0,1,156,1,1.0,2
2,37,1,1,130,283,0,2,98,1,0.0,1
3,48,2,3,138,214,0,1,108,2,1.5,2
4,54,1,2,150,195,0,1,122,1,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,4,110,264,0,1,132,1,1.2,2
914,68,1,3,144,193,1,1,141,1,3.4,2
915,57,1,3,130,131,0,1,115,2,1.2,2
916,57,2,1,130,236,0,3,174,1,0.0,2


# Splitting data into Training and Testing

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(724, 11)
(181, 11)
(724,)
(181,)


# 2. Analyze the performance of naïve Bayes with respect to accuracy, recall, precision, FPR, and ROC metrics obtained for heart dataset.

## Creating Gaussian Naive Bayes

In [None]:
classifier1 = GaussianNB()
model1 = classifier1.fit(X_train,Y_train)

In [None]:
y_pred1 = model1.predict(X_test)

In [None]:
print('The accuracy is '+str(accuracy_score(Y_test,y_pred1)*100)+"%")

The accuracy is 83.15217391304348%


In [None]:
print(confusion_matrix(Y_test, y_pred1))
target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred1, target_names=target_names))
fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred1, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)


[[60 17]
 [14 93]]
              precision    recall  f1-score   support

     class 0       0.81      0.78      0.79        77
     class 1       0.85      0.87      0.86       107

    accuracy                           0.83       184
   macro avg       0.83      0.82      0.83       184
weighted avg       0.83      0.83      0.83       184

Area under the ROC curve : 0.824190


# 3. Test the naïve Bayes algorithm with prior probabilities (0.25, 0.75), (0.75, 0.25), and (0.5, 0.5) on heart dataset. Study the impact on performance metrics.

## Creating Naive Bayes on Prior Probabilities

### prior probability (0.25,0.75)

In [None]:
classifier2 = GaussianNB(priors=[0.25,0.75])
model2 = classifier2.fit(X_train,Y_train)
y_pred2 = model2.predict(X_test)

In [None]:
print('the accuracy is '+str(accuracy_score(Y_test,y_pred2)*100)+"%")
print(confusion_matrix(Y_test, y_pred2))
target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred2, target_names=target_names))

the accuracy is 84.23913043478261%
[[58 19]
 [10 97]]
              precision    recall  f1-score   support

     class 0       0.85      0.75      0.80        77
     class 1       0.84      0.91      0.87       107

    accuracy                           0.84       184
   macro avg       0.84      0.83      0.83       184
weighted avg       0.84      0.84      0.84       184



### prior probability (0.75,0.25)

In [None]:
classifier3 = GaussianNB(priors=[0.75,0.25])
model3 = classifier3.fit(X_train,Y_train)
y_pred3 = model3.predict(X_test)

In [None]:
print('the accuracy is '+str(accuracy_score(Y_test,y_pred3)*100)+"%")
print(confusion_matrix(Y_test, y_pred3))
target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred3, target_names=target_names))

the accuracy is 83.69565217391305%
[[63 14]
 [16 91]]
              precision    recall  f1-score   support

     class 0       0.80      0.82      0.81        77
     class 1       0.87      0.85      0.86       107

    accuracy                           0.84       184
   macro avg       0.83      0.83      0.83       184
weighted avg       0.84      0.84      0.84       184



### prior probability (0.5, 0.5)

In [None]:
classifier4 = GaussianNB(priors=[0.5,0.5])
model4 = classifier4.fit(X_train,Y_train)
y_pred4 = model4.predict(X_test)

In [None]:
print('the accuracy is '+str(accuracy_score(Y_test,y_pred4)*100)+"%")
print(confusion_matrix(Y_test, y_pred4))
target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred4, target_names=target_names))

the accuracy is 83.69565217391305%
[[61 16]
 [14 93]]
              precision    recall  f1-score   support

     class 0       0.81      0.79      0.80        77
     class 1       0.85      0.87      0.86       107

    accuracy                           0.84       184
   macro avg       0.83      0.83      0.83       184
weighted avg       0.84      0.84      0.84       184



# 4. Compare the Guassian implementation of naïve Bayes algorithm with Bernoulli and Multinomial on heart dataset. Study the impact on performance metrics.


## Creating Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Creating Multinomial Naive Bayes Object
multinomialClassifer = MultinomialNB()

# Training the model
multinomialModel = multinomialClassifer.fit(X_train, Y_train)

# Evaluating the model
Y_pred3 = multinomialModel.predict(X_test)
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred3)*100)+"%")
print(confusion_matrix(Y_test, Y_pred3))

The accuracy is 69.61325966850829%
[[64 12]
 [43 62]]


## Creating Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

bernoulliClassifier = BernoulliNB()
bernoulliModel = bernoulliClassifier.fit(X_train, Y_train)
Y_pred4 = bernoulliModel.predict(X_test)
print("The accuracy is "+str(metrics.accuracy_score(Y_test,Y_pred4)*100)+"%")
print(confusion_matrix(Y_test, Y_pred4))

The accuracy is 79.00552486187846%
[[46 30]
 [ 8 97]]


# 5. Test the naïve Bayes implementation with spambase dataset. Compare the spambase performance with the heart using the discussed metrics.

In [None]:
spamdf = pd.read_csv('spambase.csv')
spamdf.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [None]:
X = spamdf.drop('spam',axis=1)
Y = spamdf['spam']

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.20,random_state=0)
print(x_train.shape)
print(x_test.shape)

(3680, 57)
(921, 57)


In [None]:
spam_model = GaussianNB()
spam_model.fit(x_train,y_train)
spam_pred = spam_model.predict(x_test)

In [None]:
print('The accuracy is '+str(accuracy_score(y_test,spam_pred)*100)+"%")
print("The recall score is", recall_score(y_test, spam_pred)*100, "%")
print("The precision is", precision_score(y_test, spam_pred)*100, "%")
print("The roc score is", roc_auc_score(y_test, spam_pred)*100, "%")
print(confusion_matrix(y_test, spam_pred))
target_names = ['class 0', 'class 1']
print(classification_report(y_test, spam_pred, target_names=target_names))


The accuracy is 80.67318132464713%
The recall score is 93.21148825065274 %
The precision is 70.13752455795678 %
The roc score is 82.47935007328175 %
[[386 152]
 [ 26 357]]
              precision    recall  f1-score   support

     class 0       0.94      0.72      0.81       538
     class 1       0.70      0.93      0.80       383

    accuracy                           0.81       921
   macro avg       0.82      0.82      0.81       921
weighted avg       0.84      0.81      0.81       921

