In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split as tts
import numpy as np
import os

In [2]:
def gather_and_clean_data():
    data = pd.read_csv("heart.csv")
    
    sex = pd.get_dummies(data['Sex'], drop_first=True, prefix="Sex").astype(int)
    chestPain = pd.get_dummies(data['ChestPainType'], drop_first=True, prefix="ChestPain").astype(int)
    ecg = pd.get_dummies(data['RestingECG'], drop_first=True, prefix="ECG").astype(int)
    stSlope = pd.get_dummies(data['ST_Slope'], drop_first=True, prefix="ST_Slope").astype(int)
    excercise = pd.get_dummies(data['ExerciseAngina'], drop_first=True, prefix="ExerciseAngina").astype(int)

    data.drop(['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina'], axis=1, inplace=True)
    data = pd.concat([data, sex, chestPain, ecg, stSlope, excercise], axis=1)

    Scaler = MinMaxScaler()
    data = pd.DataFrame(Scaler.fit_transform(data), columns=data.columns)

    return data

In [3]:
data= gather_and_clean_data()

data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPain_ATA,ChestPain_NAP,ChestPain_TA,ECG_Normal,ECG_ST,ST_Slope_Flat,ST_Slope_Up,ExerciseAngina_Y
0,0.244898,0.7,0.47927,0.0,0.788732,0.295455,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.428571,0.8,0.298507,0.0,0.676056,0.409091,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.183673,0.65,0.46932,0.0,0.267606,0.295455,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.408163,0.69,0.354892,0.0,0.338028,0.465909,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.530612,0.75,0.323383,0.0,0.43662,0.295455,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [4]:
X=data.drop("HeartDisease", axis=1)
Y=data['HeartDisease']
X.columns = X.columns.astype(str)

In [5]:
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=None)

In [6]:
gausNbModel = GaussianNB()

gausNbModel.fit(X_train, y_train)

In [7]:
y_prediction = gausNbModel.predict(X_test)

In [8]:
accuracy = metrics.accuracy_score(y_test, y_prediction)
print("Accuracy on test data:", accuracy)

report = classification_report(y_test, y_prediction, target_names=['No Heart Disease', 'Heart Disease'])
print("\nClassification Report:\n", report)

conf_matrix = confusion_matrix(y_test, y_prediction)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy on test data: 0.8206521739130435

Classification Report:
                   precision    recall  f1-score   support

No Heart Disease       0.83      0.78      0.80        87
   Heart Disease       0.81      0.86      0.83        97

        accuracy                           0.82       184
       macro avg       0.82      0.82      0.82       184
    weighted avg       0.82      0.82      0.82       184


Confusion Matrix:
 [[68 19]
 [14 83]]


In [9]:
splits = [0.5, 0.6, 0.7, 0.8, 0.9]

results = {split: [] for split in splits}

for split in splits:
    for i in range(50):
        X_train, X_test, y_train, y_test = tts(X, Y, test_size=1-split, random_state=None)

        gaussianNaiveBayesModel = GaussianNB()
        gaussianNaiveBayesModel.fit(X_train, y_train)
        y_pred = gaussianNaiveBayesModel.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)

        results[split].append(accuracy)

print("\nAverage Accuracies:")
for split, accuracies in results.items():
    avg_accuracy = np.mean(accuracies)
    print(f"Avg for {round(split * 100)}/{round((1 - split) * 100)} split: {avg_accuracy:.5f}")

file_name = "classification_results.csv"

file_exists = os.path.isfile(file_name)

# output = pd.DataFrame.from_dict(results, orient='index').transpose()
# output.to_csv(file_name, mode='a', header=not file_exists, index=False)


Average Accuracies:
Avg for 50/50 split: 0.86174
Avg for 60/40 split: 0.85842
Avg for 70/30 split: 0.85913
Avg for 80/20 split: 0.86413
Avg for 90/10 split: 0.85761
