## Import Libraries

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [18]:
# Reading Training Data
from pathlib import Path
# Define the base directory dynamically (like home folder)
base_dir = Path.home()

# Build the full file path
file_path = base_dir / 'anaconda3' / 'envs' / 'ApplicationMLModels_PredictHeartDisease' / 'statlog+heart' /'heart_data.csv'
#print(file_path)
#file_name = 'heart_data.csv'
#complete_filepath = file_path / file_name
#filepath = complete_filepath.resolve()

# Brief overview of Dataset
heart_df = pd.read_csv(file_path)
heart_df.head()

Unnamed: 0,patient_id,heart_disease_present,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol_mg_per_dl,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,max_heart_rate_achieved,exercise_induced_angina,oldpeak_eq_st_depression,slope_of_peak_exercise_st_segment,num_major_vessels,thal
0,034N95xw,1,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,IBygtBAP,0,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,zI70cfhM,1,57,1,2,124,261,0,0,141,0,0.3,1,0,7
3,qXkCdEiv,0,64,1,4,128,263,0,0,105,1,0.2,2,1,7
4,IAA2korm,0,74,0,2,120,269,0,2,121,1,0.2,1,1,3


In [10]:
## Convert Categorical Variable to Numeric Variable
# heart_df["thal_cleaned"]=np.where(heart_df["thal"]=="normal",0,
#                                   np.where(heart_df["thal"]=="fixed_defect",1,
#                                            np.where(heart_df["thal"]=="reversible_defect",2,3)
#                                           )
#                                  )

In [10]:
# ## Convert Categorical Variable to Numeric Variable
# heart_df["thal_cleaned"]=np.where(heart_df["thal"]=="normal",0,
#                                   np.where(heart_df["thal"]=="fixed_defect",1,
#                                            np.where(heart_df["thal"]=="reversible_defect",2,3)
#                                           )
#                                  )

In [19]:
heart_df.head(10)

Unnamed: 0,patient_id,heart_disease_present,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol_mg_per_dl,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,max_heart_rate_achieved,exercise_induced_angina,oldpeak_eq_st_depression,slope_of_peak_exercise_st_segment,num_major_vessels,thal
0,034N95xw,1,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,IBygtBAP,0,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,zI70cfhM,1,57,1,2,124,261,0,0,141,0,0.3,1,0,7
3,qXkCdEiv,0,64,1,4,128,263,0,0,105,1,0.2,2,1,7
4,IAA2korm,0,74,0,2,120,269,0,2,121,1,0.2,1,1,3
5,wt61iIew,0,65,1,4,120,177,0,0,140,0,0.4,1,0,7
6,7oqPbDvS,1,56,1,3,130,256,1,2,142,1,0.6,2,1,6
7,hIkX99pK,1,59,1,4,110,239,0,2,142,1,1.2,2,1,7
8,7XMzPFlJ,1,60,1,4,140,293,0,2,170,0,1.2,2,2,7
9,6D0n0pcM,1,63,0,4,150,407,0,2,154,0,4.0,2,3,7


In [20]:
# Split dataset in training and test datasets
X_train, X_test = train_test_split(heart_df, test_size=0.2, random_state=int(time.time()))

In [22]:
# Instantiate the classifier
gnb = GaussianNB()
used_features =[
    "slope_of_peak_exercise_st_segment",
    "thal",
    "resting_blood_pressure",
    "chest_pain_type",
    "num_major_vessels",
    "fasting_blood_sugar_gt_120_mg_per_dl",
    "resting_ekg_results",
    "serum_cholesterol_mg_per_dl",
    "oldpeak_eq_st_depression",
    "sex",
    "age",
    "max_heart_rate_achieved",
    "exercise_induced_angina"
]

# Train classifier
gnb.fit(
    X_train[used_features].values,
    X_train["heart_disease_present"]
)
y_pred = gnb.predict(X_test[used_features].values)    

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["heart_disease_present"] != y_pred).sum(),
          100*(1-(X_test["heart_disease_present"] != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 54 points : 8, performance 85.19%


As, we can see that the performance of our Classifier has increased this time to **85.19%** as compared to the previous Model of Logistic Regression.

In [23]:
# Probability of Presence of Heart Disease and Probability of Absence of Heart Disease
mean_heart_disease_present=np.mean(X_train["heart_disease_present"])
mean_not_heart_disease_present=1-mean_heart_disease_present
print("Heart Disease Presence prob = {:03.2f}%, Heart Disease Absent prob = {:03.2f}%"
      .format(100*mean_heart_disease_present,100*mean_not_heart_disease_present))

Heart Disease Presence prob = 46.30%, Heart Disease Absent prob = 53.70%


## Impact of Maximum Heart Rate Achieved on Presence & Absence of Heart Disease

In [24]:
# Mean and Standard Deviation of Heart Disease for Different Values of Maximum Heart Rate

mean_max_heart_rate_heart_disease_present = np.mean(X_train[X_train["heart_disease_present"]==1]["max_heart_rate_achieved"])
std_max_heart_rate_heart_disease_present = np.std(X_train[X_train["heart_disease_present"]==1]["max_heart_rate_achieved"])
mean_max_heart_rate_not_heart_disease_present = np.mean(X_train[X_train["heart_disease_present"]==0]["max_heart_rate_achieved"])
std_max_heart_rate_not_heart_disease_present = np.std(X_train[X_train["heart_disease_present"]==0]["max_heart_rate_achieved"])

print("mean_max_heart_rate_heart_disease_present = {:03.2f}".format(mean_max_heart_rate_heart_disease_present))
print("std_max_heart_rate_heart_disease_present = {:03.2f}".format(std_max_heart_rate_heart_disease_present))
print("mean_max_heart_rate_not_heart_disease_present = {:03.2f}".format(mean_max_heart_rate_not_heart_disease_present))
print("std_max_heart_rate_not_heart_disease_present = {:03.2f}".format(std_max_heart_rate_not_heart_disease_present))


mean_max_heart_rate_heart_disease_present = 139.69
std_max_heart_rate_heart_disease_present = 22.99
mean_max_heart_rate_not_heart_disease_present = 158.62
std_max_heart_rate_not_heart_disease_present = 19.03


So, seems like the maximum heart rate achieved during any physical stress test seems to be on the lower side for the patient with the probability of heart disease present. On the other hand patient with a healthy heart and with lower risk of presence of heart disease has a higher maximum heart rate achieved during any physical exercise or physical stress test.

## Model's Performance with Impact of Maximum Heart Rate Achieved

In [25]:
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
gnb = GaussianNB()
used_features =["max_heart_rate_achieved"]
y_pred = gnb.fit(X_train[used_features].values, X_train["heart_disease_present"]).predict(X_test[used_features].values)
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["heart_disease_present"] != y_pred).sum(),
          100*(1-(X_test["heart_disease_present"] != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 54 points : 14, performance 74.07%


## Impact of Resting Blood Pressure on Presence & Absence of Heart Disease

In [26]:
# Mean and Standard Deviation of Heart Disease for Different Values of Resting Blood Pressure

mean_resting_blood_pressure_heart_disease_present = np.mean(X_train[X_train["heart_disease_present"]==1]["resting_blood_pressure"])
std_resting_blood_pressure_heart_disease_present = np.std(X_train[X_train["heart_disease_present"]==1]["resting_blood_pressure"])
mean_resting_blood_pressure_not_heart_disease_present = np.mean(X_train[X_train["heart_disease_present"]==0]["resting_blood_pressure"])
std_resting_blood_pressure_not_heart_disease_present = np.std(X_train[X_train["heart_disease_present"]==0]["resting_blood_pressure"])

print("mean_resting_blood_pressure_heart_disease_present = {:03.2f}".format(mean_resting_blood_pressure_heart_disease_present))
print("std_resting_blood_pressure_heart_disease_present = {:03.2f}".format(std_resting_blood_pressure_heart_disease_present))
print("mean_resting_blood_pressure_not_heart_disease_present = {:03.2f}".format(mean_resting_blood_pressure_not_heart_disease_present))
print("std_resting_blood_pressure_not_heart_disease_present = {:03.2f}".format(std_resting_blood_pressure_not_heart_disease_present))

mean_resting_blood_pressure_heart_disease_present = 134.07
std_resting_blood_pressure_heart_disease_present = 19.64
mean_resting_blood_pressure_not_heart_disease_present = 128.70
std_resting_blood_pressure_not_heart_disease_present = 16.38


So, for obvious reason it seems like the mean of the resting BP for patient with heart disease is on the higher side than the mean of the resting blood pressure for patient with less resting blood pressure.  

## Model's Performance with Impact of Resting Blood Pressure

In [27]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
used_features =["resting_blood_pressure"]
y_pred = gnb.fit(X_train[used_features].values, X_train["heart_disease_present"]).predict(X_test[used_features].values)
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["heart_disease_present"] != y_pred).sum(),
          100*(1-(X_test["heart_disease_present"] != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 54 points : 20, performance 62.96%


## Impact of Blood Cholesterol on Presence & Absence of Heart Disease

In [28]:
# Mean and Standard Deviation of Heart Disease for Different Values of Maximum Heart Rate

mean_cholesterol_heart_disease_present = np.mean(X_train[X_train["heart_disease_present"]==1]["serum_cholesterol_mg_per_dl"])
std_cholesterol_heart_disease_present = np.std(X_train[X_train["heart_disease_present"]==1]["serum_cholesterol_mg_per_dl"])
mean_cholesterol_not_heart_disease_present = np.mean(X_train[X_train["heart_disease_present"]==0]["serum_cholesterol_mg_per_dl"])
std_cholesterol_not_heart_disease_present = np.std(X_train[X_train["heart_disease_present"]==0]["serum_cholesterol_mg_per_dl"])

print("mean_cholesterol_heart_disease_present = {:03.2f}".format(mean_cholesterol_heart_disease_present))
print("std_cholesterol_heart_disease_present = {:03.2f}".format(std_cholesterol_heart_disease_present))
print("mean_cholesterol_not_heart_disease_present = {:03.2f}".format(mean_cholesterol_not_heart_disease_present))
print("std_cholesterol_not_heart_disease_present = {:03.2f}".format(std_cholesterol_not_heart_disease_present))

mean_cholesterol_heart_disease_present = 252.13
std_cholesterol_heart_disease_present = 43.33
mean_cholesterol_not_heart_disease_present = 245.33
std_cholesterol_not_heart_disease_present = 57.06


Okay, so for obvious reason the presence of heart disease inceases with the increase in the mean of the cholesterol present in blood. 

## Model's Performance with Impact of Blood Cholesterol

In [29]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
used_features =["serum_cholesterol_mg_per_dl"]
y_pred = gnb.fit(X_train[used_features].values, X_train["heart_disease_present"]).predict(X_test[used_features].values)
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["heart_disease_present"] != y_pred).sum(),
          100*(1-(X_test["heart_disease_present"] != y_pred).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 54 points : 29, performance 46.30%


It seems that the Model's performance was at it's best when the used feature is Maximum Heart Rate Acheieved during a physical stress test for patient to be classified as Patient with Heart Disease and Patient without Heart Disease.