### Importing Pandas and NumPy

In [1026]:
import numpy as np
import pandas as pd

### Reading the dataset

In [1027]:
df = pd.read_csv("Dataset - Heart_Disease_Prediction (1).csv")
input = [58, 1, 2, 141, 297, 155, 0]

In [1028]:
df.columns

Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'Max HR',
       'FBS over 120', 'Heart Disease'],
      dtype='object')

In [1029]:
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,Max HR,FBS over 120,Heart Disease
0,65,1,4,130,322,109,0,Presence
1,51,0,3,115,564,160,0,Absence
2,59,1,2,124,261,141,0,Presence
3,48,1,4,128,263,105,0,Absence
4,73,0,2,120,269,121,0,Absence


In [1030]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              250 non-null    int64 
 1   Sex              250 non-null    int64 
 2   Chest pain type  250 non-null    int64 
 3   BP               250 non-null    int64 
 4   Cholesterol      250 non-null    int64 
 5   Max HR           250 non-null    int64 
 6   FBS over 120     250 non-null    int64 
 7   Heart Disease    250 non-null    object
dtypes: int64(7), object(1)
memory usage: 15.8+ KB


### Preparing the dataset

In [1031]:
x_df = df.drop(["Heart Disease"], axis = 1)

In [1032]:
y_df = df["Heart Disease"]

###### In the below cell, we are replacing strings 'Presence' and 'Absence' with integers 1 and 0 so that it can be read by the model.

In [1033]:
y_df.replace({'Presence': 1, 'Absence': 0}, inplace = True)

In [1034]:
y_df.head()

0    1
1    0
2    1
3    0
4    0
Name: Heart Disease, dtype: int64

In [1035]:
x_df.describe()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,Max HR,FBS over 120
count,250.0,250.0,250.0,250.0,250.0,250.0,250.0
mean,59.796,0.664,3.188,131.068,249.14,149.488,0.148
std,8.494883,0.473286,0.957247,17.860666,51.606563,23.323848,0.355812
min,45.0,0.0,1.0,94.0,126.0,71.0,0.0
25%,53.0,0.0,3.0,120.0,213.25,133.0,0.0
50%,60.0,1.0,3.0,130.0,244.5,152.5,0.0
75%,67.0,1.0,4.0,140.0,275.75,165.75,0.0
max,75.0,1.0,4.0,200.0,564.0,202.0,1.0


###### Here I tried to scale some columns using both MinMaxScaler and Standard Scaler (onlyMinMaxScaler is shown) but later realised that the accuracy of the model didn't improve with scaling

In [1036]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# age = x_df["Age"].values.reshape(-1,1)
# chest_pain_type = x_df["Chest pain type"].values.reshape(-1,1)
# bp = x_df["BP"].values.reshape(-1,1)
# cholesterol = x_df["Cholesterol"].values.reshape(-1,1)
# hr = x_df["Max HR"].values.reshape(-1,1)


# age = scaler.fit_transform(age)
# chest_pain_type = scaler.fit_transform(chest_pain_type)
# bp = scaler.fit_transform(bp)
# cholesterol = scaler.fit_transform(cholesterol)
# hr = scaler.fit_transform(hr)


# age = pd.DataFrame(age, columns = ["Age_scaled"])
# chest_pain_type = pd.DataFrame(chest_pain_type, columns = ["Chest_pain_type_scaled"])
# bp = pd.DataFrame(bp, columns = ["BP_scaled"])
# cholesterol = pd.DataFrame(cholesterol, columns = ["Cholesterol_scaled"])
# hr = pd.DataFrame(hr, columns = ["HR_scaled"])


# x_df = pd.concat([x_df, age], axis = "columns")
# x_df = pd.concat([x_df, chest_pain_type], axis = "columns")
# x_df = pd.concat([x_df, bp], axis = "columns")
# x_df = pd.concat([x_df, cholesterol], axis = "columns")
# x_df = pd.concat([x_df, hr], axis = "columns")

# x_df = x_df.drop(["Age", "BP", "Chest pain type", "Cholesterol", "Max HR"], axis = "columns")
# x_df.describe()

### Training the Model

In [1037]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
#importing all the necessary sklearn libraries

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.33)
#splitting the dataset into training and testing sets

gnb = GaussianNB()
gnb.fit(x_train, y_train)
best_model = gnb
max_accuracy = 0
for i in range(200):
    x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.33)
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    accuracy= accuracy_score(y_pred, y_test)
    if(max_accuracy < accuracy):
        best_model = gnb
    max_accuracy = max(max_accuracy, accuracy)
#finding out the most accurate model out of the 200 trained models.

print(max_accuracy)

0.8795180722891566


### Output prediction for the given input

In [1038]:
input = np.array(input)
input = input.reshape(1,-1)
input = pd.DataFrame(input, columns = ['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'Max HR', 'FBS over 120'])
pred = best_model.predict(input)
if pred == 0:
    print('Absence')
else:
    print('Presence')

Absence
