#Heart Disease prediction:
- `preprocessing`
- `model training`

In [1]:
import kagglehub
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

In [2]:
import os
os.listdir(path)

['heart.csv']

In [3]:
#load the dataframe
import pandas as pd
import numpy as np

DATASET_FILE = os.path.join(path, 'heart.csv')
df = pd.read_csv(DATASET_FILE)
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [4]:
# check for null fields
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [5]:
'''
column cp (chest pain type) has 4 values: 0, 1, 2, and 3.
These numbers are just labels, not amounts.
But to a computer, 3 > 2 > 1 > 0 might look like they're increasing in some meaningful way — which they aren't! so we convert it into three categories as cp_1, cp_2, cp_3 while dropping the first.
all columns:
cp,
restecg,
slope,
thal
'''
df = pd.get_dummies(df, columns=['cp', 'restecg', 'slope', 'thal'], drop_first=True)
df

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,cp_1,cp_2,cp_3,restecg_1,restecg_2,slope_1,slope_2,thal_1,thal_2,thal_3
0,52,1,125,212,0,168,0,1.0,2,0,False,False,False,True,False,False,True,False,False,True
1,53,1,140,203,1,155,1,3.1,0,0,False,False,False,False,False,False,False,False,False,True
2,70,1,145,174,0,125,1,2.6,0,0,False,False,False,True,False,False,False,False,False,True
3,61,1,148,203,0,161,0,0.0,1,0,False,False,False,True,False,False,True,False,False,True
4,62,0,138,294,1,106,0,1.9,3,0,False,False,False,True,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,140,221,0,164,1,0.0,0,1,True,False,False,True,False,False,True,False,True,False
1021,60,1,125,258,0,141,1,2.8,1,0,False,False,False,False,False,True,False,False,False,True
1022,47,1,110,275,0,118,1,1.0,1,0,False,False,False,False,False,True,False,False,True,False
1023,50,0,110,254,0,159,0,0.0,0,1,False,False,False,False,False,False,True,False,True,False


In [11]:
# separate the target
X = df.drop('target', axis=1) # results
Y = df['target']

# X.head()
# Y.head()

In [12]:
'''
use standardScaler to standardize the features so
that Mean=0 and Variance=1
'''

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[-0.26843658,  0.66150409, -0.37763552, ..., -0.25806452,
        -1.06347409,  1.22474487],
       [-0.15815703,  0.66150409,  0.4791073 , ..., -0.25806452,
        -1.06347409,  1.22474487],
       [ 1.71659547,  0.66150409,  0.76468824, ..., -0.25806452,
        -1.06347409,  1.22474487],
       ...,
       [-0.81983438,  0.66150409, -1.23437834, ..., -0.25806452,
         0.9403144 , -0.81649658],
       [-0.4889957 , -1.51170646, -1.23437834, ..., -0.25806452,
         0.9403144 , -0.81649658],
       [-0.04787747,  0.66150409, -0.66321646, ..., -0.25806452,
        -1.06347409,  1.22474487]])

In [13]:
'''
Split the data into training and testing sets
'''
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=20)

In [14]:
'''
Model Selection:
since it is a simple clasification problem, we just use a LogisitcRegressio Model
from sklearn. It works well with moderate-sized datasets like the one used in this example.
'''
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)

In [16]:
'''
Modle evaluation:
testing accuracy, confusion_matrix and classification_report
'''
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
Y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, Y_pred))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))

Accuracy: 0.7864077669902912

Confusion Matrix:
 [[31 15]
 [ 7 50]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.67      0.74        46
           1       0.77      0.88      0.82        57

    accuracy                           0.79       103
   macro avg       0.79      0.78      0.78       103
weighted avg       0.79      0.79      0.78       103



In [17]:
'''
Export the model
'''
import pickle
with open('heart_disease_model-0.1.0.pkl', 'wb') as f:
    pickle.dump(model, f)


In [None]:
'''
ENDDDDDDDDDDDDDDDD
WORKING ON THE APP NOW
'''