In [1]:
import pandas as pd
import numpy as np

Load Dataset

In [2]:
df = pd.read_csv('../data/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Basic Dataset Inspection

In [8]:
print("Shape: ",df.shape)
print("columns", df.columns)
print("dtypes :",df.dtypes)

Shape:  (918, 12)
columns Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')
dtypes : Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object


Check Missing Values

In [9]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [11]:
df["HeartDisease"].value_counts(normalize=True)

HeartDisease
1    0.553377
0    0.446623
Name: proportion, dtype: float64

Separate Features & Target

In [13]:
x = df.drop(columns=["HeartDisease"],axis=1)
y = df["HeartDisease"]

Encode Categorical Variables

In [17]:
x_encoded = pd.get_dummies(x,drop_first=True)
x_encoded.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,True,False,True,False,True,False,False,False,True


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Train-Test Split

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x_encoded,y, test_size=0.2, random_state=42,stratify=y)
print("Train shape:", x_train.shape)
print("Test shape:", x_test.shape)

Train shape: (734, 15)
Test shape: (184, 15)


Feature scaling

In [21]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [26]:
import joblib
import os

os.makedirs("../artifacts", exist_ok=True)

joblib.dump(x_train, "../artifacts/x_train.pkl")
joblib.dump(x_test, "../artifacts/x_test.pkl")
joblib.dump(y_train, "../artifacts/y_train.pkl")
joblib.dump(y_test, "../artifacts/y_test.pkl")

joblib.dump(x_train_scaled, "../artifacts/x_train_scaled.pkl")
joblib.dump(x_test_scaled, "../artifacts/x_test_scaled.pkl")

joblib.dump(scaler, "../artifacts/scaler.pkl")


['../artifacts/scaler.pkl']