In [21]:
import os, random, numpy as np
os.makedirs("models", exist_ok=True)
SEED = 42
random.seed(SEED); np.random.seed(SEED)


In [22]:
%pip install plotly

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib


Note: you may need to restart the kernel to use updated packages.


In [23]:
path = "heart.csv"  # Downloaded from https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
df = pd.read_csv(path, header=None)
df.columns = ['age','sex','cp','trestbps','chol','fbs','restecg',
              'thalach','exang','oldpeak','slope','ca','thal','target']
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [24]:
df = df.replace("?", np.nan)
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df['target'] = (df['target'] > 0).astype(int)
df.info()
df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64

In [25]:
df = df.fillna(df.median(numeric_only=True))
df.isna().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [26]:
px.bar(df['target'].value_counts().rename({0:'No Disease',1:'Disease'}),
       title="Class Balance: Heart Disease vs No Disease")


In [27]:
feature = 'age'  
px.histogram(df, x=feature, color='target', barmode='overlay',
             nbins=40, opacity=0.7, color_discrete_sequence=["#f72585","#3a0ca3"])


In [28]:
corr = df.corr(numeric_only=True)
px.imshow(corr, text_auto=True, aspect="auto", color_continuous_scale="Viridis",
          title="Correlation Matrix")


pairplot substitute :  relationship between age and max heart rate


In [29]:
%pip install statsmodels

import plotly.express as px

fig = px.scatter(
    df,
    x="age",
    y="thalach",
    color="target",
    trendline="ols",
    color_discrete_sequence=["#f72585", "#3a0ca3"],
    title="Age vs Max Heart Rate (Thalach) with Trendline"
)
fig.update_layout(legend_title_text="Target (0=No, 1=Yes)")
fig


Note: you may need to restart the kernel to use updated packages.


 count plots (sex and chest pain type)

In [30]:
# Sex by target
fig_sex = px.histogram(
    df,
    x="sex",
    color="target",
    barmode="group",
    color_discrete_sequence=["#f72585", "#3a0ca3"],
    title="Sex vs Target (Grouped Counts)"
)
fig_sex.update_xaxes(
    tickmode="array",
    tickvals=[0,1],
    ticktext=["Female(0)","Male(1)"]
)
fig_sex

# Chest pain type by target
fig_cp = px.histogram(
    df,
    x="cp",
    color="target",
    barmode="group",
    color_discrete_sequence=["#f72585", "#3a0ca3"],
    title="Chest Pain Type vs Target (Grouped Counts)"
)
fig_cp.update_xaxes(
    tickmode="array",
    tickvals=[0,1,2,3],
    ticktext=["Typical","Atypical","Non-anginal","Asymptomatic"]
)
fig_cp


In [31]:
cat_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
encoders = {}
for c in cat_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c].astype(int))
    encoders[c] = le
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1,0,145.0,233.0,1,2,150.0,0,2.3,2,0,1,0
1,67.0,1,3,160.0,286.0,0,2,108.0,1,1.5,1,3,0,1
2,67.0,1,3,120.0,229.0,0,2,129.0,1,2.6,1,2,2,1
3,37.0,1,2,130.0,250.0,0,0,187.0,0,3.5,2,0,0,0
4,41.0,0,1,130.0,204.0,0,2,172.0,0,1.4,0,0,0,0


In [32]:
X = df.drop('target', axis=1)
y = df['target']
feature_names = X.columns.tolist()

# Ensure SEED is defined
if 'SEED' not in globals():
    SEED = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)
X_train_sc.shape, X_test_sc.shape


((242, 13), (61, 13))

In [33]:
preprocessor = {'scaler': scaler, 'encoders': encoders, 'cat_cols': cat_cols}
joblib.dump(preprocessor, "models/preprocessor.pkl")
joblib.dump(feature_names, "models/feature_names.pkl")


['models/feature_names.pkl']