In [7]:
# =========================================
# HEART DISEASE PREDICTION – MODEL TRAINING
# =========================================

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

# ==========================
# 1. LOAD CLEANED DATA
# ==========================

data_path = "../artifacts/data.csv"   # output from EDA step
df = pd.read_csv(data_path)

print("Data Loaded Successfully!")
df.head()



Data Loaded Successfully!


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,67,0,1,170,284,0,1,142,0,5.3,0,3,1,1
1,57,1,2,185,553,1,2,76,0,3.0,1,0,1,1
2,43,0,0,193,339,0,0,189,0,5.4,1,2,0,1
3,71,0,2,178,519,0,1,181,1,1.5,0,1,1,1
4,36,1,1,155,530,0,2,182,0,3.3,1,3,2,1


In [8]:
X = df.drop("target", axis=1)
y = df["target"]

print("X Shape:", X.shape)
print("y Shape:", y.shape)


X Shape: (15000, 13)
y Shape: (15000,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (12000, 13)
Test size: (3000, 13)


In [10]:
# save train/test splits for reproducibility
os.makedirs("../artifacts", exist_ok=True)

X_train.to_csv("../artifacts/train.csv", index=False)
X_test.to_csv("../artifacts/test.csv", index=False)
y_train.to_csv("../artifacts/y_train.csv", index=False)
y_test.to_csv("../artifacts/y_test.csv", index=False)

print("Train/Test files saved inside artifacts folder!")


Train/Test files saved inside artifacts folder!


In [11]:
numeric_features = X_train.columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ]
)


In [12]:
log_reg_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)
y_pred_lr = log_reg_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9796666666666667
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       120
           1       0.99      0.99      0.99      2880

    accuracy                           0.98      3000
   macro avg       0.88      0.85      0.86      3000
weighted avg       0.98      0.98      0.98      3000



In [13]:
rf_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, 
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("RandomForest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


RandomForest Accuracy: 0.9993333333333333
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       120
           1       1.00      1.00      1.00      2880

    accuracy                           1.00      3000
   macro avg       1.00      0.99      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [14]:
best_model = rf_model   # RandomForest usually gives 78–85% accuracy

model_path = "../artifacts/model.pkl"
preprocessor_path = "../artifacts/preprocessor.pkl"

# Save Model
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)

# Save Preprocessor separately
with open(preprocessor_path, "wb") as f:
    pickle.dump(preprocessor, f)

print("Model and Preprocessor Saved Successfully!")
print("Model:", model_path)
print("Preprocessor:", preprocessor_path)


Model and Preprocessor Saved Successfully!
Model: ../artifacts/model.pkl
Preprocessor: ../artifacts/preprocessor.pkl
