In [2]:
# ==========================================
# Heart Disease Prediction Model Training
# ==========================================

import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("../data/raw/heart.csv")
df.head()

df.isnull().sum()

X = df.drop("condition", axis=1)
y = df["condition"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Train
model = RandomForestClassifier(n_estimators=300, random_state=42)
model.fit(X_train_s, y_train)

pred = model.predict(X_test_s)

print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

# Save
joblib.dump(model, "../models/heart_disease_model.pkl")
joblib.dump(scaler, "../models/scaler_heart.pkl")

print("Heart disease model saved!")


Accuracy: 0.7166666666666667
[[23  9]
 [ 8 20]]
              precision    recall  f1-score   support

           0       0.74      0.72      0.73        32
           1       0.69      0.71      0.70        28

    accuracy                           0.72        60
   macro avg       0.72      0.72      0.72        60
weighted avg       0.72      0.72      0.72        60

Heart disease model saved!
