 Imports, seed, and load preprocessed data

In [7]:
import numpy as np, pandas as pd, joblib, plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

SEED = 42

# Load and prepare dataset (Cleveland schema)
df = pd.read_csv("heart.csv", header=None)
df.columns = ['age','sex','cp','trestbps','chol','fbs','restecg',
              'thalach','exang','oldpeak','slope','ca','thal','target']

# Clean and binarize target
df = df.replace("?", np.nan)
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df['target'] = (df['target'] > 0).astype(int)
df = df.fillna(df.median(numeric_only=True))

# Encode categoricals (consistent with Notebook 1)
cat_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
for c in cat_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c].astype(int))

# Split and scale
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

# Save/overwrite preprocessor to ensure alignment with this notebook run
pre = joblib.load("models/preprocessor.pkl")
pre['scaler'] = scaler
pre['cat_cols'] = cat_cols
joblib.dump(pre, "models/preprocessor.pkl")

X_train_sc.shape, X_test_sc.shape, y_train.shape, y_test.shape


((242, 13), (61, 13), (242,), (61,))

Define models and parameter grids

In [8]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=200, solver='liblinear', random_state=SEED),
    'SVM': SVC(probability=True, random_state=SEED),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(random_state=SEED)
}

param_grids = {
    'Logistic Regression': {'C':[0.1,1,10], 'penalty':['l1','l2'], 'solver':['liblinear']},
    'SVM': {'C':[0.1,1,10], 'kernel':['rbf','linear'], 'gamma':['scale','auto']},
    'KNN': {'n_neighbors':[3,5,7,11], 'weights':['uniform','distance']},
    'Random Forest': {'n_estimators':[100,200], 'max_depth':[None,5,10]}
}


 Train with 5-fold cross-validation (GridSearchCV)

In [9]:
best_models = {}
best_params = {}
for name, model in models.items():
    print(f"Training {name} ...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_sc, y_train)
    best_models[name] = grid.best_estimator_
    best_params[name] = grid.best_params_
    print(f"Best params for {name}: {grid.best_params_}")


Training Logistic Regression ...
Best params for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Training SVM ...
Best params for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Training KNN ...
Best params for KNN: {'n_neighbors': 5, 'weights': 'uniform'}
Training Random Forest ...
Best params for Random Forest: {'max_depth': 5, 'n_estimators': 200}


Evaluate baselines (no PCA) and display metrics

In [10]:
scores = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test_sc)
    y_prob = model.predict_proba(X_test_sc)[:,1]
    scores[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob),
        'best_params': best_params[name]
    }
pd.DataFrame(scores).T


Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,best_params
Logistic Regression,0.868852,0.8125,0.928571,0.866667,0.957792,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline..."
SVM,0.885246,0.862069,0.892857,0.877193,0.950216,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}"
KNN,0.901639,0.823529,1.0,0.903226,0.924242,"{'n_neighbors': 5, 'weights': 'uniform'}"
Random Forest,0.901639,0.866667,0.928571,0.896552,0.95671,"{'max_depth': 5, 'n_estimators': 200}"


 Plot confusion matrices

In [11]:
for name, model in best_models.items():
    y_pred = model.predict(X_test_sc)
    cm = confusion_matrix(y_test, y_pred)
    fig = px.imshow(
        cm,
        text_auto=True,
        color_continuous_scale='RdPu',
        labels=dict(x="Predicted", y="Actual", color="Count"),
        title=f"{name} - Confusion Matrix"
    )
    fig.update_xaxes(tickmode="array", tickvals=[0,1], ticktext=["No Disease","Disease"])
    fig.update_yaxes(tickmode="array", tickvals=[0,1], ticktext=["No Disease","Disease"])
    fig.show()


In [14]:
joblib.dump(best_models['Logistic Regression'], "models/logistic_regression_model.pkl")
joblib.dump(best_models['SVM'], "models/svm_model.pkl")
joblib.dump(best_models['KNN'], "models/knn_model.pkl")
joblib.dump(best_models['Random Forest'], "models/random_forest_model.pkl")
joblib.dump(scores, "models/model_scores.pkl")


['models/model_scores.pkl']