In [6]:
import pandas as pd
from pathlib import Path
import numpy as np
import sys
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT/'src'))
from preprocess import load_and_preprocess , DEFAULT_TARGET_COL

In [7]:
data_path = PROJECT_ROOT / "data" / "heart.csv"
df ,X ,y,features_cols=load_and_preprocess(data_path,target_col=DEFAULT_TARGET_COL)
print("X SHAPE:",X.shape)
print("Y SHAPE:",y.shape)
print("CLASS BALANCE:",y.value_counts(normalize=True).to_dict())

X SHAPE: (303, 16)
Y SHAPE: (303,)
CLASS BALANCE: {1: 0.5445544554455446, 0: 0.45544554455445546}


In [8]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
'LogReg': Pipeline([
    ('scaler',StandardScaler()),
    ('model',LogisticRegression(max_iter=2000))
   ]),
    'DecisionTree':DecisionTreeClassifier(random_state=42),
    'RandomForest':RandomForestClassifier(n_estimators=300,random_state=42),
    'ExtraTrees':ExtraTreesClassifier(n_estimators=300,random_state=42),
    
    'HistGB':HistGradientBoostingClassifier(random_state=42),

    'KNN':Pipeline([
        ('scaler',StandardScaler()),
        ('model',KNeighborsClassifier(n_neighbors=15))
    ]),
    'SVC_RBF':Pipeline([
        ('scaler',StandardScaler()),
        ('model',SVC(kernel='rbf',probability=True,random_state=42))
    ])
    
}

In [12]:
#Cross-validated scoring loop (Accuracy + ROC-AUC)
from sklearn.model_selection import cross_val_score
results = []
for name, model in models.items():
    acc_scores = cross_val_score(model,X,y,cv=cv,scoring='accuracy')
    auc_scores=cross_val_score(model,X,y,cv=cv,scoring='roc_auc')

    results.append({
'model':name,
'acc_mean':acc_scores.mean(),
'acc_std':acc_scores.std(),
'auc_mean':auc_scores.mean(),
'auc_std':auc_scores.std(),
    })
    results_df = pd.DataFrame(results).sort_values(by='auc_mean',ascending=False)
    results_df

In [14]:
out_path=PROJECT_ROOT/'notebooks'/ 'model_comparison_cv_results.csv'
results_df.to_csv(out_path,index=False)
print('saved:',out_path)

saved: C:\Users\asult\Downloads\opengeo_ready\ml_week1_project\notebooks\model_comparison_cv_results.csv
