In [10]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [11]:
here = Path().resolve()
PROJECT_ROOT = next(p for p in [here] + list(here.parents) if (p/"data"/"heart.csv").exists())
sys.path.append(str(PROJECT_ROOT/'src'))
from preprocess import load_and_preprocess,DEFAULT_TARGET_COL
df,X,y,feature_cols = load_and_preprocess(PROJECT_ROOT/'data'/'heart.csv',target_col=DEFAULT_TARGET_COL)

In [12]:
X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42,stratify=y
)

In [13]:
from sklearn.ensemble import HistGradientBoostingClassifier
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [14]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
print('confusion_matrix\n',cm)
analysis=X_test.copy()
analysis['y_true'] = y_test
analysis['y_pred'] = y_pred

fp = analysis[(analysis['y_true'] == 0) & (analysis['y_pred'] == 1)]
fn = analysis[(analysis['y_true'] == 1) & (analysis['y_pred'] == 0)]

print('FP count:', len(fp))
print('FN count:', len(fn))


confusion_matrix
 [[19  9]
 [ 3 30]]
FP count: 9
FN count: 3


In [15]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
summary = pd.DataFrame({
    'overall_mean':X_test[numeric_cols].mean(),
    'fp_mean':fp[numeric_cols].mean(),
    'fn_mean':fn[numeric_cols].mean(),
})
summary['fp_minus_overall']=summary['fp_mean']-summary['overall_mean']
summary['fn_minus_overall'] = summary['fn_mean'] - summary['overall_mean']
summary.sort_values(by='fn_minus_overall',ascending=False).head(15)

Unnamed: 0,overall_mean,fp_mean,fn_mean,fp_minus_overall,fn_minus_overall
chol,251.737705,212.666667,277.666667,-39.071038,25.928962
trestbps,131.295082,131.888889,134.0,0.593807,2.704918
thal,2.213115,2.111111,3.0,-0.102004,0.786885
thalach,147.754098,146.666667,148.333333,-1.087432,0.579235
slope,1.344262,1.222222,1.666667,-0.12204,0.322404
sex,0.688525,0.888889,1.0,0.200364,0.311475
cp,1.065574,1.333333,1.333333,0.26776,0.26776
restecg,0.459016,0.555556,0.666667,0.096539,0.20765
fbs,0.180328,0.222222,0.333333,0.041894,0.153005
exang,0.278689,0.111111,0.333333,-0.167577,0.054645


In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=3))


              precision    recall  f1-score   support

           0      0.864     0.679     0.760        28
           1      0.769     0.909     0.833        33

    accuracy                          0.803        61
   macro avg      0.816     0.794     0.797        61
weighted avg      0.813     0.803     0.800        61



In [17]:
y_proba = None
if hasattr(model, "predict_proba"):
    y_proba = model.predict_proba(X_test)[:, 1]
    analysis["proba_1"] = y_proba


In [22]:
# 1) add probabilities first
if hasattr(model, "predict_proba"):
    analysis["proba_1"] = model.predict_proba(X_test)[:, 1]

# 2) now re-create FP and FN AFTER proba_1 exists
fp = analysis[(analysis["y_true"] == 0) & (analysis["y_pred"] == 1)]
fn = analysis[(analysis["y_true"] == 1) & (analysis["y_pred"] == 0)]

print("FP count:", len(fp))
print("FN count:", len(fn))

# 3) now sorting works
if "proba_1" in analysis.columns:
    print("Top FP (highest proba_1 but y_true=0):")
    display(fp.sort_values("proba_1", ascending=False).head(10))

    print("Top FN (lowest proba_1 but y_true=1):")
display(
    analysis[(analysis["y_true"]==0) & (analysis["y_pred"]==1)]
    .sort_values("proba_1", ascending=False).head(10)
)


FP count: 9
FN count: 3
Top FP (highest proba_1 but y_true=0):


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,age_group_41-50,age_group_51-60,age_group_>60,y_true,y_pred,proba_1
293,67,1,2,152,212,0,0,150,0,0.8,1,0,3,False,False,True,0,1,0.992936
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0,False,True,False,0,1,0.973689
286,59,1,3,134,204,0,1,162,0,0.8,2,2,2,False,True,False,0,1,0.971164
194,60,1,2,140,185,0,0,155,0,3.0,1,0,2,False,True,False,0,1,0.951471
188,50,1,2,140,233,0,1,163,0,0.6,1,1,3,True,False,False,0,1,0.897878
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2,False,False,True,0,1,0.809294
267,49,1,2,118,149,0,0,126,0,0.8,2,3,2,True,False,False,0,1,0.772946
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,False,True,False,0,1,0.699052
197,67,1,0,125,254,1,1,163,0,0.2,1,2,3,False,False,True,0,1,0.500352


Top FN (lowest proba_1 but y_true=1):


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,age_group_41-50,age_group_51-60,age_group_>60,y_true,y_pred,proba_1
293,67,1,2,152,212,0,0,150,0,0.8,1,0,3,False,False,True,0,1,0.992936
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0,False,True,False,0,1,0.973689
286,59,1,3,134,204,0,1,162,0,0.8,2,2,2,False,True,False,0,1,0.971164
194,60,1,2,140,185,0,0,155,0,3.0,1,0,2,False,True,False,0,1,0.951471
188,50,1,2,140,233,0,1,163,0,0.6,1,1,3,True,False,False,0,1,0.897878
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2,False,False,True,0,1,0.809294
267,49,1,2,118,149,0,0,126,0,0.8,2,3,2,True,False,False,0,1,0.772946
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,False,True,False,0,1,0.699052
197,67,1,0,125,254,1,1,163,0,0.2,1,2,3,False,False,True,0,1,0.500352


In [23]:
from sklearn.metrics import precision_recall_fscore_support

def eval_threshold(t):
    y_hat = (analysis["proba_1"] >= t).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(
        analysis["y_true"], y_hat, average="binary", zero_division=0
    )
    return p, r, f1

for t in [0.3, 0.4, 0.5, 0.6, 0.7]:
    p, r, f1 = eval_threshold(t)
    print(f"t={t:.1f}  precision={p:.3f}  recall={r:.3f}  f1={f1:.3f}")


t=0.3  precision=0.744  recall=0.970  f1=0.842
t=0.4  precision=0.738  recall=0.939  f1=0.827
t=0.5  precision=0.769  recall=0.909  f1=0.833
t=0.6  precision=0.784  recall=0.879  f1=0.829
t=0.7  precision=0.794  recall=0.818  f1=0.806
