In [1]:

import sklearn
import numpy as np
import pandas as pd
import joblib
from packaging import version
from preprocessing import clean_data  # make sure this exists and is stable
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve

df = pd.read_csv('HeartDiseaseTrain-Test.csv')
print(df.shape)
df.head()

(1025, 14)


Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1025 non-null   int64  
 1   sex                            1025 non-null   object 
 2   chest_pain_type                1025 non-null   object 
 3   resting_blood_pressure         1025 non-null   int64  
 4   cholestoral                    1025 non-null   int64  
 5   fasting_blood_sugar            1025 non-null   object 
 6   rest_ecg                       1025 non-null   object 
 7   Max_heart_rate                 1025 non-null   int64  
 8   exercise_induced_angina        1025 non-null   object 
 9   oldpeak                        1025 non-null   float64
 10  slope                          1025 non-null   object 
 11  vessels_colored_by_flourosopy  1025 non-null   object 
 12  thalassemia                    1025 non-null   o

In [3]:
df.describe()
df['target'].value_counts()


target
1    526
0    499
Name: count, dtype: int64

In [4]:


# Drop rows missing the target
if "target" not in df.columns:
    raise RuntimeError("No 'target' column found in dataset.")
df = df.dropna(subset=["target"]).reset_index(drop=True)
df = df.drop_duplicates().reset_index(drop=True)
print(f"After cleaning: {df.shape}")


df = clean_data(df)
df['target'] = df['target'].astype(int)

# Prepare features and target
X = df.drop(columns='target')
y = df['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

After cleaning: (302, 14)
Train shape: (241, 13), Test shape: (61, 13)


In [5]:
# cleaned = clean_data(df)
# print("After cleaning:", cleaned.shape)
# print(cleaned.columns.tolist())
# cleaned.head()

# Define feature columns
numeric_features = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak', 'vessels_colored_by_flourosopy']
binary_features = ['sex', 'fasting_blood_sugar', 'exercise_induced_angina']
categorical_features = ['chest_pain_type', 'rest_ecg', 'slope', 'thalassemia']

# Validate columns
all_expected = numeric_features + binary_features + categorical_features
missing_cols = [c for c in all_expected if c not in X_train.columns]
if missing_cols:
    raise RuntimeError(f"Missing columns: {missing_cols}")



In [6]:
# Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# Handle sklearn compatibility
if version.parse(sklearn.__version__) >= version.parse("1.2"):
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
else:
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('bin', binary_transformer, binary_features),
    ('cat', categorical_transformer, categorical_features)
])

# Model pipelines
pipe_rf_balanced = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        min_samples_split=5,
        class_weight='balanced',
        random_state=42
    ))
])

pipe_rf_fn = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=200,
        max_depth=8,
        class_weight={0: 1, 1: 3},
        random_state=42
    ))
])

pipe_lr = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ))
])

pipe_gb = Pipeline([
    ('pre', preprocessor),
    ('clf', GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ))
])

# Fit models
print("Fitting models...")
pipe_rf_balanced.fit(X_train, y_train)
pipe_rf_fn.fit(X_train, y_train)
pipe_lr.fit(X_train, y_train)
pipe_gb.fit(X_train, y_train)

# Create ensemble
ensemble = VotingClassifier(
    estimators=[
        ('rf_balanced', pipe_rf_balanced),
        ('rf_fn', pipe_rf_fn),
        ('lr', pipe_lr),
        ('gb', pipe_gb)
    ],
    voting='soft',
    weights=[1, 1.5, 1, 1]
)

ensemble.fit(X_train, y_train)
print("✓ All models fitted successfully")

Fitting models...
✓ All models fitted successfully


In [7]:
# Threshold optimization
y_proba = ensemble.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
# optimal_threshold = thresholds[np.nanargmax(f1_scores)] if len(f1_scores) > 0 else 0.5

# Compute F1 for thresholds
f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
if len(f1_scores) == 0:
    optimal_threshold = 0.5
else:
    opt_idx = int(np.nanargmax(f1_scores))
    optimal_threshold = thresholds[opt_idx]

y_pred_opt = (y_proba >= optimal_threshold).astype(int)
acc_opt = accuracy_score(y_test, y_pred_opt)
cm = confusion_matrix(y_test, y_pred_opt)
tn, fp, fn, tp = cm.ravel()

# print(f"\nOptimal threshold: {optimal_threshold:.3f}")
# print(f"Accuracy: {acc_opt:.3f}")
# print(f"Confusion matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

print("\n=== Initial Evaluation (F1-based threshold) ===")
print(f"Optimal threshold (F1-based): {optimal_threshold:.3f}")
print(f"Accuracy @ optimal: {acc_opt:.3f}")
print(f"Confusion matrix (tn, fp, fn, tp): {tn, fp, fn, tp}")
from sklearn.metrics import confusion_matrix

def find_cost_optimal_threshold(y_true, y_proba, fn_cost=5, fp_cost=1):
    """Find threshold that minimizes custom cost function"""
    thresholds = np.arange(0.1, 0.5, 0.01)
    best_threshold = 0.5
    best_cost = float('inf')
    results = []
    
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        # Custom cost: FN are more expensive than FP
        cost = (fn * fn_cost) + (fp * fp_cost)
        
        results.append({
            'threshold': threshold,
            'cost': cost,
            'fn': fn,
            'fp': fp,
            'accuracy': accuracy_score(y_true, y_pred)
        })
        
        if cost < best_cost:
            best_cost = cost
            best_threshold = threshold
    
    # Create results dataframe for analysis
    results_df = pd.DataFrame(results)
    return best_threshold, best_cost, results_df

print("\n" + "="*60)
print("COST-SENSITIVE THRESHOLD OPTIMIZATION")
print("="*60)

# Find optimal threshold with cost sensitivity
cost_optimal_threshold, best_cost, results_df = find_cost_optimal_threshold(
    y_test, y_proba, fn_cost=3, fp_cost=1  # FN are 3x more costly than FP
)

print(f"Cost-optimal threshold: {cost_optimal_threshold:.3f}")
print(f"Best cost: {best_cost}")

# Apply the new threshold
y_pred_cost = (y_proba >= cost_optimal_threshold).astype(int)
cm_cost = confusion_matrix(y_test, y_pred_cost)
tn_c, fp_c, fn_c, tp_c = cm_cost.ravel()

print(f"New CM - TN:{tn_c}, FP:{fp_c}, FN:{fn_c}, TP:{tp_c}")
print(f"New Accuracy: {accuracy_score(y_test, y_pred_cost):.3f}")

# Compare both approaches
print("\n" + "="*60)
print("COMPARISON: F1-based vs Cost-sensitive Threshold")
print("="*60)
print(f"{'Metric':<15} {'F1-based':<12} {'Cost-sensitive':<15} {'Change':<10}")
print(f"{'-'*60}")
print(f"{'Threshold':<15} {optimal_threshold:<12.3f} {cost_optimal_threshold:<15.3f} {cost_optimal_threshold-optimal_threshold:+.3f}")
print(f"{'Accuracy':<15} {acc_opt:<12.3f} {accuracy_score(y_test, y_pred_cost):<15.3f} {accuracy_score(y_test, y_pred_cost)-acc_opt:+.3f}")
print(f"{'False Negatives':<15} {fn:<12} {fn_c:<15} {fn_c-fn:+d}")
print(f"{'False Positives':<15} {fp:<12} {fp_c:<15} {fp_c-fp:+d}")
print(f"{'True Positives':<15} {tp:<12} {tp_c:<15} {tp_c-tp:+d}")



=== Initial Evaluation (F1-based threshold) ===
Optimal threshold (F1-based): 0.209
Accuracy @ optimal: 0.803
Confusion matrix (tn, fp, fn, tp): (17, 11, 1, 32)

COST-SENSITIVE THRESHOLD OPTIMIZATION
Cost-optimal threshold: 0.160
Best cost: 14
New CM - TN:17, FP:11, FN:1, TP:32
New Accuracy: 0.803

COMPARISON: F1-based vs Cost-sensitive Threshold
Metric          F1-based     Cost-sensitive  Change    
------------------------------------------------------------
Threshold       0.209        0.160           -0.049
Accuracy        0.803        0.803           +0.000
False Negatives 1            1               +0
False Positives 11           11              +0
True Positives  32           32              +0


In [8]:
# Save model
model_metadata = {
    'model': ensemble,
    'optimal_threshold': float(optimal_threshold),
    'feature_names': X.columns.tolist(),
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

joblib.dump(model_metadata, "heart_disease_model_completed_finally.pkl")
print("✓ Model saved successfully")

# Smoke test
low_risk = pd.DataFrame([{
    "age": 35.0, "sex": "Female", "resting_blood_pressure": 110.0,
    "cholestoral": 180.0, "Max_heart_rate": 150.0, "oldpeak": 0.5,
    "fasting_blood_sugar": "Lower than 120 mg/ml", "exercise_induced_angina": "No",
    "chest_pain_type": "typical_angina", "rest_ecg": "normal",
    "slope": "upsloping", "thalassemia": "normal",
    "vessels_colored_by_flourosopy": "Zero"
}])

low_risk_cleaned = clean_data(low_risk)
prob = ensemble.predict_proba(low_risk_cleaned)[0, 1]
print(f"✓ Smoke-test - Low-risk probability: {prob:.3f}")

print("\n🎉 Training complete!")

✓ Model saved successfully
✓ Smoke-test - Low-risk probability: 0.832

🎉 Training complete!


In [9]:
# Final performance summary
print("\n" + "="*60)
print("FINAL PERFORMANCE SUMMARY")
print("="*60)

# Accuracy, FP, FN for both thresholds
acc_f1 = acc_opt
acc_cost = accuracy_score(y_test, y_pred_cost)

# FP and FN from both confusion matrices
FP_f1, FN_f1 = fp, fn
FP_cost, FN_cost = fp_c, fn_c

print(f"{'Metric':<20} {'F1-based':<15} {'Cost-sensitive':<15}")
print("-" * 60)
print(f"{'Accuracy':<20} {acc_f1:<15.3f} {acc_cost:<15.3f}")
print(f"{'False Positives (FP)':<20} {FP_f1:<15} {FP_cost:<15}")
print(f"{'False Negatives (FN)':<20} {FN_f1:<15} {FN_cost:<15}")

# Optional — identify which performed better
better_model = "F1-based" if acc_f1 >= acc_cost else "Cost-sensitive"
print("\n✅ Best performing model (by accuracy):", better_model)



FINAL PERFORMANCE SUMMARY
Metric               F1-based        Cost-sensitive 
------------------------------------------------------------
Accuracy             0.803           0.803          
False Positives (FP) 11              11             
False Negatives (FN) 1               1              

✅ Best performing model (by accuracy): F1-based
