In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
End-to-end analytical ML pipeline (binary / multi-class classification).
Code prepared by Dr Abdullahi Chowdhury
Presented by: Dr Abdullahi Chowdhury and Dr Manzurul Islam, Next Generation Artificial Intelligence Research Centre

UPDATED FOR GOOGLE COLAB
------------------------
This version automatically mounts Google Drive and saves inputs/outputs there.

INSTRUCTIONS
------------
1. Run this cell.
2. Allow Google Drive mounting when prompted.
3. The script will create a folder '/content/drive/MyDrive/Analytical_Project_Data'.
4. Upload your dataset (CSV/Excel) to that folder in your Drive.
5. Update the 'dataset_filename' in the main() function at the bottom if needed.
"""

import os
import sys
import warnings
from dataclasses import dataclass, asdict
from typing import Dict, Any, List, Optional, Tuple

import numpy as np
import pandas as pd

# Colab-specific setup for Google Drive
try:
    import google.colab
    IN_COLAB = True
    print("[INFO] Google Colab detected. Mounting Drive...")
    from google.colab import drive
    drive.mount('/content/drive')

    # Define the base path for input/output in Drive
    # This creates a specific folder for your project to keep things organized
    BASE_PATH = "/content/drive/MyDrive/Analytical_Project_Data"

    if not os.path.exists(BASE_PATH):
        os.makedirs(BASE_PATH)
        print(f"[INFO] Created working directory in Drive: {BASE_PATH}")
        print(f"[ACTION REQUIRED] Please upload your dataset to: {BASE_PATH}")
    else:
        print(f"[INFO] Using existing working directory in Drive: {BASE_PATH}")

except ImportError:
    IN_COLAB = False
    print("[INFO] Not running in Colab. Using current directory.")
    BASE_PATH = "."

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    roc_auc_score,
    classification_report
)
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

# Baseline models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

# Optional: XGBoost (will be conditionally used)
try:
    from xgboost import XGBClassifier
    HAS_XGBOOST = True
except ImportError:  # pragma: no cover - optional dependency
    HAS_XGBOOST = False
    XGBClassifier = None

warnings.filterwarnings("ignore")


# ============================================================================
# CONFIGURATION
# ============================================================================

@dataclass
class Config:
    # We set defaults here, but they will be overridden in main() with full paths
    file_path: str = "heart.csv"
    sheet_name: Optional[str] = None      # If Excel, e.g. "Sheet1"
    target_column: str = "target"
    test_size: float = 0.2
    random_state: int = 42
    scaling_method: str = "standard"      # "standard" or "minmax"
    cv_folds: int = 5
    results_csv: str = "experiment_results.csv"
    top_n_to_print: int = 10


# ============================================================================
# DATA LOADING AND BASIC INSPECTION
# ============================================================================

def load_dataset(cfg: Config) -> pd.DataFrame:
    """Load dataset from CSV or Excel depending on file extension."""
    if not os.path.exists(cfg.file_path):
        raise FileNotFoundError(
            f"Dataset not found at: {cfg.file_path}\n"
            f"If using Colab, ensure you uploaded the file to: {os.path.dirname(cfg.file_path)}"
        )

    ext = os.path.splitext(cfg.file_path)[1].lower()
    if ext in [".csv"]:
        df = pd.read_csv(cfg.file_path)
    elif ext in [".xlsx", ".xls"]:
        df = pd.read_excel(cfg.file_path, sheet_name=cfg.sheet_name)
    else:
        raise ValueError(f"Unsupported file extension: {ext}")

    print("\n=== Data Head ===")
    print(df.head())
    return df


def split_features_target(df: pd.DataFrame, target_column: str) -> Tuple[pd.DataFrame, pd.Series]:
    """Separate features X and target y based on target column."""
    if target_column not in df.columns:
        raise KeyError(f"Target column '{target_column}' not found in dataset. Available columns: {list(df.columns)}")
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y


def summarise_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """Summarise missing values per feature (count and percentage)."""
    missing_count = df.isna().sum()
    missing_pct = (missing_count / len(df)) * 100
    summary = pd.DataFrame(
        {
            "column": df.columns,
            "missing_count": missing_count.values,
            "missing_pct": missing_pct.values,
        }
    )
    print("\n=== Missing Values Summary ===")
    print(summary.sort_values("missing_pct", ascending=False))
    return summary


def detect_preprocessing_needs(df: pd.DataFrame, target_column: str) -> None:
    """Identify basic preprocessing needs and print a short summary."""
    print("\n=== Preprocessing Needs Summary ===")
    dtypes = df.dtypes
    numeric_like_str_cols = []
    scaling_candidates = []
    for col, dtype in dtypes.items():
        if col == target_column:
            continue
        if dtype == "object":
            # crude heuristic: if most values look numeric, flag it
            sample = df[col].dropna().astype(str).head(50)
            if len(sample) > 0:
                numeric_like = sample.str.match(r"^-?\d+(\.\d+)?$").mean()
                if numeric_like > 0.7:
                    numeric_like_str_cols.append(col)
        elif np.issubdtype(dtype, np.number):
            scaling_candidates.append(col)

    print(f"Object columns that appear numeric and may need casting: {numeric_like_str_cols}")
    print(f"Numeric columns that may benefit from scaling: {scaling_candidates}")


# ============================================================================
# PREPROCESSING AND FEATURE ENGINEERING
# ============================================================================

def get_feature_types(X: pd.DataFrame) -> Tuple[List[str], List[str]]:
    """Return lists of numeric and categorical feature names."""
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = [c for c in X.columns if c not in numeric_features]
    return numeric_features, categorical_features


def build_preprocessor(
    X: pd.DataFrame,
    cfg: Config,
    feature_engineering: str = "none"
) -> ColumnTransformer:
    """
    Build a ColumnTransformer for preprocessing and simple feature engineering.

    feature_engineering options:
    - "none": standard numeric + categorical preprocessing
    - "poly": polynomial features on numeric data
    - "interactions": interaction-only polynomial features on numeric data
    - "pca": PCA on numeric data after scaling (dimensionality reduction)
    """
    numeric_features, categorical_features = get_feature_types(X)

    if cfg.scaling_method == "standard":
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()

    numeric_steps = [("imputer", SimpleImputer(strategy="median"))]
    cat_steps = [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]

    if feature_engineering == "poly":
        numeric_steps.append(
            ("poly", PolynomialFeatures(degree=2, include_bias=False))
        )
        numeric_steps.append(("scaler", scaler))
    elif feature_engineering == "interactions":
        numeric_steps.append(
            ("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True))
        )
        numeric_steps.append(("scaler", scaler))
    elif feature_engineering == "pca":
        # Impute -> scale -> PCA
        numeric_steps.append(("scaler", scaler))
        numeric_steps.append(("pca", PCA(n_components=0.95, random_state=cfg.random_state)))
    else:  # "none"
        numeric_steps.append(("scaler", scaler))

    numeric_transformer = SkPipeline(steps=numeric_steps)
    categorical_transformer = SkPipeline(steps=cat_steps)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    return preprocessor


# ============================================================================
# CLASS IMBALANCE AND OVERSAMPLING
# ============================================================================

def compute_class_balance(y: pd.Series) -> pd.Series:
    """Print and return class distribution."""
    print("\n=== Target Class Distribution ===")
    counts = y.value_counts()
    print(counts)
    return counts


def is_imbalanced(y: pd.Series, threshold: float = 0.3) -> bool:
    """
    Determine whether the dataset is imbalanced.
    Rule: if any class proportion is < threshold, treat as imbalanced.
    """
    value_counts = y.value_counts(normalize=True)
    minority_ratio = value_counts.min()
    return minority_ratio < threshold


def get_oversamplers(cfg: Config) -> Dict[str, Any]:
    """Return oversampler objects keyed by name."""
    oversamplers = {
        "none": None,
        "SMOTE": SMOTE(random_state=cfg.random_state),
        "ADASYN": ADASYN(random_state=cfg.random_state),
        "BorderlineSMOTE": BorderlineSMOTE(random_state=cfg.random_state, kind="borderline-1")
    }
    return oversamplers


def get_oversampler_param_grids() -> Dict[str, Dict[str, List[Any]]]:
    """
    Parameter grids for oversamplers, referenced via 'sampler__' prefix in pipelines.
    """
    oversampler_grids = {
        "none": {},
        "SMOTE": {
            "sampler__sampling_strategy": [0.5, 0.75, 1.0],
            "sampler__k_neighbors": [3, 5, 7],
        },
        "ADASYN": {
            "sampler__sampling_strategy": [0.5, 0.75, 1.0],
            "sampler__n_neighbors": [3, 5, 7],
        },
        "BorderlineSMOTE": {
            "sampler__sampling_strategy": [0.5, 0.75, 1.0],
            "sampler__k_neighbors": [3, 5, 7],
            "sampler__kind": ["borderline-1", "borderline-2"],
        },
    }
    return oversampler_grids


# ============================================================================
# MODELS AND HYPERPARAMETER GRIDS
# ============================================================================

def get_baseline_models(cfg: Config) -> Dict[str, Tuple[Any, Dict[str, List[Any]]]]:
    """Return baseline classifiers and their parameter grids."""
    models = {}

    # Random Forest
    rf = RandomForestClassifier(random_state=cfg.random_state)
    rf_grid = {
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [None, 5, 10],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
        "clf__max_features": ["sqrt", "log2"],
    }
    models["RandomForest"] = (rf, rf_grid)

    # Logistic Regression
    lr = LogisticRegression(
        max_iter=1000,
        solver="liblinear"  # good for small-to-medium binary problems
    )
    lr_grid = {
        "clf__C": [0.01, 0.1, 1.0, 10.0],
        "clf__penalty": ["l1", "l2"],
        "clf__class_weight": [None, "balanced"],
        "clf__fit_intercept": [True, False],
    }
    models["LogisticRegression"] = (lr, lr_grid)

    # Decision Tree
    dt = DecisionTreeClassifier(random_state=cfg.random_state)
    dt_grid = {
        "clf__max_depth": [None, 5, 10, 20],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
        "clf__criterion": ["gini", "entropy"],
        "clf__max_features": [None, "sqrt", "log2"],
    }
    models["DecisionTree"] = (dt, dt_grid)

    # Support Vector Machine (SVC with probability for ROC AUC)
    svm = SVC(probability=True, random_state=cfg.random_state)
    svm_grid = {
        "clf__C": [0.1, 1, 10],
        "clf__kernel": ["linear", "rbf", "poly"],
        "clf__gamma": ["scale", "auto"],
        "clf__degree": [2, 3],
        "clf__class_weight": [None, "balanced"],
    }
    models["SVM"] = (svm, svm_grid)

    return models


def get_ensemble_models(cfg: Config) -> Dict[str, Tuple[Any, Dict[str, List[Any]]]]:
    """Return ensemble models and their parameter grids."""
    models = {}

    # Random Forest again, but treated explicitly as an ensemble
    rf = RandomForestClassifier(random_state=cfg.random_state)
    rf_grid = {
        "clf__n_estimators": [100, 200],
        "clf__max_depth": [None, 5, 10],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
        "clf__max_features": ["sqrt", "log2"],
    }
    models["Ensemble_RandomForest"] = (rf, rf_grid)

    # Gradient Boosting
    gb = GradientBoostingClassifier(random_state=cfg.random_state)
    gb_grid = {
        "clf__n_estimators": [100, 200],
        "clf__learning_rate": [0.01, 0.1],
        "clf__max_depth": [3, 5],
        "clf__min_samples_split": [2, 5],
        "clf__min_samples_leaf": [1, 2],
    }
    models["GradientBoosting"] = (gb, gb_grid)

    # XGBoost if available
    if HAS_XGBOOST:
        xgb = XGBClassifier(
            random_state=cfg.random_state,
            use_label_encoder=False,
            eval_metric="logloss"
        )
        xgb_grid = {
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [3, 5, 7],
            "clf__learning_rate": [0.01, 0.1],
            "clf__subsample": [0.8, 1.0],
            "clf__colsample_bytree": [0.8, 1.0],
        }
        models["XGBoost"] = (xgb, xgb_grid)
    else:
        print("\n[INFO] xgboost is not installed. XGBoost will be skipped.")

    return models


def get_stacking_model(cfg: Config) -> Tuple[StackingClassifier, Dict[str, List[Any]]]:
    """Build a stacking classifier and parameter grid."""
    base_estimators = [
        ("rf", RandomForestClassifier(random_state=cfg.random_state)),
        ("svm", SVC(probability=True, random_state=cfg.random_state)),
        ("lr", LogisticRegression(max_iter=1000, solver="liblinear")),
        ("gb", GradientBoostingClassifier(random_state=cfg.random_state)),
    ]
    final_estimator = LogisticRegression(max_iter=1000, solver="liblinear")

    stack_clf = StackingClassifier(
        estimators=base_estimators,
        final_estimator=final_estimator,
        passthrough=True
    )

    param_grid = {
        "clf__final_estimator__C": [0.1, 1.0, 10.0],
        "clf__final_estimator__penalty": ["l1", "l2"],
        "clf__stack_method": ["auto", "predict_proba"],
        "clf__cv": [3, 5],
        "clf__n_jobs": [None],
    }

    return stack_clf, param_grid


# ============================================================================
# METRICS
# ============================================================================

def infer_positive_label(y: pd.Series) -> Any:
    """
    Choose a positive class label.
    For binary classification, use the minority class.
    For multi-class, use the least frequent class (still defined for TPR/FNR
    on that class only).
    """
    counts = y.value_counts()
    positive_label = counts.idxmin()
    return positive_label


def compute_metrics(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    y_proba: Optional[np.ndarray],
    positive_label: Any
) -> Dict[str, Any]:
    """Compute accuracy, confusion matrix, TPR, FNR, TP, FP, ROC AUC."""
    acc = accuracy_score(y_true, y_pred)
    labels = np.unique(y_true)

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    # Attempt to map TP/FP/FN/TN when binary or using positive_label
    if len(labels) == 2:
        # Ensure we know which index is positive
        pos_index = list(labels).index(positive_label)
        # For binary confusion matrix:
        # rows = true, cols = pred
        # If labels=[neg,pos], cm:
        # [[TN, FP],
        #  [FN, TP]]
        if pos_index == 1:
            tn, fp, fn, tp = cm.ravel()
        else:
            # labels order is [pos, neg], re-interpret accordingly
            tp = cm[0, 0]
            fn = cm[0, 1]
            fp = cm[1, 0]
            tn = cm[1, 1]
    else:
        # For multi-class, TP/FP/FN/TN not straightforward as scalars;
        # set to NaN and document in results.
        tp = fp = fn = tn = np.nan

    # TPR and FNR
    if len(labels) == 2:
        # recall for positive class
        if (tp + fn) > 0:
            tpr = tp / (tp + fn)
            fnr = fn / (tp + fn)
        else:
            tpr = np.nan
            fnr = np.nan
    else:
        tpr = np.nan
        fnr = np.nan

    # ROC AUC
    roc = np.nan
    if y_proba is not None:
        try:
            if len(labels) == 2:
                # y_proba is probability of positive class
                roc = roc_auc_score(y_true, y_proba)
            else:
                # multi-class (one-vs-rest)
                roc = roc_auc_score(y_true, y_proba, multi_class="ovr")
        except Exception:
            roc = np.nan

    metrics = {
        "accuracy": acc,
        "tpr": tpr,
        "fnr": fnr,
        "tp": tp,
        "fp": fp,
        "tn": tn,
        "fn": fn,
        "roc_auc": roc,
        "confusion_matrix": cm.tolist(),  # store as list for CSV/json
    }
    return metrics


# ============================================================================
# TRAINING AND EVALUATION ROUTINES
# ============================================================================

def run_grid_search(
    pipeline,
    param_grid: Dict[str, List[Any]],
    X_train: pd.DataFrame,
    y_train: pd.Series,
    cfg: Config
) -> GridSearchCV:
    """Run GridSearchCV for a given pipeline and hyperparameter grid."""
    cv = StratifiedKFold(n_splits=cfg.cv_folds, shuffle=True, random_state=cfg.random_state)
    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=0
    )
    gs.fit(X_train, y_train)
    return gs


def evaluate_and_record(
    results: List[Dict[str, Any]],
    model_name: str,
    model_category: str,
    feature_engineering: str,
    oversampler_name: str,
    grid_search: GridSearchCV,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    positive_label: Any,
) -> None:
    """
    Evaluate best estimator on test set, compute metrics and record results
    in the provided list of dictionaries.
    """
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Predictions and probabilities
    y_pred = best_model.predict(X_test)
    try:
        proba = best_model.predict_proba(X_test)
        # assume binary: need prob of positive class
        if proba.shape[1] == 2:
            y_proba_pos = proba[:, list(best_model.classes_).index(positive_label)]
        else:
            y_proba_pos = proba
    except Exception:
        y_proba_pos = None

    metrics = compute_metrics(y_test, y_pred, y_proba_pos, positive_label)
    result_entry = {
        "model_name": model_name,
        "model_category": model_category,
        "feature_engineering": feature_engineering,
        "oversampler": oversampler_name,
        "best_params": best_params,
    }
    result_entry.update(metrics)
    results.append(result_entry)


def run_model_suite(
    cfg: Config,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    feature_engineering_methods: List[str],
    baseline_models: Dict[str, Tuple[Any, Dict[str, List[Any]]]],
    ensemble_models: Dict[str, Tuple[Any, Dict[str, List[Any]]]],
    stacking_model: Tuple[Any, Dict[str, List[Any]]],
    y_train_full: pd.Series,
) -> pd.DataFrame:
    """
    Run baseline models, feature-engineered models, oversampling, ensembles,
    and stacking. Return a DataFrame with all results.
    """
    results: List[Dict[str, Any]] = []
    positive_label = infer_positive_label(y_train_full)
    print(f"\n[INFO] Positive label used for TPR/FNR: {positive_label}")

    oversamplers = get_oversamplers(cfg)
    oversampler_grids = get_oversampler_param_grids()

    # Check if dataset is imbalanced
    imbalance_flag = is_imbalanced(y_train_full)
    print(f"\n[INFO] Dataset imbalanced? {imbalance_flag}")

    # Always run the "none" oversampling setting (original data)
    oversampling_to_run = ["none"]
    if imbalance_flag:
        oversampling_to_run.extend(["SMOTE", "ADASYN", "BorderlineSMOTE"])

    # ============================================================== #
    #  BASELINE + FEATURE ENGINEERING + OVERSAMPLING                 #
    # ============================================================== #

    for fe_method in feature_engineering_methods:
        print(f"\n=== Feature Engineering: {fe_method} ===")
        preprocessor = build_preprocessor(X_train, cfg, feature_engineering=fe_method)

        for oversampler_name in oversampling_to_run:
            print(f"\n--- Oversampler: {oversampler_name} ---")
            sampler = oversamplers[oversampler_name]
            sampler_grid = oversampler_grids[oversampler_name]

            for model_name, (clf, param_grid) in baseline_models.items():
                print(f"\n[Baseline] Training model: {model_name}")
                # Build pipeline
                if sampler is None:
                    pipeline = SkPipeline(
                        steps=[("preprocess", preprocessor), ("clf", clf)]
                    )
                else:
                    pipeline = ImbPipeline(
                        steps=[("preprocess", preprocessor), ("sampler", sampler), ("clf", clf)]
                    )

                # Merge model grid with sampler grid
                full_grid = dict(param_grid)
                full_grid.update(sampler_grid)

                gs = run_grid_search(pipeline, full_grid, X_train, y_train, cfg)
                evaluate_and_record(
                    results,
                    model_name=model_name,
                    model_category="baseline",
                    feature_engineering=fe_method,
                    oversampler_name=oversampler_name,
                    grid_search=gs,
                    X_test=X_test,
                    y_test=y_test,
                    positive_label=positive_label,
                )

    # ============================================================== #
    #  ENSEMBLE METHODS                                              #
    # ============================================================== #
    for fe_method in feature_engineering_methods:
        print(f"\n=== [Ensemble] Feature Engineering: {fe_method} ===")
        preprocessor = build_preprocessor(X_train, cfg, feature_engineering=fe_method)

        for oversampler_name in oversampling_to_run:
            print(f"\n--- [Ensemble] Oversampler: {oversampler_name} ---")
            sampler = oversamplers[oversampler_name]
            sampler_grid = oversampler_grids[oversampler_name]

            for model_name, (clf, param_grid) in ensemble_models.items():
                print(f"\n[Ensemble] Training model: {model_name}")
                if sampler is None:
                    pipeline = SkPipeline(
                        steps=[("preprocess", preprocessor), ("clf", clf)]
                    )
                else:
                    pipeline = ImbPipeline(
                        steps=[("preprocess", preprocessor), ("sampler", sampler), ("clf", clf)]
                    )

                full_grid = dict(param_grid)
                full_grid.update(sampler_grid)

                gs = run_grid_search(pipeline, full_grid, X_train, y_train, cfg)
                evaluate_and_record(
                    results,
                    model_name=model_name,
                    model_category="ensemble",
                    feature_engineering=fe_method,
                    oversampler_name=oversampler_name,
                    grid_search=gs,
                    X_test=X_test,
                    y_test=y_test,
                    positive_label=positive_label,
                )

    # ============================================================== #
    #  STACKING MODEL                                                #
    # ============================================================== #
    stack_clf, stack_grid = stacking_model
    for fe_method in feature_engineering_methods:
        print(f"\n=== [Stacking] Feature Engineering: {fe_method} ===")
        preprocessor = build_preprocessor(X_train, cfg, feature_engineering=fe_method)

        for oversampler_name in oversampling_to_run:
            print(f"\n--- [Stacking] Oversampler: {oversampler_name} ---")
            sampler = oversamplers[oversampler_name]
            sampler_grid = oversampler_grids[oversampler_name]

            model_name = "StackingClassifier"
            if sampler is None:
                pipeline = SkPipeline(
                    steps=[("preprocess", preprocessor), ("clf", stack_clf)]
                )
            else:
                pipeline = ImbPipeline(
                    steps=[("preprocess", preprocessor), ("sampler", sampler), ("clf", stack_clf)]
                )

            full_grid = dict(stack_grid)
            full_grid.update(sampler_grid)

            gs = run_grid_search(pipeline, full_grid, X_train, y_train, cfg)
            evaluate_and_record(
                results,
                model_name=model_name,
                model_category="stacking",
                feature_engineering=fe_method,
                oversampler_name=oversampler_name,
                grid_search=gs,
                X_test=X_test,
                y_test=y_test,
                positive_label=positive_label,
            )

    results_df = pd.DataFrame(results)
    return results_df


# ============================================================================
# MAIN ENTRY POINT
# ============================================================================

def main():
    # INPUT SETTINGS:
    dataset_filename = "heart.csv"  # <-- Change this to your file name in Drive
    target_col = "target"                  # <-- Change this to your target column

    # Construct full paths based on the detected environment (Colab vs Local)
    full_file_path = os.path.join(BASE_PATH, dataset_filename)
    results_path = os.path.join(BASE_PATH, "experiment_results.csv")

    print(f"[INFO] Looking for dataset at: {full_file_path}")
    print(f"[INFO] Results will be saved to: {results_path}")

    cfg = Config(
        file_path=full_file_path,
        sheet_name=None,               # e.g. "Sheet1" if Excel
        target_column=target_col,
        results_csv=results_path
    )

    # Load data
    try:
        df = load_dataset(cfg)
    except FileNotFoundError as e:
        print("\n" + "!"*50)
        print(e)
        print("!"*50 + "\n")
        return

    # Basic missing values analysis
    _ = summarise_missing_values(df)

    # Detect basic preprocessing needs
    detect_preprocessing_needs(df, cfg.target_column)

    # Split features and target
    try:
        X, y = split_features_target(df, cfg.target_column)
    except KeyError as e:
        print(f"\n[ERROR] {e}")
        return

    # Class balance
    compute_class_balance(y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=cfg.test_size,
        random_state=cfg.random_state,
        stratify=y,
    )

    # Define feature-engineering methods to test
    feature_engineering_methods = ["none", "poly", "interactions", "pca"]

    # Baseline and ensemble models
    baseline_models = get_baseline_models(cfg)
    ensemble_models = get_ensemble_models(cfg)
    stacking_model = get_stacking_model(cfg)

    # Run the full suite of experiments
    results_df = run_model_suite(
        cfg=cfg,
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        feature_engineering_methods=feature_engineering_methods,
        baseline_models=baseline_models,
        ensemble_models=ensemble_models,
        stacking_model=stacking_model,
        y_train_full=y_train,
    )

    # Save results
    results_df.to_csv(cfg.results_csv, index=False)
    print(f"\n[INFO] All experiment results saved to: {cfg.results_csv}")

    # Print top-N models by ROC AUC, then by accuracy as tie-breaker
    print(f"\n=== Top {cfg.top_n_to_print} Models by ROC AUC ===")
    top_results = results_df.sort_values(
        by=["roc_auc", "accuracy"], ascending=False
    ).head(cfg.top_n_to_print)
    print(top_results[[
        "model_name",
        "model_category",
        "feature_engineering",
        "oversampler",
        "accuracy",
        "roc_auc",
        "tpr",
        "fnr"
    ]])

    # Final summary
    if not top_results.empty:
        best_row = top_results.iloc[0]
        print("\n=== Final Summary ===")
        print("Best overall configuration:")
        print(
            f"- Model: {best_row['model_name']} ({best_row['model_category']})\n"
            f"- Feature engineering: {best_row['feature_engineering']}\n"
            f"- Oversampling: {best_row['oversampler']}\n"
            f"- Accuracy: {best_row['accuracy']:.4f}\n"
            f"- ROC AUC: {best_row['roc_auc']:.4f}\n"
            f"- TPR: {best_row['tpr']:.4f} | FNR: {best_row['fnr']:.4f}"
        )


if __name__ == "__main__":
    main()

[INFO] Google Colab detected. Mounting Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[INFO] Using existing working directory in Drive: /content/drive/MyDrive/Analytical_Project_Data
[INFO] Looking for dataset at: /content/drive/MyDrive/Analytical_Project_Data/heart.csv
[INFO] Results will be saved to: /content/drive/MyDrive/Analytical_Project_Data/experiment_results.csv

=== Data Head ===
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2  