<a href="https://colab.research.google.com/github/arslanmit/PrimeSiftAI/blob/Development/AI_Core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# ================================
# Quietly install needed packages
# ================================
!pip install --quiet ipywidgets xgboost scikit-optimize tqdm catboost

import warnings
warnings.filterwarnings("ignore", message=".*'force_all_finite' was renamed to 'ensure_all_finite'.*")

import io
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
    Normalizer, QuantileTransformer, PowerTransformer
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve, confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
# Other classifiers
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
)
import xgboost
import lightgbm as lgb
from catboost import CatBoostClassifier
from skopt import BayesSearchCV

###############################################################################
# (PLS-DA removed)
###############################################################################

###############################################################################
# Tiny embedded data (10 rows each)
###############################################################################
HEART_DISEASE_CSV_DATA = """age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
63,1,0,145,233,1,2,150,0,2.3,2,0,2,0
67,1,3,160,286,0,2,108,1,1.5,1,3,1,1
37,1,2,130,250,0,0,187,0,3.5,2,0,1,0
41,0,1,130,204,0,2,172,0,1.4,0,0,1,0
56,1,1,120,236,0,0,178,0,0.8,0,0,1,0
62,0,3,140,268,0,2,160,0,3.6,2,2,1,1
57,0,3,120,354,0,0,163,1,0.6,0,0,1,0
63,1,3,130,254,0,2,147,0,1.4,1,1,3,1
67,1,3,120,229,0,2,129,1,2.6,1,2,3,1
59,1,3,140,241,0,0,123,1,0.2,1,0,3,1
"""

BREAST_CANCER_CSV_DATA = """target,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
1,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
1,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
1,20.29,14.34,135.1,1297,0.1003,0.1328,0.198,0.1043,0.1809,0.05883
1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613
1,18.25,19.98,119.6,1040,0.09463,0.109,0.1127,0.074,0.1794,0.05742
1,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451
1,13,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389
1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243
"""

def load_file_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CSV file '{file_path}' not found.")
    return pd.read_csv(file_path)

###############################################################################
# Full manual profiling
###############################################################################
def custom_full_profiling(df: pd.DataFrame) -> pd.DataFrame:
    target_present = ('target' in df.columns)
    if target_present and pd.api.types.is_numeric_dtype(df['target']):
        correlations = df.corrwith(df['target'])
    else:
        correlations = pd.Series(dtype='float64')

    rows = []
    for col in df.columns:
        dtype = df[col].dtype
        non_null = df[col].notnull().sum()
        missing = df[col].isnull().sum()
        uniq = df[col].nunique(dropna=False)

        info_dict = {
            'Column': col,
            'Dtype': str(dtype),
            '#Non-Null': non_null,
            '#Missing': missing,
            '#Unique': uniq
        }

        if pd.api.types.is_numeric_dtype(dtype):
            info_dict['Min']  = df[col].min(skipna=True)
            info_dict['Max']  = df[col].max(skipna=True)
            info_dict['Mean'] = df[col].mean(skipna=True)
            info_dict['Median'] = df[col].median(skipna=True)
            info_dict['Std']  = df[col].std(skipna=True)
            if col in correlations.index:
                info_dict['Corr(target)'] = correlations[col]
            else:
                info_dict['Corr(target)'] = None
            info_dict['Top'] = None
            info_dict['Freq']= None
        else:
            info_dict['Min'] = None
            info_dict['Max'] = None
            info_dict['Mean']=None
            info_dict['Median']=None
            info_dict['Std']=None
            info_dict['Corr(target)']=None

            top_val_series = df[col].value_counts(dropna=False)
            if len(top_val_series) > 0:
                top_val = top_val_series.index[0]
                freq = top_val_series.iloc[0]
                info_dict['Top']= top_val
                info_dict['Freq']= freq
            else:
                info_dict['Top']= None
                info_dict['Freq']= None

        rows.append(info_dict)
    return pd.DataFrame(rows)

def dynamic_explore_data(df: pd.DataFrame, max_cols: int):
    prof_df = custom_full_profiling(df)
    print("=== Full Profiling Info ===")
    display(prof_df)

    total_missing = df.isnull().sum().sum()
    if total_missing == 0:
        print("No missing values (above).")
    else:
        print(f"Missing values found: {total_missing} (above).")

    if "target" not in df.columns:
        print("\nNo 'target' => skipping pairplot.")
        return

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if "target" in numeric_cols:
        numeric_cols.remove("target")

    use_cols = numeric_cols[:max_cols]
    if not use_cols:
        print("\nNo numeric columns => skipping pairplot.")
        return

    pcols = use_cols + ["target"]
    if not set(pcols).issubset(df.columns):
        print("\nSome pairplot columns not found => skipping.")
        return

    print("\nPlotting Pairplot (may take time):")
    with tqdm(total=1, desc="Pairplot") as pbar:
        g = sns.pairplot(
            df[pcols], hue="target", palette="Greys",
            markers=["o", "D"], plot_kws={"s": 25, "alpha": 0.75}, height=3
        )
        g.fig.suptitle(f"Pairplot of up to {max_cols} Numeric + 'target'", y=1.02)
        plt.show()
        pbar.update(1)

from sklearn.ensemble import StackingClassifier
stacking_default = StackingClassifier(
    estimators=[
        ("lr", LogisticRegression(max_iter=1000, random_state=42)),
        ("knn", KNeighborsClassifier())
    ],
    final_estimator=LogisticRegression(random_state=42)
)
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
gpc_default = GaussianProcessClassifier(kernel=1.0 * RBF(1.0), random_state=42)

def create_pipeline(clf, scaling_method: str) -> Pipeline:
    steps = [("selector", SelectKBest(score_func=f_classif, k="all"))]

    def get_scaler(method):
        scalers = {
            "StandardScaler": StandardScaler(),
            "MinMaxScaler": MinMaxScaler(),
            "RobustScaler": RobustScaler(),
            "MaxAbsScaler": MaxAbsScaler(),
            "Normalizer": Normalizer(),
            "QuantileTransformer": QuantileTransformer(output_distribution="normal"),
            "PowerTransformer": PowerTransformer(),
            "none": None
        }
        return scalers.get(method, StandardScaler())

    s = get_scaler(scaling_method)
    if s is not None:
        steps.append(("scaler", s))
    steps.append(("clf", clf))
    return Pipeline(steps)

def define_all_models(scaling_method: str) -> dict:
    qda_default = QuadraticDiscriminantAnalysis(reg_param=0.1)

    return {
        "Naive Bayes": create_pipeline(GaussianNB(), scaling_method),
        "k-nearest neighbors algorithm": create_pipeline(KNeighborsClassifier(), scaling_method),
        "Logistic Regression": create_pipeline(
            LogisticRegression(max_iter=1000, random_state=42), scaling_method
        ),
        "Ridge Classifier": create_pipeline(
            RidgeClassifier(random_state=42), scaling_method
        ),
        "Linear Discriminant Analysis": create_pipeline(
            LinearDiscriminantAnalysis(), scaling_method
        ),
        "Quadratic Discriminant Analysis": create_pipeline(
            qda_default, scaling_method
        ),
        "Decision Tree": create_pipeline(
            DecisionTreeClassifier(random_state=42), scaling_method
        ),
        "Support vector machine": create_pipeline(
            SVC(probability=True, random_state=42), scaling_method
        ),
        "Random Forest": create_pipeline(
            RandomForestClassifier(random_state=42), scaling_method
        ),
        "Extra Trees": create_pipeline(
            ExtraTreesClassifier(random_state=42), scaling_method
        ),
        "AdaBoost": create_pipeline(
            AdaBoostClassifier(random_state=42), scaling_method
        ),
        "Gradient Boosting": create_pipeline(
            GradientBoostingClassifier(random_state=42), scaling_method
        ),
        "eXtreme Gradient Boosting": create_pipeline(
            xgboost.XGBClassifier(eval_metric="logloss", random_state=42),
            scaling_method
        ),
        "LightGBM": create_pipeline(
            lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True),
            scaling_method
        ),
        "CatBoost": create_pipeline(
            CatBoostClassifier(verbose=0, random_state=42),
            scaling_method
        ),
        "Stacked Generalization": create_pipeline(stacking_default, scaling_method),
        "Gaussian Process": create_pipeline(gpc_default, scaling_method),
    }

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc_ = roc_auc_score(y_test, y_prob)
    else:
        if hasattr(model, "decision_function"):
            decision = model.decision_function(X_test)
            dec_min = decision.min()
            dec_max = decision.max()
            if dec_max != dec_min:
                y_prob = (decision - dec_min) / (dec_max - dec_min)
            else:
                y_prob = np.zeros_like(decision)
            auc_ = roc_auc_score(y_test, y_prob)
        else:
            auc_ = -1.0

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": auc_
    }

from sklearn.gaussian_process.kernels import DotProduct
def define_param_grids() -> dict:
    return {
        "Naive Bayes": {
            "selector__k": [5, "all"],
            "clf__var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
        },
        "k-nearest neighbors algorithm": {
            "selector__k": [5, "all"],
            "clf__n_neighbors": [3, 5, 7],
            "clf__weights": ["uniform", "distance"]
        },
        "Logistic Regression": {
            "selector__k": [5, "all"],
            "clf__C": [0.01, 0.1, 1, 10],
            "clf__penalty": ["l2"]
        },
        "Ridge Classifier": {
            "selector__k": [5, "all"],
            "clf__alpha": [0.01, 0.1, 1.0, 10.0]
        },
        "Linear Discriminant Analysis": {"selector__k": [5, "all"]},
        "Quadratic Discriminant Analysis": {
            "selector__k": [5, "all"],
            "clf__reg_param": [0.1, 0.2]
        },
        "Decision Tree": {
            "selector__k": [5, "all"],
            "clf__max_depth": [None, 3, 5, 10],
            "clf__min_samples_split": [2, 5, 10]
        },
        "Support vector machine": {
            "selector__k": [5, "all"],
            "clf__C": [0.1, 1, 10],
            "clf__kernel": ["linear", "rbf"],
            "clf__gamma": ["scale"]
        },
        "Random Forest": {
            "selector__k": [5, "all"],
            "clf__n_estimators": [50, 100, 200],
            "clf__max_depth": [None, 5, 10]
        },
        "Extra Trees": {
            "selector__k": [5, "all"],
            "clf__n_estimators": [50, 100, 200],
            "clf__max_depth": [None, 5, 10]
        },
        "AdaBoost": {
            "selector__k": [5, "all"],
            "clf__n_estimators": [50, 100, 200],
            "clf__learning_rate": [0.01, 0.1, 1]
        },
        "Gradient Boosting": {
            "selector__k": [5, "all"],
            "clf__n_estimators": [50, 100, 200],
            "clf__learning_rate": [0.01, 0.1, 1],
            "clf__max_depth": [3, 5, 7]
        },
        "eXtreme Gradient Boosting": {
            "selector__k": [5, "all"],
            "clf__n_estimators": [50, 100, 200],
            "clf__learning_rate": [0.01, 0.1, 0.2],
            "clf__max_depth": [3, 5, 7]
        },
        "LightGBM": {
            "selector__k": [5, "all"],
            "clf__n_estimators": [50, 100, 200],
            "clf__learning_rate": [0.01, 0.1, 0.2],
            "clf__num_leaves": [20, 31, 50]
        },
        "CatBoost": {
            "selector__k": [5, "all"],
            "clf__iterations": [50, 100, 200],
            "clf__learning_rate": [0.01, 0.1],
            "clf__depth": [3, 5, 7]
        },
        "Stacked Generalization": {"selector__k": [5, "all"]},
        "Gaussian Process": {
            "selector__k": [5, "all"],
            "clf__kernel": [1.0 * RBF(1.0), DotProduct()]
        }
    }

def tune_models(models, param_grids, X_train, y_train, X_test, y_test, search_method: str):
    tuned_models = {}
    tuned_results = {}
    best_params = {}

    with tqdm(total=len(models), desc="Hyperparameter Tuning") as pbar:
        for mname, pipeline in models.items():
            print(f"Tuning {mname}...")
            grid_params = param_grids.get(mname, {})
            if search_method == "none" or not grid_params:
                pipeline.fit(X_train, y_train)
                tuned_models[mname] = pipeline
                tuned_results[mname] = evaluate_model(pipeline, X_test, y_test)
                best_params[mname] = None
                print(f"  No tuning for {mname}. Using default.\n")
                pbar.update(1)
                continue

            if search_method == "grid":
                search_cv = GridSearchCV(pipeline, grid_params, cv=3, scoring="roc_auc", n_jobs=-1)
            elif search_method == "random":
                search_cv = RandomizedSearchCV(
                    pipeline, grid_params, cv=3, scoring="roc_auc",
                    n_iter=10, n_jobs=-1, verbose=0, random_state=42
                )
            else:  # bayesian
                search_cv = BayesSearchCV(
                    pipeline, grid_params, cv=3, scoring="roc_auc",
                    n_iter=10, n_jobs=-1, random_state=42
                )

            search_cv.fit(X_train, y_train)
            best_model = search_cv.best_estimator_
            tuned_models[mname] = best_model
            tuned_results[mname] = evaluate_model(best_model, X_test, y_test)
            best_params[mname] = search_cv.best_params_
            print(f"  [{search_method.capitalize()}] Best CV AUC: {search_cv.best_score_:.4f} | Best Params: {search_cv.best_params_}\n")
            pbar.update(1)

    return tuned_models, tuned_results, best_params

def build_comparison_table(default_results, tuned_results, search_method, scaling_method):
    comp_dict = {}
    for mname in default_results:
        comp_dict[mname] = {
            "Accuracy (Default)":  default_results[mname]["Accuracy"],
            "Accuracy (Tuned)":    tuned_results[mname]["Accuracy"],
            "Precision (Default)": default_results[mname]["Precision"],
            "Precision (Tuned)":   tuned_results[mname]["Precision"],
            "Recall (Default)":    default_results[mname]["Recall"],
            "Recall (Tuned)":      tuned_results[mname]["Recall"],
            "F1 Score (Default)":  default_results[mname]["F1 Score"],
            "F1 Score (Tuned)":    tuned_results[mname]["F1 Score"],
            "AUC (Default)":       default_results[mname]["AUC"],
            "AUC (Tuned)":         tuned_results[mname]["AUC"]
        }
    df_comp = pd.DataFrame(comp_dict).T
    df_comp["Selected search method"] = search_method
    df_comp["Selected scaling method"] = scaling_method
    df_comp["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def color_cmp(row):
        metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "AUC"]
        styles = []
        for m in metrics:
            dval = row[f"{m} (Default)"]
            tval = row[f"{m} (Tuned)"]
            styles.append("")
            if tval > dval:
                styles.append("color: green")
            elif tval < dval:
                styles.append("color: red")
            else:
                styles.append("color: black")
        styles.extend(["", "", ""])
        return styles

    styled = df_comp.style.apply(color_cmp, axis=1)
    return df_comp, styled

def append_results_to_csv(df_comp, fname):
    if not os.path.exists(fname):
        df_comp.to_csv(fname, index=True)
    else:
        df_comp.to_csv(fname, mode='a', header=False, index=True)

def plot_roc_and_confusion(default_models, tuned_models, X_test, y_test):
    if not default_models:
        print("No models => skipping combined plot.")
        return

    from tqdm.notebook import tqdm
    mnames = sorted(default_models.keys())
    n_models = len(default_models)
    fig, axes = plt.subplots(nrows=n_models, ncols=3, figsize=(12, 4 * n_models))
    if n_models == 1:
        axes = [axes]

    print("Plotting combined ROC + confusion for each model in 1 figure...")
    with tqdm(total=n_models, desc="ROC/Confusion Plots") as pbar:
        for i, mname in enumerate(mnames):
            ax_roc = axes[i][0]
            ax_def = axes[i][1]
            ax_tun = axes[i][2]

            dmodel = default_models[mname]
            tmodel = tuned_models[mname]

            # Evaluate ROC for default
            if hasattr(dmodel, "predict_proba"):
                y_prob_def = dmodel.predict_proba(X_test)[:, 1]
                auc_d = roc_auc_score(y_test, y_prob_def)
                fpr_d, tpr_d, _ = roc_curve(y_test, y_prob_def)
            else:
                if hasattr(dmodel, "decision_function"):
                    decision = dmodel.decision_function(X_test)
                    dec_min = decision.min()
                    dec_max = decision.max()
                    if dec_max != dec_min:
                        y_prob_def = (decision - dec_min) / (dec_max - dec_min)
                    else:
                        y_prob_def = np.zeros_like(decision)
                    auc_d = roc_auc_score(y_test, y_prob_def)
                    fpr_d, tpr_d, _ = roc_curve(y_test, y_prob_def)
                else:
                    auc_d = -1.0
                    fpr_d = [0, 1]
                    tpr_d = [0, 1]

            # Evaluate ROC for tuned
            if hasattr(tmodel, "predict_proba"):
                y_prob_tun = tmodel.predict_proba(X_test)[:, 1]
                auc_t = roc_auc_score(y_test, y_prob_tun)
                fpr_t, tpr_t, _ = roc_curve(y_test, y_prob_tun)
            else:
                if hasattr(tmodel, "decision_function"):
                    decision_t = tmodel.decision_function(X_test)
                    dec_min = decision_t.min()
                    dec_max = decision_t.max()
                    if dec_max != dec_min:
                        y_prob_tun = (decision_t - dec_min) / (dec_max - dec_min)
                    else:
                        y_prob_tun = np.zeros_like(decision_t)
                    auc_t = roc_auc_score(y_test, y_prob_tun)
                    fpr_t, tpr_t, _ = roc_curve(y_test, y_prob_tun)
                else:
                    auc_t = -1.0
                    fpr_t = [0, 1]
                    tpr_t = [0, 1]

            ax_roc.plot(fpr_d, tpr_d, color="gray", lw=2, label=f"Default AUC={auc_d:.4f}")
            ax_roc.plot(fpr_t, tpr_t, color="red", lw=2, linestyle=":", label=f"Tuned AUC={auc_t:.4f}")
            ax_roc.plot([0, 1], [0, 1], "k--", lw=1)
            ax_roc.set_title(f"{mname}\nROC Curve")
            ax_roc.set_xlabel("False Positive Rate")
            ax_roc.set_ylabel("True Positive Rate")
            ax_roc.legend(loc="lower right")
            ax_roc.grid(True)
            ax_roc.set_aspect("equal", "box")

            # Plot confusion matrices with labels (TN, FP, FN, TP)
            cm_def = confusion_matrix(y_test, dmodel.predict(X_test))
            _plot_small_cm(ax_def, cm_def, label_text=True)
            ax_def.set_title("ConfMatrix\n(Default)")

            cm_tun = confusion_matrix(y_test, tmodel.predict(X_test))
            _plot_small_cm(ax_tun, cm_tun, label_text=True)
            ax_tun.set_title("ConfMatrix\n(Tuned)")

            pbar.update(1)

    plt.tight_layout()
    plt.show()

def _plot_small_cm(ax, cm, label_text=False):
    """
    Plot a 2x2 confusion matrix with (row=actual, col=predicted),
    optionally labeling each cell as TN, FP, FN, TP along with the numeric count.
    """
    cell_size = 0.2
    colors = np.array([
        ["#a5e8a3", "#ffaaaa"],
        ["#ffaaaa", "#a5e8a3"]
    ])

    cell_labels = [
        ["TN", "FP"],
        ["FN", "TP"]
    ]

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.add_patch(plt.Rectangle(
                (j * cell_size, i * cell_size), cell_size, cell_size,
                facecolor=colors[i, j], edgecolor='none'
            ))
            if label_text:
                txt = f"{cell_labels[i][j]}={cm[i, j]}"
            else:
                txt = str(cm[i, j])
            ax.text(j * cell_size + cell_size / 2, i * cell_size + cell_size / 2,
                    txt, ha='center', va='center',
                    fontsize=8, color='black')
    ax.set_xlim(0, 2 * cell_size)
    ax.set_ylim(0, 2 * cell_size)
    ax.set_xticks([cell_size / 2, cell_size * 1.5])
    ax.set_xticklabels(["Neg", "Pos"], fontsize=8)
    ax.set_yticks([cell_size / 2, cell_size * 1.5])
    ax.set_yticklabels(["Neg", "Pos"], fontsize=8)
    ax.invert_yaxis()
    ax.set_aspect("equal")
    ax.set_frame_on(False)
    for s in ax.spines.values():
        s.set_visible(False)

###############################################################################
# UI + Execution
###############################################################################
data_source_title = widgets.HTML(value="<b>Select data source:</b>")
file_path_title   = widgets.HTML(value="<b>CSV File Path (if 'file' selected):</b>")
search_title      = widgets.HTML(value="<b>Selected search method:</b>")
scaling_title     = widgets.HTML(value="<b>Selected scaling method:</b>")
models_title      = widgets.HTML(value="<b>Selected models:</b>")
max_cols_title    = widgets.HTML(value="<b>Max numeric columns for pairplot:</b>")

data_source_widget = widgets.RadioButtons(
    options=["heart_disease (embedded)", "breast_cancer (embedded)", "file"],
    value="heart_disease (embedded)"
)
file_path_widget  = widgets.Text(value="breast_cancer_dataset_edit.csv")
search_method_widget = widgets.RadioButtons(
    options=['grid', 'random', 'bayesian', 'none'],
    value='random'
)
scaling_method_widget = widgets.RadioButtons(
    options=[
        'StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler',
        'Normalizer', 'QuantileTransformer', 'PowerTransformer', 'none'
    ],
    value='StandardScaler'
)

model_options = [
    'Naive Bayes',
    'k-nearest neighbors algorithm',
    'Logistic Regression',
    'Ridge Classifier',
    'Linear Discriminant Analysis',
    'Quadratic Discriminant Analysis',
    'Decision Tree',
    'Support vector machine',
    'Random Forest',
    'Extra Trees',
    'AdaBoost',
    'Gradient Boosting',
    'eXtreme Gradient Boosting',
    'LightGBM',
    'CatBoost',
    'Stacked Generalization',
    'Gaussian Process'
]
model_checkboxes = [widgets.Checkbox(value=True, description=m) for m in model_options]
model_selection_box = widgets.VBox(model_checkboxes)

max_cols_widget = widgets.IntSlider(value=3, min=1, max=50, step=1)
run_button = widgets.Button(description="Run Analysis")
output = widgets.Output()

display(data_source_title, data_source_widget)
display(file_path_title, file_path_widget)
display(search_title, search_method_widget)
display(scaling_title, scaling_method_widget)
display(models_title, model_selection_box)
display(max_cols_title, max_cols_widget)
display(run_button, output)

def on_button_clicked(_):
    run_button.disabled = True
    data_source_widget.disabled = True
    file_path_widget.disabled = True
    search_method_widget.disabled = True
    scaling_method_widget.disabled = True
    max_cols_widget.disabled = True
    for cb in model_checkboxes:
        cb.disabled = True

    with output:
        clear_output()

        data_source = data_source_widget.value
        file_path = file_path_widget.value
        max_cols = max_cols_widget.value
        search_meth = search_method_widget.value
        scale_meth = scaling_method_widget.value
        selected_ms = [cb.description for cb in model_checkboxes if cb.value]

        print(f"Data Source: {data_source}")
        if data_source == "file":
            print(f"CSV File Path: {file_path}")
        print(f"Max numeric columns for pairplot: {max_cols}")
        print(f"Selected search method: {search_meth}")
        print(f"Selected scaling method: {scale_meth}")
        print(f"Selected models: {selected_ms}\n")

        if data_source == "heart_disease (embedded)":
            result_file = "heart_disease_result.csv"
            df = pd.read_csv(io.StringIO(HEART_DISEASE_CSV_DATA))
        elif data_source == "breast_cancer (embedded)":
            result_file = "breast_cancer_result.csv"
            df = pd.read_csv(io.StringIO(BREAST_CANCER_CSV_DATA))
        else:
            result_file = file_path.rstrip().replace(".csv", "_result.csv")
            df = load_file_data(file_path)

        dynamic_explore_data(df, max_cols)

        if "target" not in df.columns:
            print("Error: 'target' col not found => stopping pipeline.")
            return

        X = df.drop("target", axis=1)
        y = df["target"]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        all_models = define_all_models(scale_meth)
        default_models = {m: all_models[m] for m in selected_ms}

        print("\n=== Default Training ===")
        default_results = {}
        with tqdm(total=len(default_models), desc="Default Training") as pbar_def:
            for mname, model in default_models.items():
                model.fit(X_train, y_train)
                result = evaluate_model(model, X_test, y_test)
                default_results[mname] = result
                print(f" {mname}: AUC={result['AUC']:.4f}")
                print(f"   Default Pipeline Params: {model.get_params()}\n")
                pbar_def.update(1)

        param_grids = define_param_grids()
        tuned_models, tuned_results, best_params = tune_models(
            default_models, param_grids, X_train, y_train, X_test, y_test, search_meth
        )

        df_comp, styled_comp = build_comparison_table(
            default_results, tuned_results, search_meth, scale_meth
        )
        print("\nComparison of Default vs. Tuned:")
        display(styled_comp)

        append_results_to_csv(df_comp, result_file)
        print(f"\nResults appended to '{result_file}'.\n")

        best_tuned_auc = -np.inf
        best_tuned_name = None
        for nm, mets in tuned_results.items():
            if mets["AUC"] > best_tuned_auc:
                best_tuned_auc = mets["AUC"]
                best_tuned_name = nm
        if best_tuned_name:
            print("\nHighest AUC (Tuned):", best_tuned_name, best_tuned_auc)

        best_def_auc = -np.inf
        best_def_name = None
        for nm, mets in default_results.items():
            if mets["AUC"] > best_def_auc:
                best_def_auc = mets["AUC"]
                best_def_name = nm
        if best_def_name:
            print("\nHighest AUC (Default):", best_def_name, best_def_auc)

        print("\nPlotting combined ROC + Confusions:")
        plot_roc_and_confusion(default_models, tuned_models, X_test, y_test)

    run_button.disabled = False
    data_source_widget.disabled = False
    file_path_widget.disabled = False
    search_method_widget.disabled = False
    scaling_method_widget.disabled = False
    max_cols_widget.disabled = False
    for cb in model_checkboxes:
        cb.disabled = False

run_button.on_click(on_button_clicked)


HTML(value='<b>Select data source:</b>')

RadioButtons(options=('heart_disease (embedded)', 'breast_cancer (embedded)', 'file'), value='heart_disease (e…

HTML(value="<b>CSV File Path (if 'file' selected):</b>")

Text(value='breast_cancer_dataset_edit.csv')

HTML(value='<b>Selected search method:</b>')

RadioButtons(index=1, options=('grid', 'random', 'bayesian', 'none'), value='random')

HTML(value='<b>Selected scaling method:</b>')

RadioButtons(options=('StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'Normalizer', 'Quantil…

HTML(value='<b>Selected models:</b>')

VBox(children=(Checkbox(value=True, description='Naive Bayes'), Checkbox(value=True, description='k-nearest ne…

HTML(value='<b>Max numeric columns for pairplot:</b>')

IntSlider(value=3, max=50, min=1)

Button(description='Run Analysis', style=ButtonStyle())

Output()