<a href="https://colab.research.google.com/github/arslanmit/PrimeSiftAI/blob/Development/AI_2025_02_22_v005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet ipywidgets xgboost scikit-optimize tqdm

import warnings
warnings.filterwarnings("ignore")

import io
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

from sklearn.model_selection import (
    train_test_split, GridSearchCV, RandomizedSearchCV
)
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
    Normalizer, QuantileTransformer, PowerTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve, confusion_matrix
)

# Import LDA/QDA from discriminant_analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)

import xgboost as xgb
import lightgbm as lgb
from skopt import BayesSearchCV

###############################################################################
#        Truncated EMBEDDED CSV DATA (Replace with full untruncated lines)
###############################################################################
HEART_DISEASE_CSV_DATA = """age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
63,1,0,145,233,1,2,150,0,2.3,2,0,2,0
67,1,3,160,286,0,2,108,1,1.5,1,3,1,1
... (Replace with full lines) ...
58,0,3,170,225,1,2,146,1,2.8,1,2,2,1
"""

BREAST_CANCER_CSV_DATA = """target,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
1,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
... (Replace with full lines) ...
"""

###############################################################################
#                Load from file or embedded data
###############################################################################
def load_file_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CSV file '{file_path}' not found.")
    return pd.read_csv(file_path)

###############################################################################
#                 Scaler & Pipeline Creation
###############################################################################
def get_scaler(method: str):
    scalers = {
        "StandardScaler": StandardScaler(),
        "MinMaxScaler": MinMaxScaler(),
        "RobustScaler": RobustScaler(),
        "MaxAbsScaler": MaxAbsScaler(),
        "Normalizer": Normalizer(),
        "QuantileTransformer": QuantileTransformer(output_distribution="normal"),
        "PowerTransformer": PowerTransformer(),
        "none": None
    }
    return scalers.get(method, StandardScaler())

def create_pipeline(clf, scaling_method: str) -> Pipeline:
    steps = [("selector", SelectKBest(score_func=f_classif, k="all"))]
    scaler_obj = get_scaler(scaling_method)
    if scaler_obj is not None:
        steps.append(("scaler", scaler_obj))
    steps.append(("clf", clf))
    return Pipeline(steps)

###############################################################################
#               Define all model pipelines
###############################################################################
def define_all_models(scaling_method: str) -> dict:
    return {
        "Naive Bayes": create_pipeline(GaussianNB(), scaling_method),
        "KNN": create_pipeline(KNeighborsClassifier(), scaling_method),
        "Logistic Regression": create_pipeline(
            LogisticRegression(max_iter=1000, random_state=42),
            scaling_method
        ),
        "Linear Discriminant Analysis": create_pipeline(
            LinearDiscriminantAnalysis(), scaling_method
        ),
        # >>> QDA ADDED HERE <<<
        "Quadratic Discriminant Analysis": create_pipeline(
            QuadraticDiscriminantAnalysis(), scaling_method
        ),
        "Decision Tree": create_pipeline(DecisionTreeClassifier(random_state=42), scaling_method),
        "SVM": create_pipeline(SVC(probability=True, random_state=42), scaling_method),
        "Random Forest": create_pipeline(RandomForestClassifier(random_state=42), scaling_method),
        "Extra Trees": create_pipeline(ExtraTreesClassifier(random_state=42), scaling_method),
        "AdaBoost": create_pipeline(AdaBoostClassifier(random_state=42), scaling_method),
        "Gradient Boosting": create_pipeline(GradientBoostingClassifier(random_state=42), scaling_method),
        "XGBoost": create_pipeline(xgb.XGBClassifier(eval_metric="logloss", random_state=42), scaling_method),
        "LightGBM": create_pipeline(lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True), scaling_method),
    }

###############################################################################
#                  Evaluate model
###############################################################################
def evaluate_model(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_prob)
    }

###############################################################################
#                  Parameter Grids & Tuning
###############################################################################
def define_param_grids() -> dict:
    return {
        "Naive Bayes": {"selector__k":[5,"all"]},
        "KNN":{
            "selector__k":[5,"all"],
            "clf__n_neighbors":[3,5,7],
            "clf__weights":["uniform","distance"]
        },
        "Logistic Regression":{
            "selector__k":[5,"all"],
            "clf__C":[0.01,0.1,1,10],
            "clf__penalty":["l2"]
        },
        "Linear Discriminant Analysis":{"selector__k":[5,"all"]},
        # >>> QDA PARAM GRID ADDED HERE <<<
        "Quadratic Discriminant Analysis":{
            "selector__k":[5,"all"],
            "clf__reg_param":[0.0, 0.1, 0.2, 0.5, 1.0],  # example reg_params
            "clf__tol":[1e-4, 1e-3, 1e-2]
        },
        "Decision Tree":{
            "selector__k":[5,"all"],
            "clf__max_depth":[None,3,5,10],
            "clf__min_samples_split":[2,5,10]
        },
        "SVM":{
            "selector__k":[5,"all"],
            "clf__C":[0.1,1,10],
            "clf__kernel":["linear","rbf"],
            "clf__gamma":["scale"]
        },
        "Random Forest":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "Extra Trees":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "AdaBoost":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1]
        },
        "Gradient Boosting":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1],
            "clf__max_depth":[3,5,7]
        },
        "XGBoost":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__max_depth":[3,5,7]
        },
        "LightGBM":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__num_leaves":[20,31,50]
        },
    }

def tune_models(models, param_grids, X_train, y_train, X_test, y_test, search_method: str):
    tuned_models = {}
    tuned_results = {}
    best_params = {}

    for mname, pipeline in tqdm(models.items(), desc="Hyperparameter Tuning"):
        print(f"Tuning {mname}...")
        grid_params = param_grids.get(mname, {})
        if search_method=="none" or not grid_params:
            pipeline.fit(X_train, y_train)
            tuned_models[mname] = pipeline
            tuned_results[mname] = evaluate_model(pipeline, X_test, y_test)
            best_params[mname] = None
            print(f"  No tuning for {mname}. Using default.\n")
            continue

        if search_method=="grid":
            search_cv = GridSearchCV(pipeline, grid_params, cv=3, scoring="roc_auc", n_jobs=-1)
        elif search_method=="random":
            search_cv = RandomizedSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, verbose=0, random_state=42
            )
        else: # "bayesian"
            search_cv = BayesSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, random_state=42
            )

        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        tuned_models[mname] = best_model
        tuned_results[mname] = evaluate_model(best_model, X_test, y_test)
        best_params[mname] = search_cv.best_params_
        print(f"  [{search_method.capitalize()}] Best CV AUC: {search_cv.best_score_:.4f} | Best Params: {search_cv.best_params_}\n")

    return tuned_models, tuned_results, best_params

###############################################################################
#                Build Comparison Table & CSV
###############################################################################
def build_comparison_table(default_results, tuned_results, search_method, scaling_method):
    comp_dict={}
    for mname in default_results:
        comp_dict[mname] = {
            "Accuracy (Default)": default_results[mname]["Accuracy"],
            "Accuracy (Tuned)": tuned_results[mname]["Accuracy"],
            "Precision (Default)": default_results[mname]["Precision"],
            "Precision (Tuned)": tuned_results[mname]["Precision"],
            "Recall (Default)": default_results[mname]["Recall"],
            "Recall (Tuned)": tuned_results[mname]["Recall"],
            "F1 Score (Default)": default_results[mname]["F1 Score"],
            "F1 Score (Tuned)": tuned_results[mname]["F1 Score"],
            "AUC (Default)": default_results[mname]["AUC"],
            "AUC (Tuned)": tuned_results[mname]["AUC"]
        }
    df_comp = pd.DataFrame(comp_dict).T
    df_comp["Selected search method"] = search_method
    df_comp["Selected scaling method"] = scaling_method
    df_comp["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def color_cmp(row):
        metrics = ["Accuracy","Precision","Recall","F1 Score","AUC"]
        styles=[]
        for m in metrics:
            dval = row[f"{m} (Default)"]
            tval = row[f"{m} (Tuned)"]
            styles.append("")
            if tval>dval:
                styles.append("color: green")
            elif tval<dval:
                styles.append("color: red")
            else:
                styles.append("color: black")
        styles.extend(["","",""])
        return styles

    styled = df_comp.style.apply(color_cmp, axis=1)
    return df_comp, styled

def append_results_to_csv(df_comp, fname):
    if not os.path.exists(fname):
        df_comp.to_csv(fname, index=True)
    else:
        df_comp.to_csv(fname, mode='a', header=False, index=True)

###############################################################################
#               Combined ROC + Confusion per model
###############################################################################
def plot_roc_and_confusion(default_models, tuned_models, X_test, y_test):
    if not default_models:
        print("No models selected => skipping combined plot.")
        return

    from tqdm.notebook import tqdm
    mnames = sorted(default_models.keys())
    n_models= len(mnames)
    fig, axes = plt.subplots(nrows=n_models, ncols=3, figsize=(12, 4*n_models))
    if n_models==1:
        axes=[axes]

    print("Plotting combined ROC + confusion for each model in 1 figure...")
    with tqdm(total=n_models, desc="Models") as pbar:
        for i, mname in enumerate(mnames):
            ax_roc= axes[i][0]
            ax_cnd= axes[i][1]
            ax_cnt= axes[i][2]

            # 1) Combined ROC
            dmodel = default_models[mname]
            tmodel = tuned_models[mname]

            yprob_def= dmodel.predict_proba(X_test)[:,1]
            fpr_d, tpr_d,_= roc_curve(y_test, yprob_def)
            auc_d= roc_auc_score(y_test, yprob_def)

            yprob_tun= tmodel.predict_proba(X_test)[:,1]
            fpr_t, tpr_t,_= roc_curve(y_test,yprob_tun)
            auc_t= roc_auc_score(y_test,yprob_tun)

            ax_roc.plot(fpr_d, tpr_d, color="gray", lw=2, label=f"Default AUC={auc_d:.4f}")
            ax_roc.plot(fpr_t, tpr_t, color="red", lw=2, linestyle=":",
                        label=f"Tuned AUC={auc_t:.4f}")
            ax_roc.plot([0,1],[0,1],"k--",lw=1)
            ax_roc.set_title(f"{mname}\nROC Curve")
            ax_roc.set_xlabel("False Positive Rate")
            ax_roc.set_ylabel("True Positive Rate")
            ax_roc.legend(loc="lower right")
            ax_roc.grid(True)
            ax_roc.set_aspect("equal","box")

            # 2) Default confusion matrix
            cm_def= confusion_matrix(y_test, dmodel.predict(X_test))
            _plot_small_cm(ax_cnd, cm_def)
            ax_cnd.set_title("ConfMatrix\n(Default)")

            # 3) Tuned confusion matrix
            cm_tun= confusion_matrix(y_test, tmodel.predict(X_test))
            _plot_small_cm(ax_cnt, cm_tun)
            ax_cnt.set_title("ConfMatrix\n(Tuned)")

            pbar.update(1)

    plt.tight_layout()
    plt.show()

def _plot_small_cm(ax, cm):
    cell_size=0.2
    colors = np.array([["#a5e8a3","#ffaaaa"],["#ffaaaa","#a5e8a3"]])
    for i in range(2):
        for j in range(2):
            ax.add_patch(plt.Rectangle(
                (j*cell_size, i*cell_size), cell_size, cell_size,
                facecolor=colors[i,j], edgecolor='none'
            ))
            ax.text(j*cell_size+cell_size/2, i*cell_size+cell_size/2,
                    str(cm[i,j]), ha='center', va='center',
                    fontsize=8, color='black')
    ax.set_xlim(0,2*cell_size)
    ax.set_ylim(0,2*cell_size)
    ax.set_xticks([cell_size/2, cell_size*1.5])
    ax.set_xticklabels(["Neg","Pos"], fontsize=8)
    ax.set_yticks([cell_size/2, cell_size*1.5])
    ax.set_yticklabels(["Neg","Pos"], fontsize=8)
    ax.invert_yaxis()
    ax.set_aspect("equal")
    ax.set_frame_on(False)
    for s in ax.spines.values():
        s.set_visible(False)

###############################################################################
#           Quick "explore_data" for Pairplot + Stats
###############################################################################
def dynamic_explore_data(df: pd.DataFrame, max_cols: int):
    n_missing = df.isnull().sum().sum()
    if n_missing==0:
        print("No missing values.\n")
    else:
        print(f"Missing values found: {n_missing}\n")

    print("Descriptive Stats:")
    display(df.describe(include='all').T)

    if "target" not in df.columns:
        print("No 'target' => skipping pairplot.\n")
        return

    numeric_cols= df.select_dtypes(include=[np.number]).columns.tolist()
    if "target" in numeric_cols: numeric_cols.remove("target")
    use_cols = numeric_cols[:max_cols]
    if not use_cols:
        print("No numeric columns => skipping pairplot.")
        return

    pcols = use_cols + ["target"]
    if set(pcols).issubset(df.columns):
        print("Plotting Pairplot:")
        with tqdm(total=1, desc="Pairplot") as pb:
            g = sns.pairplot(
                df[pcols], hue="target", palette="Greys", markers=["o","D"],
                plot_kws={"s":25,"alpha":0.75}, height=3
            )
            g.fig.suptitle("Pairplot of up to {max_cols} Numeric + 'target'", y=1.02)
            plt.show()
            pb.update(1)
    else:
        print("Some columns not found => skipping pairplot.")

###############################################################################
#              UI + Execution
###############################################################################
data_source_title  = widgets.HTML(value="<b>Select data source:</b>")
file_path_title    = widgets.HTML(value="<b>CSV File Path (if 'file' selected):</b>")
search_title       = widgets.HTML(value="<b>Selected search method:</b>")
scaling_title      = widgets.HTML(value="<b>Selected scaling method:</b>")
models_title       = widgets.HTML(value="<b>Selected models:</b>")
max_cols_title     = widgets.HTML(value="<b>Max numeric columns for pairplot:</b>")

data_source_widget = widgets.RadioButtons(
    options=["heart_disease (embedded)", "breast_cancer (embedded)", "file"],
    value="heart_disease (embedded)"
)
file_path_widget   = widgets.Text(value="heart_disease.csv")
search_method_widget= widgets.RadioButtons(
    options=['grid','random','bayesian','none'],
    value='random'
)
scaling_method_widget= widgets.RadioButtons(
    options=[
        'StandardScaler','MinMaxScaler','RobustScaler','MaxAbsScaler',
        'Normalizer','QuantileTransformer','PowerTransformer','none'
    ],
    value='StandardScaler'
)
model_options = [
    'Naive Bayes','KNN','Logistic Regression','Linear Discriminant Analysis',
    'Quadratic Discriminant Analysis',  # QDA added
    'Decision Tree','SVM','Random Forest','Extra Trees',
    'AdaBoost','Gradient Boosting','XGBoost','LightGBM'
]
model_checkboxes= [widgets.Checkbox(value=True, description=m) for m in model_options]
model_selection_box= widgets.VBox(model_checkboxes)

max_cols_widget= widgets.IntSlider(value=5, min=1, max=50, step=1)
run_button     = widgets.Button(description="Run Analysis")
output         = widgets.Output()

display(data_source_title, data_source_widget)
display(file_path_title,   file_path_widget)
display(search_title,      search_method_widget)
display(scaling_title,     scaling_method_widget)
display(models_title,      model_selection_box)
display(max_cols_title,    max_cols_widget)
display(run_button,        output)

def on_button_clicked(_):
    run_button.disabled= True
    data_source_widget.disabled= True
    file_path_widget.disabled= True
    search_method_widget.disabled= True
    scaling_method_widget.disabled= True
    max_cols_widget.disabled= True
    for cb in model_checkboxes:
        cb.disabled= True

    with output:
        clear_output()

        data_source    = data_source_widget.value
        file_path      = file_path_widget.value
        max_cols       = max_cols_widget.value
        search_method  = search_method_widget.value
        scaling_method = scaling_method_widget.value
        selected_ms    = [cb.description for cb in model_checkboxes if cb.value]

        print(f"Data Source: {data_source}")
        if data_source=="file":
            print(f"CSV File Path: {file_path}")
        print(f"Max numeric columns: {max_cols}")
        print(f"Search method: {search_method}")
        print(f"Scaling method: {scaling_method}")
        print(f"Selected models: {selected_ms}\n")

        if data_source=="heart_disease (embedded)":
            result_file = "heart_disease_result.csv"
            df = pd.read_csv(io.StringIO(HEART_DISEASE_CSV_DATA))
        elif data_source=="breast_cancer (embedded)":
            result_file = "breast_cancer_result.csv"
            df = pd.read_csv(io.StringIO(BREAST_CANCER_CSV_DATA))
        else:
            result_file = file_path.rstrip().replace(".csv","_result.csv")
            df = load_file_data(file_path)

        dynamic_explore_data(df, max_cols)

        if "target" not in df.columns:
            print("No 'target' => stopping pipeline.")
            return

        X = df.drop("target", axis=1)
        y = df["target"]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # Build default models
        all_models= define_all_models(scaling_method)
        default_models = {m: all_models[m] for m in selected_ms}
        print("Training default models:")
        default_results = {}
        for mname, model in tqdm(default_models.items(), desc="Default Training"):
            model.fit(X_train, y_train)
            default_results[mname] = evaluate_model(model, X_test, y_test)

        param_grids = define_param_grids()
        tuned_models, tuned_results, best_params= tune_models(
            default_models, param_grids, X_train, y_train, X_test, y_test, search_method
        )

        df_comp, styled_comp = build_comparison_table(
            default_results, tuned_results, search_method, scaling_method
        )
        print("\nComparison of Default vs. Tuned:")
        display(styled_comp)

        append_results_to_csv(df_comp, result_file)
        print(f"\nAppended results to '{result_file}'\n")

        # Find best tuned
        best_tuned_auc= -np.inf
        best_tuned_name= None
        for nm, metrics in tuned_results.items():
            if metrics["AUC"]>best_tuned_auc:
                best_tuned_auc= metrics["AUC"]
                best_tuned_name= nm
        if best_tuned_name:
            print("Highest AUC (Tuned):", best_tuned_name, best_tuned_auc)

        # Find best default
        best_def_auc= -np.inf
        best_def_name= None
        for nm, metrics in default_results.items():
            if metrics["AUC"]>best_def_auc:
                best_def_auc= metrics["AUC"]
                best_def_name= nm
        if best_def_name:
            print("Highest AUC (Default):", best_def_name, best_def_auc)

        print("\nPlotting combined ROC + Confusions:")
        plot_roc_and_confusion(default_models, tuned_models, X_test, y_test)

    run_button.disabled= False
    data_source_widget.disabled= False
    file_path_widget.disabled= False
    search_method_widget.disabled= False
    scaling_method_widget.disabled= False
    max_cols_widget.disabled= False
    for cb in model_checkboxes:
        cb.disabled= False

run_button.on_click(on_button_clicked)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m1.4/1.6 MB[0m [31m43.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m37.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h

HTML(value='<b>Select data source:</b>')

RadioButtons(options=('heart_disease (embedded)', 'breast_cancer (embedded)', 'file'), value='heart_disease (e…

HTML(value="<b>CSV File Path (if 'file' selected):</b>")

Text(value='heart_disease.csv')

HTML(value='<b>Selected search method:</b>')

RadioButtons(index=1, options=('grid', 'random', 'bayesian', 'none'), value='random')

HTML(value='<b>Selected scaling method:</b>')

RadioButtons(options=('StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'Normalizer', 'Quantil…

HTML(value='<b>Selected models:</b>')

VBox(children=(Checkbox(value=True, description='Naive Bayes'), Checkbox(value=True, description='KNN'), Check…

HTML(value='<b>Max numeric columns for pairplot:</b>')

IntSlider(value=5, max=50, min=1)

Button(description='Run Analysis', style=ButtonStyle())

Output()

In [None]:
!pip install --quiet ipywidgets xgboost scikit-optimize tqdm

import warnings
warnings.filterwarnings("ignore")

import io
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

from sklearn.model_selection import (
    train_test_split, GridSearchCV, RandomizedSearchCV
)
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
    Normalizer, QuantileTransformer, PowerTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve, confusion_matrix
)

# Additional:
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)
import xgboost as xgb
import lightgbm as lgb
from skopt import BayesSearchCV

###############################################################################
#           Truncated EMBEDDED CSV DATA (Replace w/ Full or choose "file")
###############################################################################
HEART_DISEASE_CSV_DATA = """age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
63,1,0,145,233,1,2,150,0,2.3,2,0,2,0
67,1,3,160,286,0,2,108,1,1.5,1,3,1,1
... (Replace with full untruncated lines) ...
57,0,3,140,241,0,0,123,1,0.2,1,0,3,1
"""

BREAST_CANCER_CSV_DATA = """target,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
1,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
... (Replace with full untruncated lines) ...
"""

###############################################################################
#                Load from file or embedded
###############################################################################
def load_file_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CSV file '{file_path}' not found.")
    return pd.read_csv(file_path)

###############################################################################
#            Create Pipeline + Scaler + Models (including QDA & LDA)
###############################################################################
def get_scaler(method: str):
    scalers = {
        "StandardScaler": StandardScaler(),
        "MinMaxScaler": MinMaxScaler(),
        "RobustScaler": RobustScaler(),
        "MaxAbsScaler": MaxAbsScaler(),
        "Normalizer": Normalizer(),
        "QuantileTransformer": QuantileTransformer(output_distribution="normal"),
        "PowerTransformer": PowerTransformer(),
        "none": None
    }
    return scalers.get(method, StandardScaler())

def create_pipeline(clf, scaling_method: str) -> Pipeline:
    steps = [("selector", SelectKBest(score_func=f_classif, k="all"))]
    scaler_obj = get_scaler(scaling_method)
    if scaler_obj is not None:
        steps.append(("scaler", scaler_obj))
    steps.append(("clf", clf))
    return Pipeline(steps)

def define_all_models(scaling_method: str) -> dict:
    return {
        "Naive Bayes": create_pipeline(GaussianNB(), scaling_method),
        "KNN": create_pipeline(KNeighborsClassifier(), scaling_method),
        "Logistic Regression": create_pipeline(
            LogisticRegression(max_iter=1000, random_state=42), scaling_method
        ),
        "Linear Discriminant Analysis": create_pipeline(
            LinearDiscriminantAnalysis(), scaling_method
        ),
        "Quadratic Discriminant Analysis": create_pipeline(
            QuadraticDiscriminantAnalysis(), scaling_method
        ),
        "Decision Tree": create_pipeline(
            DecisionTreeClassifier(random_state=42), scaling_method
        ),
        "SVM": create_pipeline(
            SVC(probability=True, random_state=42), scaling_method
        ),
        "Random Forest": create_pipeline(
            RandomForestClassifier(random_state=42), scaling_method
        ),
        "Extra Trees": create_pipeline(
            ExtraTreesClassifier(random_state=42), scaling_method
        ),
        "AdaBoost": create_pipeline(
            AdaBoostClassifier(random_state=42), scaling_method
        ),
        "Gradient Boosting": create_pipeline(
            GradientBoostingClassifier(random_state=42), scaling_method
        ),
        "XGBoost": create_pipeline(
            xgb.XGBClassifier(eval_metric="logloss", random_state=42),
            scaling_method
        ),
        "LightGBM": create_pipeline(
            lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True),
            scaling_method
        ),
    }

def evaluate_model(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_prob)
    }

###############################################################################
#                Parameter grids: includes LDA + QDA
###############################################################################
def define_param_grids() -> dict:
    return {
        "Naive Bayes": {"selector__k":[5,"all"]},
        "KNN":{
            "selector__k":[5,"all"],
            "clf__n_neighbors":[3,5,7],
            "clf__weights":["uniform","distance"]
        },
        "Logistic Regression":{
            "selector__k":[5,"all"],
            "clf__C":[0.01,0.1,1,10],
            "clf__penalty":["l2"]
        },
        "Linear Discriminant Analysis":{
            "selector__k":[5,"all"]
            # LDA hyperparams if you want to try
        },
        "Quadratic Discriminant Analysis":{
            "selector__k":[5,"all"],
            "clf__reg_param":[0.0, 0.1, 0.2, 0.5, 1.0],
            "clf__tol":[1e-4, 1e-3, 1e-2]
        },
        "Decision Tree":{
            "selector__k":[5,"all"],
            "clf__max_depth":[None,3,5,10],
            "clf__min_samples_split":[2,5,10]
        },
        "SVM":{
            "selector__k":[5,"all"],
            "clf__C":[0.1,1,10],
            "clf__kernel":["linear","rbf"],
            "clf__gamma":["scale"]
        },
        "Random Forest":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "Extra Trees":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "AdaBoost":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1]
        },
        "Gradient Boosting":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1],
            "clf__max_depth":[3,5,7]
        },
        "XGBoost":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__max_depth":[3,5,7]
        },
        "LightGBM":{
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__num_leaves":[20,31,50]
        },
    }

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def tune_models(models, param_grids, X_train, y_train, X_test, y_test, search_method: str):
    tuned_models = {}
    tuned_results= {}
    best_params  = {}

    for mname, pipeline in tqdm(models.items(), desc="Hyperparameter Tuning"):
        print(f"Tuning {mname}...")
        grid_params = param_grids.get(mname, {})
        if search_method=="none" or not grid_params:
            pipeline.fit(X_train, y_train)
            tuned_models[mname] = pipeline
            tuned_results[mname]= evaluate_model(pipeline, X_test, y_test)
            best_params[mname]  = None
            print(f"  No tuning for {mname}. Using default.\n")
            continue

        if search_method=="grid":
            search_cv= GridSearchCV(pipeline, grid_params, cv=3, scoring="roc_auc", n_jobs=-1)
        elif search_method=="random":
            search_cv= RandomizedSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, verbose=0, random_state=42
            )
        else: # bayesian
            search_cv= BayesSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, random_state=42
            )

        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        tuned_models[mname] = best_model
        tuned_results[mname]= evaluate_model(best_model, X_test, y_test)
        best_params[mname]  = search_cv.best_params_
        print(f"  [{search_method.capitalize()}] Best CV AUC: {search_cv.best_score_:.4f} | Best Params: {search_cv.best_params_}\n")

    return tuned_models, tuned_results, best_params

###############################################################################
#                Build Comparison Table & CSV
###############################################################################
def build_comparison_table(default_results, tuned_results, search_method, scaling_method):
    comp_dict= {}
    for mname in default_results:
        comp_dict[mname]= {
            "Accuracy (Default)":   default_results[mname]["Accuracy"],
            "Accuracy (Tuned)":     tuned_results[mname]["Accuracy"],
            "Precision (Default)":  default_results[mname]["Precision"],
            "Precision (Tuned)":    tuned_results[mname]["Precision"],
            "Recall (Default)":     default_results[mname]["Recall"],
            "Recall (Tuned)":       tuned_results[mname]["Recall"],
            "F1 Score (Default)":   default_results[mname]["F1 Score"],
            "F1 Score (Tuned)":     tuned_results[mname]["F1 Score"],
            "AUC (Default)":        default_results[mname]["AUC"],
            "AUC (Tuned)":          tuned_results[mname]["AUC"]
        }
    df_comp= pd.DataFrame(comp_dict).T
    df_comp["Selected search method"]= search_method
    df_comp["Selected scaling method"]=scaling_method
    df_comp["timestamp"]= datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def color_cmp(row):
        metrics= ["Accuracy","Precision","Recall","F1 Score","AUC"]
        styles= []
        for m in metrics:
            dval= row[f"{m} (Default)"]
            tval= row[f"{m} (Tuned)"]
            styles.append("")
            if tval>dval:
                styles.append("color: green")
            elif tval<dval:
                styles.append("color: red")
            else:
                styles.append("color: black")
        styles.extend(["","",""])
        return styles

    styled= df_comp.style.apply(color_cmp, axis=1)
    return df_comp, styled

def append_results_to_csv(df_comp, fname):
    if not os.path.exists(fname):
        df_comp.to_csv(fname, index=True)
    else:
        df_comp.to_csv(fname, mode='a', header=False, index=True)

###############################################################################
#               Combined ROC + Confusion per model
###############################################################################
def plot_roc_and_confusion(default_models, tuned_models, X_test, y_test):
    """
    For each model => 1 row w/ 3 subplots:
       [col0] Combined ROC, [col1] Default ConfMat, [col2] Tuned ConfMat
    """
    if not default_models:
        print("No models => skipping combined plot.")
        return

    from tqdm.notebook import tqdm
    mnames= sorted(default_models.keys())
    n_models= len(mnames)
    fig, axes= plt.subplots(nrows=n_models, ncols=3, figsize=(12, 4*n_models))
    if n_models==1:
        axes= [axes]  # so we can do axes[row][col]

    print("Plotting combined ROC + confusion for each model in 1 figure...")
    with tqdm(total=n_models, desc="Models") as pbar:
        for i, mname in enumerate(mnames):
            ax_roc= axes[i][0]
            ax_def= axes[i][1]
            ax_tun= axes[i][2]

            # 1) Combined ROC
            dmodel= default_models[mname]
            tmodel= tuned_models[mname]

            y_prob_def= dmodel.predict_proba(X_test)[:,1]
            fpr_d, tpr_d,_= roc_curve(y_test, y_prob_def)
            auc_d= roc_auc_score(y_test, y_prob_def)

            y_prob_tun= tmodel.predict_proba(X_test)[:,1]
            fpr_t, tpr_t,_= roc_curve(y_test, y_prob_tun)
            auc_t= roc_auc_score(y_test, y_prob_tun)

            ax_roc.plot(fpr_d,tpr_d,color="gray", lw=2, label=f"Default AUC={auc_d:.4f}")
            ax_roc.plot(fpr_t,tpr_t,color="red",  lw=2, linestyle=":",
                        label=f"Tuned AUC={auc_t:.4f}")
            ax_roc.plot([0,1],[0,1],"k--",lw=1)
            ax_roc.set_title(f"{mname}\nROC Curve")
            ax_roc.set_xlabel("False Positive Rate")
            ax_roc.set_ylabel("True Positive Rate")
            ax_roc.legend(loc="lower right")
            ax_roc.grid(True)
            ax_roc.set_aspect("equal","box")

            # 2) Default confusion matrix
            cm_def= confusion_matrix(y_test, dmodel.predict(X_test))
            _plot_small_cm(ax_def, cm_def)
            ax_def.set_title("Conf. Matrix\n(Default)")

            # 3) Tuned confusion matrix
            cm_tun= confusion_matrix(y_test, tmodel.predict(X_test))
            _plot_small_cm(ax_tun, cm_tun)
            ax_tun.set_title("Conf. Matrix\n(Tuned)")

            pbar.update(1)

    plt.tight_layout()
    plt.show()

def _plot_small_cm(ax, cm):
    cell_size= 0.2
    colors= np.array([
        ["#a5e8a3","#ffaaaa"],
        ["#ffaaaa","#a5e8a3"]
    ])
    for i in range(2):
        for j in range(2):
            ax.add_patch(plt.Rectangle(
                (j*cell_size, i*cell_size), cell_size, cell_size,
                facecolor=colors[i,j], edgecolor='none'
            ))
            ax.text(j*cell_size+cell_size/2, i*cell_size+cell_size/2,
                    str(cm[i,j]), ha='center', va='center',
                    fontsize=8, color='black')
    ax.set_xlim(0, 2*cell_size)
    ax.set_ylim(0, 2*cell_size)
    ax.set_xticks([cell_size/2, cell_size*1.5])
    ax.set_xticklabels(["Neg","Pos"], fontsize=8)
    ax.set_yticks([cell_size/2, cell_size*1.5])
    ax.set_yticklabels(["Neg","Pos"], fontsize=8)
    ax.invert_yaxis()
    ax.set_aspect("equal")
    ax.set_frame_on(False)
    for s in ax.spines.values():
        s.set_visible(False)

###############################################################################
#         Custom Full Profiling (manual expanded stats)
###############################################################################
def custom_full_profiling(df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame with expanded info for each column:
      - Column name
      - Data type
      - #Non-Null
      - #Missing
      - #Unique
      - Min, Max, Mean, Median, Std (if numeric)
      - Top, Freq (if non-numeric)
      - Corr(target) if 'target' is numeric
    """
    target_present = ('target' in df.columns)
    if target_present and pd.api.types.is_numeric_dtype(df['target']):
        correlations = df.corrwith(df['target'])
    else:
        correlations = pd.Series(dtype='float64')

    rows = []
    for col in df.columns:
        dtype= df[col].dtype
        non_null= df[col].notnull().sum()
        missing= df[col].isnull().sum()
        uniq= df[col].nunique(dropna=False)
        info= {
            'Column': col,
            'Dtype':  str(dtype),
            '#Non-Null': non_null,
            '#Missing':  missing,
            '#Unique':   uniq
        }
        if pd.api.types.is_numeric_dtype(dtype):
            info['Min']   = df[col].min(skipna=True)
            info['Max']   = df[col].max(skipna=True)
            info['Mean']  = df[col].mean(skipna=True)
            info['Median']= df[col].median(skipna=True)
            info['Std']   = df[col].std(skipna=True)
            # correlation if col in index
            if col in correlations.index:
                info['Corr(target)']= correlations[col]
            else:
                info['Corr(target)']= None

            info['Top']= None
            info['Freq']=None
        else:
            # For non-numeric
            info['Min']   = None
            info['Max']   = None
            info['Mean']  = None
            info['Median']= None
            info['Std']   = None
            info['Corr(target)']= None
            top_val_series= df[col].value_counts(dropna=False)
            if len(top_val_series)>0:
                top_val= top_val_series.index[0]
                freq= top_val_series.iloc[0]
                info['Top']=  top_val
                info['Freq']= freq
            else:
                info['Top']=  None
                info['Freq']= None
        rows.append(info)
    return pd.DataFrame(rows)

###############################################################################
#              Data Exploration (calls custom_full_profiling)
###############################################################################
def dynamic_explore_data(df: pd.DataFrame, max_cols: int):
    # 1) Show full custom profiling
    profile_df= custom_full_profiling(df)
    print("=== Full Profiling Info ===")
    display(profile_df)

    # 2) Pairplot if 'target' in columns
    total_missing= df.isnull().sum().sum()
    if total_missing==0:
        print("No missing values (above).")
    else:
        print(f"Missing values found: {total_missing} (above).")

    if "target" not in df.columns:
        print("\nNo 'target' => skipping pairplot.")
        return

    numeric_cols= df.select_dtypes(include=[np.number]).columns.tolist()
    if "target" in numeric_cols:
        numeric_cols.remove("target")
    use_cols= numeric_cols[:max_cols]
    if not use_cols:
        print("\nNo numeric cols => skipping pairplot.")
        return

    pcols= use_cols + ["target"]
    if not set(pcols).issubset(df.columns):
        print("\nSome pairplot columns missing => skipping.")
        return

    print("\nPlotting Pairplot (may take time):")
    with tqdm(total=1, desc="Pairplot") as pbar:
        g= sns.pairplot(
            df[pcols], hue="target", palette="Greys",
            markers=["o","D"], plot_kws={"s":25,"alpha":0.75}, height=3
        )
        g.fig.suptitle(f"Pairplot of up to {max_cols} Numeric + Target", y=1.02)
        plt.show()
        pbar.update(1)

###############################################################################
#               UI + Execution
###############################################################################
data_source_title  = widgets.HTML(value="<b>Select data source:</b>")
file_path_title    = widgets.HTML(value="<b>CSV File Path (if 'file' selected):</b>")
search_title       = widgets.HTML(value="<b>Selected search method:</b>")
scaling_title      = widgets.HTML(value="<b>Selected scaling method:</b>")
models_title       = widgets.HTML(value="<b>Selected models:</b>")
max_cols_title     = widgets.HTML(value="<b>Max numeric columns for pairplot:</b>")

data_source_widget= widgets.RadioButtons(
    options=["heart_disease (embedded)", "breast_cancer (embedded)", "file"],
    value="heart_disease (embedded)"
)
file_path_widget  = widgets.Text(value="heart_disease.csv")
search_method_widget= widgets.RadioButtons(
    options=['grid','random','bayesian','none'],
    value='random'
)
scaling_method_widget= widgets.RadioButtons(
    options=[
        'StandardScaler','MinMaxScaler','RobustScaler','MaxAbsScaler',
        'Normalizer','QuantileTransformer','PowerTransformer','none'
    ],
    value='StandardScaler'
)
model_options= [
    'Naive Bayes','KNN','Logistic Regression','Linear Discriminant Analysis',
    'Quadratic Discriminant Analysis','Decision Tree','SVM','Random Forest',
    'Extra Trees','AdaBoost','Gradient Boosting','XGBoost','LightGBM'
]
model_checkboxes= [widgets.Checkbox(value=True, description=m) for m in model_options]
model_selection_box= widgets.VBox(model_checkboxes)

max_cols_widget= widgets.IntSlider(value=5, min=1, max=50, step=1)
run_button     = widgets.Button(description="Run Analysis")
output         = widgets.Output()

display(data_source_title, data_source_widget)
display(file_path_title,   file_path_widget)
display(search_title,      search_method_widget)
display(scaling_title,     scaling_method_widget)
display(models_title,      model_selection_box)
display(max_cols_title,    max_cols_widget)
display(run_button,        output)

def on_button_clicked(_):
    # disable all
    run_button.disabled= True
    data_source_widget.disabled= True
    file_path_widget.disabled= True
    search_method_widget.disabled= True
    scaling_method_widget.disabled= True
    max_cols_widget.disabled= True
    for cb in model_checkboxes:
        cb.disabled= True

    with output:
        clear_output()

        data_source= data_source_widget.value
        file_path = file_path_widget.value
        max_cols = max_cols_widget.value
        search_method= search_method_widget.value
        scaling_method= scaling_method_widget.value
        selected_ms= [cb.description for cb in model_checkboxes if cb.value]

        print(f"Data Source: {data_source}")
        if data_source=="file":
            print(f"CSV File Path: {file_path}")
        print(f"Max numeric columns: {max_cols}")
        print(f"Search method: {search_method}")
        print(f"Scaling method: {scaling_method}")
        print(f"Selected models: {selected_ms}\n")

        # decide result CSV name
        if data_source=="heart_disease (embedded)":
            result_file= "heart_disease_result.csv"
            df= pd.read_csv(io.StringIO(HEART_DISEASE_CSV_DATA))
        elif data_source=="breast_cancer (embedded)":
            result_file= "breast_cancer_result.csv"
            df= pd.read_csv(io.StringIO(BREAST_CANCER_CSV_DATA))
        else:
            result_file= file_path.rstrip().replace(".csv","_result.csv")
            df= load_file_data(file_path)

        # 1) Explore w/ Full Profiling + pairplot
        dynamic_explore_data(df, max_cols)

        if "target" not in df.columns:
            print("No 'target' => stopping pipeline.")
            return

        # 2) train/test
        X= df.drop("target", axis=1)
        y= df["target"]
        X_train, X_test, y_train, y_test= train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # 3) Default models
        all_models= define_all_models(scaling_method)
        default_models= {m: all_models[m] for m in selected_ms}
        print("Training Default Models:")
        default_results= {}
        for mname, model in tqdm(default_models.items(), desc="Default Training"):
            model.fit(X_train, y_train)
            default_results[mname]= evaluate_model(model, X_test, y_test)

        # 4) Tuning
        param_grids= define_param_grids()
        tuned_models, tuned_results, best_ps= tune_models(
            default_models, param_grids, X_train, y_train, X_test, y_test, search_method
        )

        # 5) Compare
        df_comp, styled_comp= build_comparison_table(
            default_results, tuned_results, search_method, scaling_method
        )
        print("\nComparison of Default vs. Tuned:")
        display(styled_comp)

        append_results_to_csv(df_comp, result_file)
        print(f"\nResults appended to '{result_file}'.\n")

        # highest tuned
        best_tuned_auc= -np.inf
        best_tuned_name=None
        for nm, mets in tuned_results.items():
            if mets["AUC"]> best_tuned_auc:
                best_tuned_auc= mets["AUC"]
                best_tuned_name= nm
        if best_tuned_name:
            print("Highest AUC (Tuned):", best_tuned_name, best_tuned_auc)

        # highest default
        best_def_auc= -np.inf
        best_def_name=None
        for nm, mets in default_results.items():
            if mets["AUC"]> best_def_auc:
                best_def_auc= mets["AUC"]
                best_def_name= nm
        if best_def_name:
            print("Highest AUC (Default):", best_def_name, best_def_auc)

        # 6) Combined ROC + Confusions
        print("\nPlotting combined ROC + Confusions:")
        plot_roc_and_confusion(default_models, tuned_models, X_test, y_test)

    # re-enable
    run_button.disabled= False
    data_source_widget.disabled= False
    file_path_widget.disabled= False
    search_method_widget.disabled= False
    scaling_method_widget.disabled= False
    max_cols_widget.disabled= False
    for cb in model_checkboxes:
        cb.disabled= False

run_button.on_click(on_button_clicked)


HTML(value='<b>Select data source:</b>')

RadioButtons(options=('heart_disease (embedded)', 'breast_cancer (embedded)', 'file'), value='heart_disease (e…

HTML(value="<b>CSV File Path (if 'file' selected):</b>")

Text(value='heart_disease.csv')

HTML(value='<b>Selected search method:</b>')

RadioButtons(index=1, options=('grid', 'random', 'bayesian', 'none'), value='random')

HTML(value='<b>Selected scaling method:</b>')

RadioButtons(options=('StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'Normalizer', 'Quantil…

HTML(value='<b>Selected models:</b>')

VBox(children=(Checkbox(value=True, description='Naive Bayes'), Checkbox(value=True, description='KNN'), Check…

HTML(value='<b>Max numeric columns for pairplot:</b>')

IntSlider(value=5, max=50, min=1)

Button(description='Run Analysis', style=ButtonStyle())

Output()

In [1]:
!pip install --quiet --upgrade scikit-learn scikeras tensorflow ipywidgets tqdm xgboost lightgbm

import warnings
warnings.filterwarnings("ignore")

import io
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

# Scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
    Normalizer, QuantileTransformer, PowerTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve, confusion_matrix
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)

import xgboost as xgb  # <--- We'll use xgb.XGBClassifier now
import lightgbm as lgb

# SciKeras for KerasClassifier
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

###############################################################################
#                  FULL EMBEDDED CSV DATA
###############################################################################
# Replace these short placeholders with your full data lines or pick "file".
HEART_DISEASE_CSV_DATA = """age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
63,1,3,145,233,1,2,150,0,2.3,2,0,2,0
67,1,2,160,286,0,2,108,1,1.5,1,3,1,1
41,0,1,130,204,0,2,172,0,1.4,0,0,1,0
56,1,2,120,236,0,0,178,0,0.8,0,0,1,0
57,0,3,120,354,0,0,163,1,0.6,0,0,1,0
"""

BREAST_CANCER_CSV_DATA = """target,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
1,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
0,13.08,15.71,85.63,520,0.1075,0.1270,0.0456,0.0311,0.1967,0.06811
1,19.17,24.8,132.4,1123,0.0974,0.2458,0.2065,0.1118,0.2397,0.07800
1,18.63,25.11,124.8,1088,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197
"""

###############################################################################
#          Data loading
###############################################################################
def load_file_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CSV file '{file_path}' not found.")
    return pd.read_csv(file_path)

###############################################################################
#          Keras MLP builder function
###############################################################################
def build_keras_mlp(
    input_dim=13,  # Hard-coded for demonstration
    hidden_units=32,
    activation='relu',
    lr=0.001
):
    model = Sequential()
    model.add(Dense(hidden_units, activation=activation, input_dim=input_dim))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

###############################################################################
#          Pipeline creation
###############################################################################
def get_scaler(method: str):
    scalers = {
        "StandardScaler": StandardScaler(),
        "MinMaxScaler": MinMaxScaler(),
        "RobustScaler": RobustScaler(),
        "MaxAbsScaler": MaxAbsScaler(),
        "Normalizer": Normalizer(),
        "QuantileTransformer": QuantileTransformer(output_distribution="normal"),
        "PowerTransformer": PowerTransformer(),
        "none": None
    }
    return scalers.get(method, StandardScaler())

def create_pipeline(clf, scaling_method: str) -> Pipeline:
    steps = [("selector", SelectKBest(score_func=f_classif, k="all"))]
    scaler_obj = get_scaler(scaling_method)
    if scaler_obj is not None:
        steps.append(("scaler", scaler_obj))
    steps.append(("clf", clf))
    return Pipeline(steps)

###############################################################################
#          Define all models (fixed xg => xgb)
###############################################################################
def define_all_models(scaling_method: str) -> dict:
    keras_mlp = KerasClassifier(
        model=build_keras_mlp,
        epochs=10, batch_size=32, verbose=0, random_state=42
    )

    return {
        "Naive Bayes": create_pipeline(GaussianNB(), scaling_method),
        "KNN": create_pipeline(KNeighborsClassifier(), scaling_method),
        "Logistic Regression": create_pipeline(
            LogisticRegression(max_iter=1000, random_state=42), scaling_method
        ),
        "Linear Discriminant Analysis": create_pipeline(
            LinearDiscriminantAnalysis(), scaling_method
        ),
        "Quadratic Discriminant Analysis": create_pipeline(
            QuadraticDiscriminantAnalysis(), scaling_method
        ),
        "Decision Tree": create_pipeline(
            DecisionTreeClassifier(random_state=42), scaling_method
        ),
        "SVM": create_pipeline(
            SVC(probability=True, random_state=42), scaling_method
        ),
        "Random Forest": create_pipeline(
            RandomForestClassifier(random_state=42), scaling_method
        ),
        "Extra Trees": create_pipeline(
            ExtraTreesClassifier(random_state=42), scaling_method
        ),
        "AdaBoost": create_pipeline(
            AdaBoostClassifier(random_state=42), scaling_method
        ),
        "Gradient Boosting": create_pipeline(
            GradientBoostingClassifier(random_state=42), scaling_method
        ),
        # fix: xg => xgb
        "XGBoost": create_pipeline(
            xgb.XGBClassifier(eval_metric="logloss", random_state=42), scaling_method
        ),
        "LightGBM": create_pipeline(
            lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True),
            scaling_method
        ),
        "Neural Net (Keras MLP)": create_pipeline(keras_mlp, scaling_method),
    }

###############################################################################
#          Evaluate a model
###############################################################################
def evaluate_model(model, X_test, y_test) -> dict:
    y_pred = model.predict(X_test)
    # SciKeras KerasClassifier might return shape (N,1). Convert 0/1
    if y_pred.ndim == 2 and y_pred.shape[1] == 1:
        y_pred = (y_pred>0.5).astype(int).ravel()
    y_prob = model.predict_proba(X_test)[:,1]
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_prob)
    }

###############################################################################
#          Param grids
###############################################################################
def define_param_grids() -> dict:
    return {
        "Naive Bayes": {"selector__k":[5,"all"]},
        "KNN": {
            "selector__k":[5,"all"],
            "clf__n_neighbors":[3,5,7],
            "clf__weights":["uniform","distance"]
        },
        "Logistic Regression": {
            "selector__k":[5,"all"],
            "clf__C":[0.01,0.1,1,10],
            "clf__penalty":["l2"]
        },
        "Linear Discriminant Analysis": {"selector__k":[5,"all"]},
        "Quadratic Discriminant Analysis": {
            "selector__k":[5,"all"],
            "clf__reg_param":[0.0,0.1,0.2,0.5,1.0],
            "clf__tol":[1e-4,1e-3,1e-2]
        },
        "Decision Tree": {
            "selector__k":[5,"all"],
            "clf__max_depth":[None,3,5,10],
            "clf__min_samples_split":[2,5,10]
        },
        "SVM": {
            "selector__k":[5,"all"],
            "clf__C":[0.1,1,10],
            "clf__kernel":["linear","rbf"],
            "clf__gamma":["scale"]
        },
        "Random Forest": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "Extra Trees": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "AdaBoost": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1]
        },
        "Gradient Boosting": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1],
            "clf__max_depth":[3,5,7]
        },
        "XGBoost": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__max_depth":[3,5,7]
        },
        "LightGBM": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__num_leaves":[20,31,50]
        },
        "Neural Net (Keras MLP)": {
            "selector__k":[5,"all"],
            "clf__epochs":[5,10],
            "clf__batch_size":[16,32]
        },
    }

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def tune_models(models, param_grids, X_train, y_train, X_test, y_test, search_method: str):
    tuned_models = {}
    tuned_results= {}
    best_params  = {}

    for mname, pipeline in tqdm(models.items(), desc="Hyperparameter Tuning"):
        print(f"Tuning {mname}...")
        grid_params = param_grids.get(mname, {})
        if search_method=="none" or not grid_params:
            pipeline.fit(X_train, y_train)
            tuned_models[mname] = pipeline
            tuned_results[mname]= evaluate_model(pipeline, X_test, y_test)
            best_params[mname]  = None
            print(f"  No tuning for {mname}. Using default.\n")
            continue

        if search_method=="grid":
            search_cv= GridSearchCV(pipeline, grid_params, cv=3, scoring="roc_auc", n_jobs=-1)
        elif search_method=="random":
            search_cv= RandomizedSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, verbose=0, random_state=42
            )
        else: # "bayesian"
            from skopt import BayesSearchCV
            search_cv= BayesSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, random_state=42
            )

        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        tuned_models[mname] = best_model
        tuned_results[mname]= evaluate_model(best_model, X_test, y_test)
        best_params[mname]  = search_cv.best_params_
        print(f"  [{search_method.capitalize()}] Best CV AUC: {search_cv.best_score_:.4f} | Best Params: {search_cv.best_params_}\n")

    return tuned_models, tuned_results, best_params

###############################################################################
#         Build Comparison Table + CSV
###############################################################################
def build_comparison_table(default_results, tuned_results, search_method, scaling_method):
    comp_dict= {}
    for mname in default_results:
        comp_dict[mname]= {
            "Accuracy (Default)":   default_results[mname]["Accuracy"],
            "Accuracy (Tuned)":     tuned_results[mname]["Accuracy"],
            "Precision (Default)":  default_results[mname]["Precision"],
            "Precision (Tuned)":    tuned_results[mname]["Precision"],
            "Recall (Default)":     default_results[mname]["Recall"],
            "Recall (Tuned)":       tuned_results[mname]["Recall"],
            "F1 Score (Default)":   default_results[mname]["F1 Score"],
            "F1 Score (Tuned)":     tuned_results[mname]["F1 Score"],
            "AUC (Default)":        default_results[mname]["AUC"],
            "AUC (Tuned)":          tuned_results[mname]["AUC"]
        }
    df_comp= pd.DataFrame(comp_dict).T
    df_comp["Selected search method"]= search_method
    df_comp["Selected scaling method"]= scaling_method
    df_comp["timestamp"]= datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def color_cmp(row):
        metrics= ["Accuracy","Precision","Recall","F1 Score","AUC"]
        styles= []
        for m in metrics:
            dval= row[f"{m} (Default)"]
            tval= row[f"{m} (Tuned)"]
            styles.append("")
            if tval>dval:
                styles.append("color: green")
            elif tval<dval:
                styles.append("color: red")
            else:
                styles.append("color: black")
        styles.extend(["","",""])
        return styles

    styled= df_comp.style.apply(color_cmp, axis=1)
    return df_comp, styled

def append_results_to_csv(df_comp, fname):
    if not os.path.exists(fname):
        df_comp.to_csv(fname, index=True)
    else:
        df_comp.to_csv(fname, mode='a', header=False, index=True)

###############################################################################
#           Combined ROC + Confusion
###############################################################################
def plot_roc_and_confusion(default_models, tuned_models, X_test, y_test):
    if not default_models:
        print("No models => skipping combined plot.")
        return

    from tqdm.notebook import tqdm
    mnames= sorted(default_models.keys())
    n_models= len(mnames)
    fig, axes= plt.subplots(nrows=n_models, ncols=3, figsize=(12,4*n_models))
    if n_models==1:
        axes= [axes]

    print("Plotting combined ROC + confusion for each model in 1 figure...")
    with tqdm(total=n_models, desc="Models") as pbar:
        for i, mname in enumerate(mnames):
            ax_roc= axes[i][0]
            ax_def= axes[i][1]
            ax_tun= axes[i][2]

            dmodel= default_models[mname]
            tmodel= tuned_models[mname]

            y_prob_def= dmodel.predict_proba(X_test)[:,1]
            fpr_d,tpr_d,_= roc_curve(y_test, y_prob_def)
            auc_d= roc_auc_score(y_test,y_prob_def)

            y_prob_tun= tmodel.predict_proba(X_test)[:,1]
            fpr_t,tpr_t,_= roc_curve(y_test, y_prob_tun)
            auc_t= roc_auc_score(y_test,y_prob_tun)

            ax_roc.plot(fpr_d,tpr_d,color="gray", lw=2, label=f"Default AUC={auc_d:.4f}")
            ax_roc.plot(fpr_t,tpr_t,color="red", lw=2, linestyle=":",
                        label=f"Tuned AUC={auc_t:.4f}")
            ax_roc.plot([0,1],[0,1],"k--",lw=1)
            ax_roc.set_title(f"{mname}\nROC Curve")
            ax_roc.set_xlabel("False Positive Rate")
            ax_roc.set_ylabel("True Positive Rate")
            ax_roc.legend(loc="lower right")
            ax_roc.grid(True)
            ax_roc.set_aspect("equal","box")

            cm_def= confusion_matrix(y_test,dmodel.predict(X_test))
            _plot_small_cm(ax_def,cm_def)
            ax_def.set_title("Conf. Matrix\n(Default)")

            cm_tun= confusion_matrix(y_test,tmodel.predict(X_test))
            _plot_small_cm(ax_tun,cm_tun)
            ax_tun.set_title("Conf. Matrix\n(Tuned)")

            pbar.update(1)
    plt.tight_layout()
    plt.show()

def _plot_small_cm(ax, cm):
    cell_size=0.2
    colors= np.array([
        ["#a5e8a3","#ffaaaa"],
        ["#ffaaaa","#a5e8a3"]
    ])
    for i in range(2):
        for j in range(2):
            ax.add_patch(plt.Rectangle(
                (j*cell_size, i*cell_size), cell_size, cell_size,
                facecolor=colors[i,j], edgecolor='none'
            ))
            ax.text(j*cell_size+cell_size/2, i*cell_size+cell_size/2,
                    str(cm[i,j]), ha='center', va='center',
                    fontsize=8, color='black')
    ax.set_xlim(0,2*cell_size)
    ax.set_ylim(0,2*cell_size)
    ax.set_xticks([cell_size/2,cell_size*1.5])
    ax.set_xticklabels(["Neg","Pos"],fontsize=8)
    ax.set_yticks([cell_size/2,cell_size*1.5])
    ax.set_yticklabels(["Neg","Pos"],fontsize=8)
    ax.invert_yaxis()
    ax.set_aspect("equal")
    ax.set_frame_on(False)
    for s in ax.spines.values():
        s.set_visible(False)

###############################################################################
#         Manual Full-Profiling
###############################################################################
def custom_full_profiling(df: pd.DataFrame) -> pd.DataFrame:
    target_present= ('target' in df.columns)
    if target_present and pd.api.types.is_numeric_dtype(df['target']):
        correlations= df.corrwith(df['target'])
    else:
        correlations= pd.Series(dtype='float64')

    rows=[]
    for col in df.columns:
        dtype= df[col].dtype
        non_null= df[col].notnull().sum()
        missing= df[col].isnull().sum()
        uniq= df[col].nunique(dropna=False)
        info= {
            'Column': col,
            'Dtype': str(dtype),
            '#Non-Null': non_null,
            '#Missing': missing,
            '#Unique': uniq
        }
        if pd.api.types.is_numeric_dtype(dtype):
            info['Min']=df[col].min(skipna=True)
            info['Max']=df[col].max(skipna=True)
            info['Mean']=df[col].mean(skipna=True)
            info['Median']=df[col].median(skipna=True)
            info['Std']=df[col].std(skipna=True)
            if col in correlations.index:
                info['Corr(target)']= correlations[col]
            else:
                info['Corr(target)']= None
            info['Top']= None
            info['Freq']=None
        else:
            info['Min']=None
            info['Max']=None
            info['Mean']=None
            info['Median']=None
            info['Std']=None
            info['Corr(target)']=None
            vc= df[col].value_counts(dropna=False)
            if len(vc)>0:
                tv= vc.index[0]
                freq= vc.iloc[0]
                info['Top']= tv
                info['Freq']=freq
            else:
                info['Top']=None
                info['Freq']=None
        rows.append(info)
    return pd.DataFrame(rows)

def dynamic_explore_data(df: pd.DataFrame, max_cols: int):
    prof= custom_full_profiling(df)
    print("=== Full Profiling Info ===")
    display(prof)

    total_missing= df.isnull().sum().sum()
    if total_missing==0:
        print("No missing values (above).")
    else:
        print(f"Missing values: {total_missing} (above).")

    if "target" not in df.columns:
        print("\nNo 'target' => skipping pairplot.")
        return

    numeric_cols= df.select_dtypes(include=[np.number]).columns.tolist()
    if "target" in numeric_cols:
        numeric_cols.remove("target")
    use_cols= numeric_cols[:max_cols]
    if not use_cols:
        print("\nNo numeric => skipping pairplot.")
        return

    pcols= use_cols + ["target"]
    if not set(pcols).issubset(df.columns):
        print("\nSome pairplot columns missing => skipping.")
        return

    print("\nPlotting Pairplot (may take time):")
    with tqdm(total=1, desc="Pairplot") as pb:
        g= sns.pairplot(
            df[pcols], hue="target",
            palette="Greys", markers=["o","D"],
            plot_kws={"s":25,"alpha":0.75}, height=3
        )
        g.fig.suptitle(f"Pairplot of up to {max_cols} Numeric + Target", y=1.02)
        plt.show()
        pb.update(1)

###############################################################################
#               UI + Execution
###############################################################################
data_source_title   = widgets.HTML(value="<b>Select data source:</b>")
file_path_title     = widgets.HTML(value="<b>CSV File Path (if 'file' selected):</b>")
search_title        = widgets.HTML(value="<b>Selected search method:</b>")
scaling_title       = widgets.HTML(value="<b>Selected scaling method:</b>")
models_title        = widgets.HTML(value="<b>Selected models:</b>")
max_cols_title      = widgets.HTML(value="<b>Max numeric columns for pairplot:</b>")

data_source_widget  = widgets.RadioButtons(
    options=["heart_disease (embedded)", "breast_cancer (embedded)", "file"],
    value="heart_disease (embedded)"
)
file_path_widget    = widgets.Text(value="heart_disease.csv")
search_method_widget= widgets.RadioButtons(
    options=['grid','random','bayesian','none'], value='random'
)
scaling_method_widget= widgets.RadioButtons(
    options=[
        'StandardScaler','MinMaxScaler','RobustScaler','MaxAbsScaler',
        'Normalizer','QuantileTransformer','PowerTransformer','none'
    ],
    value='StandardScaler'
)
model_options= [
    'Naive Bayes','KNN','Logistic Regression','Linear Discriminant Analysis',
    'Quadratic Discriminant Analysis','Decision Tree','SVM','Random Forest',
    'Extra Trees','AdaBoost','Gradient Boosting','XGBoost','LightGBM',
    'Neural Net (Keras MLP)'
]
model_checkboxes= [widgets.Checkbox(value=True, description=m) for m in model_options]
model_selection_box= widgets.VBox(model_checkboxes)

max_cols_widget= widgets.IntSlider(value=5, min=1, max=50, step=1)
run_button= widgets.Button(description="Run Analysis")
output= widgets.Output()

display(data_source_title, data_source_widget)
display(file_path_title,   file_path_widget)
display(search_title,      search_method_widget)
display(scaling_title,     scaling_method_widget)
display(models_title,      model_selection_box)
display(max_cols_title,    max_cols_widget)
display(run_button,        output)

def on_button_clicked(_):
    run_button.disabled= True
    data_source_widget.disabled= True
    file_path_widget.disabled= True
    search_method_widget.disabled= True
    scaling_method_widget.disabled= True
    max_cols_widget.disabled= True
    for cb in model_checkboxes:
        cb.disabled= True

    with output:
        clear_output()

        data_source= data_source_widget.value
        file_path = file_path_widget.value
        max_cols = max_cols_widget.value
        search_method= search_method_widget.value
        scaling_method= scaling_method_widget.value
        selected_ms= [cb.description for cb in model_checkboxes if cb.value]

        print(f"Data Source: {data_source}")
        if data_source=="file":
            print(f"CSV File Path: {file_path}")
        print(f"Max numeric columns: {max_cols}")
        print(f"Search method: {search_method}")
        print(f"Scaling method: {scaling_method}")
        print(f"Selected models: {selected_ms}\n")

        # decide CSV name & load data
        if data_source=="heart_disease (embedded)":
            result_file= "heart_disease_result.csv"
            df= pd.read_csv(io.StringIO(HEART_DISEASE_CSV_DATA))
        elif data_source=="breast_cancer (embedded)":
            result_file= "breast_cancer_result.csv"
            df= pd.read_csv(io.StringIO(BREAST_CANCER_CSV_DATA))
        else:
            result_file= file_path.rstrip().replace(".csv","_result.csv")
            df= load_file_data(file_path)

        dynamic_explore_data(df, max_cols)
        if "target" not in df.columns:
            print("No 'target' => stopping pipeline.")
            return

        X= df.drop("target", axis=1)
        y= df["target"]
        X_train, X_test, y_train, y_test= train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # build + train default
        all_models= define_all_models(scaling_method)
        default_models= {m: all_models[m] for m in selected_ms}
        print("Training default models:")
        default_results= {}
        for mname, model in tqdm(default_models.items(), desc="Default Training"):
            model.fit(X_train, y_train)
            default_results[mname]= evaluate_model(model, X_test, y_test)

        # tune
        param_grids= define_param_grids()
        tuned_models, tuned_results, best_params= tune_models(
            default_models, param_grids, X_train, y_train, X_test, y_test, search_method
        )

        # compare
        df_comp, styled_comp= build_comparison_table(
            default_results, tuned_results, search_method, scaling_method
        )
        print("\nComparison of Default vs. Tuned:")
        display(styled_comp)

        append_results_to_csv(df_comp, result_file)
        print(f"\nAppended results to '{result_file}'\n")

        best_tuned_auc= -np.inf
        best_tuned_name= None
        for nm, mets in tuned_results.items():
            if mets["AUC"]> best_tuned_auc:
                best_tuned_auc= mets["AUC"]
                best_tuned_name= nm
        if best_tuned_name:
            print(f"Highest AUC (Tuned): {best_tuned_name} => {best_tuned_auc:.4f}")

        best_def_auc= -np.inf
        best_def_name= None
        for nm, mets in default_results.items():
            if mets["AUC"]> best_def_auc:
                best_def_auc= mets["AUC"]
                best_def_name= nm
        if best_def_name:
            print(f"Highest AUC (Default): {best_def_name} => {best_def_auc:.4f}")

        print("\nPlotting combined ROC + Confusions:")
        plot_roc_and_confusion(default_models, tuned_models, X_test, y_test)

    # re-enable UI
    run_button.disabled= False
    data_source_widget.disabled= False
    file_path_widget.disabled= False
    search_method_widget.disabled= False
    scaling_method_widget.disabled= False
    max_cols_widget.disabled= False
    for cb in model_checkboxes:
        cb.disabled= False

run_button.on_click(on_button_clicked)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h

HTML(value='<b>Select data source:</b>')

RadioButtons(options=('heart_disease (embedded)', 'breast_cancer (embedded)', 'file'), value='heart_disease (e…

HTML(value="<b>CSV File Path (if 'file' selected):</b>")

Text(value='heart_disease.csv')

HTML(value='<b>Selected search method:</b>')

RadioButtons(index=1, options=('grid', 'random', 'bayesian', 'none'), value='random')

HTML(value='<b>Selected scaling method:</b>')

RadioButtons(options=('StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'Normalizer', 'Quantil…

HTML(value='<b>Selected models:</b>')

VBox(children=(Checkbox(value=True, description='Naive Bayes'), Checkbox(value=True, description='KNN'), Check…

HTML(value='<b>Max numeric columns for pairplot:</b>')

IntSlider(value=5, max=50, min=1)

Button(description='Run Analysis', style=ButtonStyle())

Output()

In [1]:
!pip install --quiet ipywidgets xgboost scikit-optimize tqdm scikeras tensorflow

import warnings
warnings.filterwarnings("ignore")

import io
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

from sklearn.model_selection import (
    train_test_split, GridSearchCV, RandomizedSearchCV
)
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler,
    Normalizer, QuantileTransformer, PowerTransformer
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, roc_curve, confusion_matrix
)

# Additional for LDA, QDA, etc.
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)
import xgboost as xgb
import lightgbm as lgb
from skopt import BayesSearchCV

# >>> Import from SciKeras
from scikeras.wrappers import KerasClassifier

# For building the Keras model
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

###############################################################################
#         EMBEDDED CSV (Truncated) - replace with your full data or pick "file"
###############################################################################
HEART_DISEASE_CSV_DATA = """age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
63,1,0,145,233,1,2,150,0,2.3,2,0,2,0
67,1,3,160,286,0,2,108,1,1.5,1,3,1,1
... (Replace with full untruncated lines) ...
"""

BREAST_CANCER_CSV_DATA = """target,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
1,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
... (Replace with full untruncated lines) ...
"""

###############################################################################
#                   Load from file or embedded data
###############################################################################
def load_file_data(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CSV file '{file_path}' not found.")
    return pd.read_csv(file_path)

###############################################################################
#   Build Keras MLP (adjust input_dim to match your # of features)
###############################################################################
def build_keras_mlp(
    input_dim=13,  # Hard-coded to match typical heart_disease data
    hidden_units=32,
    activation='relu',
    lr=0.001
):
    model = Sequential()
    model.add(Dense(hidden_units, activation=activation, input_dim=input_dim))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

###############################################################################
#                    Scaler + Pipeline
###############################################################################
def get_scaler(method: str):
    scalers = {
        "StandardScaler": StandardScaler(),
        "MinMaxScaler": MinMaxScaler(),
        "RobustScaler": RobustScaler(),
        "MaxAbsScaler": MaxAbsScaler(),
        "Normalizer": Normalizer(),
        "QuantileTransformer": QuantileTransformer(output_distribution="normal"),
        "PowerTransformer": PowerTransformer(),
        "none": None
    }
    return scalers.get(method, StandardScaler())

def create_pipeline(clf, scaling_method: str) -> Pipeline:
    steps = [("selector", SelectKBest(score_func=f_classif, k="all"))]
    scaler_obj = get_scaler(scaling_method)
    if scaler_obj is not None:
        steps.append(("scaler", scaler_obj))
    steps.append(("clf", clf))
    return Pipeline(steps)

###############################################################################
#              All models incl. LDA, QDA, and "Neural Net (Keras MLP)"
###############################################################################
def define_all_models(scaling_method: str) -> dict:
    # Keras MLP w/ SciKeras
    keras_mlp = KerasClassifier(
        model=build_keras_mlp,  # scikeras param
        epochs=10,
        batch_size=32,
        verbose=0,
        # We do not pass input_dim here because the build func uses default=13
        random_state=42
    )

    return {
        "Naive Bayes": create_pipeline(GaussianNB(), scaling_method),
        "KNN": create_pipeline(KNeighborsClassifier(), scaling_method),
        "Logistic Regression": create_pipeline(
            LogisticRegression(max_iter=1000, random_state=42), scaling_method
        ),
        "Linear Discriminant Analysis": create_pipeline(
            LinearDiscriminantAnalysis(), scaling_method
        ),
        "Quadratic Discriminant Analysis": create_pipeline(
            QuadraticDiscriminantAnalysis(), scaling_method
        ),
        "Decision Tree": create_pipeline(DecisionTreeClassifier(random_state=42), scaling_method),
        "SVM": create_pipeline(SVC(probability=True, random_state=42), scaling_method),
        "Random Forest": create_pipeline(RandomForestClassifier(random_state=42), scaling_method),
        "Extra Trees": create_pipeline(ExtraTreesClassifier(random_state=42), scaling_method),
        "AdaBoost": create_pipeline(AdaBoostClassifier(random_state=42), scaling_method),
        "Gradient Boosting": create_pipeline(GradientBoostingClassifier(random_state=42), scaling_method),
        "XGBoost": create_pipeline(xgb.XGBClassifier(eval_metric="logloss", random_state=42), scaling_method),
        "LightGBM": create_pipeline(lgb.LGBMClassifier(random_state=42, verbose=-1, force_col_wise=True), scaling_method),
        "Neural Net (Keras MLP)": create_pipeline(keras_mlp, scaling_method),
    }

###############################################################################
#                  Evaluate model
###############################################################################
def evaluate_model(model, X_test, y_test) -> dict:
    # For KerasClassifier (SciKeras), predict() returns class labels 0/1
    y_pred = model.predict(X_test)
    if y_pred.ndim == 2 and y_pred.shape[1] == 1:
        y_pred = (y_pred > 0.5).astype(int).ravel()
    y_prob = model.predict_proba(X_test)[:, 1]
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "AUC": roc_auc_score(y_test, y_prob)
    }

###############################################################################
#                Param grids: includes MLP
###############################################################################
def define_param_grids() -> dict:
    return {
        "Naive Bayes": {"selector__k":[5,"all"]},
        "KNN": {
            "selector__k":[5,"all"],
            "clf__n_neighbors":[3,5,7],
            "clf__weights":["uniform","distance"]
        },
        "Logistic Regression": {
            "selector__k":[5,"all"],
            "clf__C":[0.01,0.1,1,10],
            "clf__penalty":["l2"]
        },
        "Linear Discriminant Analysis": {"selector__k":[5,"all"]},
        "Quadratic Discriminant Analysis": {
            "selector__k":[5,"all"],
            "clf__reg_param":[0.0,0.1,0.2,0.5,1.0],
            "clf__tol":[1e-4,1e-3,1e-2]
        },
        "Decision Tree": {
            "selector__k":[5,"all"],
            "clf__max_depth":[None,3,5,10],
            "clf__min_samples_split":[2,5,10]
        },
        "SVM": {
            "selector__k":[5,"all"],
            "clf__C":[0.1,1,10],
            "clf__kernel":["linear","rbf"],
            "clf__gamma":["scale"]
        },
        "Random Forest": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "Extra Trees": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__max_depth":[None,5,10]
        },
        "AdaBoost": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1]
        },
        "Gradient Boosting": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,1],
            "clf__max_depth":[3,5,7]
        },
        "XGBoost": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__max_depth":[3,5,7]
        },
        "LightGBM": {
            "selector__k":[5,"all"],
            "clf__n_estimators":[50,100,200],
            "clf__learning_rate":[0.01,0.1,0.2],
            "clf__num_leaves":[20,31,50]
        },
        # Minimal param grid for MLP:
        "Neural Net (Keras MLP)": {
            "selector__k":[5,"all"],
            # tune epochs, batch_size, etc.:
            "clf__epochs":[5,10],
            "clf__batch_size":[16,32]
        },
    }

def tune_models(models, param_grids, X_train, y_train, X_test, y_test, search_method: str):
    tuned_models = {}
    tuned_results= {}
    best_params  = {}

    for mname, pipeline in tqdm(models.items(), desc="Hyperparameter Tuning"):
        print(f"Tuning {mname}...")
        grid_params = param_grids.get(mname, {})
        if search_method=="none" or not grid_params:
            pipeline.fit(X_train, y_train)
            tuned_models[mname] = pipeline
            tuned_results[mname]= evaluate_model(pipeline, X_test, y_test)
            best_params[mname]  = None
            print(f"  No tuning for {mname}. Using default.\n")
            continue

        if search_method=="grid":
            search_cv= GridSearchCV(pipeline, grid_params, cv=3, scoring="roc_auc", n_jobs=-1)
        elif search_method=="random":
            search_cv= RandomizedSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, verbose=0, random_state=42
            )
        else: # "bayesian"
            search_cv= BayesSearchCV(
                pipeline, grid_params, cv=3, scoring="roc_auc",
                n_iter=10, n_jobs=-1, random_state=42
            )

        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        tuned_models[mname] = best_model
        tuned_results[mname]= evaluate_model(best_model, X_test, y_test)
        best_params[mname]  = search_cv.best_params_
        print(f"  [{search_method.capitalize()}] Best CV AUC: {search_cv.best_score_:.4f} | Best Params: {search_cv.best_params_}\n")

    return tuned_models, tuned_results, best_params

###############################################################################
#                Build Comparison Table & CSV
###############################################################################
def build_comparison_table(default_results, tuned_results, search_method, scaling_method):
    comp_dict= {}
    for mname in default_results:
        comp_dict[mname]= {
            "Accuracy (Default)":   default_results[mname]["Accuracy"],
            "Accuracy (Tuned)":     tuned_results[mname]["Accuracy"],
            "Precision (Default)":  default_results[mname]["Precision"],
            "Precision (Tuned)":    tuned_results[mname]["Precision"],
            "Recall (Default)":     default_results[mname]["Recall"],
            "Recall (Tuned)":       tuned_results[mname]["Recall"],
            "F1 Score (Default)":   default_results[mname]["F1 Score"],
            "F1 Score (Tuned)":     tuned_results[mname]["F1 Score"],
            "AUC (Default)":        default_results[mname]["AUC"],
            "AUC (Tuned)":          tuned_results[mname]["AUC"]
        }
    df_comp= pd.DataFrame(comp_dict).T
    df_comp["Selected search method"]= search_method
    df_comp["Selected scaling method"]= scaling_method
    df_comp["timestamp"]= datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def color_cmp(row):
        metrics= ["Accuracy","Precision","Recall","F1 Score","AUC"]
        styles= []
        for m in metrics:
            dval= row[f"{m} (Default)"]
            tval= row[f"{m} (Tuned)"]
            styles.append("")
            if tval>dval:
                styles.append("color: green")
            elif tval<dval:
                styles.append("color: red")
            else:
                styles.append("color: black")
        styles.extend(["","",""])
        return styles

    styled= df_comp.style.apply(color_cmp, axis=1)
    return df_comp, styled

def append_results_to_csv(df_comp, fname):
    if not os.path.exists(fname):
        df_comp.to_csv(fname, index=True)
    else:
        df_comp.to_csv(fname, mode='a', header=False, index=True)

###############################################################################
#       Combined ROC + Confusion side-by-side
###############################################################################
def plot_roc_and_confusion(default_models, tuned_models, X_test, y_test):
    if not default_models:
        print("No models => skipping combined plot.")
        return

    from tqdm.notebook import tqdm
    mnames= sorted(default_models.keys())
    n_models= len(mnames)
    fig, axes= plt.subplots(nrows=n_models, ncols=3, figsize=(12,4*n_models))
    if n_models==1:
        axes= [axes]

    print("Plotting combined ROC + confusion for each model in 1 figure...")
    with tqdm(total=n_models, desc="Models") as pbar:
        for i, mname in enumerate(mnames):
            ax_roc= axes[i][0]
            ax_def= axes[i][1]
            ax_tun= axes[i][2]

            dmodel= default_models[mname]
            tmodel= tuned_models[mname]

            y_prob_def= dmodel.predict_proba(X_test)[:,1]
            fpr_d,tpr_d,_= roc_curve(y_test, y_prob_def)
            auc_d= roc_auc_score(y_test,y_prob_def)

            y_prob_tun= tmodel.predict_proba(X_test)[:,1]
            fpr_t,tpr_t,_= roc_curve(y_test, y_prob_tun)
            auc_t= roc_auc_score(y_test,y_prob_tun)

            ax_roc.plot(fpr_d,tpr_d,color="gray", lw=2, label=f"Default AUC={auc_d:.4f}")
            ax_roc.plot(fpr_t,tpr_t,color="red", lw=2, linestyle=":",
                        label=f"Tuned AUC={auc_t:.4f}")
            ax_roc.plot([0,1],[0,1],"k--",lw=1)
            ax_roc.set_title(f"{mname}\nROC Curve")
            ax_roc.set_xlabel("False Positive Rate")
            ax_roc.set_ylabel("True Positive Rate")
            ax_roc.legend(loc="lower right")
            ax_roc.grid(True)
            ax_roc.set_aspect("equal","box")

            cm_def= confusion_matrix(y_test,dmodel.predict(X_test))
            _plot_small_cm(ax_def,cm_def)
            ax_def.set_title("Conf. Matrix\n(Default)")

            cm_tun= confusion_matrix(y_test,tmodel.predict(X_test))
            _plot_small_cm(ax_tun,cm_tun)
            ax_tun.set_title("Conf. Matrix\n(Tuned)")

            pbar.update(1)
    plt.tight_layout()
    plt.show()

def _plot_small_cm(ax, cm):
    cell_size=0.2
    colors= np.array([
        ["#a5e8a3","#ffaaaa"],
        ["#ffaaaa","#a5e8a3"]
    ])
    for i in range(2):
        for j in range(2):
            ax.add_patch(plt.Rectangle(
                (j*cell_size,i*cell_size), cell_size, cell_size,
                facecolor=colors[i,j], edgecolor='none'
            ))
            ax.text(j*cell_size+cell_size/2, i*cell_size+cell_size/2,
                    str(cm[i,j]), ha='center', va='center',
                    fontsize=8, color='black')
    ax.set_xlim(0,2*cell_size)
    ax.set_ylim(0,2*cell_size)
    ax.set_xticks([cell_size/2,cell_size*1.5])
    ax.set_xticklabels(["Neg","Pos"],fontsize=8)
    ax.set_yticks([cell_size/2,cell_size*1.5])
    ax.set_yticklabels(["Neg","Pos"],fontsize=8)
    ax.invert_yaxis()
    ax.set_aspect("equal")
    ax.set_frame_on(False)
    for s in ax.spines.values():
        s.set_visible(False)

###############################################################################
#    Manual Full-Profiling (No ydata-profiling)
###############################################################################
def custom_full_profiling(df: pd.DataFrame) -> pd.DataFrame:
    target_present= ('target' in df.columns)
    if target_present and pd.api.types.is_numeric_dtype(df['target']):
        correlations= df.corrwith(df['target'])
    else:
        correlations= pd.Series(dtype='float64')

    rows=[]
    for col in df.columns:
        dtype= df[col].dtype
        non_null= df[col].notnull().sum()
        missing= df[col].isnull().sum()
        uniq= df[col].nunique(dropna=False)
        info= {
            'Column': col,
            'Dtype': str(dtype),
            '#Non-Null': non_null,
            '#Missing': missing,
            '#Unique': uniq
        }
        if pd.api.types.is_numeric_dtype(dtype):
            info['Min']= df[col].min(skipna=True)
            info['Max']= df[col].max(skipna=True)
            info['Mean']=df[col].mean(skipna=True)
            info['Median']=df[col].median(skipna=True)
            info['Std']= df[col].std(skipna=True)
            if col in correlations.index:
                info['Corr(target)']= correlations[col]
            else:
                info['Corr(target)']=None
            info['Top']=None
            info['Freq']=None
        else:
            info['Min']=None
            info['Max']=None
            info['Mean']=None
            info['Median']=None
            info['Std']=None
            info['Corr(target)']=None
            top_val_series= df[col].value_counts(dropna=False)
            if len(top_val_series)>0:
                tv= top_val_series.index[0]
                freq= top_val_series.iloc[0]
                info['Top']= tv
                info['Freq']=freq
            else:
                info['Top']=None
                info['Freq']=None
        rows.append(info)
    return pd.DataFrame(rows)

def dynamic_explore_data(df: pd.DataFrame, max_cols: int):
    # Show full manual profiling
    profile_df= custom_full_profiling(df)
    print("=== Full Profiling Info ===")
    display(profile_df)

    total_missing= df.isnull().sum().sum()
    if total_missing==0:
        print("No missing values (above).")
    else:
        print(f"Missing values: {total_missing} (above).")

    if "target" not in df.columns:
        print("\nNo 'target' => skipping pairplot.")
        return

    numeric_cols= df.select_dtypes(include=[np.number]).columns.tolist()
    if "target" in numeric_cols:
        numeric_cols.remove("target")
    use_cols= numeric_cols[:max_cols]
    if not use_cols:
        print("\nNo numeric => skipping pairplot.")
        return

    pcols= use_cols + ["target"]
    if not set(pcols).issubset(df.columns):
        print("\nSome pairplot columns missing => skipping.")
        return

    print("\nPlotting Pairplot (may take time):")
    with tqdm(total=1, desc="Pairplot") as pbar:
        g= sns.pairplot(
            df[pcols], hue="target", palette="Greys",
            markers=["o","D"], plot_kws={"s":25,"alpha":0.75}, height=3
        )
        g.fig.suptitle(f"Pairplot of up to {max_cols} Numeric + Target", y=1.02)
        plt.show()
        pbar.update(1)

###############################################################################
#                     UI + Execution
###############################################################################
data_source_title  = widgets.HTML(value="<b>Select data source:</b>")
file_path_title    = widgets.HTML(value="<b>CSV File Path (if 'file' selected):</b>")
search_title       = widgets.HTML(value="<b>Selected search method:</b>")
scaling_title      = widgets.HTML(value="<b>Selected scaling method:</b>")
models_title       = widgets.HTML(value="<b>Selected models:</b>")
max_cols_title     = widgets.HTML(value="<b>Max numeric columns for pairplot:</b>")

data_source_widget= widgets.RadioButtons(
    options=["heart_disease (embedded)", "breast_cancer (embedded)", "file"],
    value="heart_disease (embedded)"
)
file_path_widget  = widgets.Text(value="heart_disease.csv")
search_method_widget= widgets.RadioButtons(
    options=['grid','random','bayesian','none'], value='random'
)
scaling_method_widget= widgets.RadioButtons(
    options=[
        'StandardScaler','MinMaxScaler','RobustScaler','MaxAbsScaler',
        'Normalizer','QuantileTransformer','PowerTransformer','none'
    ],
    value='StandardScaler'
)
model_options= [
    'Naive Bayes','KNN','Logistic Regression','Linear Discriminant Analysis',
    'Quadratic Discriminant Analysis','Decision Tree','SVM','Random Forest',
    'Extra Trees','AdaBoost','Gradient Boosting','XGBoost','LightGBM',
    'Neural Net (Keras MLP)'
]
model_checkboxes= [widgets.Checkbox(value=True, description=m) for m in model_options]
model_selection_box= widgets.VBox(model_checkboxes)

max_cols_widget= widgets.IntSlider(value=5, min=1, max=50, step=1)
run_button     = widgets.Button(description="Run Analysis")
output         = widgets.Output()

display(data_source_title, data_source_widget)
display(file_path_title,   file_path_widget)
display(search_title,      search_method_widget)
display(scaling_title,     scaling_method_widget)
display(models_title,      model_selection_box)
display(max_cols_title,    max_cols_widget)
display(run_button,        output)

def on_button_clicked(_):
    # disable while running
    run_button.disabled= True
    data_source_widget.disabled= True
    file_path_widget.disabled= True
    search_method_widget.disabled= True
    scaling_method_widget.disabled= True
    max_cols_widget.disabled= True
    for cb in model_checkboxes:
        cb.disabled= True

    with output:
        clear_output()

        data_source   = data_source_widget.value
        file_path     = file_path_widget.value
        max_cols      = max_cols_widget.value
        search_method = search_method_widget.value
        scaling_method= scaling_method_widget.value
        selected_ms   = [cb.description for cb in model_checkboxes if cb.value]

        print(f"Data Source: {data_source}")
        if data_source=="file":
            print(f"CSV File Path: {file_path}")
        print(f"Max numeric columns: {max_cols}")
        print(f"Search method: {search_method}")
        print(f"Scaling method: {scaling_method}")
        print(f"Selected models: {selected_ms}\n")

        if data_source=="heart_disease (embedded)":
            result_file= "heart_disease_result.csv"
            df= pd.read_csv(io.StringIO(HEART_DISEASE_CSV_DATA))
        elif data_source=="breast_cancer (embedded)":
            result_file= "breast_cancer_result.csv"
            df= pd.read_csv(io.StringIO(BREAST_CANCER_CSV_DATA))
        else:
            result_file= file_path.rstrip().replace(".csv","_result.csv")
            df= load_file_data(file_path)

        dynamic_explore_data(df, max_cols)
        if "target" not in df.columns:
            print("No 'target' => stopping pipeline.")
            return

        X= df.drop("target", axis=1)
        y= df["target"]
        X_train, X_test, y_train, y_test= train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # define + train default models
        all_models= define_all_models(scaling_method)
        default_models= {m: all_models[m] for m in selected_ms}
        print("Training default models:")
        default_results= {}
        for mname, model in tqdm(default_models.items(), desc="Default Training"):
            model.fit(X_train, y_train)
            default_results[mname]= evaluate_model(model, X_test, y_test)

        # Tuning
        param_grids= define_param_grids()
        tuned_models, tuned_results, best_params= tune_models(
            default_models, param_grids, X_train, y_train, X_test, y_test, search_method
        )

        df_comp, styled_comp= build_comparison_table(
            default_results, tuned_results, search_method, scaling_method
        )
        print("\nComparison of Default vs. Tuned:")
        display(styled_comp)

        append_results_to_csv(df_comp, result_file)
        print(f"\nAppended results to '{result_file}'\n")

        # best tuned
        best_tuned_auc= -np.inf
        best_tuned_name= None
        for nm, mets in tuned_results.items():
            if mets["AUC"]>best_tuned_auc:
                best_tuned_auc= mets["AUC"]
                best_tuned_name= nm
        if best_tuned_name:
            print(f"Highest AUC (Tuned): {best_tuned_name} => {best_tuned_auc:.4f}")

        # best default
        best_def_auc= -np.inf
        best_def_name= None
        for nm, mets in default_results.items():
            if mets["AUC"]> best_def_auc:
                best_def_auc= mets["AUC"]
                best_def_name= nm
        if best_def_name:
            print(f"Highest AUC (Default): {best_def_name} => {best_def_auc:.4f}")

        print("\nPlotting combined ROC + Confusions:")
        plot_roc_and_confusion(default_models, tuned_models, X_test, y_test)

    # re-enable
    run_button.disabled= False
    data_source_widget.disabled= False
    file_path_widget.disabled= False
    search_method_widget.disabled= False
    scaling_method_widget.disabled= False
    max_cols_widget.disabled= False
    for cb in model_checkboxes:
        cb.disabled= False

run_button.on_click(on_button_clicked)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

HTML(value='<b>Select data source:</b>')

RadioButtons(options=('heart_disease (embedded)', 'breast_cancer (embedded)', 'file'), value='heart_disease (e…

HTML(value="<b>CSV File Path (if 'file' selected):</b>")

Text(value='heart_disease.csv')

HTML(value='<b>Selected search method:</b>')

RadioButtons(index=1, options=('grid', 'random', 'bayesian', 'none'), value='random')

HTML(value='<b>Selected scaling method:</b>')

RadioButtons(options=('StandardScaler', 'MinMaxScaler', 'RobustScaler', 'MaxAbsScaler', 'Normalizer', 'Quantil…

HTML(value='<b>Selected models:</b>')

VBox(children=(Checkbox(value=True, description='Naive Bayes'), Checkbox(value=True, description='KNN'), Check…

HTML(value='<b>Max numeric columns for pairplot:</b>')

IntSlider(value=5, max=50, min=1)

Button(description='Run Analysis', style=ButtonStyle())

Output()