In [2]:
import pandas as pd
import numpy as np

data1 = pd.read_csv('datasets/cleaned_lightpath_dataset.csv')
target1 = pd.read_csv('datasets/cleaned_lightpath_target.csv')

data2 = pd.read_csv('datasets/cleaned_lightpath_dataset_2.csv')
target2 = pd.read_csv('datasets/cleaned_lightpath_target_2.csv')

data_5 = pd.read_csv('datasets/data1_plus_5.csv')
target_5 = pd.read_csv('datasets/target1_plus_5.csv')

data_5_balanced = pd.read_csv('datasets/data1_plus_5_balanced.csv')
target_5_balanced = pd.read_csv('datasets/target1_plus_5_balanced.csv')

data_10 = pd.read_csv('datasets/data1_plus_10.csv')
target_10 = pd.read_csv('datasets/target1_plus_10.csv')

data_10_balanced = pd.read_csv('datasets/data1_plus_10_balanced.csv')
target_10_balanced = pd.read_csv('datasets/target1_plus_10_balanced.csv')

data15 = pd.read_csv('datasets/data1_plus_15.csv')
target15 = pd.read_csv('datasets/target1_plus_15.csv')

data15_balanced = pd.read_csv('datasets/data1_plus_15_balanced.csv')
target15_balanced = pd.read_csv('datasets/target1_plus_15_balanced.csv')

data20 = pd.read_csv('datasets/data1_plus_20.csv')
target20 = pd.read_csv('datasets/target1_plus_20.csv')

data20_balanced = pd.read_csv('datasets/data1_plus_20_balanced.csv')
target20_balanced = pd.read_csv('datasets/target1_plus_20_balanced.csv')

shard1 = pd.read_csv('datasets/dataset2_shard_1.csv')
target_shard1 = pd.read_csv('datasets/target2_shard_1.csv')

shard2 = pd.read_csv('datasets/dataset2_shard_2.csv')
target_shard2 = pd.read_csv('datasets/target2_shard_2.csv')

shard3 = pd.read_csv('datasets/dataset2_shard_3.csv')
target_shard3 = pd.read_csv('datasets/target2_shard_3.csv')

In [None]:
# Imports
from itertools import cycle
from sklearn.metrics import classification_report, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import numpy as np
import pandas as pd

missing = []
try:
    from xgboost import XGBClassifier
except Exception:
    missing.append("xgboost")
try:
    from catboost import CatBoostClassifier
except Exception:
    missing.append("catboost")
try:
    from lightgbm import LGBMClassifier
except Exception:
    missing.append("lightgbm")

if missing:
    raise ImportError(
        "Missing packages: " + ", ".join(missing) + ". Install them before running this cell."
    )


# Model-specific preprocessing and hyperparameter search
This section defines lightweight search spaces per model and a helper to run a small randomized search. The goal is to compare preprocessing choices (e.g., scaling) and core hyperparameters without an exhaustive run.


*Note*: This code (below) will help you to tune model on specific single data then return best outperforming model on the training data not on evals. Which is intuitive

In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def build_search_spaces():
    # Preprocessing choices: with/without scaling for linear models
    preproc_options = {
        "scaled": Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler(with_mean=False)),
            ]
        ),
        "unscaled": Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy="median")),
            ]
        ),
    }

    spaces = {
        "LogisticRegression": {
            "model": LogisticRegression(max_iter=5000, n_jobs=-1),
            "param_distributions": {
                "preproc": ["scaled"],
                "model__C": [0.01, 0.1, 1, 10],
                "model__penalty": ["l2"],
                "model__solver": ["lbfgs", "saga"],
            },
        },
        "LogReg_L1": {
            "model": LogisticRegression(max_iter=5000, penalty="l1", solver="saga", n_jobs=-1),
            "param_distributions": {
                "preproc": ["scaled"],
                "model__C": [0.01, 0.1, 1, 10],
            },
        },
        "LogReg_L2": {
            "model": LogisticRegression(max_iter=5000, penalty="l2", solver="saga", n_jobs=-1),
            "param_distributions": {
                "preproc": ["scaled"],
                "model__C": [0.01, 0.1, 1, 10],
            },
        },
        "RandomForest": {
            "model": RandomForestClassifier(random_state=42, n_jobs=-1),
            "param_distributions": {
                "preproc": ["unscaled"],
                "model__n_estimators": [200, 300, 500],
                "model__max_depth": [None, 10, 20, 30],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 2, 4],
            },
        },
        "ExtraTrees": {
            "model": ExtraTreesClassifier(random_state=42, n_jobs=-1),
            "param_distributions": {
                "preproc": ["unscaled"],
                "model__n_estimators": [200, 300, 500],
                "model__max_depth": [None, 10, 20, 30],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 2, 4],
            },
        },
        "XGBoost": {
            "model": XGBClassifier(
                n_estimators=300,
                learning_rate=0.1,
                max_depth=6,
                subsample=0.8,
                colsample_bytree=0.8,
                eval_metric="logloss",
                n_jobs=-1,
                random_state=42,
            ),
            "param_distributions": {
                "preproc": ["unscaled"],
                "model__n_estimators": [200, 300, 500],
                "model__max_depth": [4, 6, 8],
                "model__learning_rate": [0.03, 0.05, 0.1],
                "model__subsample": [0.7, 0.8, 1.0],
                "model__colsample_bytree": [0.7, 0.8, 1.0],
            },
        },
        "LightGBM": {
            "model": LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1),
            "param_distributions": {
                "preproc": ["unscaled"],
                "model__n_estimators": [200, 300, 500],
                "model__num_leaves": [15, 31, 63],
                "model__learning_rate": [0.03, 0.05, 0.1],
                "model__subsample": [0.7, 0.8, 1.0],
                "model__colsample_bytree": [0.7, 0.8, 1.0],
            },
        },
        "CatBoost": {
            "model": CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, random_seed=42, verbose=False),
            "param_distributions": {
                "preproc": ["unscaled"],
                "model__iterations": [200, 300, 500],
                "model__depth": [4, 6, 8],
                "model__learning_rate": [0.03, 0.05, 0.1],
                "model__l2_leaf_reg": [1, 3, 5, 7],
            },
        },
    }
    return preproc_options, spaces

def build_pipeline(preproc_name, preproc_options, model):
    return Pipeline(steps=[("preproc", preproc_options[preproc_name]), ("model", model)])

def tune_model(X_train_df, y_train_df, model_name, n_iter=20, cv_splits=3, scoring="f1_weighted", random_state=42):
    X = pd.get_dummies(X_train_df, drop_first=False)
    y = y_train_df.iloc[:, 0].values.ravel()

    preproc_options, spaces = build_search_spaces()
    if model_name not in spaces:
        raise ValueError(f"Unknown model_name: {model_name}")
    space = spaces[model_name]
    base_model = space["model"]
    param_distributions = space["param_distributions"]

    def _make_estimator(preproc_choice):
        return build_pipeline(preproc_choice, preproc_options, base_model)

    # Wrap preproc choice as a parameter by using a custom estimator per candidate
    estimators = []
    for preproc_choice in param_distributions["preproc"]:
        estimators.append((_make_estimator(preproc_choice), preproc_choice))

    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=random_state)

    best_result = None
    for estimator, preproc_choice in estimators:
        params = {k: v for k, v in param_distributions.items() if k != "preproc"}
        search = RandomizedSearchCV(
            estimator=estimator,
            param_distributions=params,
            n_iter=n_iter,
            scoring=scoring,
            cv=cv,
            random_state=random_state,
            n_jobs=-1,
            verbose=0,
        )
        search.fit(X, y)
        result = {
            "model": model_name,
            "preproc": preproc_choice,
            "best_score": search.best_score_,
            "best_params": search.best_params_,
        }
        if best_result is None or result["best_score"] > best_result["best_score"]:
            best_result = result
    return best_result

## Example: tune one model on one dataset
Adjust `model_name` and `n_iter` as needed. Start small to keep runtime reasonable.

In [None]:
# Example: tune LogisticRegression on data_10 (change as needed)
best_lr = tune_model(data_10, target_10, model_name="LogisticRegression", n_iter=15, cv_splits=3)
best_lr

_______________________________

In [None]:
# --- Datasets (train) ---
train_datasets = [
    ("data1", data1, target1),
    ("data_5", data_5, target_5),
    ("data_5_balanced", data_5_balanced, target_5_balanced),
    ("data_10", data_10, target_10),
    ("data_10_balanced", data_10_balanced, target_10_balanced),
    ("data_15", data15, target15),
    ("data_15_balanced", data15_balanced, target15_balanced),
    ("data_20", data20, target20),
    ("data_20_balanced", data20_balanced, target20_balanced),
]

# --- Datasets (evaluation) ---
eval_datasets = [
    ("data2", data2, target2),
    ("shard1", shard1, target_shard1),
    ("shard2", shard2, target_shard2),
    ("shard3", shard3, target_shard3),
]

# --- Models (basic settings) ---
models = [
    ("LogisticRegression", make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=2000, n_jobs=-1))),
    ("LogReg_L1", make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=5000, penalty="l1", solver="saga", n_jobs=-1))),
    ("LogReg_L2", make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=5000, penalty="l2", solver="saga", n_jobs=-1))),
    ("XGBoost", XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, eval_metric="logloss", n_jobs=-1, random_state=42)),
    ("CatBoost", CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, random_seed=42, verbose=False)),
    ("LightGBM", LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1)),
    ("RandomForest", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)),
    ("ExtraTrees", ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1)),
]

# Map 1 model to 1 dataset (cycles through models if datasets > models)
model_cycle = cycle(models)
training_plan = [(ds_name, X, y, *next(model_cycle)) for ds_name, X, y in train_datasets]

def _prepare_xy(X_df: pd.DataFrame, y_df: pd.DataFrame):
    X = pd.get_dummies(X_df, drop_first=False)
    y = y_df.iloc[:, 0].values.ravel()
    return X, y

def _align_eval_columns(X_eval: pd.DataFrame, train_columns):
    X_eval = pd.get_dummies(X_eval, drop_first=False)
    X_eval = X_eval.reindex(columns=train_columns, fill_value=0)
    return X_eval

def _get_score(model, X_eval):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X_eval)
    if hasattr(model, "decision_function"):
        return model.decision_function(X_eval)
    return model.predict(X_eval)

results = []

for train_name, X_train_df, y_train_df, model_name, model in training_plan:
    X_train, y_train = _prepare_xy(X_train_df, y_train_df)
    model.fit(X_train, y_train)
    train_columns = X_train.columns
    for eval_name, X_eval_df, y_eval_df in eval_datasets:
        X_eval = _align_eval_columns(X_eval_df, train_columns)
        y_eval = y_eval_df.iloc[:, 0].values.ravel()
        y_pred = model.predict(X_eval)
        report = classification_report(y_eval, y_pred, output_dict=True, zero_division=0)
        precision = report["weighted avg"]["precision"]
        recall = report["weighted avg"]["recall"]
        f1 = report["weighted avg"]["f1-score"]
        accuracy = report["accuracy"]
        try:
            pr_auc = average_precision_score(y_eval, _get_score(model, X_eval), average="weighted")
        except Exception:
            pr_auc = np.nan
        results.append({
            "train_dataset": train_name,
            "model": model_name,
            "eval_dataset": eval_name,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "pr_auc": pr_auc,
        })

results_df = pd.DataFrame(results)

In [None]:
results_df