In [3]:
import pandas as pd
import numpy as np

data1 = pd.read_csv('datasets/cleaned_lightpath_dataset.csv')
target1 = pd.read_csv('datasets/cleaned_lightpath_target.csv')

data2 = pd.read_csv('datasets/cleaned_lightpath_dataset_2.csv')
target2 = pd.read_csv('datasets/cleaned_lightpath_target_2.csv')

data_5 = pd.read_csv('datasets/data1_plus_5.csv')
target_5 = pd.read_csv('datasets/target1_plus_5.csv')

data_5_balanced = pd.read_csv('datasets/data1_plus_5_balanced.csv')
target_5_balanced = pd.read_csv('datasets/target1_plus_5_balanced.csv')

data_10 = pd.read_csv('datasets/data1_plus_10.csv')
target_10 = pd.read_csv('datasets/target1_plus_10.csv')

data_10_balanced = pd.read_csv('datasets/data1_plus_10_balanced.csv')
target_10_balanced = pd.read_csv('datasets/target1_plus_10_balanced.csv')

data15 = pd.read_csv('datasets/data1_plus_15.csv')
target15 = pd.read_csv('datasets/target1_plus_15.csv')

data15_balanced = pd.read_csv('datasets/data1_plus_15_balanced.csv')
target15_balanced = pd.read_csv('datasets/target1_plus_15_balanced.csv')

data20 = pd.read_csv('datasets/data1_plus_20.csv')
target20 = pd.read_csv('datasets/target1_plus_20.csv')

data20_balanced = pd.read_csv('datasets/data1_plus_20_balanced.csv')
target20_balanced = pd.read_csv('datasets/target1_plus_20_balanced.csv')

shard1 = pd.read_csv('datasets/dataset2_shard_1.csv')
target_shard1 = pd.read_csv('datasets/target2_shard_1.csv')

shard2 = pd.read_csv('datasets/dataset2_shard_2.csv')
target_shard2 = pd.read_csv('datasets/target2_shard_2.csv')

shard3 = pd.read_csv('datasets/dataset2_shard_3.csv')
target_shard3 = pd.read_csv('datasets/target2_shard_3.csv')

In [None]:
# Imports
from itertools import cycle
from sklearn.metrics import classification_report, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import numpy as np
import pandas as pd



In [None]:

missing = []
try:
    from xgboost import XGBClassifier
except Exception:
    missing.append("xgboost")
try:
    from catboost import CatBoostClassifier
except Exception:
    missing.append("catboost")
try:
    from lightgbm import LGBMClassifier
except Exception:
    missing.append("lightgbm")

if missing:
    raise ImportError(
        "Missing packages: " + ", ".join(missing) + ". Install them before running this cell."
    )

# --- Datasets (train) ---
train_datasets = [
    ("data1", data1, target1),
    ("data_5", data_5, target_5),
    ("data_5_balanced", data_5_balanced, target_5_balanced),
    ("data_10", data_10, target_10),
    ("data_10_balanced", data_10_balanced, target_10_balanced),
    ("data_15", data15, target15),
    ("data_15_balanced", data15_balanced, target15_balanced),
    ("data_20", data20, target20),
    ("data_20_balanced", data20_balanced, target20_balanced),
]

# --- Datasets (evaluation) ---
eval_datasets = [
    ("data2", data2, target2),
    ("shard1", shard1, target_shard1),
    ("shard2", shard2, target_shard2),
    ("shard3", shard3, target_shard3),
]

# --- Models (basic settings) ---
models = [
    ("LogisticRegression", make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=2000, n_jobs=-1))),
    ("LogReg_L1", make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=5000, penalty="l1", solver="saga", n_jobs=-1))),
    ("LogReg_L2", make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=5000, penalty="l2", solver="saga", n_jobs=-1))),
    ("XGBoost", XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, eval_metric="logloss", n_jobs=-1, random_state=42)),
    ("CatBoost", CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, random_seed=42, verbose=False)),
    ("LightGBM", LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1)),
    ("RandomForest", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)),
    ("ExtraTrees", ExtraTreesClassifier(n_estimators=300, random_state=42, n_jobs=-1)),
]

# Map 1 model to 1 dataset (cycles through models if datasets > models)
model_cycle = cycle(models)
training_plan = [(ds_name, X, y, *next(model_cycle)) for ds_name, X, y in train_datasets]

def _prepare_xy(X_df: pd.DataFrame, y_df: pd.DataFrame):
    X = pd.get_dummies(X_df, drop_first=False)
    y = y_df.iloc[:, 0].values.ravel()
    return X, y

def _align_eval_columns(X_eval: pd.DataFrame, train_columns):
    X_eval = pd.get_dummies(X_eval, drop_first=False)
    X_eval = X_eval.reindex(columns=train_columns, fill_value=0)
    return X_eval

def _get_score(model, X_eval):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X_eval)
    if hasattr(model, "decision_function"):
        return model.decision_function(X_eval)
    return model.predict(X_eval)

results = []

for train_name, X_train_df, y_train_df, model_name, model in training_plan:
    X_train, y_train = _prepare_xy(X_train_df, y_train_df)
    model.fit(X_train, y_train)
    train_columns = X_train.columns
    for eval_name, X_eval_df, y_eval_df in eval_datasets:
        X_eval = _align_eval_columns(X_eval_df, train_columns)
        y_eval = y_eval_df.iloc[:, 0].values.ravel()
        y_pred = model.predict(X_eval)
        report = classification_report(y_eval, y_pred, output_dict=True, zero_division=0)
        precision = report["weighted avg"]["precision"]
        recall = report["weighted avg"]["recall"]
        f1 = report["weighted avg"]["f1-score"]
        accuracy = report["accuracy"]
        try:
            pr_auc = average_precision_score(y_eval, _get_score(model, X_eval), average="weighted")
        except Exception:
            pr_auc = np.nan
        results.append({
            "train_dataset": train_name,
            "model": model_name,
            "eval_dataset": eval_name,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "pr_auc": pr_auc,
        })

results_df = pd.DataFrame(results)
results_df