# 04 — Hypertuning (Baseline laden → Tuning → Hypertuned speichern)

Dieses Notebook:
1) lädt die Baseline-Modelle aus `models/baseline/`
2) lädt die Modell-Datasets (Features/Targets)
3) evaluiert Baseline-Modelle als Referenz
4) führt Hypertuning mit RandomizedSearchCV durch
5) evaluiert die besten Modelle auf dem Testset
6) speichert die hypertuned Modelle nach `models/hypertuned/`
7) schreibt Reports (`json`) für spätere Notebooks


## Imports

In [71]:
import json
from pathlib import Path
from typing import Dict, Any, Tuple

import numpy as np
import pandas as pd

from joblib import load, dump
from scipy.stats import randint, uniform, loguniform

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    roc_auc_score, average_precision_score, f1_score, confusion_matrix
)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler


from xgboost import XGBRegressor, XGBClassifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier


from sklearn import set_config
set_config(transform_output="default")


## Konfiguration

In [72]:
RANDOM_SEED = globals().get("RANDOM_SEED", 42)

BASE_EXPORT_DIR = Path("../data/interim/converted_sqlite_samples")
CURRENT_SAMPLE_PATH = BASE_EXPORT_DIR / "current_sample.json"
cfg = json.loads(CURRENT_SAMPLE_PATH.read_text())
SAMPLE_NAME = cfg["SAMPLE_NAME"]

# Ordnerstruktur (wie du es beschrieben hast)
MODELS_BASELINE_DIR = Path("../data/models/baseline") / SAMPLE_NAME
MODELS_HYPER_DIR = Path("../data/models/hypertuned") / SAMPLE_NAME
MODELS_HYPER_DIR.mkdir(parents=True, exist_ok=True)

# Reports
REPORT_PATH = Path("../data/reports/04_hypertuning") / SAMPLE_NAME
BEST_PARAMS_PATH = Path("../data/reports/04_hypertuning") / SAMPLE_NAME

REPORT_PATH.mkdir(parents=True, exist_ok=True)
BEST_PARAMS_PATH.mkdir(parents=True, exist_ok=True)

# Split
TEST_SIZE = 0.2

TUNE_ITER_REG = 18
TUNE_ITER_CLS = 22
CV_FOLDS = 3


# Ausgabe-Container
report: Dict[str, Any] = {}
best_params: Dict[str, Any] = {}


## Helper Functions

In [73]:
def _identity(X):
    return X

def build_preprocessor_tree(X: pd.DataFrame):
    """Preprocessing für XGBoost/Tree-Modelle (ohne Scaling)."""
    numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    num_pipe = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[("num", num_pipe, numeric_cols), ("cat", cat_pipe, categorical_cols)],
        remainder="drop",
        sparse_threshold=0.3
    )
    return pre, numeric_cols, categorical_cols

def build_preprocessor_linear(X: pd.DataFrame):
    """Preprocessing für lineare Modelle (mit Scaling)."""
    numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[("num", num_pipe, numeric_cols), ("cat", cat_pipe, categorical_cols)],
        remainder="drop",
        sparse_threshold=0.3
    )
    return pre, numeric_cols, categorical_cols

def regression_report(y_true, y_pred) -> Dict[str, float]:
    mae = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2 = float(r2_score(y_true, y_pred))
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

def classification_report_binary(y_true, proba, threshold=0.5) -> Dict[str, Any]:
    pred = (proba >= threshold).astype(int)
    roc = float(roc_auc_score(y_true, proba)) if len(np.unique(y_true)) > 1 else float("nan")
    pr = float(average_precision_score(y_true, proba)) if len(np.unique(y_true)) > 1 else float("nan")
    f1 = float(f1_score(y_true, pred))
    cm = confusion_matrix(y_true, pred).tolist()
    return {"roc_auc": roc, "pr_auc": pr, "f1": f1, "confusion_matrix": cm}

def best_f1_threshold(y_true, proba, thresholds=np.linspace(0.05, 0.95, 19)):
    best_t, best_f1 = 0.5, -1
    for t in thresholds:
        pred = (proba >= t).astype(int)
        f1 = f1_score(y_true, pred)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return float(best_t), float(best_f1)


from utils.helper_functions import  sklearn_sanitize_df


sanitize_tf = FunctionTransformer(sklearn_sanitize_df, feature_names_out="one-to-one")

## Daten Laden

In [74]:
DATA_DIR = Path("../data/datasets") / SAMPLE_NAME

def _load_parquet(p: Path):
    if not p.exists():
        raise FileNotFoundError(f"Datei nicht gefunden: {p}")
    return pd.read_parquet(p)

files = {
    "X_track_pop": DATA_DIR / "X_track_pop.parquet",
    "y_track_pop": DATA_DIR / "y_track_pop.parquet",
    "X_album_pop": DATA_DIR / "X_album_pop.parquet",
    "y_album_pop": DATA_DIR / "y_album_pop.parquet",
    "X_track_hit": DATA_DIR / "X_track_hit.parquet",
    "y_hit": DATA_DIR / "y_hit.parquet",
    "X_track_explicit": DATA_DIR / "X_track_explicit.parquet",
    "y_explicit": DATA_DIR / "y_explicit.parquet",
    "X_track_mood": DATA_DIR / "X_track_mood.parquet",
    "Y_mood": DATA_DIR / "Y_mood.parquet",
}

X_track_pop = _load_parquet(files["X_track_pop"])
y_track_pop = _load_parquet(files["y_track_pop"]).squeeze()

X_album_pop = _load_parquet(files["X_album_pop"])
y_album_pop = _load_parquet(files["y_album_pop"]).squeeze()

X_track_hit = _load_parquet(files["X_track_hit"])
y_hit = _load_parquet(files["y_hit"]).squeeze().astype(int)

X_track_explicit = _load_parquet(files["X_track_explicit"])
y_explicit = _load_parquet(files["y_explicit"]).squeeze().astype(int)

X_track_mood = _load_parquet(files["X_track_mood"])
y_mood = _load_parquet(files["Y_mood"])

In [75]:
print("X_track_pop:", X_track_mood.shape)
print("y_track_pop:", y_mood.shape)
print("X index unique:", X_track_pop.index.is_unique)
print("y index unique:", y_track_pop.index.is_unique)
print("Index equal:", X_track_pop.index.equals(y_track_pop.index))


X_track_pop: (299981, 70)
y_track_pop: (299981, 7)
X index unique: True
y index unique: True
Index equal: True


## Baseline-Modelle laden

In [76]:
baseline_paths = {
    "track_popularity": MODELS_BASELINE_DIR / "03_track_popularity_pipeline_xgb.joblib",
    "album_popularity": MODELS_BASELINE_DIR / "03_album_popularity_pipeline_xgb.joblib",
    "hit": MODELS_BASELINE_DIR / "03_hit_pipeline_xgb.joblib",
    "explicit": MODELS_BASELINE_DIR / "03_explicit_pipeline_xgb.joblib",
    "mood": MODELS_BASELINE_DIR / "03_mood_pipeline.joblib",
}

baseline_models = {}
for k, p in baseline_paths.items():
    if p.exists():
        baseline_models[k] = load(p)
    else:
        print(f"Baseline nicht gefunden (übersprungen): {p}")

list(baseline_models.keys())


['track_popularity', 'album_popularity', 'hit', 'explicit', 'mood']

## Gemeinsame Splits (Baseline & Hypertuned müssen gleich evaluieren)

In [77]:
splits = {}

# Regression Splits (kein stratify)
splits["track_pop"] = train_test_split(
    X_track_pop, y_track_pop, test_size=TEST_SIZE, random_state=RANDOM_SEED
)
splits["album_pop"] = train_test_split(
    X_album_pop, y_album_pop, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# Classification Splits (stratifiziert)
splits["hit"] = train_test_split(
    X_track_hit, y_hit, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_hit
)
splits["explicit"] = train_test_split(
    X_track_explicit, y_explicit, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_explicit
)

# Mood optional
splits["mood"] = train_test_split(
        X_track_mood, y_mood, test_size=TEST_SIZE, random_state=RANDOM_SEED
)


## Tuning-Funktion

In [78]:
import numpy as np
import pandas as pd
import warnings
from dataclasses import dataclass
from typing import Optional

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

from sklearn.model_selection import train_test_split, RandomizedSearchCV, PredefinedSplit
from scipy.stats import randint, uniform, loguniform
from sklearn.pipeline import Pipeline


# -----------------------------
# Config
# -----------------------------
@dataclass(frozen=True)
class TuneCFG:
    random_seed: int = 42
    valid_size: float = 0.15
    early_stopping_rounds: int = 50
    n_iter_reg: int = 18
    n_iter_cls: int = 18
    n_estimators_reg: int = 6000
    n_estimators_cls: int = 8000
    prefer_gpu: bool = True
    # start stable: single GPU + ES => keep parallel low
    n_jobs_search: int = 1


CFG = TuneCFG(
    random_seed=globals().get("RANDOM_SEED", 42),
    n_iter_reg=globals().get("TUNE_ITER_REG", 18),
    n_iter_cls=globals().get("TUNE_ITER_CLS", 18),
)


def _detect_xgb_device(prefer_gpu=True) -> str:
    if not prefer_gpu:
        return "cpu"
    X = np.random.rand(4000, 20).astype(np.float32)
    y = np.random.rand(4000).astype(np.float32)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        try:
            m = xgb.XGBRegressor(n_estimators=30, tree_method="hist", device="cuda", n_jobs=1)
            m.fit(X, y, verbose=False)
        except Exception:
            return "cpu"
        msgs = "\n".join(str(x.message) for x in w)
        if "No visible GPU is found" in msgs or "Device is changed from GPU to CPU" in msgs:
            return "cpu"
    return "cuda"


def _make_ps(n_train: int, n_val: int) -> PredefinedSplit:
    return PredefinedSplit(test_fold=np.array([-1] * n_train + [0] * n_val, dtype=int))


def _stack(a, b):
    # works for numpy arrays and scipy sparse
    try:
        import scipy.sparse as sp
        if sp.issparse(a) or sp.issparse(b):
            return sp.vstack([a, b])
    except Exception:
        pass
    return np.vstack([a, b])


# -----------------------------
# Wrapper so your downstream code still works: best_model.predict(X_df)
# -----------------------------
class PreprocessThenModel:
    def __init__(self, preprocess_pipe: Pipeline, model):
        self.preprocess_pipe = preprocess_pipe
        self.model = model

    def predict(self, X):
        Xt = self.preprocess_pipe.transform(X)
        return self.model.predict(Xt)

    def predict_proba(self, X):
        Xt = self.preprocess_pipe.transform(X)
        return self.model.predict_proba(Xt)

    # so joblib dump works nicely
    def get_params(self, deep=True):
        return {"preprocess_pipe": self.preprocess_pipe, "model": self.model}


# -----------------------------
# DROP-IN API (same call style)
# -----------------------------
def tune_xgb_regression(Xtr, ytr, preprocessor):
    device = _detect_xgb_device(CFG.prefer_gpu)
    print(f"[tune_xgb_regression FIXED] xgboost={xgb.__version__} | device={device} | n_iter={CFG.n_iter_reg}")

    # split once (production-style)
    X_train, X_val, y_train, y_val = train_test_split(
        Xtr, ytr, test_size=CFG.valid_size, random_state=CFG.random_seed
    )

    # fit preprocessing ONLY on train, then transform train+val
    preprocess_pipe = Pipeline([("sanitize", sanitize_tf), ("pre", preprocessor)])
    preprocess_pipe.fit(X_train, y_train)

    Xt_train = preprocess_pipe.transform(X_train)
    Xt_val = preprocess_pipe.transform(X_val)

    X_all = _stack(Xt_train, Xt_val)
    y_all = np.concatenate([np.asarray(y_train), np.asarray(y_val)], axis=0)
    ps = _make_ps(Xt_train.shape[0], Xt_val.shape[0])

    base = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        device=device,
        random_state=CFG.random_seed,
        n_jobs=1,
        n_estimators=CFG.n_estimators_reg,
        eval_metric="mae",
        early_stopping_rounds=CFG.early_stopping_rounds,  # XGB 3.1.x: set here
    )

    param_dist = {
        "learning_rate": loguniform(0.03, 0.15),
        "max_depth": randint(3, 8),
        "min_child_weight": loguniform(1.0, 20.0),
        "subsample": uniform(0.75, 0.25),
        "colsample_bytree": uniform(0.75, 0.25),
        "gamma": loguniform(1e-8, 2.0),
        "reg_lambda": loguniform(1e-2, 50.0),
        "reg_alpha": loguniform(1e-8, 5.0),
    }

    search = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=min(CFG.n_iter_reg, 40),
        scoring="neg_mean_absolute_error",
        cv=ps,
        random_state=CFG.random_seed,
        n_jobs=CFG.n_jobs_search,
        verbose=1,
        error_score="raise",
        refit=False,  # we will refit manually with proper ES
    )

    # IMPORTANT: eval_set is ALREADY transformed numeric matrix -> no string dtypes possible
    search.fit(X_all, y_all, eval_set=[(Xt_val, np.asarray(y_val))], verbose=False)

    # refit final model with best params on transformed train, ES on transformed val
    best = XGBRegressor(**base.get_params())
    best.set_params(**search.best_params_)
    best.fit(Xt_train, np.asarray(y_train), eval_set=[(Xt_val, np.asarray(y_val))], verbose=False)

    # return a “production-like” object with predict on raw DF
    search.best_estimator_ = PreprocessThenModel(preprocess_pipe, best)
    return search


def tune_xgb_classification(Xtr, ytr, preprocessor, scale_pos_weight: float):
    device = _detect_xgb_device(CFG.prefer_gpu)
    print(f"[tune_xgb_classification FIXED] xgboost={xgb.__version__} | device={device} | n_iter={CFG.n_iter_cls}")

    X_train, X_val, y_train, y_val = train_test_split(
        Xtr, ytr, test_size=CFG.valid_size, random_state=CFG.random_seed, stratify=ytr
    )

    preprocess_pipe = Pipeline([("sanitize", sanitize_tf), ("pre", preprocessor)])
    preprocess_pipe.fit(X_train, y_train)

    Xt_train = preprocess_pipe.transform(X_train)
    Xt_val = preprocess_pipe.transform(X_val)

    X_all = _stack(Xt_train, Xt_val)
    y_all = np.concatenate([np.asarray(y_train), np.asarray(y_val)], axis=0)
    ps = _make_ps(Xt_train.shape[0], Xt_val.shape[0])

    base = XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        device=device,
        random_state=CFG.random_seed,
        n_jobs=1,
        n_estimators=CFG.n_estimators_cls,
        eval_metric="aucpr",
        scale_pos_weight=scale_pos_weight,
        early_stopping_rounds=CFG.early_stopping_rounds,
    )

    param_dist = {
        "learning_rate": loguniform(0.03, 0.15),
        "max_depth": randint(3, 7),
        "min_child_weight": loguniform(1.0, 20.0),
        "subsample": uniform(0.75, 0.25),
        "colsample_bytree": uniform(0.75, 0.25),
        "gamma": loguniform(1e-8, 2.0),
        "reg_lambda": loguniform(1e-2, 50.0),
        "reg_alpha": loguniform(1e-8, 5.0),
    }

    search = RandomizedSearchCV(
        estimator=base,
        param_distributions=param_dist,
        n_iter=min(CFG.n_iter_cls, 40),
        scoring="average_precision",
        cv=ps,
        random_state=CFG.random_seed,
        n_jobs=CFG.n_jobs_search,
        verbose=1,
        error_score="raise",
        refit=False,
    )

    search.fit(X_all, y_all, eval_set=[(Xt_val, np.asarray(y_val))], verbose=False)

    best = XGBClassifier(**base.get_params())
    best.set_params(**search.best_params_)
    best.fit(Xt_train, np.asarray(y_train), eval_set=[(Xt_val, np.asarray(y_val))], verbose=False)

    search.best_estimator_ = PreprocessThenModel(preprocess_pipe, best)
    return search


### Track Popularity Tuning

In [79]:
Xtr, Xte, ytr, yte = splits["track_pop"]
pre, _, _ = build_preprocessor_tree(Xtr)

search = tune_xgb_regression(Xtr, ytr, pre)
best_model = search.best_estimator_

pred = best_model.predict(Xte)
metrics = regression_report(yte, pred)

report["track_popularity"] = {
    "hypertuned": metrics,
    "cv_best_mae": float(-search.best_score_)
}
best_params["track_popularity"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_track_popularity_xgb_hypertuned.joblib")

report["track_popularity"]

[tune_xgb_regression FIXED] xgboost=3.1.2 | device=cuda | n_iter=18
Fitting 1 folds for each of 18 candidates, totalling 18 fits


{'hypertuned': {'MAE': 2.77017913132906,
  'RMSE': 5.191235544050068,
  'R2': 0.7617520606845933},
 'cv_best_mae': 2.7912476561665533}

### Album Popularity Tuning

In [80]:
Xtr, Xte, ytr, yte = splits["album_pop"]
pre, _, _ = build_preprocessor_tree(Xtr)

search = tune_xgb_regression(Xtr, ytr, pre)
best_model = search.best_estimator_

pred = best_model.predict(Xte)
metrics = regression_report(yte, pred)

report["album_popularity"] = {
    "hypertuned": metrics,
    "cv_best_mae": float(-search.best_score_)
}
best_params["album_popularity"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_album_popularity_xgb_hypertuned.joblib")

report["album_popularity"]


[tune_xgb_regression FIXED] xgboost=3.1.2 | device=cuda | n_iter=18
Fitting 1 folds for each of 18 candidates, totalling 18 fits



KeyboardInterrupt



### Hit Tuning + Threshhold


In [None]:
Xtr, Xte, ytr, yte = splits["hit"]
pre, _, _ = build_preprocessor_tree(Xtr)

neg = int((ytr == 0).sum())
pos = int((ytr == 1).sum())
spw = neg / max(pos, 1)

search = tune_xgb_classification(Xtr, ytr, pre, scale_pos_weight=spw)
best_model = search.best_estimator_

proba = best_model.predict_proba(Xte)[:, 1]
thr, thr_f1 = best_f1_threshold(yte, proba)

metrics = classification_report_binary(yte, proba, threshold=thr)
metrics["best_threshold"] = thr
metrics["best_threshold_f1"] = thr_f1

report["hit_prediction"] = {
    "hypertuned": metrics,
    "cv_best_pr_auc": float(search.best_score_),
    "scale_pos_weight": float(spw)
}
best_params["hit_prediction"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_hit_xgb_hypertuned.joblib")

report["hit_prediction"]


### Explicit Tuning + Threshold

In [None]:
Xtr, Xte, ytr, yte = splits["explicit"]
pre, _, _ = build_preprocessor_tree(Xtr)

neg = int((ytr == 0).sum())
pos = int((ytr == 1).sum())
spw = neg / max(pos, 1)

search = tune_xgb_classification(Xtr, ytr, pre, scale_pos_weight=spw)
best_model = search.best_estimator_

proba = best_model.predict_proba(Xte)[:, 1]
thr, thr_f1 = best_f1_threshold(yte, proba)

metrics = classification_report_binary(yte, proba, threshold=thr)
metrics["best_threshold"] = thr
metrics["best_threshold_f1"] = thr_f1

report["explicit_prediction"] = {
    "hypertuned": metrics,
    "cv_best_pr_auc": float(search.best_score_),
    "scale_pos_weight": float(spw)
}
best_params["explicit_prediction"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_explicit_xgb_hypertuned.joblib")

report["explicit_prediction"]


### Mood Tuning

In [None]:
import tempfile
import os
from sklearn.linear_model import LogisticRegression


def tune_logreg_multilabel_mood(Xtr, Ytr, preprocessor):
    # Cache preprocessing across CV folds & candidates
    cache_dir = os.path.join(tempfile.gettempdir(), "sklearn_linear_cache")


    base = OneVsRestClassifier(
        LogisticRegression(
            solver="saga",
            max_iter=300,           # will be controlled by halving (resource)
            tol=1e-3,               # practical default; can be tuned
            random_state=RANDOM_SEED,
            n_jobs=1                # keep single-thread; CV parallel outside
        ),
        n_jobs=1                   # avoid oversubscription with CV n_jobs=-1
    )

    pipe = Pipeline(
        steps=[("sanitize", sanitize_tf), ("pre", preprocessor), ("model", base)]
    )

    # Conditional distributions: l1_ratio only when elasticnet
    param_dist = [
        {
            "model__estimator__penalty": ["l2"],
            "model__estimator__C": loguniform(1e-2, 20.0),
            "model__estimator__tol": loguniform(1e-4, 1e-2),
        },
        {
            "model__estimator__penalty": ["elasticnet"],
            "model__estimator__C": loguniform(1e-2, 20.0),
            "model__estimator__l1_ratio": uniform(0.0, 1.0),
            "model__estimator__tol": loguniform(1e-4, 1e-2),
        },
    ]

    # Fast path: HalvingRandomSearchCV (massive speedup vs full 2000 iters always)
    try:
        from sklearn.experimental import enable_halving_search_cv  # noqa: F401
        from sklearn.model_selection import HalvingRandomSearchCV

        search = HalvingRandomSearchCV(
            estimator=pipe,
            param_distributions=param_dist,
            scoring="f1_micro",
            cv=CV_FOLDS,                 # use 3 if you can, 2 if speed is critical
            n_candidates=max(12, TUNE_ITER_CLS),
            factor=3,
            resource="model__estimator__max_iter",
            min_resources=100,
            max_resources=2000,
            aggressive_elimination=True,
            random_state=RANDOM_SEED,
            n_jobs=-1,
            verbose=1,
            error_score="raise"
        )

    except Exception:
        # Fallback: faster randomized search with capped max_iter range
        param_dist_fallback = [
            {
                **d,
                "model__estimator__max_iter": [200, 400, 800, 1200, 2000],
            }
            for d in param_dist
        ]

        search = RandomizedSearchCV(
            pipe,
            param_distributions=param_dist_fallback,
            n_iter=12,
            scoring="f1_micro",
            cv=2,
            verbose=1,
            random_state=RANDOM_SEED,
            n_jobs=-1,
            error_score="raise"
        )

    search.fit(Xtr, Ytr)
    return search


In [None]:
Xtr, Xte, Ytr, Yte = splits["mood"]
pre, _, _ = build_preprocessor_linear(Xtr)
search = tune_logreg_multilabel_mood(Xtr, Ytr, pre)

best_model = search.best_estimator_
Ypred = best_model.predict(Xte)

micro_f1 = float(f1_score(Yte, Ypred, average="micro"))
macro_f1 = float(f1_score(Yte, Ypred, average="macro"))

report["mood_multilabel"] = {
    "hypertuned": {"micro_f1": micro_f1, "macro_f1": macro_f1},
    "cv_best_f1_micro": float(search.best_score_)
}
best_params["mood_multilabel"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_mood_multilabel_logreg_hypertuned.joblib")


## Reports Saving

In [None]:
# Write hypertuning report
with open(REPORT_PATH / "04_hypertuning_report.json", "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

# Write best parameters
with open(BEST_PARAMS_PATH / "04_hypertuning_best_params.json", "w", encoding="utf-8") as f:
    json.dump(best_params, f, indent=2)

print("Hypertuning fertig. Gespeichert unter:")
print(" -", MODELS_HYPER_DIR)
print(" -", REPORT_PATH)
print(" -", BEST_PARAMS_PATH)
