# 04 — Hypertuning (Baseline laden → Tuning → Hypertuned speichern)

Dieses Notebook:
1) lädt die Baseline-Modelle aus `models/baseline/`
2) lädt die Modell-Datasets (Features/Targets)
3) evaluiert Baseline-Modelle als Referenz
4) führt Hypertuning mit RandomizedSearchCV durch
5) evaluiert die besten Modelle auf dem Testset
6) speichert die hypertuned Modelle nach `models/hypertuned/`
7) schreibt Reports (`json`) für spätere Notebooks


## Imports

In [152]:
import json
from pathlib import Path
from typing import Dict, Any, Tuple

import numpy as np
import pandas as pd

from joblib import load, dump
from scipy.stats import randint, uniform, loguniform

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    roc_auc_score, average_precision_score, f1_score, confusion_matrix
)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler


from xgboost import XGBRegressor, XGBClassifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier


## Konfiguration

In [153]:
RANDOM_SEED = globals().get("RANDOM_SEED", 42)

# Ordnerstruktur (wie du es beschrieben hast)
MODELS_BASELINE_DIR = Path("../data/models/baseline")
MODELS_HYPER_DIR = Path("../data/models/hypertuned")
MODELS_HYPER_DIR.mkdir(parents=True, exist_ok=True)

# Reports
REPORT_PATH = MODELS_HYPER_DIR / "../data/reports/04_hypertuning/04_hypertuning_report.json"
BEST_PARAMS_PATH = MODELS_HYPER_DIR / "../data/reports/03_hypertuning/04_best_params.json"

# Split
TEST_SIZE = 0.2

# Tuning
TUNE_ITER_REG = 10  # Regression
TUNE_ITER_CLS = 12  # Classification (lohnt sich oft mehr)
CV_FOLDS = 2

# Ausgabe-Container
report: Dict[str, Any] = {}
best_params: Dict[str, Any] = {}


## Helper Functions

In [154]:
def _identity(X):
    return X

def build_preprocessor_tree(X: pd.DataFrame):
    """Preprocessing für XGBoost/Tree-Modelle (ohne Scaling)."""
    numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    num_pipe = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[("num", num_pipe, numeric_cols), ("cat", cat_pipe, categorical_cols)],
        remainder="drop",
        sparse_threshold=0.3
    )
    return pre, numeric_cols, categorical_cols

def build_preprocessor_linear(X: pd.DataFrame):
    """Preprocessing für lineare Modelle (mit Scaling)."""
    numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    pre = ColumnTransformer(
        transformers=[("num", num_pipe, numeric_cols), ("cat", cat_pipe, categorical_cols)],
        remainder="drop",
        sparse_threshold=0.3
    )
    return pre, numeric_cols, categorical_cols

def regression_report(y_true, y_pred) -> Dict[str, float]:
    mae = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2 = float(r2_score(y_true, y_pred))
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

def classification_report_binary(y_true, proba, threshold=0.5) -> Dict[str, Any]:
    pred = (proba >= threshold).astype(int)
    roc = float(roc_auc_score(y_true, proba)) if len(np.unique(y_true)) > 1 else float("nan")
    pr = float(average_precision_score(y_true, proba)) if len(np.unique(y_true)) > 1 else float("nan")
    f1 = float(f1_score(y_true, pred))
    cm = confusion_matrix(y_true, pred).tolist()
    return {"roc_auc": roc, "pr_auc": pr, "f1": f1, "confusion_matrix": cm}

def best_f1_threshold(y_true, proba, thresholds=np.linspace(0.05, 0.95, 19)):
    best_t, best_f1 = 0.5, -1
    for t in thresholds:
        pred = (proba >= t).astype(int)
        f1 = f1_score(y_true, pred)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return float(best_t), float(best_f1)

def sklearn_sanitize_df(X):
    if not isinstance(X, pd.DataFrame):
        return X

    X = X.copy()

    # Convert NaT -> np.nan
    X = X.replace({pd.NaT: np.nan})

    for c in X.columns:
        dt = X[c].dtype

        # pandas string or categorical -> object + np.nan
        if pd.api.types.is_string_dtype(dt) or isinstance(dt, pd.CategoricalDtype):
            X[c] = X[c].astype("object")
            X[c] = X[c].where(pd.notna(X[c]), np.nan)

        # pandas nullable boolean -> float (0/1/nan)
        elif str(dt) == "boolean":
            X[c] = X[c].astype("float64")

        # pandas nullable integer (Int64, Int32...) -> float (so missing -> np.nan)
        elif str(dt).startswith("Int"):
            X[c] = X[c].astype("float64")

        # object columns might still contain pd.NA -> replace with np.nan
        elif X[c].dtype == "object":
            X[c] = X[c].where(pd.notna(X[c]), np.nan)

    return X

sanitize_tf = FunctionTransformer(sklearn_sanitize_df, feature_names_out="one-to-one")

## Daten Laden

In [155]:
DATA_DIR = Path("../data/datasets")

def _load_parquet(p: Path):
    if not p.exists():
        raise FileNotFoundError(f"Datei nicht gefunden: {p}")
    return pd.read_parquet(p)

files = {
    "X_track_pop": DATA_DIR / "X_track_pop.parquet",
    "y_track_pop": DATA_DIR / "y_track_pop.parquet",
    "X_album_pop": DATA_DIR / "X_album_pop.parquet",
    "y_album_pop": DATA_DIR / "y_album_pop.parquet",
    "X_track_hit": DATA_DIR / "X_track_hit.parquet",
    "y_hit": DATA_DIR / "y_hit.parquet",
    "X_track_explicit": DATA_DIR / "X_track_explicit.parquet",
    "y_explicit": DATA_DIR / "y_explicit.parquet",
    "X_track_mood": DATA_DIR / "X_track_mood.parquet",
    "Y_mood": DATA_DIR / "Y_mood.parquet",
}

X_track_pop = _load_parquet(files["X_track_pop"])
y_track_pop = _load_parquet(files["y_track_pop"]).squeeze()

X_album_pop = _load_parquet(files["X_album_pop"])
y_album_pop = _load_parquet(files["y_album_pop"]).squeeze()

X_track_hit = _load_parquet(files["X_track_hit"])
y_hit = _load_parquet(files["y_hit"]).squeeze().astype(int)

X_track_explicit = _load_parquet(files["X_track_explicit"])
y_explicit = _load_parquet(files["y_explicit"]).squeeze().astype(int)

X_track_mood = _load_parquet(files["X_track_mood"])
y_mood = _load_parquet(files["Y_mood"])

In [156]:
print("X_track_pop:", X_track_mood.shape)
print("y_track_pop:", y_mood.shape)
print("X index unique:", X_track_pop.index.is_unique)
print("y index unique:", y_track_pop.index.is_unique)
print("Index equal:", X_track_pop.index.equals(y_track_pop.index))


X_track_pop: (294616, 63)
y_track_pop: (294616, 7)
X index unique: True
y index unique: True
Index equal: True


## Baseline-Modelle laden

In [157]:
baseline_paths = {
    "track_popularity": MODELS_BASELINE_DIR / "03_track_popularity_pipeline_xgb.joblib",
    "album_popularity": MODELS_BASELINE_DIR / "03_album_popularity_pipeline_xgb.joblib",
    "hit": MODELS_BASELINE_DIR / "03_hit_pipeline_xgb.joblib",
    "explicit": MODELS_BASELINE_DIR / "03_explicit_pipeline_xgb.joblib",
    "mood": MODELS_BASELINE_DIR / "03_mood_pipeline.joblib",
}

baseline_models = {}
for k, p in baseline_paths.items():
    if p.exists():
        baseline_models[k] = load(p)
    else:
        print(f"Baseline nicht gefunden (übersprungen): {p}")

list(baseline_models.keys())


['track_popularity', 'album_popularity', 'hit', 'explicit', 'mood']

## Gemeinsame Splits (Baseline & Hypertuned müssen gleich evaluieren)

In [158]:
splits = {}

# Regression Splits (kein stratify)
splits["track_pop"] = train_test_split(
    X_track_pop, y_track_pop, test_size=TEST_SIZE, random_state=RANDOM_SEED
)
splits["album_pop"] = train_test_split(
    X_album_pop, y_album_pop, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# Classification Splits (stratifiziert)
splits["hit"] = train_test_split(
    X_track_hit, y_hit, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_hit
)
splits["explicit"] = train_test_split(
    X_track_explicit, y_explicit, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_explicit
)

# Mood optional
splits["mood"] = train_test_split(
        X_track_mood, y_mood, test_size=TEST_SIZE, random_state=RANDOM_SEED
)


## Tuning-Funktion

In [159]:
def tune_xgb_regression(Xtr, ytr, preprocessor):
    base = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    pipe = Pipeline(steps=[("sanitize", sanitize_tf), ("pre", preprocessor), ("model", base)])

    param_dist = {
        "model__n_estimators": randint(400, 2000),
        "model__learning_rate": uniform(0.02, 0.10),
        "model__max_depth": randint(3, 8),
        "model__min_child_weight": randint(1, 10),
        "model__subsample": uniform(0.7, 0.3),
        "model__colsample_bytree": uniform(0.7, 0.3),
        "model__reg_lambda": uniform(0.0, 2.0),
    }

    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=TUNE_ITER_REG,
        scoring="neg_mean_absolute_error",
        cv=CV_FOLDS,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        error_score="raise"
    )
    search.fit(Xtr, ytr)
    return search


def tune_xgb_classification(Xtr, ytr, preprocessor, scale_pos_weight: float):
    base = XGBClassifier(
        objective="binary:logistic",
        eval_metric="aucpr",
        tree_method="hist",
        scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    pipe = Pipeline(steps=[("sanitize", sanitize_tf), ("pre", preprocessor), ("model", base)])

    param_dist = {
        "model__n_estimators": randint(400, 2500),
        "model__learning_rate": uniform(0.02, 0.10),
        "model__max_depth": randint(3, 8),
        "model__min_child_weight": randint(1, 10),
        "model__subsample": uniform(0.7, 0.3),
        "model__colsample_bytree": uniform(0.7, 0.3),
        "model__reg_lambda": uniform(0.0, 2.0),
    }

    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=TUNE_ITER_CLS,
        scoring="average_precision",  # PR-AUC
        cv=CV_FOLDS,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        error_score="raise"
    )
    search.fit(Xtr, ytr)
    return search


### Track Popularity Tuning

In [160]:
Xtr, Xte, ytr, yte = splits["track_pop"]
pre, _, _ = build_preprocessor_tree(X_track_pop)

search = tune_xgb_regression(Xtr, ytr, pre)
best_model = search.best_estimator_

pred = best_model.predict(Xte)
metrics = regression_report(yte, pred)

report["track_popularity"] = {
    "hypertuned": metrics,
    "cv_best_mae": float(-search.best_score_)
}
best_params["track_popularity"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_track_popularity_xgb_hypertuned.joblib")

report["track_popularity"]

Fitting 2 folds for each of 10 candidates, totalling 20 fits


{'hypertuned': {'MAE': 11.437227110891024,
  'RMSE': 15.050962913709757,
  'R2': 0.5285628189200127},
 'cv_best_mae': 11.699140529672468}

### Album Popularity Tuning

In [161]:
Xtr, Xte, ytr, yte = splits["album_pop"]
pre, _, _ = build_preprocessor_tree(X_album_pop)

search = tune_xgb_regression(Xtr, ytr, pre)
best_model = search.best_estimator_

pred = best_model.predict(Xte)
metrics = regression_report(yte, pred)

report["album_popularity"] = {
    "hypertuned": metrics,
    "cv_best_mae": float(-search.best_score_)
}
best_params["album_popularity"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_album_popularity_xgb_hypertuned.joblib")

report["album_popularity"]


Fitting 2 folds for each of 10 candidates, totalling 20 fits


{'hypertuned': {'MAE': 12.36629857755661,
  'RMSE': 15.601218692622076,
  'R2': 0.48225220756184717},
 'cv_best_mae': 12.581619596807998}

### Hit Tuning + Threshhold


In [162]:
Xtr, Xte, ytr, yte = splits["hit"]
pre, _, _ = build_preprocessor_tree(X_track_hit)

neg = int((ytr == 0).sum())
pos = int((ytr == 1).sum())
spw = neg / max(pos, 1)

search = tune_xgb_classification(Xtr, ytr, pre, scale_pos_weight=spw)
best_model = search.best_estimator_

proba = best_model.predict_proba(Xte)[:, 1]
thr, thr_f1 = best_f1_threshold(yte, proba)

metrics = classification_report_binary(yte, proba, threshold=thr)
metrics["best_threshold"] = thr
metrics["best_threshold_f1"] = thr_f1

report["hit_prediction"] = {
    "hypertuned": metrics,
    "cv_best_pr_auc": float(search.best_score_),
    "scale_pos_weight": float(spw)
}
best_params["hit_prediction"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_hit_xgb_hypertuned.joblib")

report["hit_prediction"]


Fitting 2 folds for each of 12 candidates, totalling 24 fits


{'hypertuned': {'roc_auc': 0.8450792401152873,
  'pr_auc': 0.5514634510191063,
  'f1': 0.5334410629320406,
  'confusion_matrix': [[42588, 6764], [3630, 5942]],
  'best_threshold': 0.6,
  'best_threshold_f1': 0.5334410629320406},
 'cv_best_pr_auc': 0.5270666904307535,
 'scale_pos_weight': 5.1556582830577975}

### Explicit Tuning + Threshold

In [163]:
Xtr, Xte, ytr, yte = splits["explicit"]
pre, _, _ = build_preprocessor_tree(X_track_explicit)

neg = int((ytr == 0).sum())
pos = int((ytr == 1).sum())
spw = neg / max(pos, 1)

search = tune_xgb_classification(Xtr, ytr, pre, scale_pos_weight=spw)
best_model = search.best_estimator_

proba = best_model.predict_proba(Xte)[:, 1]
thr, thr_f1 = best_f1_threshold(yte, proba)

metrics = classification_report_binary(yte, proba, threshold=thr)
metrics["best_threshold"] = thr
metrics["best_threshold_f1"] = thr_f1

report["explicit_prediction"] = {
    "hypertuned": metrics,
    "cv_best_pr_auc": float(search.best_score_),
    "scale_pos_weight": float(spw)
}
best_params["explicit_prediction"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_explicit_xgb_hypertuned.joblib")

report["explicit_prediction"]


Fitting 2 folds for each of 12 candidates, totalling 24 fits


{'hypertuned': {'roc_auc': 0.947179303017534,
  'pr_auc': 0.8777478483013232,
  'f1': 0.7985542018508709,
  'confusion_matrix': [[39645, 3344], [3121, 12814]],
  'best_threshold': 0.65,
  'best_threshold_f1': 0.7985542018508709},
 'cv_best_pr_auc': 0.8714634762957656,
 'scale_pos_weight': 2.6978568514857697}

### Mood Tuning

In [None]:
Xtr, Xte, Ytr, Yte = splits["mood"]
pre, _, _ = build_preprocessor_linear(X_track_mood)

base = OneVsRestClassifier(
        SGDClassifier(loss="log_loss", random_state=RANDOM_SEED, n_jobs=-1)
    )

pipe = Pipeline(steps=[("sanitize", sanitize_tf), ("pre", pre), ("model", base)])

# kleiner Suchraum, sonst zu teuer
param_dist = {
        "model__estimator__alpha": loguniform(1e-6, 1e-2),
        "model__estimator__penalty": ["l2", "l1", "elasticnet"],
        "model__estimator__l1_ratio": uniform(0.0, 1.0),
        "model__estimator__max_iter": randint(1500, 5000),
    }

search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist,
        n_iter=min(20, TUNE_ITER_REG),
        scoring="f1_micro",
        cv=3,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )

search.fit(Xtr, Ytr)
best_model = search.best_estimator_
Ypred = best_model.predict(Xte)
micro_f1 = float(f1_score(Yte, Ypred, average="micro"))
macro_f1 = float(f1_score(Yte, Ypred, average="macro"))
report["mood_multilabel"] = {
        "hypertuned": {"micro_f1": micro_f1, "macro_f1": macro_f1},
        "cv_best_f1_micro": float(search.best_score_)
    }
best_params["mood_multilabel"] = search.best_params_

dump(best_model, MODELS_HYPER_DIR / "04_mood_multilabel_sgd_hypertuned.joblib")

Fitting 3 folds for each of 10 candidates, totalling 30 fits


## Reports Saving

In [None]:
with open(REPORT_PATH, "w", encoding="utf-8") as f:
    json.dump(report, f, ensure_ascii=False, indent=2)

with open(BEST_PARAMS_PATH, "w", encoding="utf-8") as f:
    json.dump(best_params, f, ensure_ascii=False, indent=2)

print("Hypertuning fertig. Gespeichert unter:")
print(" -", MODELS_HYPER_DIR)
print(" -", REPORT_PATH)
print(" -", BEST_PARAMS_PATH)

report
