In [1]:
%pwd

'/itf-fi-ml/home/arunps/Projects/speaker-type-classifier/notebooks'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'/itf-fi-ml/home/arunps/Projects/speaker-type-classifier'

In [5]:
import os
from pathlib import Path
import time
import json
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_recall_fscore_support,
)

from xgboost import XGBClassifier


In [6]:
RUN_ID = "run_20260203_161558"
FEATURE_ROOT = Path("/scratch/users/arunps/speaker-type-classifier/artifacts_feature_store/v1")
EGEMAPS_DIR = FEATURE_ROOT / "egemaps" / RUN_ID

ID2LABEL = {0: "adult_male", 1: "adult_female", 2: "child", 3: "background"}
LABELS = [ID2LABEL[i] for i in sorted(ID2LABEL)]
LABEL_IDS = list(sorted(ID2LABEL.keys()))

def load_pack(feature_dir: Path):
    Xtr = np.load(feature_dir / "train_X.npy")
    ytr = np.load(feature_dir / "train_y.npy")
    Xva = np.load(feature_dir / "val_X.npy")
    yva = np.load(feature_dir / "val_y.npy")
    return Xtr, ytr, Xva, yva

Xtr_e, ytr, Xva_e, yva = load_pack(EGEMAPS_DIR)

print("eGeMAPS:", Xtr_e.shape, Xva_e.shape, "labels:", np.unique(ytr))


eGeMAPS: (54068, 88) (13517, 88) labels: [0 1 2 3]


In [7]:
def check_gpu():
    try:
        import torch
        print("torch.cuda.is_available():", torch.cuda.is_available())
        if torch.cuda.is_available():
            print("GPU:", torch.cuda.get_device_name(0))
    except Exception as e:
        print("Torch check skipped:", e)

check_gpu()
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))


torch.cuda.is_available(): True
GPU: NVIDIA GeForce RTX 2080 Ti
CUDA_VISIBLE_DEVICES: None


In [8]:
def compute_metrics(y_true, y_pred, labels, label_names):
    acc = accuracy_score(y_true, y_pred)

    p, r, f1, sup = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, zero_division=0
    )

    uar = float(np.mean(r))       # macro recall
    macro_f1 = float(np.mean(f1))

    df = pd.DataFrame({
        "label_id": labels,
        "label": label_names,
        "support": sup,
        "precision": p,
        "recall": r,
        "f1": f1,
    })

    return {
        "accuracy": float(acc),
        "uar_macro_recall": float(uar),
        "macro_f1": float(macro_f1),
        "per_class": df,
    }


In [11]:
import mlflow

EXPERIMENT_NAME = "xgb-egemaps-hparam-tuning"
USE_DAGSHUB_INIT = True

if USE_DAGSHUB_INIT:
    try:
        import dagshub

        DAGSHUB_OWNER = "arunps12"   
        DAGSHUB_REPO  = "speaker-type-classifier"

        dagshub.init(
            repo_owner=DAGSHUB_OWNER,
            repo_name=DAGSHUB_REPO,
            mlflow=True
        )

        print("DagsHub MLflow initialized.")

    except Exception as e:
        print("dagshub.init failed, falling back to plain MLflow.\n", e)

# Set / create experiment
mlflow.set_experiment(EXPERIMENT_NAME)

print("MLflow tracking URI:", mlflow.get_tracking_uri())
print("Experiment:", EXPERIMENT_NAME)


with mlflow.start_run(run_name="smoke_test"):
    mlflow.log_param("smoke_test", "ok")
    mlflow.log_metric("ping", 1.0)

print("MLflow smoke test logged successfully.")


2026/02/04 15:11:41 INFO mlflow.tracking.fluent: Experiment with name 'xgb-egemaps-hparam-tuning' does not exist. Creating a new experiment.


DagsHub MLflow initialized.
MLflow tracking URI: https://dagshub.com/arunps12/speaker-type-classifier.mlflow
Experiment: xgb-egemaps-hparam-tuning
MLflow smoke test logged successfully.


In [12]:
def train_eval_once(Xtr, ytr, Xva, yva, params, seed=42, use_gpu=True):
    base = dict(
        objective="multi:softprob",
        num_class=4,
        eval_metric="mlogloss",
        random_state=seed,
        tree_method="hist",
    )
    cfg = dict(base)
    cfg.update(params)

    # GPU fallback
    if use_gpu:
        cfg.update(device="cuda")
    else:
        cfg.update(device="cpu", n_jobs=16)

    try:
        clf = XGBClassifier(**cfg)
        t0 = time.time()
        clf.fit(Xtr, ytr, verbose=False)
        train_time = time.time() - t0
        mode = "cuda" if use_gpu else "cpu"
    except Exception as e:
        if use_gpu:
            # fallback to CPU
            cfg.update(device="cpu", n_jobs=16)
            clf = XGBClassifier(**cfg)
            t0 = time.time()
            clf.fit(Xtr, ytr, verbose=False)
            train_time = time.time() - t0
            mode = "cpu_fallback"
        else:
            raise

    y_pred = clf.predict(Xva)
    metrics = compute_metrics(
        yva, y_pred,
        labels=LABEL_IDS,
        label_names=[ID2LABEL[i] for i in LABEL_IDS]
    )
    return clf, metrics, train_time, mode


baseline_params = dict(
    n_estimators=1200,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
)

clf0, m0, t0, mode0 = train_eval_once(Xtr_e, ytr, Xva_e, yva, baseline_params, use_gpu=True)
print("Baseline mode:", mode0, "time:", round(t0,2))
print("Baseline acc:", m0["accuracy"], "UAR:", m0["uar_macro_recall"], "MacroF1:", m0["macro_f1"])
display(m0["per_class"])


Baseline mode: cuda time: 26.4
Baseline acc: 0.944366353480802 UAR: 0.9367319402500172 MacroF1: 0.9364579424677455


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Unnamed: 0,label_id,label,support,precision,recall,f1
0,0,adult_male,4000,0.979016,0.97975,0.979383
1,1,adult_female,4000,0.98512,0.9765,0.980791
2,2,child,2677,0.893124,0.89279,0.892957
3,3,background,2840,0.887574,0.897887,0.892701


In [13]:
def sample_params(rng: random.Random):
    params = {
        "n_estimators": rng.choice([400, 600, 800, 1000, 1400, 1800]),
        "learning_rate": rng.choice([0.02, 0.03, 0.05, 0.07, 0.1]),
        "max_depth": rng.choice([4, 6, 8, 10, 12]),
        "min_child_weight": rng.choice([1, 2, 4, 6, 8]),
        "subsample": rng.choice([0.6, 0.7, 0.8, 0.9, 1.0]),
        "colsample_bytree": rng.choice([0.6, 0.7, 0.8, 0.9, 1.0]),
        "gamma": rng.choice([0.0, 0.5, 1.0, 2.0, 5.0]),
        "reg_lambda": rng.choice([0.5, 1.0, 2.0, 5.0, 10.0]),
        "reg_alpha": rng.choice([0.0, 0.1, 0.5, 1.0, 2.0]),
    }
    return params


In [14]:
def log_dict_flat(prefix, d):
    out = {}
    for k, v in d.items():
        out[f"{prefix}.{k}"] = v
    return out

def run_trial(trial_id, Xtr, ytr, Xva, yva, params, seed=42, use_gpu=True):
    with mlflow.start_run(run_name=f"trial_{trial_id:04d}"):
        mlflow.log_params(params)
        mlflow.log_param("seed", seed)
        mlflow.log_param("feature_set", "egemaps")
        mlflow.log_param("run_id", RUN_ID)

        clf, metrics, train_time, mode = train_eval_once(
            Xtr, ytr, Xva, yva, params, seed=seed, use_gpu=use_gpu
        )

        mlflow.log_metric("train_time_sec", train_time)
        mlflow.log_metric("val_accuracy", metrics["accuracy"])
        mlflow.log_metric("val_uar_macro_recall", metrics["uar_macro_recall"])
        mlflow.log_metric("val_macro_f1", metrics["macro_f1"])
        mlflow.log_param("device_mode", mode)

        # log per-class metrics as metrics 
        per = metrics["per_class"]
        for _, row in per.iterrows():
            lab = row["label"]
            mlflow.log_metric(f"val_f1_{lab}", float(row["f1"]))
            mlflow.log_metric(f"val_recall_{lab}", float(row["recall"]))
            mlflow.log_metric(f"val_precision_{lab}", float(row["precision"]))

        # save artifacts: params + summary json
        summary = {
            "trial_id": trial_id,
            "mode": mode,
            "train_time_sec": train_time,
            "metrics": {k: v for k, v in metrics.items() if k != "per_class"},
            "params": params,
        }
        tmp_json = f"trial_{trial_id:04d}_summary.json"
        with open(tmp_json, "w") as f:
            json.dump(summary, f, indent=2)
        mlflow.log_artifact(tmp_json)

        # log model (can be big, but handy)
        #mlflow.xgboost.log_model(clf, artifact_path="model")

        return clf, metrics, train_time, mode


In [15]:
N_TRIALS = 40          
SEED = 42
USE_GPU = True

rng = random.Random(SEED)

results = []
best = {"score": -1, "trial_id": None, "params": None}

for i in range(N_TRIALS):
    params = sample_params(rng)

    # avoid too many huge trees
    # if params["max_depth"] >= 12 and params["n_estimators"] >= 1800 and params["learning_rate"] <= 0.03:
    #     continue

    clf, metrics, train_time, mode = run_trial(
        trial_id=i,
        Xtr=Xtr_e, ytr=ytr,
        Xva=Xva_e, yva=yva,
        params=params,
        seed=SEED,
        use_gpu=USE_GPU
    )

    score = metrics["macro_f1"]  # choose your main objective here (macro_f1 or uar)
    row = {
        "trial_id": i,
        "score_macro_f1": score,
        "val_accuracy": metrics["accuracy"],
        "val_uar": metrics["uar_macro_recall"],
        "train_time_sec": train_time,
        "mode": mode,
        **params
    }
    results.append(row)

    if score > best["score"]:
        best = {"score": score, "trial_id": i, "params": params}
        print(f"New best @ trial {i}: macro_f1={score:.6f}  (acc={metrics['accuracy']:.6f}, uar={metrics['uar_macro_recall']:.6f})")

leaderboard = pd.DataFrame(results).sort_values("score_macro_f1", ascending=False).reset_index(drop=True)
display(leaderboard.head(15))
print("\nBEST:", best)


New best @ trial 0: macro_f1=0.934682  (acc=0.942591, uar=0.934926)
New best @ trial 6: macro_f1=0.935737  (acc=0.943479, uar=0.936045)
New best @ trial 8: macro_f1=0.936856  (acc=0.944662, uar=0.937159)
New best @ trial 32: macro_f1=0.936961  (acc=0.944736, uar=0.937228)


Unnamed: 0,trial_id,score_macro_f1,val_accuracy,val_uar,train_time_sec,mode,n_estimators,learning_rate,max_depth,min_child_weight,subsample,colsample_bytree,gamma,reg_lambda,reg_alpha
0,32,0.936961,0.944736,0.937228,7.528541,cuda,400,0.1,10,8,1.0,0.7,0.0,10.0,0.0
1,8,0.936856,0.944662,0.937159,9.335789,cuda,1800,0.05,4,8,0.7,1.0,0.5,1.0,1.0
2,23,0.936101,0.94407,0.936411,20.186373,cuda,1800,0.02,6,1,0.6,0.8,0.0,10.0,0.1
3,29,0.935981,0.94407,0.936241,12.231599,cuda,600,0.07,10,6,0.7,0.9,0.0,1.0,1.0
4,6,0.935737,0.943479,0.936045,4.300998,cuda,800,0.1,6,1,0.6,0.7,1.0,0.5,0.1
5,28,0.93558,0.943627,0.935845,15.283491,cuda,400,0.07,12,1,0.6,1.0,0.0,0.5,0.1
6,37,0.935386,0.943553,0.935643,32.809754,cuda,1400,0.03,12,4,0.7,0.8,0.0,1.0,0.5
7,21,0.934994,0.943035,0.935278,9.066404,cuda,1000,0.05,10,8,0.9,0.6,0.5,1.0,0.0
8,0,0.934682,0.942591,0.934926,10.182497,cuda,1800,0.02,4,4,0.7,0.7,0.5,0.5,2.0
9,22,0.934662,0.942739,0.934961,19.90115,cuda,800,0.02,12,8,0.7,1.0,0.5,0.5,0.0



BEST: {'score': 0.9369613445831566, 'trial_id': 32, 'params': {'n_estimators': 400, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 8, 'subsample': 1.0, 'colsample_bytree': 0.7, 'gamma': 0.0, 'reg_lambda': 10.0, 'reg_alpha': 0.0}}
