# Notes

- Semua nama kolom kini snake_case (mis. `user_id`, `stress_level`, `created_at`, `study_hour_per_day`).
- Kolom `is_restored` adalah metadata input/restore dan **tidak** dipakai sebagai fitur model.


In [2]:
# =====================================================================================
# PERSONALIZED_FORECAST (Binary, from stress_level_pred) - 1 CELL
# - Train a separate model per user (personalized)
# - Compare:
#   1) Markov USER(prev_high, dow, user)
#   2) Per-user ML: LogReg / DecisionTree / RandomForest / ExtraTrees / HistGB
# - Threshold tuning via pooled time-CV windows (stable)
# - TEST = last TEST_LEN days per user (time-based)
# - Save best to ../models/personalized_forecast_best.joblib
# =====================================================================================

import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# =========================
# 0) CONFIG
# =========================
CANDIDATE_PATHS = [
    Path("../datasets/global_dataset_pred.csv"),
]
DATA_PATH = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if DATA_PATH is None:
    raise FileNotFoundError("global_dataset_pred.csv tidak ditemukan. Cek path DATA_PATH.")

MODEL_OUT = Path("../models/personalized_forecast.joblib")

DATE_COL   = "date"
USER_COL   = "user_id"
TARGET_COL = "stress_level_pred"

WINDOW = 7
TEST_LEN = 12

# CV windows di dalam train_pool tiap user (index relatif)
VAL_WINDOWS = [(8, 20), (12, 24), (16, 28)]
THRESHOLDS = np.linspace(0.05, 0.95, 19)

RANDOM_STATE = 42

# jika True: fitur user_id tidak dipakai (karena sudah per-user model)
# biarkan False/True sama saja; per-user model biasanya tidak butuh user_id
USE_USER_ID_FEATURE = False

# =========================
# Helpers
# =========================
def eval_bin(y_true, y_pred):
    return {
        "acc": float(accuracy_score(y_true, y_pred)),
        "f1":  float(f1_score(y_true, y_pred, zero_division=0)),
    }

def tune_thr_from_proba(y_true, p_high):
    best_thr, best_f1 = None, -1
    for thr in THRESHOLDS:
        pred = (p_high >= thr).astype(int)
        f1 = float(f1_score(y_true, pred, zero_division=0))
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return float(best_thr), float(best_f1)

# =========================
# 1) LOAD + FEATURE ENGINEERING (no leak)
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.sort_values([USER_COL, DATE_COL]).reset_index(drop=True)

rows = []
for uid, g in df.groupby(USER_COL):
    g = g.sort_values(DATE_COL).reset_index(drop=True)

    g["dow"] = g[DATE_COL].dt.dayofweek.astype(int)
    g["is_weekend"] = (g["dow"] >= 5).astype(int)

    # stress lags
    for k in range(1, WINDOW + 1):
        g[f"lag_sp_{k}"] = g[TARGET_COL].shift(k)

    sp_shift = g[TARGET_COL].shift(1)

    # rolling stats ending at t-1
    g["sp_mean_7"] = sp_shift.rolling(WINDOW).mean()
    g["sp_std_7"]  = sp_shift.rolling(WINDOW).std()
    g["sp_min_7"]  = sp_shift.rolling(WINDOW).min()
    g["sp_max_7"]  = sp_shift.rolling(WINDOW).max()

    g["count_high_7"] = (sp_shift >= 1).rolling(WINDOW).sum()
    g["count_low_7"]  = (sp_shift == 0).rolling(WINDOW).sum()

    # streak_high up to t-1
    high = (sp_shift >= 1).astype(int).fillna(0).astype(int).tolist()
    streak, cur = [], 0
    for v in high:
        cur = cur + 1 if v == 1 else 0
        streak.append(cur)
    g["streak_high"] = streak

    # transitions_7 ending at t-1
    diff = (sp_shift != sp_shift.shift(1)).astype(int)
    g["transitions_7"] = diff.rolling(WINDOW).sum()

    rows.append(g)

feat = pd.concat(rows, ignore_index=True)
feat["y_bin"] = (feat[TARGET_COL] >= 1).astype(int)

feature_cols = []
if USE_USER_ID_FEATURE:
    feature_cols.append(USER_COL)
feature_cols += ["dow", "is_weekend"] + [f"lag_sp_{k}" for k in range(1, WINDOW + 1)] + [
    "sp_mean_7", "sp_std_7", "sp_min_7", "sp_max_7",
    "count_high_7", "count_low_7",
    "streak_high", "transitions_7"
]

feat = feat.dropna(subset=feature_cols + ["y_bin"]).reset_index(drop=True)

print("=== DATASET ===")
print("Path:", DATA_PATH)
print("Rows:", len(feat), "| Users:", feat[USER_COL].nunique())
print("Binary dist:", feat["y_bin"].value_counts().to_dict())

# =========================
# 2) SPLIT: time-based per user (TEST = last TEST_LEN)
# =========================
users = sorted(feat[USER_COL].unique().tolist())

per_user = {}
for uid in users:
    g = feat[feat[USER_COL] == uid].sort_values(DATE_COL).reset_index(drop=True)
    n = len(g)
    test_start = n - TEST_LEN
    if test_start <= 20:
        raise ValueError(f"User {uid}: data terlalu sedikit untuk split + CV windows.")
    train_pool = g.iloc[:test_start].copy()
    test_block = g.iloc[test_start:].copy()
    per_user[uid] = {"train_pool": train_pool, "test": test_block}

print("\n=== SPLIT ===")
print("Users:", users, "| TEST_LEN:", TEST_LEN)
print("Total TrainPool:", sum(len(per_user[u]["train_pool"]) for u in users), "| Total Test:", sum(len(per_user[u]["test"]) for u in users))

# CV splits per user (index relatif)
def cv_splits_user(train_pool_df):
    splits = []
    for (v0, v1) in VAL_WINDOWS:
        if len(train_pool_df) < v1:
            continue
        tr = train_pool_df.iloc[:v0]
        va = train_pool_df.iloc[v0:v1]
        splits.append((tr, va))
    return splits

# =========================
# 3) MARKOV USER
# =========================
def train_markov_one_user(df_train):
    # probs[prev(2), dow(7), y(2)]
    counts = np.zeros((2, 7, 2), dtype=int)
    prev = (df_train["lag_sp_1"] >= 1).astype(int).values
    dow  = df_train["dow"].astype(int).values
    yb   = df_train["y_bin"].astype(int).values
    for p, d, y in zip(prev, dow, yb):
        counts[p, d, y] += 1
    probs = (counts + 1) / (counts.sum(axis=2, keepdims=True) + 2)  # Laplace
    return probs

def predict_markov_one_user_proba(probs, df_eval):
    prev = (df_eval["lag_sp_1"] >= 1).astype(int).values
    dow  = df_eval["dow"].astype(int).values
    p_high = np.array([probs[p, d, 1] for p, d in zip(prev, dow)])
    return p_high

# tune threshold pooled across users & folds
p_true_all, p_high_all = [], []
for uid in users:
    tp = per_user[uid]["train_pool"]
    for tr_df, va_df in cv_splits_user(tp):
        probs = train_markov_one_user(tr_df)
        p = predict_markov_one_user_proba(probs, va_df)
        p_true_all.append(va_df["y_bin"].values)
        p_high_all.append(p)

p_true_all = np.concatenate(p_true_all)
p_high_all = np.concatenate(p_high_all)
thr_markov, cv_f1_markov = tune_thr_from_proba(p_true_all, p_high_all)

# train final markov per user, test
markov_models = {}
test_preds_all, test_true_all = [], []
per_user_test_f1 = []

for uid in users:
    tp = per_user[uid]["train_pool"]
    te = per_user[uid]["test"]

    probs = train_markov_one_user(tp)
    markov_models[uid] = probs

    p_te = predict_markov_one_user_proba(probs, te)
    pred_te = (p_te >= thr_markov).astype(int)

    test_true_all.append(te["y_bin"].values)
    test_preds_all.append(pred_te)

    per_user_test_f1.append(f1_score(te["y_bin"], pred_te, zero_division=0))

test_true_all = np.concatenate(test_true_all)
test_preds_all = np.concatenate(test_preds_all)

markov_test = eval_bin(test_true_all, test_preds_all)
markov_test_macro = float(np.mean(per_user_test_f1))

print("\n=== MARKOV USER (PERSONALIZED) ===")
print("Best thr (pooled CV):", thr_markov, "| CV pooled F1:", round(cv_f1_markov, 4))
print("TEST pooled:", markov_test, "| TEST macro(user-avg) f1:", round(markov_test_macro, 4))

# =========================
# 4) PERSONALIZED ML: per user model, tuned fairly (pooled CV)
# =========================
# preprocess for per-user (user_id not needed)
cat_cols = ["dow", "is_weekend"]
if USE_USER_ID_FEATURE:
    cat_cols = [USER_COL] + cat_cols

num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
    ],
    remainder="drop"
)

CANDIDATES = {
    "LogReg": (
        LogisticRegression(max_iter=5000, class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__C": [0.1, 0.3, 1.0, 3.0], "clf__solver": ["liblinear"]}
    ),
    "DecisionTree": (
        DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__max_depth": [2, 3, 4, None], "clf__min_samples_leaf": [1, 2, 4]}
    ),
    "RandomForest": (
        RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400], "clf__max_depth": [None, 6, 10], "clf__min_samples_leaf": [1, 2], "clf__max_features": ["sqrt"]}
    ),
    "ExtraTrees": (
        ExtraTreesClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400, 800], "clf__max_depth": [None, 6, 10], "clf__min_samples_leaf": [1, 2], "clf__max_features": ["sqrt"]}
    ),
    "HistGB": (
        HistGradientBoostingClassifier(random_state=RANDOM_STATE),
        {"clf__learning_rate": [0.05, 0.1], "clf__max_depth": [2, 3], "clf__max_leaf_nodes": [15, 31, 63]}
    ),
}

def pooled_cv_best_params_and_thr_for_personalized(model_name, clf, grid):
    """
    For each param:
      - For each user:
        - For each CV fold (time window):
          train on user-fold train, predict proba on user-fold val
      - Pool all user+fold probs, tune threshold, get pooled CV F1
    """
    best = None

    for params in ParameterGrid(grid):
        p_true_all, p_high_all = [], []

        for uid in users:
            tp = per_user[uid]["train_pool"]
            folds = cv_splits_user(tp)
            if len(folds) == 0:
                continue

            for tr_df, va_df in folds:
                pipe = Pipeline([("prep", preprocess), ("clf", clf)])
                pipe.set_params(**params)
                pipe.fit(tr_df[feature_cols], tr_df["y_bin"])
                p = pipe.predict_proba(va_df[feature_cols])[:, 1]

                p_true_all.append(va_df["y_bin"].values)
                p_high_all.append(p)

        y_all = np.concatenate(p_true_all)
        p_all = np.concatenate(p_high_all)
        thr, cv_f1 = tune_thr_from_proba(y_all, p_all)

        if (best is None) or (cv_f1 > best["cv_f1"]):
            best = {"params": params, "thr": thr, "cv_f1": float(cv_f1)}

    return best

ml_rows = []

print("\n=== PERSONALIZED ML (pooled CV tuning) ===")
for name, (clf, grid) in CANDIDATES.items():
    best = pooled_cv_best_params_and_thr_for_personalized(name, clf, grid)

    # train final per-user models on full train_pool
    per_user_models = {}
    per_user_f1 = []
    pooled_true, pooled_pred = [], []

    for uid in users:
        tp = per_user[uid]["train_pool"]
        te = per_user[uid]["test"]

        pipe = Pipeline([("prep", preprocess), ("clf", clf)])
        pipe.set_params(**best["params"])
        pipe.fit(tp[feature_cols], tp["y_bin"])

        p = pipe.predict_proba(te[feature_cols])[:, 1]
        pred = (p >= best["thr"]).astype(int)

        per_user_models[uid] = pipe
        per_user_f1.append(f1_score(te["y_bin"], pred, zero_division=0))
        pooled_true.append(te["y_bin"].values)
        pooled_pred.append(pred)

    pooled_true = np.concatenate(pooled_true)
    pooled_pred = np.concatenate(pooled_pred)

    test_metrics = eval_bin(pooled_true, pooled_pred)
    test_macro = float(np.mean(per_user_f1))

    ml_rows.append({
        "model": name,
        "cv_f1": float(best["cv_f1"]),
        "thr": float(best["thr"]),
        "test_f1_pooled": float(test_metrics["f1"]),
        "test_acc_pooled": float(test_metrics["acc"]),
        "test_f1_macro_users": float(test_macro),
        "best_params": best["params"],
        "models_by_user": per_user_models,
    })

# leaderboard by TEST pooled F1
ml_sorted = sorted(ml_rows, key=lambda r: r["test_f1_pooled"], reverse=True)

print("\n=== PERSONALIZED LEADERBOARD (sorted by TEST pooled F1) ===")
print("MarkovUser | CV f1=%.4f thr=%.2f | TEST pooled f1=%.4f acc=%.4f | TEST macro(user)=%.4f" %
      (cv_f1_markov, thr_markov, markov_test["f1"], markov_test["acc"], markov_test_macro))

for r in ml_sorted:
    print(f"{r['model']:<10} | CV f1={r['cv_f1']:.4f} thr={r['thr']:.2f} | "
          f"TEST pooled f1={r['test_f1_pooled']:.4f} acc={r['test_acc_pooled']:.4f} | "
          f"TEST macro(user) f1={r['test_f1_macro_users']:.4f} | params={r['best_params']}")

# pick best by TEST pooled F1 (for practicality)
best_personal = {"name": "MarkovUser", "thr": thr_markov, "cv_f1": cv_f1_markov,
                 "test_f1": markov_test["f1"], "test_acc": markov_test["acc"],
                 "artifact": {"type": "markov_user", "thr": float(thr_markov), "probs_by_user": markov_models}}

if len(ml_sorted) > 0 and ml_sorted[0]["test_f1_pooled"] > best_personal["test_f1"]:
    top = ml_sorted[0]
    best_personal = {
        "name": top["model"],
        "thr": float(top["thr"]),
        "cv_f1": float(top["cv_f1"]),
        "test_f1": float(top["test_f1_pooled"]),
        "test_acc": float(top["test_acc_pooled"]),
        "artifact": {"type": "personalized_sklearn", "thr": float(top["thr"]), "models_by_user": top["models_by_user"]}
    }

print("\n✅ BEST PERSONALIZED (by TEST pooled F1):", best_personal["name"])
print("   TEST:", {"f1": round(best_personal["test_f1"], 4), "acc": round(best_personal["test_acc"], 4)})
print("   CV  :", {"pooled_f1": round(best_personal["cv_f1"], 4), "thr": round(best_personal["thr"], 2)})

# Save
MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(
    {
        "type": best_personal["artifact"]["type"],
        "thr": float(best_personal["thr"]),
        "artifact": best_personal["artifact"],
        "meta": {
            "target": "y_bin = (stress_level_pred>=1)",
            "window": WINDOW,
            "test_len": TEST_LEN,
            "val_windows": VAL_WINDOWS,
            "thresholds": THRESHOLDS.tolist(),
            "use_user_id_feature": USE_USER_ID_FEATURE,
            "users": users,
        }
    },
    MODEL_OUT
)
print("Saved:", MODEL_OUT)


=== DATASET ===
Path: ..\datasets\global_dataset_pred.csv
Rows: 240 | Users: 5
Binary dist: {1: 126, 0: 114}

=== SPLIT ===
Users: [1, 2, 3, 4, 5] | TEST_LEN: 12
Total TrainPool: 180 | Total Test: 60

=== MARKOV USER (PERSONALIZED) ===
Best thr (pooled CV): 0.3 | CV pooled F1: 0.8151
TEST pooled: {'acc': 0.75, 'f1': 0.8235294117647058} | TEST macro(user-avg) f1: 0.8102

=== PERSONALIZED ML (pooled CV tuning) ===

=== PERSONALIZED LEADERBOARD (sorted by TEST pooled F1) ===
MarkovUser | CV f1=0.8151 thr=0.30 | TEST pooled f1=0.8235 acc=0.7500 | TEST macro(user)=0.8102
LogReg     | CV f1=0.7864 thr=0.05 | TEST pooled f1=0.8046 acc=0.7167 | TEST macro(user) f1=0.7960 | params={'clf__C': 0.1, 'clf__solver': 'liblinear'}
DecisionTree | CV f1=0.8166 thr=0.05 | TEST pooled f1=0.7179 acc=0.6333 | TEST macro(user) f1=0.6792 | params={'clf__max_depth': 2, 'clf__min_samples_leaf': 4}
ExtraTrees | CV f1=0.8188 thr=0.25 | TEST pooled f1=0.7143 acc=0.6667 | TEST macro(user) f1=0.6628 | params={'clf__

In [3]:
# =====================================================================================
# PERSONALIZED_FORECAST (Binary, from stress_level_pred) - 1 CELL (Consistent Baselines)
#
# Baseline Level 1 (paling dasar, untuk personalized juga):
#   - Persistence per user: y(t)=y(t-1)
#
# Baseline Level 2 (probabilistik, personalized):
#   - Markov USER: P(high_t | prev_high, dow, user) + threshold tuning (pooled time-CV)
#
# Models (PERSONALIZED = model terpisah per user):
#   - LogReg / DecisionTree / RandomForest / ExtraTrees / HistGB (per user)
#   - Semua pakai: time-based split per user (TEST=last TEST_LEN),
#                  time-based CV windows per user (pooled across users),
#                  threshold tuning yang sama.
#
# Target:
#   y_bin = 1 if stress_level_pred(t) >= 1 else 0
#
# Data:
#   /mnt/data/global_dataset_pred.csv (upload kamu) / atau ../datasets/global_dataset_pred.csv
#
# Save best personalized:
#   ../models/personalized_forecast_best.joblib
# =====================================================================================

import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# =========================
# 0) CONFIG
# =========================
CANDIDATE_PATHS = [
    Path("/mnt/data/global_dataset_pred.csv"),
    Path("../datasets/global_dataset_pred.csv"),
]
DATA_PATH = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if DATA_PATH is None:
    raise FileNotFoundError("global_dataset_pred.csv tidak ditemukan. Cek path DATA_PATH.")

MODEL_OUT = Path("../models/personalized_forecast_best.joblib")

DATE_COL   = "date"
USER_COL   = "user_id"
TARGET_COL = "stress_level_pred"

WINDOW = 7
TEST_LEN = 12

# CV windows dalam train_pool tiap user (index relatif)
VAL_WINDOWS = [(8, 20), (12, 24), (16, 28)]
THRESHOLDS = np.linspace(0.05, 0.95, 19)

RANDOM_STATE = 42

# Personalized = per user model, jadi user_id biasanya TIDAK perlu jadi fitur.
USE_USER_ID_FEATURE = False

# =========================
# Helpers
# =========================
def eval_bin(y_true, y_pred):
    return {
        "acc": float(accuracy_score(y_true, y_pred)),
        "f1":  float(f1_score(y_true, y_pred, zero_division=0)),
    }

def tune_thr_from_proba(y_true, p_high):
    best_thr, best_f1 = None, -1
    for thr in THRESHOLDS:
        pred = (p_high >= thr).astype(int)
        f1 = float(f1_score(y_true, pred, zero_division=0))
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return float(best_thr), float(best_f1)

# =========================
# 1) LOAD + FEATURE ENGINEERING (no leak)
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.sort_values([USER_COL, DATE_COL]).reset_index(drop=True)

rows = []
for uid, g in df.groupby(USER_COL):
    g = g.sort_values(DATE_COL).reset_index(drop=True)

    g["dow"] = g[DATE_COL].dt.dayofweek.astype(int)
    g["is_weekend"] = (g["dow"] >= 5).astype(int)

    for k in range(1, WINDOW + 1):
        g[f"lag_sp_{k}"] = g[TARGET_COL].shift(k)

    sp_shift = g[TARGET_COL].shift(1)

    g["sp_mean_7"] = sp_shift.rolling(WINDOW).mean()
    g["sp_std_7"]  = sp_shift.rolling(WINDOW).std().fillna(0.0)
    g["sp_min_7"]  = sp_shift.rolling(WINDOW).min()
    g["sp_max_7"]  = sp_shift.rolling(WINDOW).max()

    g["count_high_7"] = (sp_shift >= 1).rolling(WINDOW).sum()
    g["count_low_7"]  = (sp_shift == 0).rolling(WINDOW).sum()

    # streak_high (<= t-1)
    high = (sp_shift >= 1).astype(int).fillna(0).astype(int).tolist()
    streak, cur = [], 0
    for v in high:
        cur = cur + 1 if v == 1 else 0
        streak.append(cur)
    g["streak_high"] = streak

    # transitions_7
    diff = (sp_shift != sp_shift.shift(1)).astype(int)
    g["transitions_7"] = diff.rolling(WINDOW).sum()

    rows.append(g)

feat = pd.concat(rows, ignore_index=True)
feat["y_bin"] = (feat[TARGET_COL] >= 1).astype(int)

feature_cols = ["dow", "is_weekend"] + [f"lag_sp_{k}" for k in range(1, WINDOW + 1)] + [
    "sp_mean_7", "sp_std_7", "sp_min_7", "sp_max_7",
    "count_high_7", "count_low_7",
    "streak_high", "transitions_7"
]
if USE_USER_ID_FEATURE:
    feature_cols = [USER_COL] + feature_cols

feat = feat.dropna(subset=feature_cols + ["y_bin"]).reset_index(drop=True)

users = sorted(feat[USER_COL].unique().tolist())

print("=== DATASET ===")
print("Path:", DATA_PATH)
print("Users:", users)
print("Rows:", len(feat), "| Binary dist:", feat["y_bin"].value_counts().to_dict())
print("WINDOW:", WINDOW, "| TEST_LEN:", TEST_LEN, "| USE_USER_ID_FEATURE:", USE_USER_ID_FEATURE)

# =========================
# 2) SPLIT: time-based per user (TEST = last TEST_LEN)
# =========================
per_user = {}
for uid in users:
    g = feat[feat[USER_COL] == uid].sort_values(DATE_COL).reset_index(drop=True)
    n = len(g)
    test_start = n - TEST_LEN
    if test_start <= 20:
        raise ValueError(f"User {uid}: data terlalu sedikit untuk split + CV windows.")
    per_user[uid] = {
        "train_pool": g.iloc[:test_start].copy(),
        "test": g.iloc[test_start:].copy()
    }

print("\n=== SPLIT ===")
print("Total TrainPool:", sum(len(per_user[u]["train_pool"]) for u in users),
      "| Total Test:", sum(len(per_user[u]["test"]) for u in users))

def cv_folds_user(tp_df):
    folds = []
    for (v0, v1) in VAL_WINDOWS:
        if len(tp_df) < v1:
            continue
        tr = tp_df.iloc[:v0]
        va = tp_df.iloc[v0:v1]
        folds.append((tr, va))
    return folds

# =========================
# 3) BASELINE L1: Persistence (per user)
# =========================
all_true, all_pred = [], []
per_user_f1 = []

for uid in users:
    te = per_user[uid]["test"]
    pred = (te["lag_sp_1"] >= 1).astype(int).values
    y = te["y_bin"].astype(int).values

    all_true.append(y)
    all_pred.append(pred)
    per_user_f1.append(f1_score(y, pred, zero_division=0))

y_all = np.concatenate(all_true)
p_all = np.concatenate(all_pred)

persist_test = eval_bin(y_all, p_all)
persist_macro = float(np.mean(per_user_f1))

print("\n=== BASELINE L1: Persistence (per user) ===")
print("TEST pooled:", persist_test, "| TEST macro(user-avg) f1:", round(persist_macro, 4))

# =========================
# 4) BASELINE L2: Markov USER(prev_high, dow, user) + thr tuning
# =========================
def train_markov_one_user(df_train):
    counts = np.zeros((2, 7, 2), dtype=int)
    prev = (df_train["lag_sp_1"] >= 1).astype(int).values
    dow  = df_train["dow"].astype(int).values
    yb   = df_train["y_bin"].astype(int).values
    for p, d, y in zip(prev, dow, yb):
        counts[p, d, y] += 1
    probs = (counts + 1) / (counts.sum(axis=2, keepdims=True) + 2)  # Laplace
    return probs

def markov_proba_user(probs, df_eval):
    prev = (df_eval["lag_sp_1"] >= 1).astype(int).values
    dow  = df_eval["dow"].astype(int).values
    return np.array([probs[p, d, 1] for p, d in zip(prev, dow)])

# pooled CV for threshold (across users & folds)
cv_true, cv_phigh = [], []
for uid in users:
    tp = per_user[uid]["train_pool"]
    for tr_df, va_df in cv_folds_user(tp):
        probs = train_markov_one_user(tr_df)
        p = markov_proba_user(probs, va_df)
        cv_true.append(va_df["y_bin"].values)
        cv_phigh.append(p)

cv_true = np.concatenate(cv_true)
cv_phigh = np.concatenate(cv_phigh)

thr_mk, cv_f1_mk = tune_thr_from_proba(cv_true, cv_phigh)

# train final per-user Markov on full train_pool -> test
mk_models = {}
all_true, all_pred = [], []
per_user_f1 = []

for uid in users:
    tp = per_user[uid]["train_pool"]
    te = per_user[uid]["test"]

    probs = train_markov_one_user(tp)
    mk_models[uid] = probs

    p = markov_proba_user(probs, te)
    pred = (p >= thr_mk).astype(int)

    y = te["y_bin"].values
    all_true.append(y)
    all_pred.append(pred)
    per_user_f1.append(f1_score(y, pred, zero_division=0))

y_all = np.concatenate(all_true)
pred_all = np.concatenate(all_pred)

markov_test = eval_bin(y_all, pred_all)
markov_macro = float(np.mean(per_user_f1))

print("\n=== BASELINE L2: Markov USER(prev_high, dow, user) ===")
print("Best thr:", thr_mk, "| CV pooled F1:", round(cv_f1_mk, 4))
print("TEST pooled:", markov_test, "| TEST macro(user-avg) f1:", round(markov_macro, 4))

# =========================
# 5) MODELS: per-user ML (fair protocol, pooled CV)
# =========================
cat_cols = ["dow", "is_weekend"]
if USE_USER_ID_FEATURE:
    cat_cols = [USER_COL] + cat_cols
num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
    ],
    remainder="drop"
)

CANDIDATES = {
    "LogReg": (
        LogisticRegression(max_iter=5000, class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__C": [0.1, 0.3, 1.0, 3.0], "clf__solver": ["liblinear"]}
    ),
    "DecisionTree": (
        DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__max_depth": [2, 3, 4, None], "clf__min_samples_leaf": [1, 2, 4]}
    ),
    "RandomForest": (
        RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400], "clf__max_depth": [None, 6, 10], "clf__min_samples_leaf": [1, 2], "clf__max_features": ["sqrt"]}
    ),
    "ExtraTrees": (
        ExtraTreesClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400, 800], "clf__max_depth": [None, 6, 10], "clf__min_samples_leaf": [1, 2], "clf__max_features": ["sqrt"]}
    ),
    "HistGB": (
        HistGradientBoostingClassifier(random_state=RANDOM_STATE),
        {"clf__learning_rate": [0.05, 0.1], "clf__max_depth": [2, 3], "clf__max_leaf_nodes": [15, 31, 63]}
    ),
}

def pooled_cv_best_params_and_thr_personalized(clf, grid):
    best = None
    for params in ParameterGrid(grid):
        y_list, p_list = [], []

        for uid in users:
            tp = per_user[uid]["train_pool"]
            folds = cv_folds_user(tp)
            if len(folds) == 0:
                continue
            for tr_df, va_df in folds:
                pipe = Pipeline([("prep", preprocess), ("clf", clf)])
                pipe.set_params(**params)
                pipe.fit(tr_df[feature_cols], tr_df["y_bin"].astype(int))

                p = pipe.predict_proba(va_df[feature_cols])[:, 1]
                y_list.append(va_df["y_bin"].values)
                p_list.append(p)

        y_all = np.concatenate(y_list)
        p_all = np.concatenate(p_list)

        thr, cv_f1 = tune_thr_from_proba(y_all, p_all)

        if (best is None) or (cv_f1 > best["cv_f1"]):
            best = {"params": params, "thr": float(thr), "cv_f1": float(cv_f1)}
    return best

rows = []
print("\n=== PERSONALIZED ML: TRAIN + TUNE (pooled CV, fair protocol) ===")
for name, (clf, grid) in CANDIDATES.items():
    best = pooled_cv_best_params_and_thr_personalized(clf, grid)

    models_by_user = {}
    per_user_test_f1 = []
    all_true, all_pred = [], []

    for uid in users:
        tp = per_user[uid]["train_pool"]
        te = per_user[uid]["test"]

        pipe = Pipeline([("prep", preprocess), ("clf", clf)])
        pipe.set_params(**best["params"])
        pipe.fit(tp[feature_cols], tp["y_bin"].astype(int))

        p = pipe.predict_proba(te[feature_cols])[:, 1]
        pred = (p >= best["thr"]).astype(int)

        models_by_user[uid] = pipe
        per_user_test_f1.append(f1_score(te["y_bin"].values, pred, zero_division=0))

        all_true.append(te["y_bin"].values)
        all_pred.append(pred)

    y_all = np.concatenate(all_true)
    pred_all = np.concatenate(all_pred)

    test_metrics = eval_bin(y_all, pred_all)
    test_macro = float(np.mean(per_user_test_f1))

    rows.append({
        "model": name,
        "cv_f1": best["cv_f1"],
        "thr": best["thr"],
        "test_f1_pooled": test_metrics["f1"],
        "test_acc_pooled": test_metrics["acc"],
        "test_f1_macro_users": test_macro,
        "params": best["params"],
        "models_by_user": models_by_user,
    })

# =========================
# 6) LEADERBOARD + SAVE BEST
# =========================
print("\n=== PERSONALIZED LEADERBOARD (sorted by TEST pooled F1) ===")
print(f"Baseline-Persist | TEST pooled f1={persist_test['f1']:.4f} acc={persist_test['acc']:.4f} | macro(user) f1={persist_macro:.4f}")
print(f"Baseline-Markov  | CV f1={cv_f1_mk:.4f} thr={thr_mk:.2f} | TEST pooled f1={markov_test['f1']:.4f} acc={markov_test['acc']:.4f} | macro(user) f1={markov_macro:.4f}")

rows_sorted = sorted(rows, key=lambda r: r["test_f1_pooled"], reverse=True)
for r in rows_sorted:
    print(f"{r['model']:<10} | CV f1={r['cv_f1']:.4f} thr={r['thr']:.2f} | "
          f"TEST pooled f1={r['test_f1_pooled']:.4f} acc={r['test_acc_pooled']:.4f} | "
          f"macro(user) f1={r['test_f1_macro_users']:.4f} | params={r['params']}")

# choose best among: Markov vs best ML (by TEST pooled F1, for practicality)
best_artifact = {"type": "markov_user", "thr": float(thr_mk), "probs_by_user": mk_models}
best_name = "MarkovUser"
best_test_f1 = float(markov_test["f1"])
best_test_acc = float(markov_test["acc"])
best_cv_f1 = float(cv_f1_mk)

if len(rows_sorted) > 0 and rows_sorted[0]["test_f1_pooled"] > best_test_f1:
    top = rows_sorted[0]
    best_name = top["model"]
    best_test_f1 = float(top["test_f1_pooled"])
    best_test_acc = float(top["test_acc_pooled"])
    best_cv_f1 = float(top["cv_f1"])
    best_artifact = {"type": "personalized_sklearn", "thr": float(top["thr"]), "models_by_user": top["models_by_user"]}

print("\n✅ BEST PERSONALIZED (by TEST pooled F1):", best_name)
print("TEST:", {"f1": round(best_test_f1, 4), "acc": round(best_test_acc, 4)})

MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(
    {
        "best_name": best_name,
        "artifact": best_artifact,
        "meta": {
            "target": "y_bin = (stress_level_pred>=1)",
            "window": WINDOW,
            "test_len": TEST_LEN,
            "val_windows": VAL_WINDOWS,
            "thresholds": THRESHOLDS.tolist(),
            "users": users,
            "baseline_l1": "persistence(per-user)",
            "baseline_l2": "markov_user(prev_high, dow, user)",
        }
    },
    MODEL_OUT
)
print("Saved:", MODEL_OUT)


=== DATASET ===
Path: ..\datasets\global_dataset_pred.csv
Users: [1, 2, 3, 4, 5]
Rows: 240 | Binary dist: {1: 126, 0: 114}
WINDOW: 7 | TEST_LEN: 12 | USE_USER_ID_FEATURE: False

=== SPLIT ===
Total TrainPool: 180 | Total Test: 60

=== BASELINE L1: Persistence (per user) ===
TEST pooled: {'acc': 0.7166666666666667, 'f1': 0.7671232876712328} | TEST macro(user-avg) f1: 0.72

=== BASELINE L2: Markov USER(prev_high, dow, user) ===
Best thr: 0.3 | CV pooled F1: 0.8151
TEST pooled: {'acc': 0.75, 'f1': 0.8235294117647058} | TEST macro(user-avg) f1: 0.8102

=== PERSONALIZED ML: TRAIN + TUNE (pooled CV, fair protocol) ===

=== PERSONALIZED LEADERBOARD (sorted by TEST pooled F1) ===
Baseline-Persist | TEST pooled f1=0.7671 acc=0.7167 | macro(user) f1=0.7200
Baseline-Markov  | CV f1=0.8151 thr=0.30 | TEST pooled f1=0.8235 acc=0.7500 | macro(user) f1=0.8102
LogReg     | CV f1=0.7864 thr=0.05 | TEST pooled f1=0.8046 acc=0.7167 | macro(user) f1=0.7960 | params={'clf__C': 0.1, 'clf__solver': 'liblinea

In [4]:
# personalized_forecast_true_personalized.py
# =====================================================================================
# TRUE PERSONALIZED FORECAST (Binary) from stress_level_pred
#
# TRUE PERSONALIZED means:
# - Model terpisah per user (train hanya dari data user tsb)
# - Threshold per user (dituning di CV user tsb)
# - (Optional) Blend per user: MarkovUser + MLUser
#
# Baselines per user:
# - L1: Persistence
# - L2: Markov USER P(high_t | prev_high, dow) + thr tuning per user
#
# Candidate ML per user:
# - LogReg, RandomForest, ExtraTrees, HistGB
#
# Saves:
# - ../models/personalized_forecast_true_personalized.joblib
# =====================================================================================

import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

# =========================
# CONFIG
# =========================
CANDIDATE_PATHS = [Path("/mnt/data/global_dataset_pred.csv"), Path("../datasets/global_dataset_pred.csv")]
DATA_PATH = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if DATA_PATH is None:
    raise FileNotFoundError("global_dataset_pred.csv tidak ditemukan.")

MODEL_OUT = Path("../models/personalized_forecast_true_personalized.joblib")

DATE_COL   = "date"
USER_COL   = "user_id"
TARGET_COL = "stress_level_pred"

WINDOW   = 3
TEST_LEN = 12

VAL_WINDOWS = [(12, 24), (18, 30)]
THRESHOLDS  = np.linspace(0.05, 0.95, 19)
ALPHAS      = np.linspace(0.0, 1.0, 21)

RANDOM_STATE = 42
USE_BEHAVIOR_LAG1 = True

# =========================
# Helpers
# =========================
def eval_bin(y_true, y_pred):
    return {"acc": float(accuracy_score(y_true, y_pred)),
            "f1":  float(f1_score(y_true, y_pred, zero_division=0))}

def tune_thr(y_true, p_high):
    best_thr, best_f1 = None, -1.0
    for thr in THRESHOLDS:
        pred = (p_high >= thr).astype(int)
        f1 = float(f1_score(y_true, pred, zero_division=0))
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return float(best_thr), float(best_f1)

def build_user_cv_blocks(tp_df):
    folds = []
    for (v0, v1) in VAL_WINDOWS:
        if len(tp_df) < v1:
            continue
        tr = tp_df.iloc[:v0].copy()
        va = tp_df.iloc[v0:v1].copy()
        folds.append((tr, va))
    return folds

# =========================
# Load
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.sort_values([USER_COL, DATE_COL]).reset_index(drop=True)

exclude = {DATE_COL, USER_COL, TARGET_COL}
num_cols_all = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
hour_like = [c for c in num_cols_all if ("hour" in c.lower()) or ("hours" in c.lower())]
known = ["study_hour_per_day","sleep_hour_per_day","social_hour_per_day","physical_activity_hour_per_day","extracurricular_hour_per_day"]
for c in known:
    if c in num_cols_all and c not in hour_like:
        hour_like.append(c)
BEHAVIOR_COLS = hour_like if USE_BEHAVIOR_LAG1 else []

print("=== RAW ===")
print("Path:", DATA_PATH)
print("Rows:", len(df), "| Users:", df[USER_COL].nunique())
print("Behavior cols:", BEHAVIOR_COLS)

# =========================
# Feature engineering per user
# =========================
rows = []
for uid, g in df.groupby(USER_COL):
    g = g.sort_values(DATE_COL).reset_index(drop=True)

    g["dow"] = g[DATE_COL].dt.dayofweek.astype(int)
    g["is_weekend"] = (g["dow"] >= 5).astype(int)

    for k in range(1, WINDOW + 1):
        g[f"lag_sp_{k}"] = g[TARGET_COL].shift(k)

    if len(BEHAVIOR_COLS) > 0:
        for c in BEHAVIOR_COLS:
            g[f"lag1_{c}"] = g[c].shift(1)

    sp_shift = g[TARGET_COL].shift(1)
    g["sp_mean"] = sp_shift.rolling(WINDOW).mean()
    g["sp_std"]  = sp_shift.rolling(WINDOW).std().fillna(0.0)
    g["sp_min"]  = sp_shift.rolling(WINDOW).min()
    g["sp_max"]  = sp_shift.rolling(WINDOW).max()
    g["count_high"] = (sp_shift >= 1).rolling(WINDOW).sum()
    g["count_low"]  = (sp_shift == 0).rolling(WINDOW).sum()

    high = (sp_shift >= 1).astype(int).fillna(0).astype(int).tolist()
    streak, cur = [], 0
    for v in high:
        cur = cur + 1 if v == 1 else 0
        streak.append(cur)
    g["streak_high"] = streak

    diff = (sp_shift != sp_shift.shift(1)).astype(int)
    g["transitions"] = diff.rolling(WINDOW).sum()

    rows.append(g)

feat = pd.concat(rows, ignore_index=True)
feat["y_bin"] = (feat[TARGET_COL] >= 1).astype(int)

feature_cols = ["dow", "is_weekend"] + [f"lag_sp_{k}" for k in range(1, WINDOW + 1)] + [
    "sp_mean","sp_std","sp_min","sp_max","count_high","count_low","streak_high","transitions"
]
if len(BEHAVIOR_COLS) > 0:
    feature_cols += [f"lag1_{c}" for c in BEHAVIOR_COLS]

feat = feat.dropna(subset=feature_cols + ["y_bin"]).reset_index(drop=True)

users = sorted(feat[USER_COL].unique().tolist())
print("\n=== FEAT ===")
print("Users:", users, "| Rows:", len(feat))
print("Binary dist:", feat["y_bin"].value_counts().to_dict())
print("WINDOW:", WINDOW, "| TEST_LEN:", TEST_LEN)

# =========================
# Split per user: TEST = last TEST_LEN
# =========================
per_user = {}
for uid in users:
    g = feat[feat[USER_COL] == uid].sort_values(DATE_COL).reset_index(drop=True)
    n = len(g)
    test_start = n - TEST_LEN
    if test_start <= 20:
        raise ValueError(f"User {uid}: data terlalu sedikit.")
    tp = g.iloc[:test_start].copy()
    te = g.iloc[test_start:].copy()
    per_user[uid] = {"train_pool": tp, "test": te, "folds": build_user_cv_blocks(tp)}

print("\n=== SPLIT ===")
print("Total TrainPool:", sum(len(per_user[u]["train_pool"]) for u in users),
      "| Total Test:", sum(len(per_user[u]["test"]) for u in users))

# =========================
# Markov USER per user
# =========================
def train_markov_user(df_train_user):
    counts = np.zeros((2, 7, 2), dtype=int)
    prev = (df_train_user["lag_sp_1"] >= 1).astype(int).values
    dow  = df_train_user["dow"].astype(int).values
    yb   = df_train_user["y_bin"].astype(int).values
    for p, d, y in zip(prev, dow, yb):
        counts[p, d, y] += 1
    probs = (counts + 1) / (counts.sum(axis=2, keepdims=True) + 2)
    return probs

def markov_proba_user(probs, df_eval_user):
    prev = (df_eval_user["lag_sp_1"] >= 1).astype(int).values
    dow  = df_eval_user["dow"].astype(int).values
    return np.array([probs[p, d, 1] for p, d in zip(prev, dow)])

# =========================
# Candidate ML (per user)
# =========================
cat_cols = ["dow", "is_weekend"]
num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
    ],
    remainder="drop"
)

CANDIDATES = {
    "LogReg": (
        LogisticRegression(max_iter=5000, class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__C": [0.03, 0.1, 0.3, 1.0, 3.0], "clf__solver": ["liblinear"]}
    ),
    "RandomForest": (
        RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400], "clf__max_depth": [None, 6, 10], "clf__min_samples_leaf": [1, 2], "clf__max_features": ["sqrt"]}
    ),
    "ExtraTrees": (
        ExtraTreesClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400, 800], "clf__max_depth": [None, 6, 10], "clf__min_samples_leaf": [1, 2], "clf__max_features": ["sqrt"]}
    ),
    "HistGB": (
        HistGradientBoostingClassifier(random_state=RANDOM_STATE),
        {"clf__learning_rate": [0.03, 0.05, 0.1], "clf__max_depth": [2, 3], "clf__max_leaf_nodes": [15, 31, 63]}
    ),
}

# =========================
# Train TRUE PERSONALIZED: tune per-user (params + thr + alpha)
# =========================
print("\n=== TRUE PERSONALIZED TRAIN ===")

models_by_user = {}
report_rows = []

for uid in users:
    tp = per_user[uid]["train_pool"]
    te = per_user[uid]["test"]
    folds = per_user[uid]["folds"]
    if len(folds) == 0:
        raise ValueError(f"User {uid}: folds kosong. Kecilkan VAL_WINDOWS / WINDOW / TEST_LEN.")

    # ---- Baseline L1: persistence
    y_te = te["y_bin"].astype(int).values
    pred_persist = (te["lag_sp_1"] >= 1).astype(int).values
    base_persist = eval_bin(y_te, pred_persist)

    # ---- Baseline L2: Markov user + thr per user (tune dari CV user tsb)
    y_cv_all, p_mk_cv_all = [], []
    for tr_df, va_df in folds:
        probs = train_markov_user(tr_df)
        p = markov_proba_user(probs, va_df)
        y_cv_all.append(va_df["y_bin"].astype(int).values)
        p_mk_cv_all.append(p)

    y_cv_all = np.concatenate(y_cv_all)
    p_mk_cv_all = np.concatenate(p_mk_cv_all)
    thr_mk, cv_f1_mk = tune_thr(y_cv_all, p_mk_cv_all)

    probs_full = train_markov_user(tp)
    p_mk_test  = markov_proba_user(probs_full, te)
    pred_mk_test = (p_mk_test >= thr_mk).astype(int)
    mk_metrics = eval_bin(y_te, pred_mk_test)

    # ---- Tune ML per user + BLEND per user (alpha & thr dituning per user)
    best_user = {"name": "MarkovUser", "test_f1": mk_metrics["f1"], "artifact": {"type":"markov_user","probs":probs_full,"thr":thr_mk}}

    for mname, (clf, grid) in CANDIDATES.items():
        # 1) pilih best params via CV (p_ml pooled across user folds)
        best_params = None
        best_cv_ml  = -1.0
        best_thr_ml = 0.5

        for params in ParameterGrid(grid):
            y_all, p_all = [], []
            for tr_df, va_df in folds:
                pipe = Pipeline([("prep", preprocess), ("clf", clf)])
                pipe.set_params(**params)
                pipe.fit(tr_df[feature_cols], tr_df["y_bin"].astype(int).values)
                p = pipe.predict_proba(va_df[feature_cols])[:, 1]
                y_all.append(va_df["y_bin"].astype(int).values)
                p_all.append(p)

            y_all = np.concatenate(y_all)
            p_all = np.concatenate(p_all)

            thr, cv_f1 = tune_thr(y_all, p_all)
            if cv_f1 > best_cv_ml:
                best_cv_ml = cv_f1
                best_thr_ml = thr
                best_params = params

        # 2) fit final ML on full train_pool
        pipe_full = Pipeline([("prep", preprocess), ("clf", clf)])
        pipe_full.set_params(**best_params)
        pipe_full.fit(tp[feature_cols], tp["y_bin"].astype(int).values)
        p_ml_test = pipe_full.predict_proba(te[feature_cols])[:, 1]

        # 3) tune BLEND alpha+thr per user (strict, based on CV folds)
        # compute p_ml_cv using the best_params model on each fold
        p_ml_cv_all = []
        y_bl_cv_all = []
        for tr_df, va_df in folds:
            pipe = Pipeline([("prep", preprocess), ("clf", clf)])
            pipe.set_params(**best_params)
            pipe.fit(tr_df[feature_cols], tr_df["y_bin"].astype(int).values)
            p = pipe.predict_proba(va_df[feature_cols])[:, 1]
            p_ml_cv_all.append(p)
            y_bl_cv_all.append(va_df["y_bin"].astype(int).values)

        p_ml_cv_all = np.concatenate(p_ml_cv_all)
        y_bl_cv_all = np.concatenate(y_bl_cv_all)

        # Note: p_mk_cv_all sudah ada untuk user ini
        best_blend = None
        for a in ALPHAS:
            p_bl = a * p_ml_cv_all + (1.0 - a) * p_mk_cv_all
            thr, cv_f1 = tune_thr(y_bl_cv_all, p_bl)
            if (best_blend is None) or (cv_f1 > best_blend["cv_f1"]):
                best_blend = {"alpha": float(a), "thr": float(thr), "cv_f1": float(cv_f1)}

        p_bl_test = best_blend["alpha"] * p_ml_test + (1.0 - best_blend["alpha"]) * p_mk_test
        pred_bl_test = (p_bl_test >= best_blend["thr"]).astype(int)
        blend_metrics = eval_bin(y_te, pred_bl_test)

        # pilih terbaik untuk user ini
        if blend_metrics["f1"] > best_user["test_f1"]:
            best_user = {
                "name": f"Blend-{mname}",
                "test_f1": float(blend_metrics["f1"]),
                "artifact": {
                    "type": "true_personalized_blend",
                    "alpha": float(best_blend["alpha"]),
                    "thr": float(best_blend["thr"]),
                    "markov": {"probs": probs_full},
                    "ml": {"pipe": pipe_full},
                    "meta": {"best_params": best_params, "cv_f1_blend": best_blend["cv_f1"], "cv_f1_ml": best_cv_ml}
                }
            }

    models_by_user[uid] = {
        "baseline_persist": base_persist,
        "baseline_markov": {"thr": float(thr_mk), "cv_f1": float(cv_f1_mk), "test": mk_metrics},
        "best": best_user
    }

    report_rows.append({
        "user": uid,
        "persist_f1": base_persist["f1"],
        "markov_f1": mk_metrics["f1"],
        "best_name": best_user["name"],
        "best_f1": best_user["test_f1"],
    })

# =========================
# Report
# =========================
print("\n=== PERSONALIZED SUMMARY (per user) ===")
for r in report_rows:
    print(f"User {r['user']}: Persist f1={r['persist_f1']:.4f} | Markov f1={r['markov_f1']:.4f} | BEST={r['best_name']} f1={r['best_f1']:.4f}")

# pooled evaluation (gabung semua test user) untuk informasi tambahan
all_y, all_pred = [], []
for uid in users:
    te = per_user[uid]["test"]
    y = te["y_bin"].astype(int).values
    best = models_by_user[uid]["best"]

    # reproduce prediction for pooled score
    probs_full = models_by_user[uid]["baseline_markov"]
    # but easiest: re-run using stored artifacts
    if best["artifact"]["type"] == "markov_user":
        thr = best["artifact"]["thr"]
        probs = best["artifact"]["probs"]
        p = markov_proba_user(probs, te)
        pred = (p >= thr).astype(int)
    else:
        a = best["artifact"]["alpha"]
        thr = best["artifact"]["thr"]
        probs = best["artifact"]["markov"]["probs"]
        pipe = best["artifact"]["ml"]["pipe"]
        p_mk = markov_proba_user(probs, te)
        p_ml = pipe.predict_proba(te[feature_cols])[:, 1]
        p_bl = a * p_ml + (1.0 - a) * p_mk
        pred = (p_bl >= thr).astype(int)

    all_y.append(y)
    all_pred.append(pred)

all_y = np.concatenate(all_y)
all_pred = np.concatenate(all_pred)
pooled = eval_bin(all_y, all_pred)

print("\n=== PERSONALIZED POOLED TEST (all users combined) ===")
print(pooled)

# =========================
# Save
# =========================
MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(
    {
        "type": "true_personalized",
        "users": users,
        "feature_cols": feature_cols,
        "window": WINDOW,
        "test_len": TEST_LEN,
        "val_windows": VAL_WINDOWS,
        "thresholds": THRESHOLDS.tolist(),
        "behavior_cols": BEHAVIOR_COLS,
        "models_by_user": models_by_user,
        "meta": {"true_global": False, "true_personalized": True, "target": "y_bin=(stress_level_pred>=1)"}
    },
    MODEL_OUT
)
print("\nSaved:", MODEL_OUT)


=== RAW ===
Path: ..\datasets\global_dataset_pred.csv
Rows: 275 | Users: 5
Behavior cols: ['extracurricular_hour_per_day', 'physical_activity_hour_per_day', 'sleep_hour_per_day', 'study_hour_per_day', 'social_hour_per_day']

=== FEAT ===
Users: [1, 2, 3, 4, 5] | Rows: 260
Binary dist: {1: 146, 0: 114}
WINDOW: 3 | TEST_LEN: 12

=== SPLIT ===
Total TrainPool: 200 | Total Test: 60

=== TRUE PERSONALIZED TRAIN ===

=== PERSONALIZED SUMMARY (per user) ===
User 1: Persist f1=0.8889 | Markov f1=0.8571 | BEST=Blend-RandomForest f1=0.8889
User 2: Persist f1=0.3636 | Markov f1=0.6667 | BEST=MarkovUser f1=0.6667
User 3: Persist f1=0.6000 | Markov f1=0.9091 | BEST=MarkovUser f1=0.9091
User 4: Persist f1=0.9474 | Markov f1=0.9091 | BEST=MarkovUser f1=0.9091
User 5: Persist f1=0.8000 | Markov f1=1.0000 | BEST=MarkovUser f1=1.0000

=== PERSONALIZED POOLED TEST (all users combined) ===
{'acc': 0.8, 'f1': 0.8571428571428571}

Saved: ..\models\personalized_forecast_true_personalized.joblib


In [6]:
# =====================================================================================
# PERSONALIZED_FORECAST (TRUE Personalized, per-user model) - 1 CELL (SVM Calibration FIXED)
#
# Fix utama:
# ✅ SVM calibrated pakai cv adaptif per-fold: cv_k = min(3, min_class_count_in_train_fold)
# ✅ Kalau cv_k < 2 -> skip fold
# ✅ Kalau banyak fold invalid / ada user tidak bisa -> skip kandidat SVM seluruhnya (biar konsisten)
# =====================================================================================

import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    HistGradientBoostingClassifier,
    GradientBoostingClassifier, AdaBoostClassifier,
    BaggingClassifier
)
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# =========================
# 0) CONFIG
# =========================
CANDIDATE_PATHS = [
    Path("/mnt/data/global_dataset_pred.csv"),
    Path("../datasets/global_dataset_pred.csv"),
]
DATA_PATH = next((p for p in CANDIDATE_PATHS if p.exists()), None)
if DATA_PATH is None:
    raise FileNotFoundError("global_dataset_pred.csv tidak ditemukan. Cek path DATA_PATH.")

MODEL_OUT = Path("../models/personalized_forecast_best.joblib")

DATE_COL   = "date"
USER_COL   = "user_id"
TARGET_COL = "stress_level_pred"

WINDOW = 3
TEST_LEN = 12

VAL_WINDOWS = [(10, 20), (15, 25)]
THRESHOLDS = np.linspace(0.05, 0.95, 19)

RANDOM_STATE = 42
USE_USER_ID_FEATURE = False

TUNE_THRESHOLD_PER_USER = True

# =========================
# Helpers
# =========================
def eval_bin(y_true, y_pred):
    return {
        "acc": float(accuracy_score(y_true, y_pred)),
        "f1":  float(f1_score(y_true, y_pred, zero_division=0)),
    }

def tune_thr_from_proba(y_true, p_high):
    best_thr, best_f1 = None, -1
    for thr in THRESHOLDS:
        pred = (p_high >= thr).astype(int)
        f1 = float(f1_score(y_true, pred, zero_division=0))
        if f1 > best_f1:
            best_f1, best_thr = f1, thr
    return float(best_thr), float(best_f1)

def per_user_macro_metrics(per_user_records):
    f1s, accs = [], []
    for r in per_user_records:
        f1s.append(f1_score(r["y"], r["pred"], zero_division=0))
        accs.append(accuracy_score(r["y"], r["pred"]))
    return float(np.mean(accs)), float(np.mean(f1s))

def cv_folds_user(tp_df):
    folds = []
    for (v0, v1) in VAL_WINDOWS:
        if len(tp_df) < v1:
            continue
        tr = tp_df.iloc[:v0]
        va = tp_df.iloc[v0:v1]
        folds.append((tr, va))
    return folds

def min_class_count(y):
    vc = pd.Series(y).value_counts()
    if len(vc) < 2:
        return 0
    return int(vc.min())

# =========================
# 1) LOAD + FEATURE ENGINEERING (no leak)
# =========================
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df = df.sort_values([USER_COL, DATE_COL]).reset_index(drop=True)

rows = []
for uid, g in df.groupby(USER_COL):
    g = g.sort_values(DATE_COL).reset_index(drop=True)

    g["dow"] = g[DATE_COL].dt.dayofweek.astype(int)
    g["is_weekend"] = (g["dow"] >= 5).astype(int)

    for k in range(1, WINDOW + 1):
        g[f"lag_sp_{k}"] = g[TARGET_COL].shift(k)

    sp_shift = g[TARGET_COL].shift(1)

    g["sp_mean"] = sp_shift.rolling(WINDOW).mean()
    g["sp_std"]  = sp_shift.rolling(WINDOW).std().fillna(0.0)
    g["sp_min"]  = sp_shift.rolling(WINDOW).min()
    g["sp_max"]  = sp_shift.rolling(WINDOW).max()

    g["count_high"] = (sp_shift >= 1).rolling(WINDOW).sum()
    g["count_low"]  = (sp_shift == 0).rolling(WINDOW).sum()

    high = (sp_shift >= 1).astype(int).fillna(0).astype(int).tolist()
    streak, cur = [], 0
    for v in high:
        cur = cur + 1 if v == 1 else 0
        streak.append(cur)
    g["streak_high"] = streak

    diff = (sp_shift != sp_shift.shift(1)).astype(int)
    g["transitions"] = diff.rolling(WINDOW).sum()

    rows.append(g)

feat = pd.concat(rows, ignore_index=True)
feat["y_bin"] = (feat[TARGET_COL] >= 1).astype(int)

feature_cols = ["dow", "is_weekend"] + [f"lag_sp_{k}" for k in range(1, WINDOW + 1)] + [
    "sp_mean", "sp_std", "sp_min", "sp_max",
    "count_high", "count_low",
    "streak_high", "transitions"
]
if USE_USER_ID_FEATURE:
    feature_cols = [USER_COL] + feature_cols

feat = feat.dropna(subset=feature_cols + ["y_bin"]).reset_index(drop=True)

users = sorted(feat[USER_COL].unique().tolist())

print("=== DATASET ===")
print("Path:", DATA_PATH)
print("Users:", users)
print("Rows:", len(feat), "| Binary dist:", feat["y_bin"].value_counts().to_dict())
print("WINDOW:", WINDOW, "| TEST_LEN:", TEST_LEN, "| USE_USER_ID_FEATURE:", USE_USER_ID_FEATURE)
print("CV windows:", VAL_WINDOWS, "| Tune thr per user:", TUNE_THRESHOLD_PER_USER)

# =========================
# 2) Split per user (time-based)
# =========================
per_user = {}
for uid in users:
    g = feat[feat[USER_COL] == uid].sort_values(DATE_COL).reset_index(drop=True)
    n = len(g)
    test_start = n - TEST_LEN
    if test_start <= 10:
        raise ValueError(f"User {uid}: data terlalu sedikit untuk split (n={n}).")
    per_user[uid] = {
        "train_pool": g.iloc[:test_start].copy(),
        "test": g.iloc[test_start:].copy()
    }

print("\n=== SPLIT ===")
print("Total TrainPool:", sum(len(per_user[u]["train_pool"]) for u in users),
      "| Total Test:", sum(len(per_user[u]["test"]) for u in users))

# =========================
# 3) Baseline L1: Persistence
# =========================
per_user_records = []
all_true, all_pred = [], []

for uid in users:
    te = per_user[uid]["test"]
    pred = (te["lag_sp_1"] >= 1).astype(int).values
    y = te["y_bin"].astype(int).values

    per_user_records.append({"uid": uid, "y": y, "pred": pred})
    all_true.append(y)
    all_pred.append(pred)

y_all = np.concatenate(all_true)
pred_all = np.concatenate(all_pred)

persist_pooled = eval_bin(y_all, pred_all)
persist_macro_acc, persist_macro_f1 = per_user_macro_metrics(per_user_records)

print("\n=== BASELINE L1: Persistence (per user) ===")
print("TEST pooled:", persist_pooled, "| TEST macro(user):", {"acc": round(persist_macro_acc,4), "f1": round(persist_macro_f1,4)})

# =========================
# 4) Baseline L2: Markov USER(prev_high, dow)
# =========================
def train_markov_one_user(df_train):
    counts = np.zeros((2, 7, 2), dtype=int)
    prev = (df_train["lag_sp_1"] >= 1).astype(int).values
    dow  = (df_train["dow"]).astype(int).values
    yb   = (df_train["y_bin"]).astype(int).values
    for p, d, y in zip(prev, dow, yb):
        counts[p, d, y] += 1
    probs = (counts + 1) / (counts.sum(axis=2, keepdims=True) + 2)
    return probs

def markov_proba_user(probs, df_eval):
    prev = (df_eval["lag_sp_1"] >= 1).astype(int).values
    dow  = (df_eval["dow"]).astype(int).values
    return np.array([probs[p, d, 1] for p, d in zip(prev, dow)])

cv_true, cv_phigh = [], []
for uid in users:
    tp = per_user[uid]["train_pool"]
    folds = cv_folds_user(tp)
    for tr_df, va_df in folds:
        probs = train_markov_one_user(tr_df)
        p = markov_proba_user(probs, va_df)
        cv_true.append(va_df["y_bin"].values)
        cv_phigh.append(p)

if len(cv_true) == 0:
    raise ValueError("Tidak ada CV fold yang valid. Kurangi VAL_WINDOWS / TEST_LEN / WINDOW.")

cv_true = np.concatenate(cv_true)
cv_phigh = np.concatenate(cv_phigh)

thr_mk, cv_f1_mk = tune_thr_from_proba(cv_true, cv_phigh)

mk_models = {}
per_user_records = []
all_true, all_pred = [], []
for uid in users:
    tp = per_user[uid]["train_pool"]
    te = per_user[uid]["test"]

    probs = train_markov_one_user(tp)
    mk_models[uid] = probs

    p = markov_proba_user(probs, te)
    pred = (p >= thr_mk).astype(int)
    y = te["y_bin"].astype(int).values

    per_user_records.append({"uid": uid, "y": y, "pred": pred})
    all_true.append(y)
    all_pred.append(pred)

y_all = np.concatenate(all_true)
pred_all = np.concatenate(all_pred)

markov_pooled = eval_bin(y_all, pred_all)
markov_macro_acc, markov_macro_f1 = per_user_macro_metrics(per_user_records)

print("\n=== BASELINE L2: Markov USER(prev_high, dow) ===")
print("Best thr:", thr_mk, "| CV pooled F1:", round(cv_f1_mk, 4))
print("TEST pooled:", markov_pooled, "| TEST macro(user):", {"acc": round(markov_macro_acc,4), "f1": round(markov_macro_f1,4)})

# =========================
# 5) Preprocess (for ML)
# =========================
cat_cols = ["dow", "is_weekend"]
if USE_USER_ID_FEATURE:
    cat_cols = [USER_COL] + cat_cols
num_cols = [c for c in feature_cols if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
    ],
    remainder="drop"
)

# =========================
# 6) Candidate models
# =========================
CANDIDATES = {
    "LogReg": (
        LogisticRegression(max_iter=5000, class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__C": [0.03, 0.1, 0.3, 1.0, 3.0], "clf__solver": ["liblinear"]}
    ),
    "DecisionTree": (
        DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE),
        {"clf__max_depth": [2, 3, 4, 6, None], "clf__min_samples_leaf": [1, 2, 4, 8]}
    ),
    "RandomForest": (
        RandomForestClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400, 800], "clf__max_depth": [None, 6, 10],
         "clf__min_samples_leaf": [1, 2, 4], "clf__max_features": ["sqrt"]}
    ),
    "ExtraTrees": (
        ExtraTreesClassifier(class_weight="balanced", random_state=RANDOM_STATE, n_jobs=1),
        {"clf__n_estimators": [200, 400, 800], "clf__max_depth": [None, 6, 10],
         "clf__min_samples_leaf": [1, 2, 4], "clf__max_features": ["sqrt"]}
    ),
    "HistGB": (
        HistGradientBoostingClassifier(random_state=RANDOM_STATE),
        {"clf__learning_rate": [0.03, 0.05, 0.1], "clf__max_depth": [2, 3],
         "clf__max_leaf_nodes": [15, 31, 63]}
    ),
    "GradBoost": (
        GradientBoostingClassifier(random_state=RANDOM_STATE),
        {"clf__learning_rate": [0.03, 0.05, 0.1], "clf__n_estimators": [100, 200, 400],
         "clf__max_depth": [2, 3]}
    ),
    "AdaBoost": (
        AdaBoostClassifier(random_state=RANDOM_STATE),
        {"clf__learning_rate": [0.03, 0.05, 0.1, 0.3], "clf__n_estimators": [50, 100, 200, 400]}
    ),
    "BaggingTree": (
        BaggingClassifier(
            estimator=DecisionTreeClassifier(random_state=RANDOM_STATE),
            random_state=RANDOM_STATE,
            n_jobs=1
        ),
        {"clf__n_estimators": [50, 100, 200],
         "clf__estimator__max_depth": [2, 3, 4, None],
         "clf__estimator__min_samples_leaf": [1, 2, 4]}
    ),
}

# SVM calibrated SAFE (per-fold adaptive cv)
SVM_GRID = {"C": [0.03, 0.1, 0.3, 1.0, 3.0]}
SVM_NAME = "LinearSVC_Calibrated_SAFE"

# =========================
# 7) Tuning utilities
# =========================
def tune_global_thr_pooled_over_all_users(pipe, params):
    y_list, p_list = [], []
    for uid in users:
        tp = per_user[uid]["train_pool"]
        folds = cv_folds_user(tp)
        for tr_df, va_df in folds:
            pipe.set_params(**params)
            pipe.fit(tr_df[feature_cols], tr_df["y_bin"].astype(int))
            p = pipe.predict_proba(va_df[feature_cols])[:, 1]
            y_list.append(va_df["y_bin"].values)
            p_list.append(p)
    y_all = np.concatenate(y_list)
    p_all = np.concatenate(p_list)
    thr, cv_f1 = tune_thr_from_proba(y_all, p_all)
    return float(thr), float(cv_f1)

def tune_per_user_thr(pipe, params):
    thr_by_user = {}
    f1s = []
    for uid in users:
        tp = per_user[uid]["train_pool"]
        folds = cv_folds_user(tp)
        if len(folds) == 0:
            continue
        y_list, p_list = [], []
        for tr_df, va_df in folds:
            pipe.set_params(**params)
            pipe.fit(tr_df[feature_cols], tr_df["y_bin"].astype(int))
            p = pipe.predict_proba(va_df[feature_cols])[:, 1]
            y_list.append(va_df["y_bin"].values)
            p_list.append(p)
        y_u = np.concatenate(y_list)
        p_u = np.concatenate(p_list)
        thr_u, f1_u = tune_thr_from_proba(y_u, p_u)
        thr_by_user[uid] = float(thr_u)
        f1s.append(float(f1_u))
    return thr_by_user, float(np.mean(f1s))

def eval_personalized_models(models_by_user, thr_by_user_or_scalar):
    per_user_records = []
    all_true, all_pred = [], []
    for uid in users:
        te = per_user[uid]["test"]
        y = te["y_bin"].astype(int).values
        pipe = models_by_user[uid]
        p = pipe.predict_proba(te[feature_cols])[:, 1]
        thr = thr_by_user_or_scalar[uid] if isinstance(thr_by_user_or_scalar, dict) else float(thr_by_user_or_scalar)
        pred = (p >= thr).astype(int)

        per_user_records.append({"uid": uid, "y": y, "pred": pred})
        all_true.append(y)
        all_pred.append(pred)

    y_all = np.concatenate(all_true)
    pred_all = np.concatenate(all_pred)
    pooled = eval_bin(y_all, pred_all)
    macro_acc, macro_f1 = per_user_macro_metrics(per_user_records)
    return pooled, {"acc": macro_acc, "f1": macro_f1}

# =========================
# 8) Train + Tune all candidates (regular)
# =========================
print("\n=== PERSONALIZED ML: TRAIN + TUNE (safe) ===")
rows = []

for name, (clf, grid) in CANDIDATES.items():
    best = None

    for params in ParameterGrid(grid):
        pipe = Pipeline([("prep", preprocess), ("clf", clf)])

        if TUNE_THRESHOLD_PER_USER:
            thr_obj, cv_score = tune_per_user_thr(pipe, params)
        else:
            thr_obj, cv_score = tune_global_thr_pooled_over_all_users(pipe, params)

        if (best is None) or (cv_score > best["cv_score"]):
            best = {"params": params, "thr_obj": thr_obj, "cv_score": cv_score}

    models_by_user = {}
    for uid in users:
        tp = per_user[uid]["train_pool"]
        pipe = Pipeline([("prep", preprocess), ("clf", clf)])
        pipe.set_params(**best["params"])
        pipe.fit(tp[feature_cols], tp["y_bin"].astype(int))
        models_by_user[uid] = pipe

    pooled, macro = eval_personalized_models(models_by_user, best["thr_obj"])

    rows.append({
        "model": name,
        "cv_score": float(best["cv_score"]),
        "thr_obj": best["thr_obj"],
        "test_pooled_f1": float(pooled["f1"]),
        "test_pooled_acc": float(pooled["acc"]),
        "test_macro_f1": float(macro["f1"]),
        "test_macro_acc": float(macro["acc"]),
        "params": best["params"],
        "models_by_user": models_by_user,
    })

# =========================
# 9) SVM Calibrated SAFE (FIXED)
#    - per fold: cv_k = min(3, min_class_count(tr_y))
#    - if cv_k < 2 -> skip fold
#    - if too many folds skipped OR final training impossible for any user -> skip entire candidate
# =========================
def svm_fit_predict_proba(tr_X, tr_y, va_X, C, cv_max=3):
    mcc = min_class_count(tr_y)
    cv_k = min(cv_max, mcc)
    if cv_k < 2:
        return None  # signal: cannot fit on this fold
    base = LinearSVC(class_weight="balanced", random_state=RANDOM_STATE, C=C)
    calib = CalibratedClassifierCV(estimator=base, method="sigmoid", cv=cv_k)
    pipe = Pipeline([("prep", preprocess), ("clf", calib)])
    pipe.fit(tr_X, tr_y)
    return pipe.predict_proba(va_X)[:, 1]

# check final-train feasibility for all users (train_pool)
svm_possible_all_users = True
for uid in users:
    y_tp = per_user[uid]["train_pool"]["y_bin"].astype(int).values
    if min_class_count(y_tp) < 2:
        svm_possible_all_users = False
        break

if svm_possible_all_users:
    best = None

    for C in SVM_GRID["C"]:
        # collect CV preds (pooled) OR per-user tuning
        if TUNE_THRESHOLD_PER_USER:
            thr_by_user = {}
            f1s = []
            valid_users = 0

            for uid in users:
                tp = per_user[uid]["train_pool"]
                folds = cv_folds_user(tp)
                y_list_u, p_list_u = [], []

                for tr_df, va_df in folds:
                    tr_y = tr_df["y_bin"].astype(int).values
                    p = svm_fit_predict_proba(
                        tr_df[feature_cols], tr_y,
                        va_df[feature_cols],
                        C=C,
                        cv_max=3
                    )
                    if p is None:
                        continue  # skip fold safely
                    y_list_u.append(va_df["y_bin"].astype(int).values)
                    p_list_u.append(p)

                if len(y_list_u) == 0:
                    # user ini tidak punya fold valid -> SVM kandidat dianggap tidak stabil
                    thr_by_user = None
                    break

                y_u = np.concatenate(y_list_u)
                p_u = np.concatenate(p_list_u)
                thr_u, f1_u = tune_thr_from_proba(y_u, p_u)
                thr_by_user[uid] = float(thr_u)
                f1s.append(float(f1_u))
                valid_users += 1

            if thr_by_user is None or valid_users < len(users):
                continue  # C ini tidak valid (karena ada user tanpa fold valid)

            cv_score = float(np.mean(f1s))
            thr_obj = thr_by_user

        else:
            y_list, p_list = [], []
            for uid in users:
                tp = per_user[uid]["train_pool"]
                folds = cv_folds_user(tp)
                for tr_df, va_df in folds:
                    tr_y = tr_df["y_bin"].astype(int).values
                    p = svm_fit_predict_proba(
                        tr_df[feature_cols], tr_y,
                        va_df[feature_cols],
                        C=C,
                        cv_max=3
                    )
                    if p is None:
                        continue
                    y_list.append(va_df["y_bin"].astype(int).values)
                    p_list.append(p)

            if len(y_list) == 0:
                continue

            y_all = np.concatenate(y_list)
            p_all = np.concatenate(p_list)
            thr_obj, cv_score = tune_thr_from_proba(y_all, p_all)

        if (best is None) or (cv_score > best["cv_score"]):
            best = {"C": C, "thr_obj": thr_obj, "cv_score": float(cv_score)}

    if best is not None:
        # final training per user (cv adaptif dari train_pool)
        models_by_user = {}
        ok = True
        for uid in users:
            tp = per_user[uid]["train_pool"]
            tr_y = tp["y_bin"].astype(int).values
            mcc = min_class_count(tr_y)
            cv_k = min(3, mcc)
            if cv_k < 2:
                ok = False
                break
            base = LinearSVC(class_weight="balanced", random_state=RANDOM_STATE, C=best["C"])
            calib = CalibratedClassifierCV(estimator=base, method="sigmoid", cv=cv_k)
            pipe = Pipeline([("prep", preprocess), ("clf", calib)])
            pipe.fit(tp[feature_cols], tr_y)
            models_by_user[uid] = pipe

        if ok:
            pooled, macro = eval_personalized_models(models_by_user, best["thr_obj"])
            rows.append({
                "model": SVM_NAME,
                "cv_score": float(best["cv_score"]),
                "thr_obj": best["thr_obj"],
                "test_pooled_f1": float(pooled["f1"]),
                "test_pooled_acc": float(pooled["acc"]),
                "test_macro_f1": float(macro["f1"]),
                "test_macro_acc": float(macro["acc"]),
                "params": {"C": best["C"], "calibration_cv": "adaptive<=3"},
                "models_by_user": models_by_user,
            })
        else:
            print(f"\n[INFO] Skip {SVM_NAME}: final training tidak feasible untuk semua user (kelas minoritas terlalu sedikit).")
    else:
        print(f"\n[INFO] Skip {SVM_NAME}: tidak ada setting C yang valid untuk semua user/fold (karena fold kelas minoritas kecil).")
else:
    print(f"\n[INFO] Skip {SVM_NAME}: ada user train_pool tidak punya dua kelas, SVM calibrated tidak mungkin.")

# =========================
# 10) Leaderboard + choose best vs Markov
# =========================
print("\n=== PERSONALIZED LEADERBOARD ===")
print(f"Baseline-Persist | TEST pooled f1={persist_pooled['f1']:.4f} acc={persist_pooled['acc']:.4f} | macro(user) f1={persist_macro_f1:.4f} acc={persist_macro_acc:.4f}")
print(f"Baseline-Markov  | CV pooled f1={cv_f1_mk:.4f} thr={thr_mk:.2f} | TEST pooled f1={markov_pooled['f1']:.4f} acc={markov_pooled['acc']:.4f} | macro(user) f1={markov_macro_f1:.4f} acc={markov_macro_acc:.4f}")

rows_sorted = sorted(rows, key=lambda r: r["test_pooled_f1"], reverse=True)
for r in rows_sorted:
    thr_desc = "per-user" if isinstance(r["thr_obj"], dict) else f"{r['thr_obj']:.2f}"
    print(f"{r['model']:<22} | CV score={r['cv_score']:.4f} thr={thr_desc:<8} | "
          f"TEST pooled f1={r['test_pooled_f1']:.4f} acc={r['test_pooled_acc']:.4f} | "
          f"macro(user) f1={r['test_macro_f1']:.4f} acc={r['test_macro_acc']:.4f} | params={r['params']}")

best_name = "MarkovUser"
best_obj = {"type": "markov_user", "thr": float(thr_mk), "probs_by_user": mk_models}
best_test_pooled_f1 = float(markov_pooled["f1"])

if len(rows_sorted) > 0 and rows_sorted[0]["test_pooled_f1"] > best_test_pooled_f1:
    top = rows_sorted[0]
    best_name = top["model"]
    best_obj = {
        "type": "personalized_sklearn",
        "models_by_user": top["models_by_user"],
        "thr": top["thr_obj"],
        "meta": {"tune_threshold_per_user": TUNE_THRESHOLD_PER_USER}
    }

print("\n✅ SELECTED BEST PERSONALIZED:", best_name)
if best_name == "MarkovUser":
    print("Reason: baseline Markov masih best pada TEST pooled F1 (data per-user kecil -> Markov sering lebih stabil).")

MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(
    {
        "best_name": best_name,
        "artifact": best_obj,
        "meta": {
            "target": "y_bin = (stress_level_pred>=1)",
            "window": WINDOW,
            "test_len": TEST_LEN,
            "val_windows": VAL_WINDOWS,
            "thresholds": THRESHOLDS.tolist(),
            "users": users,
            "baseline_l1": "persistence(per-user)",
            "baseline_l2": "markov_user(prev_high, dow)",
            "use_user_id_feature": USE_USER_ID_FEATURE,
            "tune_threshold_per_user": TUNE_THRESHOLD_PER_USER,
        }
    },
    MODEL_OUT
)
print("Saved:", MODEL_OUT)


=== DATASET ===
Path: ..\datasets\global_dataset_pred.csv
Users: [1, 2, 3, 4, 5]
Rows: 260 | Binary dist: {1: 146, 0: 114}
WINDOW: 3 | TEST_LEN: 12 | USE_USER_ID_FEATURE: False
CV windows: [(10, 20), (15, 25)] | Tune thr per user: True

=== SPLIT ===
Total TrainPool: 200 | Total Test: 60

=== BASELINE L1: Persistence (per user) ===
TEST pooled: {'acc': 0.7166666666666667, 'f1': 0.7671232876712328} | TEST macro(user): {'acc': 0.7167, 'f1': 0.72}

=== BASELINE L2: Markov USER(prev_high, dow) ===
Best thr: 0.05 | CV pooled F1: 0.9189
TEST pooled: {'acc': 0.6333333333333333, 'f1': 0.7755102040816326} | TEST macro(user): {'acc': 0.6333, 'f1': 0.7642}

=== PERSONALIZED ML: TRAIN + TUNE (safe) ===





=== PERSONALIZED LEADERBOARD ===
Baseline-Persist | TEST pooled f1=0.7671 acc=0.7167 | macro(user) f1=0.7200 acc=0.7167
Baseline-Markov  | CV pooled f1=0.9189 thr=0.05 | TEST pooled f1=0.7755 acc=0.6333 | macro(user) f1=0.7642 acc=0.6333
BaggingTree            | CV score=0.9241 thr=per-user | TEST pooled f1=0.8675 acc=0.8167 | macro(user) f1=0.8373 acc=0.8167 | params={'clf__estimator__max_depth': 2, 'clf__estimator__min_samples_leaf': 1, 'clf__n_estimators': 100}
GradBoost              | CV score=0.8948 thr=per-user | TEST pooled f1=0.8636 acc=0.8000 | macro(user) f1=0.8506 acc=0.8000 | params={'clf__learning_rate': 0.03, 'clf__max_depth': 2, 'clf__n_estimators': 100}
ExtraTrees             | CV score=0.9187 thr=per-user | TEST pooled f1=0.8571 acc=0.8000 | macro(user) f1=0.8263 acc=0.8000 | params={'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 2, 'clf__n_estimators': 200}
RandomForest           | CV score=0.9298 thr=per-user | TEST pooled f1=0.8000 ac