In [14]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import ParameterGrid

from xgboost import XGBClassifier

# =========================
# CONFIG
# =========================
INPUT_FILE  = "train_features_v2.parquet"
TIME_COL    = "snapshot_time"
TARGET_COL  = "target"
ID_COL      = "userId"

BUNDLE_FILE = "bundle_ensemble_v2.pkl"

RANDOM_SEEDS = [7, 42, 2025]    # 3 seeds pour un mini-ensemble

In [15]:
# =========================
# LOAD
# =========================
df = pd.read_parquet(INPUT_FILE)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")

df = df.sort_values(TIME_COL).reset_index(drop=True)
print("‚úÖ Loaded:", df.shape)
print("Unique snapshot_time:", df[TIME_COL].nunique(), "->", sorted(df[TIME_COL].unique())[:3], "...")

# train/valid = dernier snapshot_time (pour coller au test qui ressemble au plus r√©cent)
last_t0 = df[TIME_COL].max()
tr_mask = df[TIME_COL] < last_t0
va_mask = df[TIME_COL] == last_t0

print("last_t0:", last_t0)
print("train rows:", int(tr_mask.sum()), "valid rows:", int(va_mask.sum()))

y_tr = df.loc[tr_mask, TARGET_COL].astype(int)
y_va = df.loc[va_mask, TARGET_COL].astype(int)

# drop id/target/time
drop_cols = [c for c in [TARGET_COL, TIME_COL, ID_COL] if c in df.columns]
X_all = df.drop(columns=drop_cols)

# one-hot for categoricals
cat_cols = X_all.select_dtypes(include=["object","category"]).columns.tolist()
if cat_cols:
    X_all = pd.get_dummies(X_all, columns=cat_cols, dummy_na=True)

# bool -> int
for c in X_all.select_dtypes(include=["bool"]).columns:
    X_all[c] = X_all[c].astype(int)

X_all = X_all.replace([np.inf, -np.inf], np.nan).fillna(0)

X_tr = X_all.loc[tr_mask]
X_va = X_all.loc[va_mask]

print("X_tr:", X_tr.shape, "X_va:", X_va.shape)

expected_cols = list(X_all.columns)

‚úÖ Loaded: (75863, 83)
Unique snapshot_time: 5 -> [Timestamp('2018-10-11 00:00:01'), Timestamp('2018-10-18 00:00:01'), Timestamp('2018-10-25 00:00:01')] ...
last_t0: 2018-11-08 00:00:01
train rows: 60539 valid rows: 15324
X_tr: (60539, 84) X_va: (15324, 84)


In [16]:
# =========================
# Petit search cibl√© (rapide) sur XGBoost via xgboost.train (compatible vieux xgboost)
# √âval sur DERNIER snapshot + tuning du threshold sur la VALID (Balanced Accuracy)
# =========================

import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb

pos = int(y_tr.sum())
neg = int(len(y_tr) - pos)
ratio = neg / max(pos, 1)
print(f"Train pos={pos} neg={neg} ratio={ratio:.2f}")

param_grid = {
    "max_depth": [4, 5, 6],
    "min_child_weight": [1, 3, 5],
    "subsample": [0.8, 0.9],
    "colsample_bytree": [0.7, 0.85],
    "learning_rate": [0.03, 0.05],      # -> eta
    "reg_lambda": [1.0, 3.0],          # -> lambda
    "reg_alpha": [0.0, 0.1],           # -> alpha
    "scale_pos_weight": [1.0, 5.0, 10.0, float(ratio)],
}

# On √©chantillonne 25 configs
all_params = list(ParameterGrid(param_grid))
rng = np.random.default_rng(42)
sample_idx = rng.choice(len(all_params), size=min(25, len(all_params)), replace=False)
sampled = [all_params[i] for i in sample_idx]

def best_threshold_for_ba(y_true, proba):
    thresholds = np.linspace(0.05, 0.95, 181)
    scores = [balanced_accuracy_score(y_true, (proba >= t).astype(int)) for t in thresholds]
    best_i = int(np.argmax(scores))
    return float(thresholds[best_i]), float(scores[best_i])

def predict_best(booster, dmat):
    # robust cross-version
    if hasattr(booster, "best_ntree_limit") and booster.best_ntree_limit:
        return booster.predict(dmat, ntree_limit=booster.best_ntree_limit)
    if hasattr(booster, "best_iteration") and booster.best_iteration is not None:
        try:
            return booster.predict(dmat, iteration_range=(0, booster.best_iteration + 1))
        except TypeError:
            return booster.predict(dmat, ntree_limit=booster.best_iteration + 1)
    return booster.predict(dmat)

# DMatrix
dtrain = xgb.DMatrix(X_tr.values, label=y_tr.values, missing=np.nan)
dvalid = xgb.DMatrix(X_va.values, label=y_va.values, missing=np.nan)

best = {"score": -1, "threshold": 0.5, "params": None, "best_rounds": None}

for k, p in enumerate(sampled, 1):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",  # on early-stop sur logloss (stable), et on optimise BA via threshold
        "tree_method": "hist",
        "seed": 42,

        "max_depth": p["max_depth"],
        "min_child_weight": p["min_child_weight"],
        "subsample": p["subsample"],
        "colsample_bytree": p["colsample_bytree"],
        "eta": p["learning_rate"],
        "lambda": p["reg_lambda"],
        "alpha": p["reg_alpha"],
        "scale_pos_weight": p["scale_pos_weight"],
    }

    booster = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=[(dvalid, "valid")],
        early_stopping_rounds=200,
        verbose_eval=False
    )

    proba_va = predict_best(booster, dvalid)
    th, sc = best_threshold_for_ba(y_va.values, proba_va)

    # best rounds
    best_rounds = None
    if hasattr(booster, "best_iteration") and booster.best_iteration is not None:
        best_rounds = int(booster.best_iteration) + 1
    elif hasattr(booster, "best_ntree_limit") and booster.best_ntree_limit:
        best_rounds = int(booster.best_ntree_limit)

    if sc > best["score"]:
        best = {"score": sc, "threshold": th, "params": p, "best_rounds": best_rounds}
        print(f"üèÜ New best ({k}/{len(sampled)}): BA={sc:.5f} @th={th:.3f} best_rounds={best_rounds} params={p}")

print("\n‚úÖ Best config:", best)


Train pos=3208 neg=57331 ratio=17.87
üèÜ New best (1/25): BA=0.64962 @th=0.060 best_rounds=3857 params={'colsample_bytree': 0.85, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 1, 'reg_alpha': 0.1, 'reg_lambda': 1.0, 'scale_pos_weight': 17.871259351620946, 'subsample': 0.9}
üèÜ New best (2/25): BA=0.65424 @th=0.050 best_rounds=3112 params={'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3, 'reg_alpha': 0.0, 'reg_lambda': 3.0, 'scale_pos_weight': 5.0, 'subsample': 0.8}
üèÜ New best (3/25): BA=0.66148 @th=0.060 best_rounds=3827 params={'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 3, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'scale_pos_weight': 10.0, 'subsample': 0.8}
üèÜ New best (5/25): BA=0.68143 @th=0.065 best_rounds=549 params={'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 1, 'reg_alpha': 0.1, 'reg_lambda': 1.0, 'scale_pos_weight': 1.0, 'subsample': 0.9}
üèÜ Ne

In [17]:
# =========================
# ENSEMBLE (full + recent) + weight search + threshold tuning + SAVE MODELS
# AUTO-CONTAINED: recr√©e df, X, y si besoin (compatible vieux xgboost)
# =========================

import os, json
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import balanced_accuracy_score

# --------- A ADAPTER SI BESOIN ----------
INPUT_FILE = "train_features_v2.parquet"   # <-- mets le bon nom ici
TIME_COL   = "snapshot_time"
TARGET_COL = "target"
ID_COLS    = ["userId"]
OUT_DIR    = "xgb_ensemble_v3"
# ---------------------------------------

# Si "best" n'existe pas (kernel restart), recolle ici ton best config:
if "best" not in globals():
    best = {
        "score": 0.6898267046818989,
        "threshold": 0.06,
        "params": {
            "colsample_bytree": 0.7,
            "learning_rate": 0.05,
            "max_depth": 6,
            "min_child_weight": 1,
            "reg_alpha": 0.1,
            "reg_lambda": 1.0,
            "scale_pos_weight": 1.0,
            "subsample": 0.8
        },
        "best_rounds": 150
    }

# ---------- Load df ----------
if not os.path.exists(INPUT_FILE):
    print(f"‚ùå {INPUT_FILE} introuvable. Parquets dispo:")
    for f in os.listdir("."):
        if f.endswith(".parquet"):
            print(" -", f)
    raise FileNotFoundError(INPUT_FILE)

df = pd.read_parquet(INPUT_FILE)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
df = df.sort_values(TIME_COL).reset_index(drop=True)

if TARGET_COL not in df.columns:
    raise KeyError(f"'{TARGET_COL}' manquant dans df. Colonnes: {list(df.columns)[:50]} ...")

y = df[TARGET_COL].astype(int)

# ---------- Build X (datetime -> age_days, object/category -> one-hot) ----------
def make_X(df_in: pd.DataFrame) -> pd.DataFrame:
    d = df_in.copy()
    d[TIME_COL] = pd.to_datetime(d[TIME_COL], errors="coerce")

    # convertit toutes les colonnes datetime (sauf snapshot_time) en *_age_days
    dt_cols = d.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]", "datetimetz"]).columns.tolist()
    for col in dt_cols:
        if col == TIME_COL:
            continue
        d[col] = pd.to_datetime(d[col], errors="coerce")
        d[f"{col}_age_days"] = (d[TIME_COL] - d[col]).dt.total_seconds() / 86400.0
        d.drop(columns=[col], inplace=True)

    drop_cols = [c for c in ([TARGET_COL, TIME_COL] + ID_COLS) if c in d.columns]
    X_ = d.drop(columns=drop_cols)

    # bool -> int
    for c in X_.select_dtypes(include=["bool"]).columns:
        X_[c] = X_[c].astype(int)

    # object/category -> one-hot
    cat_cols = X_.select_dtypes(include=["object", "category"]).columns.tolist()
    if len(cat_cols) > 0:
        X_ = pd.get_dummies(X_, columns=cat_cols, dummy_na=True)

    X_ = X_.replace([np.inf, -np.inf], np.nan).fillna(0)
    return X_

X = make_X(df)

print("‚úÖ df:", df.shape, "| X:", X.shape, "| y:", y.shape)
print("Unique snapshot_time:", df[TIME_COL].nunique())

# ---------- Utils ----------
def predict_best(booster, dmat):
    if hasattr(booster, "best_ntree_limit") and booster.best_ntree_limit:
        return booster.predict(dmat, ntree_limit=booster.best_ntree_limit)
    if hasattr(booster, "best_iteration") and booster.best_iteration is not None:
        try:
            return booster.predict(dmat, iteration_range=(0, booster.best_iteration + 1))
        except TypeError:
            return booster.predict(dmat, ntree_limit=booster.best_iteration + 1)
    return booster.predict(dmat)

def best_threshold_for_ba(y_true, proba):
    thresholds = np.linspace(0.01, 0.99, 199)
    scores = [balanced_accuracy_score(y_true, (proba >= t).astype(int)) for t in thresholds]
    bi = int(np.argmax(scores))
    return float(thresholds[bi]), float(scores[bi])

# ---------- Split FULL/RECENT/VALID ----------
times = df[TIME_COL].dropna().sort_values().unique()
if len(times) < 4:
    raise ValueError(f"Pas assez de snapshots ({len(times)}). Il en faut au moins 4.")

last_time = times[-1]
train_times = set(times[:-1])

is_train = df[TIME_COL].isin(train_times).values
is_valid = (df[TIME_COL] == last_time).values

X_tr_full, y_tr_full = X.loc[is_train], y.loc[is_train]
X_va, y_va = X.loc[is_valid], y.loc[is_valid]

# recent = les 2 snapshots avant le dernier
recent_train_times = set(times[-3:-1])
is_train_recent = df[TIME_COL].isin(recent_train_times).values
X_tr_recent, y_tr_recent = X.loc[is_train_recent], y.loc[is_train_recent]

print("FULL train:", X_tr_full.shape, "RECENT train:", X_tr_recent.shape, "VALID:", X_va.shape)

dtr_full   = xgb.DMatrix(X_tr_full.values, label=y_tr_full.values, missing=np.nan)
dtr_recent = xgb.DMatrix(X_tr_recent.values, label=y_tr_recent.values, missing=np.nan)
dva        = xgb.DMatrix(X_va.values, label=y_va.values, missing=np.nan)

# ---------- Params from best ----------
bp = best["params"]
base = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",
    "max_depth": bp["max_depth"],
    "min_child_weight": bp["min_child_weight"],
    "subsample": bp["subsample"],
    "colsample_bytree": bp["colsample_bytree"],
    "eta": bp["learning_rate"],
    "lambda": bp["reg_lambda"],
    "alpha": bp["reg_alpha"],
    "scale_pos_weight": float(bp["scale_pos_weight"]),
}

# ---------- Train multi-seeds FULL + RECENT ----------
seeds = [7, 13, 21, 42, 99]
full_models, recent_models = [], []

for seed in seeds:
    p = dict(base); p["seed"] = int(seed)

    b_full = xgb.train(
        params=p,
        dtrain=dtr_full,
        num_boost_round=5000,
        evals=[(dva, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    full_models.append(b_full)

    b_recent = xgb.train(
        params=p,
        dtrain=dtr_recent,
        num_boost_round=5000,
        evals=[(dva, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    recent_models.append(b_recent)

print("‚úÖ trained", len(full_models), "full +", len(recent_models), "recent")

p_full = np.mean([predict_best(m, dva) for m in full_models], axis=0)
p_recent = np.mean([predict_best(m, dva) for m in recent_models], axis=0)

# ---------- Weight search FULL vs RECENT ----------
weights = np.linspace(0.0, 1.0, 11)
best_combo = {"ba": -1, "w_full": None, "th": None}

for w in weights:
    p_mix = w * p_full + (1 - w) * p_recent
    th, ba = best_threshold_for_ba(y_va.values, p_mix)
    if ba > best_combo["ba"]:
        best_combo = {"ba": float(ba), "w_full": float(w), "th": float(th)}

print("\nüèÜ BEST MIX:", best_combo, "(w_full = part FULL)")

# ---------- Save models (no pickle) ----------
os.makedirs(OUT_DIR, exist_ok=True)

paths_full, paths_recent = [], []
for i, m in enumerate(full_models):
    path = os.path.join(OUT_DIR, f"full_seed{i}.model")
    m.save_model(path)
    paths_full.append(path)

for i, m in enumerate(recent_models):
    path = os.path.join(OUT_DIR, f"recent_seed{i}.model")
    m.save_model(path)
    paths_recent.append(path)

bundle = {
    "features": list(X.columns),
    "threshold": best_combo["th"],
    "w_full": best_combo["w_full"],
    "model_paths_full": paths_full,
    "model_paths_recent": paths_recent,
}

with open(os.path.join(OUT_DIR, "bundle.json"), "w") as f:
    json.dump(bundle, f)

print("‚úÖ Saved bundle:", os.path.join(OUT_DIR, "bundle.json"))

‚úÖ df: (75863, 83) | X: (75863, 84) | y: (75863,)
Unique snapshot_time: 5
FULL train: (60539, 84) RECENT train: (31112, 84) VALID: (15324, 84)
‚úÖ trained 5 full + 5 recent

üèÜ BEST MIX: {'ba': 0.68924716159239, 'w_full': 1.0, 'th': 0.059494949494949496} (w_full = part FULL)
‚úÖ Saved bundle: xgb_ensemble_v3/bundle.json


  m.save_model(path)
  m.save_model(path)


In [18]:
# =========================
# ENSEMBLE V4: multi-config + multi-seed (xgboost.train, compatible vieux xgboost)
# Sauvegarde mod√®les + bundle.json (pas de pickle)
# =========================

import os, json
import numpy as np
import pandas as pd
import xgboost as xgb

# ---- A ADAPTER ----
INPUT_FILE = "train_features_v2.parquet"   # <- mets le bon nom (ex: train_features_multisnapshot.parquet)
TIME_COL   = "snapshot_time"
TARGET_COL = "target"
ID_COLS    = ["userId"]
OUT_DIR    = "xgb_ensemble_v4"
# -------------------

# ---------- Load ----------
if not os.path.exists(INPUT_FILE):
    print(f"‚ùå {INPUT_FILE} introuvable. Parquets dispo:")
    for f in os.listdir("."):
        if f.endswith(".parquet"):
            print(" -", f)
    raise FileNotFoundError(INPUT_FILE)

df = pd.read_parquet(INPUT_FILE)
df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")
df = df.sort_values(TIME_COL).reset_index(drop=True)

y = df[TARGET_COL].astype(int)

# ---------- Build X (datetime -> age_days, object/category -> one-hot) ----------
def make_X(df_in: pd.DataFrame) -> pd.DataFrame:
    d = df_in.copy()
    d[TIME_COL] = pd.to_datetime(d[TIME_COL], errors="coerce")

    # datetime -> age_days
    dt_cols = d.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]", "datetimetz"]).columns.tolist()
    for col in dt_cols:
        if col == TIME_COL:
            continue
        d[col] = pd.to_datetime(d[col], errors="coerce")
        d[f"{col}_age_days"] = (d[TIME_COL] - d[col]).dt.total_seconds() / 86400.0
        d.drop(columns=[col], inplace=True)

    drop_cols = [c for c in ([TARGET_COL, TIME_COL] + ID_COLS) if c in d.columns]
    X_ = d.drop(columns=drop_cols)

    for c in X_.select_dtypes(include=["bool"]).columns:
        X_[c] = X_[c].astype(int)

    cat_cols = X_.select_dtypes(include=["object", "category"]).columns.tolist()
    if len(cat_cols) > 0:
        X_ = pd.get_dummies(X_, columns=cat_cols, dummy_na=True)

    X_ = X_.replace([np.inf, -np.inf], np.nan).fillna(0)
    return X_

X = make_X(df)

print("‚úÖ df:", df.shape, "| X:", X.shape, "| y:", y.shape)
print("Unique snapshot_time:", df[TIME_COL].nunique())

# ---------- Configs (repris de tes meilleurs prints) ----------
# best config (BA=0.6898, rounds=150)
cfg_best = {
    "name": "best_150",
    "params": {
        "max_depth": 6,
        "min_child_weight": 1,
        "subsample": 0.8,
        "colsample_bytree": 0.7,
        "eta": 0.05,
        "lambda": 1.0,
        "alpha": 0.1,
        "scale_pos_weight": 1.0,
    },
    "rounds": 150,
    "seeds": [7, 13, 21, 42, 99],
}

# autre config forte (BA=0.6814, rounds=549)
cfg_alt = {
    "name": "alt_549",
    "params": {
        "max_depth": 6,
        "min_child_weight": 1,
        "subsample": 0.9,
        "colsample_bytree": 0.7,
        "eta": 0.03,
        "lambda": 1.0,
        "alpha": 0.1,
        "scale_pos_weight": 1.0,
    },
    "rounds": 549,
    "seeds": [7, 42, 99],   # moins de seeds pour aller vite
}

# config ‚Äúdiff√©rente‚Äù (BA=0.6542, spw=5, rounds=3112) ‚Äî diversit√©
cfg_div = {
    "name": "div_1200cap",
    "params": {
        "max_depth": 5,
        "min_child_weight": 3,
        "subsample": 0.8,
        "colsample_bytree": 0.7,
        "eta": 0.05,
        "lambda": 3.0,
        "alpha": 0.0,
        "scale_pos_weight": 5.0,
    },
    "rounds": 1200,         # cap pour temps (au lieu de 3112)
    "seeds": [13, 42],
}

configs = [cfg_best, cfg_alt, cfg_div]

# ---------- Train all models on ALL data ----------
dtrain = xgb.DMatrix(X.values, label=y.values, missing=np.nan)

base = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",
}

os.makedirs(OUT_DIR, exist_ok=True)

model_paths = []
model_meta = []

for cfg in configs:
    for seed in cfg["seeds"]:
        params = dict(base)
        params.update(cfg["params"])
        params["seed"] = int(seed)

        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=int(cfg["rounds"]),
            verbose_eval=False
        )

        path = os.path.join(OUT_DIR, f"{cfg['name']}_seed{seed}.model")
        booster.save_model(path)

        model_paths.append(path)
        model_meta.append({"name": cfg["name"], "seed": int(seed), "rounds": int(cfg["rounds"]), "params": cfg["params"]})

        print("‚úÖ saved", path)

bundle = {
    "features": list(X.columns),
    "model_paths": model_paths,
    "model_meta": model_meta
}

with open(os.path.join(OUT_DIR, "bundle.json"), "w") as f:
    json.dump(bundle, f)

print("\n‚úÖ Bundle saved:", os.path.join(OUT_DIR, "bundle.json"))
print("Models:", len(model_paths))


‚úÖ df: (75863, 83) | X: (75863, 84) | y: (75863,)
Unique snapshot_time: 5


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/best_150_seed7.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/best_150_seed13.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/best_150_seed21.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/best_150_seed42.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/best_150_seed99.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/alt_549_seed7.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/alt_549_seed42.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/alt_549_seed99.model


  booster.save_model(path)


‚úÖ saved xgb_ensemble_v4/div_1200cap_seed13.model
‚úÖ saved xgb_ensemble_v4/div_1200cap_seed42.model

‚úÖ Bundle saved: xgb_ensemble_v4/bundle.json
Models: 10


  booster.save_model(path)
