# Dataset Interview：XGBoost 训练与部署速查（现场用）

本 notebook 用于在现场快速完成：  
- 数据加载与快速检查  
- 时序切分 / 验证  
- 特征工程（lag / rolling / 日历特征）  
- XGBoost 训练（early stopping）  
- 误差诊断与可解释性  
- 模型保存与复现

> 目标：用最少步骤跑出可靠 baseline，然后做 2–3 次有方向的参数试验并能解释。


In [None]:
# ====== 环境自检（按需执行）======
import sys, os, platform, time
print("python:", sys.version.split()[0])
print("platform:", platform.platform())


## 0. 依赖导入 & 全局设置
- 树模型不依赖标准化  
- 缺失值保留为 NaN，XGBoost 可直接处理  
- 类别特征默认 one-hot（高基数先做合并/截断）


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

# xgboost
import xgboost as xgb

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


## 1. 数据读取（按实际文件改路径/格式）
约定：
- 时间列：`time_col`
- 目标列：`target_col`
- 可选 ID 列：`id_col`（如资产/站点/用户等）


In [None]:
# ====== 修改这里 ======
DATA_PATH = "data.csv"      # e.g., "train.parquet" / "train.csv"
time_col = "timestamp"
target_col = "y"
id_col = None               # e.g., "asset_id"; 无则填 None

# ====== 读取 ======
def load_data(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext in [".parquet"]:
        return pd.read_parquet(path)
    if ext in [".csv"]:
        return pd.read_csv(path)
    if ext in [".feather"]:
        return pd.read_feather(path)
    raise ValueError(f"Unsupported file type: {ext}")

df = load_data(DATA_PATH)
print(df.shape)
df.head()


## 2. 快速体检：类型/缺失/重复/目标分布


In [None]:
def quick_profile(df: pd.DataFrame, target_col: str | None = None):
    out = {}
    out["n_rows"] = len(df)
    out["n_cols"] = df.shape[1]
    out["dup_rows"] = int(df.duplicated().sum())
    na_rate = df.isna().mean().sort_values(ascending=False)
    out["top_na"] = na_rate.head(10)
    dtypes = df.dtypes.astype(str).value_counts()
    out["dtypes"] = dtypes
    if target_col and target_col in df.columns:
        y = df[target_col]
        out["target_na"] = float(y.isna().mean())
        if pd.api.types.is_numeric_dtype(y):
            out["target_desc"] = y.describe()
        else:
            out["target_value_counts"] = y.value_counts(dropna=False).head(10)
    return out

prof = quick_profile(df, target_col)
print("rows/cols:", prof["n_rows"], prof["n_cols"])
print("dup_rows:", prof["dup_rows"])
print("\n== dtype counts ==")
print(prof["dtypes"])
print("\n== top missing columns ==")
print(prof["top_na"])


In [None]:
# 时间列标准化为 pandas datetime（按实际格式调整）
df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
df = df.sort_values(time_col).reset_index(drop=True)

print("time min/max:", df[time_col].min(), df[time_col].max())


### 2.1 目标列快速图（回归）


In [None]:
if target_col in df.columns and pd.api.types.is_numeric_dtype(df[target_col]):
    plt.figure()
    df[target_col].hist(bins=50)
    plt.title("Target distribution")
    plt.show()

    # 时间趋势（粗看）
    plt.figure()
    df.set_index(time_col)[target_col].rolling(100).mean().plot()
    plt.title("Target rolling mean (window=100)")
    plt.show()


## 3. 时序切分：Train / Valid / Test
- 不 shuffle  
- 用最后一段做 valid/test，或用 walk-forward 做 CV


In [None]:
# ====== 修改这里：按题目定义划分比例或固定日期 ======
train_frac = 0.7
valid_frac = 0.15

n = len(df)
i_train_end = int(n * train_frac)
i_valid_end = int(n * (train_frac + valid_frac))

df_train = df.iloc[:i_train_end].copy()
df_valid = df.iloc[i_train_end:i_valid_end].copy()
df_test  = df.iloc[i_valid_end:].copy()

print("train/valid/test:", df_train.shape, df_valid.shape, df_test.shape)
print("train time:", df_train[time_col].min(), "->", df_train[time_col].max())
print("valid time:", df_valid[time_col].min(), "->", df_valid[time_col].max())
print("test  time:", df_test[time_col].min(),  "->", df_test[time_col].max())


## 4. 特征工程（通用 + 时序）
### 4.1 日历特征（时间戳）


In [None]:
def add_time_features(df: pd.DataFrame, time_col: str) -> pd.DataFrame:
    t = df[time_col]
    out = df.copy()
    out["hour"] = t.dt.hour
    out["dayofweek"] = t.dt.dayofweek
    out["day"] = t.dt.day
    out["month"] = t.dt.month
    out["is_month_start"] = t.dt.is_month_start.astype(int)
    out["is_month_end"] = t.dt.is_month_end.astype(int)
    return out

df_train_fe = add_time_features(df_train, time_col)
df_valid_fe = add_time_features(df_valid, time_col)
df_test_fe  = add_time_features(df_test,  time_col)


### 4.2 Lag / Rolling（按实体分组或全局）
- 若存在 `id_col`：按实体分组做 lag/rolling  
- 否则：视为单序列


In [None]:
# ====== 修改这里：选取要做 lag/rolling 的数值列 ======
# 默认：除了 time/target/id 之外的数值列
exclude_cols = {time_col, target_col}
if id_col:
    exclude_cols.add(id_col)

numeric_cols = [c for c in df.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(df[c])]
print("numeric feature cols:", len(numeric_cols))
numeric_cols[:20]


In [None]:
def add_lag_rolling(df: pd.DataFrame, time_col: str, cols: list[str], id_col: str | None = None,
                    lags=(1,2,3,5,10), windows=(5,10,20)):
    out = df.sort_values([id_col, time_col] if id_col else [time_col]).copy()
    grp = out.groupby(id_col, sort=False) if id_col else [(None, out)]

    for col in cols:
        if id_col:
            g = out.groupby(id_col, sort=False)[col]
        else:
            g = out[col]

        for L in lags:
            out[f"{col}_lag{L}"] = g.shift(L)

        for W in windows:
            # rolling stats on shifted series to avoid leakage at time t
            s = g.shift(1)
            if id_col:
                out[f"{col}_rmean{W}"] = s.groupby(out[id_col]).rolling(W, min_periods=max(2, W//3)).mean().reset_index(level=0, drop=True)
                out[f"{col}_rstd{W}"]  = s.groupby(out[id_col]).rolling(W, min_periods=max(2, W//3)).std().reset_index(level=0, drop=True)
            else:
                out[f"{col}_rmean{W}"] = s.rolling(W, min_periods=max(2, W//3)).mean()
                out[f"{col}_rstd{W}"]  = s.rolling(W, min_periods=max(2, W//3)).std()
    return out

df_train_fe = add_lag_rolling(df_train_fe, time_col, numeric_cols, id_col=id_col)
df_valid_fe = add_lag_rolling(df_valid_fe, time_col, numeric_cols, id_col=id_col)
df_test_fe  = add_lag_rolling(df_test_fe,  time_col, numeric_cols, id_col=id_col)

print(df_train_fe.shape, df_valid_fe.shape, df_test_fe.shape)


### 4.3 丢掉因 lag/rolling 产生的前几行 NaN（训练/验证要一致）


In [None]:
# 以 target 非空为基本要求；同时允许特征存在 NaN（XGB 可处理）
def clean_frame(df: pd.DataFrame, target_col: str):
    out = df.copy()
    out = out[~out[target_col].isna()].copy()
    return out

df_train_fe = clean_frame(df_train_fe, target_col)
df_valid_fe = clean_frame(df_valid_fe, target_col)
df_test_fe  = clean_frame(df_test_fe,  target_col)

print("after clean:", df_train_fe.shape, df_valid_fe.shape, df_test_fe.shape)


## 5. 特征列整理：数值 + 类别


In [None]:
# 类别列：object/category/bool
base_exclude = {target_col, time_col}
if id_col:
    base_exclude.add(id_col)

cat_cols = [c for c in df_train_fe.columns
            if c not in base_exclude and (df_train_fe[c].dtype == "object" or str(df_train_fe[c].dtype) == "category" or df_train_fe[c].dtype == "bool")]

# 数值列：其余
feat_cols = [c for c in df_train_fe.columns if c not in base_exclude]
num_cols = [c for c in feat_cols if c not in cat_cols and pd.api.types.is_numeric_dtype(df_train_fe[c])]

print("features:", len(feat_cols), "num:", len(num_cols), "cat:", len(cat_cols))


## 6. 建模任务类型
- 回归：`task = "reg"`  
- 二分类：`task = "clf_bin"`（y 取 {0,1}）  
- 多分类：`task = "clf_multi"`（y 为 0..K-1）


In [None]:
task = "reg"   # "reg" / "clf_bin" / "clf_multi"


## 7. Pipeline：one-hot + XGBoost（Sklearn API）
- 现场优先用 Pipeline，便于保存与复现  
- XGBoost 接受稀疏矩阵输入


In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
    ],
    remainder="drop",
)

def build_model(task: str):
    if task == "reg":
        return xgb.XGBRegressor(
            n_estimators=3000,
            learning_rate=0.03,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            reg_alpha=0.0,
            reg_lambda=1.0,
            gamma=0.0,
            tree_method="hist",
            random_state=RANDOM_SEED,
            n_jobs=-1,
        )
    if task == "clf_bin":
        return xgb.XGBClassifier(
            n_estimators=3000,
            learning_rate=0.03,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            reg_alpha=0.0,
            reg_lambda=1.0,
            gamma=0.0,
            tree_method="hist",
            random_state=RANDOM_SEED,
            n_jobs=-1,
            eval_metric="logloss",
        )
    if task == "clf_multi":
        # num_class 后面根据 y 自动推断
        return xgb.XGBClassifier(
            n_estimators=3000,
            learning_rate=0.03,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            reg_alpha=0.0,
            reg_lambda=1.0,
            gamma=0.0,
            tree_method="hist",
            random_state=RANDOM_SEED,
            n_jobs=-1,
            objective="multi:softprob",
            eval_metric="mlogloss",
        )
    raise ValueError(task)

model = build_model(task)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", model)
])

X_train = df_train_fe[feat_cols]
y_train = df_train_fe[target_col]
X_valid = df_valid_fe[feat_cols]
y_valid = df_valid_fe[target_col]
X_test  = df_test_fe[feat_cols]
y_test  = df_test_fe[target_col]

print(X_train.shape, X_valid.shape, X_test.shape)


## 8. 训练（early stopping）
XGBoost 的 early stopping 需要把 eval_set 传进模型 fit。  
Pipeline 下通过 `model__` 前缀传参。


In [None]:
fit_params = dict(
    model__eval_set=[(X_valid, y_valid)],
    model__verbose=200,
    model__early_stopping_rounds=200,
)

t0 = time.time()
pipe.fit(X_train, y_train, **fit_params)
print("train seconds:", round(time.time() - t0, 2))


## 9. 评估：Valid/Test 指标 + 简单诊断


In [None]:
def eval_reg(y_true, y_pred, name=""):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{name} RMSE={rmse:.6g}  MAE={mae:.6g}")
    return {"rmse": rmse, "mae": mae}

def eval_bin(y_true, y_proba, name=""):
    # y_proba: P(y=1)
    auc = roc_auc_score(y_true, y_proba)
    ll = log_loss(y_true, y_proba)
    print(f"{name} AUC={auc:.6g}  logloss={ll:.6g}")
    return {"auc": auc, "logloss": ll}

# valid
if task == "reg":
    pred_v = pipe.predict(X_valid)
    m_valid = eval_reg(y_valid, pred_v, "valid")
elif task == "clf_bin":
    proba_v = pipe.predict_proba(X_valid)[:, 1]
    m_valid = eval_bin(y_valid, proba_v, "valid")
elif task == "clf_multi":
    proba_v = pipe.predict_proba(X_valid)
    ll = log_loss(y_valid, proba_v)
    print(f"valid mlogloss={ll:.6g}")
    m_valid = {"mlogloss": ll}

# test
if task == "reg":
    pred_t = pipe.predict(X_test)
    m_test = eval_reg(y_test, pred_t, "test")
elif task == "clf_bin":
    proba_t = pipe.predict_proba(X_test)[:, 1]
    m_test = eval_bin(y_test, proba_t, "test")
elif task == "clf_multi":
    proba_t = pipe.predict_proba(X_test)
    ll = log_loss(y_test, proba_t)
    print(f"test mlogloss={ll:.6g}")
    m_test = {"mlogloss": ll}


In [None]:
# 残差图（回归）
if task == "reg":
    res = (y_test.values - pred_t)
    plt.figure()
    plt.hist(res, bins=60)
    plt.title("Residuals on test")
    plt.show()

    # 残差随时间漂移（粗查）
    tmp = df_test_fe[[time_col]].copy()
    tmp["residual"] = res
    plt.figure()
    tmp.set_index(time_col)["residual"].rolling(100).mean().plot()
    plt.title("Residual rolling mean (test, window=100)")
    plt.show()


## 10. 特征重要性（gain / weight）
Pipeline 下取出训练后的 booster：


In [None]:
# 取出训练后的 XGBoost 模型
xgb_model = pipe.named_steps["model"]

# 重要性（sklearn feature_importances_ 是按处理后的列；one-hot 后列名需要从 encoder 拿）
# 这里给出可复现的 feature name 展开
prep = pipe.named_steps["prep"]
feature_names = []

# num
feature_names.extend(num_cols)

# cat one-hot
if len(cat_cols) > 0:
    ohe = prep.named_transformers_["cat"]
    ohe_names = list(ohe.get_feature_names_out(cat_cols))
    feature_names.extend(ohe_names)

imp = pd.Series(xgb_model.feature_importances_, index=feature_names).sort_values(ascending=False)
imp.head(25)


In [None]:
# 画 top20
topk = 20
plt.figure()
imp.head(topk)[::-1].plot(kind="barh")
plt.title(f"Top {topk} feature importance")
plt.show()


## 11. 参数试验（控制变量：2–3 次即可）
目标：验证“更保守/更激进”对过拟合与泛化的影响。

这里用一组试验表驱动训练，输出 valid 指标对比。


In [None]:
from copy import deepcopy

def clone_pipe_with_params(pipe: Pipeline, params: dict):
    p2 = deepcopy(pipe)
    p2.set_params(**params)
    return p2

trials = [
    ("baseline", {}),
    ("shallower", {"model__max_depth": 4}),
    ("more_reg",  {"model__min_child_weight": 10, "model__gamma": 1.0}),
]

results = []

for name, p in trials:
    p2 = clone_pipe_with_params(pipe, p)
    t0 = time.time()
    p2.fit(X_train, y_train, **fit_params)
    sec = time.time() - t0

    if task == "reg":
        pv = p2.predict(X_valid)
        rmse = mean_squared_error(y_valid, pv, squared=False)
        results.append((name, rmse, sec))
    elif task == "clf_bin":
        pv = p2.predict_proba(X_valid)[:, 1]
        ll = log_loss(y_valid, pv)
        results.append((name, ll, sec))
    else:
        pv = p2.predict_proba(X_valid)
        ll = log_loss(y_valid, pv)
        results.append((name, ll, sec))

res_df = pd.DataFrame(results, columns=["trial", "valid_metric", "train_seconds"]).sort_values("valid_metric")
res_df


## 12. 保存模型（含预处理）
Pipeline 直接用 joblib 保存，现场最稳。


In [None]:
import joblib

OUT_DIR = "artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

model_path = os.path.join(OUT_DIR, "xgb_pipeline.joblib")
joblib.dump(pipe, model_path)

# 也保存一个 xgboost 原生模型（可选）
xgb_native_path = os.path.join(OUT_DIR, "xgb_native.json")
pipe.named_steps["model"].save_model(xgb_native_path)

print("saved:", model_path)
print("saved:", xgb_native_path)


## 13. 复现加载（自检）


In [None]:
pipe2 = joblib.load(model_path)

if task == "reg":
    p = pipe2.predict(X_test)
    eval_reg(y_test, p, "loaded test")
elif task == "clf_bin":
    p = pipe2.predict_proba(X_test)[:, 1]
    eval_bin(y_test, p, "loaded test")
else:
    p = pipe2.predict_proba(X_test)
    ll = log_loss(y_test, p)
    print(f"loaded test mlogloss={ll:.6g}")


## 14. 最终汇报结构（PPT/口述提纲）
- 问题定义与目标指标  
- 数据概览：时间范围、缺失、分布、潜在泄漏点  
- 验证方式：按时间切分 / walk-forward  
- Baseline：最小特征集 + early stopping  
- 改进：两到三组有方向的超参试验（更保守/更激进）  
- 结果：valid/test 指标、误差分桶（按时间/分位数/类别）  
- 可解释性：Top features + 合理解释  
- 下一步：更精细的特征、分层建模、稳定性评估
