# Dataset Interview：LightGBM 现场训练与交付 Notebook

> 目标：在受限环境中快速完成 **数据检查 → 切分 → 训练 → 评估 → 解释 → 导出**，并保留可复现的运行痕迹。  
> 时间：2026-01-29  
> 约定：全程使用固定随机种子；时间序列任务默认按时间切分；所有输出写入 `./artifacts/`。


## 0. 运行环境与目录结构

In [None]:

import os, sys, json, math, time, random
from pathlib import Path

ART = Path("./artifacts")
ART.mkdir(parents=True, exist_ok=True)

print("Python:", sys.version.split()[0])
print("CWD:", Path(".").resolve())
print("Artifacts dir:", ART.resolve())


## 1. 依赖导入（含备胎模型）

In [None]:

import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    roc_auc_score, average_precision_score, log_loss,
    accuracy_score, f1_score
)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 备胎：LightGBM 不可用时仍能给出强 baseline
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# LightGBM（若不可用，后续会自动切换到备胎）
LGBM_OK = True
try:
    import lightgbm as lgb
    from lightgbm import LGBMRegressor, LGBMClassifier
    print("LightGBM:", lgb.__version__)
except Exception as e:
    LGBM_OK = False
    print("LightGBM import failed -> fallback to sklearn HGB. Error:", repr(e))


## 2. 全局配置（按任务修改）

In [None]:

CFG = {
    # === 必填 ===
    "data_path": "./data/train.csv",   # 或 parquet / feather
    "target": "y",
    "time_col": None,                 # 时间列名（时间序列任务填写）
    "group_col": None,                # 实体列名（如 symbol/user_id；可选）
    "task": "binary",                 # {"reg", "binary", "multiclass"}

    # === 切分 ===
    "test_size": 0.2,                 # 时间序列：按最后 test_size 比例做验证
    "n_splits": 5,                    # CV 折数（时间序列用 TimeSeriesSplit）

    # === 类别特征处理 ===
    "cat_cols": [],                   # 手动指定类别列（推荐）

    # === 训练 ===
    "metric": None,                   # None -> 自动按 task 选择
    "early_stopping_rounds": 200,
    "n_estimators": 5000,
    "learning_rate": 0.03,
    "num_leaves": 63,
    "min_child_samples": 50,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
}
CFG


## 3. 数据读取（CSV/Parquet 自适应）

In [None]:

def read_any(path: str) -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {p.resolve()}")
    if p.suffix.lower() in [".csv"]:
        return pd.read_csv(p)
    if p.suffix.lower() in [".parquet"]:
        return pd.read_parquet(p)
    if p.suffix.lower() in [".feather"]:
        return pd.read_feather(p)
    raise ValueError(f"Unsupported file type: {p.suffix}")

df = read_any(CFG["data_path"])
print("shape:", df.shape)
df.head()


## 4. 快速体检（缺失/重复/目标分布/时间覆盖）

In [None]:

def quick_profile(df: pd.DataFrame, target: str, time_col: str | None):
    out = {}
    out["n_rows"] = len(df)
    out["n_cols"] = df.shape[1]
    out["dup_rows"] = int(df.duplicated().sum())
    out["missing_top10"] = df.isna().mean().sort_values(ascending=False).head(10).to_dict()
    if target in df.columns:
        y = df[target]
        out["target_missing"] = float(y.isna().mean())
        out["target_unique"] = int(y.nunique(dropna=True))
        if pd.api.types.is_numeric_dtype(y):
            out["target_desc"] = y.describe().to_dict()
        else:
            out["target_vc"] = y.value_counts(dropna=False).head(20).to_dict()
    if time_col and time_col in df.columns:
        t = pd.to_datetime(df[time_col], errors="coerce")
        out["time_missing"] = float(t.isna().mean())
        out["time_min"] = None if t.isna().all() else str(t.min())
        out["time_max"] = None if t.isna().all() else str(t.max())
    return out

profile = quick_profile(df, CFG["target"], CFG["time_col"])
profile


## 5. 切分（时间序列：按时间排序 + 最后区间验证）

In [None]:

def train_valid_split(df: pd.DataFrame, cfg: dict):
    target = cfg["target"]
    time_col = cfg["time_col"]
    test_size = cfg["test_size"]
    if time_col and time_col in df.columns:
        d = df.copy()
        d[time_col] = pd.to_datetime(d[time_col], errors="coerce")
        d = d.sort_values(time_col).reset_index(drop=True)
        cut = int(len(d) * (1 - test_size))
        train_df = d.iloc[:cut].copy()
        valid_df = d.iloc[cut:].copy()
        return train_df, valid_df
    else:
        # 非时序：默认末尾切（保持可复现，不 shuffle）
        cut = int(len(df) * (1 - test_size))
        train_df = df.iloc[:cut].copy()
        valid_df = df.iloc[cut:].copy()
        return train_df, valid_df

train_df, valid_df = train_valid_split(df, CFG)
print("train:", train_df.shape, "valid:", valid_df.shape)


## 6. 特征列选择 + 类别列处理（pandas category）

In [None]:

TARGET = CFG["target"]
TIME_COL = CFG["time_col"]
GROUP_COL = CFG["group_col"]

drop_cols = [c for c in [TARGET, TIME_COL] if c and c in df.columns]
X_cols = [c for c in df.columns if c not in drop_cols]

# cat cols：按 CFG 或自动识别 object/string
cat_cols = list(CFG["cat_cols"])
if not cat_cols:
    cat_cols = [c for c in X_cols if df[c].dtype == "object"]
num_cols = [c for c in X_cols if c not in cat_cols]

def enforce_categories(d: pd.DataFrame, cat_cols: list[str]) -> pd.DataFrame:
    d = d.copy()
    for c in cat_cols:
        if c in d.columns:
            # 先转 string 再 category，减少混合类型坑
            d[c] = d[c].astype("string").fillna("<<MISSING>>").astype("category")
    return d

X_train = enforce_categories(train_df[X_cols], cat_cols)
X_valid = enforce_categories(valid_df[X_cols], cat_cols)
y_train = train_df[TARGET]
y_valid = valid_df[TARGET]

print("n_features:", len(X_cols))
print("cat:", len(cat_cols), "num:", len(num_cols))


## 7. 评价指标与统一打分函数

In [None]:

def get_default_metric(task: str) -> str:
    if task == "reg":
        return "rmse"
    if task == "binary":
        return "auc"
    if task == "multiclass":
        return "multi_logloss"
    raise ValueError(task)

METRIC = CFG["metric"] or get_default_metric(CFG["task"])

def score(task: str, y_true, y_pred, y_proba=None):
    if task == "reg":
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mae = mean_absolute_error(y_true, y_pred)
        return {"rmse": float(rmse), "mae": float(mae)}
    if task == "binary":
        out = {}
        if y_proba is not None:
            out["auc"] = float(roc_auc_score(y_true, y_proba))
            out["ap"] = float(average_precision_score(y_true, y_proba))
            out["logloss"] = float(log_loss(y_true, y_proba, eps=1e-15))
        out["acc"] = float(accuracy_score(y_true, (y_pred > 0.5).astype(int) if y_proba is None else (y_proba >= 0.5).astype(int)))
        out["f1"] = float(f1_score(y_true, (y_pred > 0.5).astype(int) if y_proba is None else (y_proba >= 0.5).astype(int)))
        return out
    if task == "multiclass":
        # y_proba: (n, K)
        ll = float(log_loss(y_true, y_proba))
        pred = np.argmax(y_proba, axis=1)
        acc = float(accuracy_score(y_true, pred))
        return {"multi_logloss": ll, "acc": acc}
    raise ValueError(task)


## 8. 模型训练（LightGBM 优先；失败则备胎）

In [None]:

def train_model(cfg: dict, X_train, y_train, X_valid, y_valid, cat_cols: list[str]):
    task = cfg["task"]
    esr = cfg["early_stopping_rounds"]

    if LGBM_OK:
        if task == "reg":
            model = LGBMRegressor(
                n_estimators=cfg["n_estimators"],
                learning_rate=cfg["learning_rate"],
                num_leaves=cfg["num_leaves"],
                min_child_samples=cfg["min_child_samples"],
                subsample=cfg["subsample"],
                colsample_bytree=cfg["colsample_bytree"],
                reg_alpha=cfg["reg_alpha"],
                reg_lambda=cfg["reg_lambda"],
                random_state=RANDOM_SEED,
                n_jobs=-1,
            )
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric=cfg["metric"] or "rmse",
                categorical_feature=cat_cols if len(cat_cols) else "auto",
                callbacks=[
                    lgb.early_stopping(esr, verbose=False),
                    lgb.log_evaluation(200),
                ],
            )
            return model, "lgbm"

        if task == "binary":
            model = LGBMClassifier(
                n_estimators=cfg["n_estimators"],
                learning_rate=cfg["learning_rate"],
                num_leaves=cfg["num_leaves"],
                min_child_samples=cfg["min_child_samples"],
                subsample=cfg["subsample"],
                colsample_bytree=cfg["colsample_bytree"],
                reg_alpha=cfg["reg_alpha"],
                reg_lambda=cfg["reg_lambda"],
                random_state=RANDOM_SEED,
                n_jobs=-1,
            )
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric=cfg["metric"] or "auc",
                categorical_feature=cat_cols if len(cat_cols) else "auto",
                callbacks=[
                    lgb.early_stopping(esr, verbose=False),
                    lgb.log_evaluation(200),
                ],
            )
            return model, "lgbm"

        if task == "multiclass":
            n_classes = int(pd.Series(y_train).nunique())
            model = LGBMClassifier(
                objective="multiclass",
                num_class=n_classes,
                n_estimators=cfg["n_estimators"],
                learning_rate=cfg["learning_rate"],
                num_leaves=cfg["num_leaves"],
                min_child_samples=cfg["min_child_samples"],
                subsample=cfg["subsample"],
                colsample_bytree=cfg["colsample_bytree"],
                reg_alpha=cfg["reg_alpha"],
                reg_lambda=cfg["reg_lambda"],
                random_state=RANDOM_SEED,
                n_jobs=-1,
            )
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric=cfg["metric"] or "multi_logloss",
                categorical_feature=cat_cols if len(cat_cols) else "auto",
                callbacks=[
                    lgb.early_stopping(esr, verbose=False),
                    lgb.log_evaluation(200),
                ],
            )
            return model, "lgbm"

    # 备胎：sklearn HGB（需要数值矩阵；类别做 one-hot）
    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
            ("num", "passthrough", [c for c in X_train.columns if c not in cat_cols]),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

    if task == "reg":
        model = HistGradientBoostingRegressor(random_state=RANDOM_SEED)
        pipe = Pipeline([("pre", pre), ("model", model)])
        pipe.fit(X_train, y_train)
        return pipe, "hgb"
    if task == "binary":
        model = HistGradientBoostingClassifier(random_state=RANDOM_SEED)
        pipe = Pipeline([("pre", pre), ("model", model)])
        pipe.fit(X_train, y_train)
        return pipe, "hgb"
    if task == "multiclass":
        model = HistGradientBoostingClassifier(random_state=RANDOM_SEED)
        pipe = Pipeline([("pre", pre), ("model", model)])
        pipe.fit(X_train, y_train)
        return pipe, "hgb"
    raise ValueError(task)

model, backend = train_model(CFG, X_train, y_train, X_valid, y_valid, cat_cols)
print("backend:", backend, "model:", type(model))


## 9. 验证集评估

In [None]:

task = CFG["task"]

if task == "reg":
    yhat = model.predict(X_valid)
    metrics = score(task, y_valid, yhat)
elif task == "binary":
    if backend == "lgbm":
        proba = model.predict_proba(X_valid)[:, 1]
        yhat = proba
        metrics = score(task, y_valid, y_pred=None, y_proba=proba)
    else:
        proba = model.predict_proba(X_valid)[:, 1]
        metrics = score(task, y_valid, y_pred=None, y_proba=proba)
elif task == "multiclass":
    proba = model.predict_proba(X_valid)
    metrics = score(task, y_valid, y_pred=None, y_proba=proba)
else:
    raise ValueError(task)

metrics


## 10. 特征重要性（仅 LightGBM）

In [None]:

import pandas as pd

if backend == "lgbm":
    imp = pd.DataFrame({
        "feature": X_train.columns,
        "importance": model.feature_importances_,
    }).sort_values("importance", ascending=False)
    display(imp.head(30))
    imp.to_csv(ART / "feature_importance.csv", index=False)
    print("saved:", ART / "feature_importance.csv")
else:
    print("非 LightGBM 后端：importance 使用 permutation_importance 或 SHAP（若环境允许）")


## 11. 误差分析（按时间/分组切片）

In [None]:

def add_preds(df_part: pd.DataFrame, X_part, task: str, model, backend: str):
    d = df_part.copy()
    if task == "reg":
        d["_pred"] = model.predict(X_part)
    elif task == "binary":
        if hasattr(model, "predict_proba"):
            d["_pred"] = model.predict_proba(X_part)[:, 1]
        else:
            d["_pred"] = model.predict(X_part)
    else:
        # 多分类：存最大类概率
        d["_pred"] = model.predict_proba(X_part).max(axis=1)
    return d

valid_with_pred = add_preds(valid_df, X_valid, CFG["task"], model, backend)

# 按时间分桶（若有时间列）
if TIME_COL and TIME_COL in valid_with_pred.columns:
    t = pd.to_datetime(valid_with_pred[TIME_COL], errors="coerce")
    valid_with_pred["_tbin"] = pd.cut(t.view("int64"), bins=10, duplicates="drop")
    print(valid_with_pred.groupby("_tbin")["_pred"].agg(["count","mean"]).head())

# 按 group 聚合（若有 group 列）
if GROUP_COL and GROUP_COL in valid_with_pred.columns:
    g = valid_with_pred.groupby(GROUP_COL)["_pred"].agg(["count","mean"]).sort_values("count", ascending=False).head(20)
    display(g)

valid_with_pred.head()


## 12. 保存模型与配置（交付用）

In [None]:

import joblib

joblib.dump(model, ART / "model.pkl")
with open(ART / "config.json", "w", encoding="utf-8") as f:
    json.dump(CFG, f, ensure_ascii=False, indent=2)

print("saved:", ART / "model.pkl")
print("saved:", ART / "config.json")


## 13. 单次推理模板（读取模型 + 生成预测）

In [None]:

# new_df = read_any("./data/test.csv")
# new_X = enforce_categories(new_df[X_cols], cat_cols)

# loaded = joblib.load(ART / "model.pkl")
# if CFG["task"] == "reg":
#     pred = loaded.predict(new_X)
# elif CFG["task"] == "binary":
#     pred = loaded.predict_proba(new_X)[:, 1]
# else:
#     pred = loaded.predict_proba(new_X)

# pd.DataFrame({"pred": pred}).to_csv(ART / "predictions.csv", index=False)
# print("saved:", ART / "predictions.csv")


## 14. 现场收尾清单（打印在脑子里）
- 跑通 baseline（有分数）
- 切分无泄露（时间/分组）
- 类别列 category（或 one-hot 备胎）
- early stopping 选最佳轮次
- 重要性 + 错误切片
- artifacts：model.pkl / config.json / feature_importance.csv / predictions.csv