# Sklearn 树模型训练与部署速查（Dataset Interview）

更新时间：2026-01-29

目标：拿到任意表格数据后，快速完成：数据切分 → 预处理 → 训练树模型（回归/分类） → 可靠评估（含时间序列） → 解释（重要性） → 产出可展示结果。

约束：默认使用 Python + numpy/pandas + scikit-learn；


## 0. 环境与导入
- 统一随机种子
- 统一展示与日志
- 统一指标与切分工具


In [None]:
import os
import re
import time
import warnings
from typing import Optional, Dict, Tuple, List

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, roc_auc_score, average_precision_score, f1_score, log_loss
)

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestRegressor, RandomForestClassifier,
    ExtraTreesRegressor, ExtraTreesClassifier,
    GradientBoostingRegressor, GradientBoostingClassifier,
    AdaBoostRegressor, AdaBoostClassifier
)
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier

from sklearn.inspection import permutation_importance

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 200)

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def safe_auc(y_true, y_score):
    y_true = np.asarray(y_true)
    if len(np.unique(y_true)) < 2:
        return np.nan
    return float(roc_auc_score(y_true, y_score))

def timestamp():
    return time.strftime("%Y%m%d_%H%M%S")


## 1. 数据快速体检（5 分钟以内）
输出：shape、缺失率、重复行、目标分布、时间字段提示。


In [None]:
def quick_profile(df: pd.DataFrame, target: Optional[str] = None, time_col: Optional[str] = None, n_head: int = 3):
    print("shape:", df.shape)
    print("\nhead:")
    display(df.head(n_head))
    print("\ndtypes:")
    display(df.dtypes.value_counts())
    
    miss = df.isna().mean().sort_values(ascending=False)
    print("\nmissing rate (top 20):")
    display(miss.head(20))
    
    dup = df.duplicated().mean()
    print(f"\nduplicate row rate: {dup:.4f}")
    
    if target is not None and target in df.columns:
        y = df[target]
        print("\nTarget summary:")
        try:
            display(y.describe())
        except Exception:
            display(y.head())
        if y.dtype == "O" or y.nunique(dropna=True) <= 20:
            print("\nTarget value_counts (top 20):")
            display(y.value_counts().head(20))
    
    if time_col is None:
        cand = [c for c in df.columns if re.search(r"(date|time|timestamp)", c, flags=re.I)]
        if cand:
            print("\nPossible time columns:", cand[:10])
    else:
        print("\nTime column:", time_col)

def infer_task_type(y: pd.Series) -> str:
    # 输出: "regression" / "binary" / "multiclass"
    if y.dtype == "O":
        nunique = y.nunique(dropna=True)
        return "binary" if nunique == 2 else "multiclass"
    nunique = y.nunique(dropna=True)
    if nunique <= 2:
        return "binary"
    if nunique <= 20 and y.dropna().astype(int).equals(y.dropna()):
        return "multiclass"
    return "regression"


## 2. 切分策略（非时序 / 时序）
- 非时序：train_test_split
- 时序：按时间排序后做 holdout；或 TimeSeriesSplit 做 CV

泄漏自检：rolling/聚合 特征必须基于过去信息（例如 shift(1)）。


In [None]:
def split_holdout(
    df: pd.DataFrame,
    target: str,
    time_col: Optional[str] = None,
    test_size: float = 0.2,
    valid_size: float = 0.2,
    shuffle: bool = True,
    stratify: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]:
    # 返回：X_train, X_valid, X_test, y_train, y_valid, y_test
    # 若 time_col 给定：按 time_col 排序并做顺序切分（无 shuffle）
    # 否则：常规随机切分；分类任务可启用 stratify
    
    assert target in df.columns
    df2 = df.copy()
    y = df2.pop(target)

    if time_col is not None:
        df2 = df2.sort_values(time_col).reset_index(drop=True)
        y = y.reset_index(drop=True)
        n = len(df2)
        n_test = int(round(n * test_size))
        n_valid = int(round((n - n_test) * valid_size))
        n_train = n - n_test - n_valid

        X_train = df2.iloc[:n_train]
        X_valid = df2.iloc[n_train:n_train + n_valid]
        X_test  = df2.iloc[n_train + n_valid:]

        y_train = y.iloc[:n_train]
        y_valid = y.iloc[n_train:n_train + n_valid]
        y_test  = y.iloc[n_train + n_valid:]
        return X_train, X_valid, X_test, y_train, y_valid, y_test

    strat = y if stratify else None
    X_trv, X_test, y_trv, y_test = train_test_split(
        df2, y, test_size=test_size, random_state=RANDOM_STATE, shuffle=shuffle, stratify=strat
    )
    strat2 = y_trv if stratify else None
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_trv, y_trv, test_size=valid_size, random_state=RANDOM_STATE, shuffle=shuffle, stratify=strat2
    )
    return X_train, X_valid, X_test, y_train, y_valid, y_test


## 3. 预处理与 Pipeline（树模型专用）
树模型不依赖特征缩放；重点：缺失值 + 类别编码。


In [None]:
def build_preprocess(X: pd.DataFrame, categorical_cols: Optional[List[str]] = None):
    if categorical_cols is None:
        categorical_cols = [c for c in X.columns if X[c].dtype == "O" or str(X[c].dtype).startswith("category")]
    numeric_cols = [c for c in X.columns if c not in categorical_cols]

    numeric_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ])
    cat_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ])
    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_tf, numeric_cols),
            ("cat", cat_tf, categorical_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )
    return preprocess, numeric_cols, categorical_cols

def get_feature_names(pipe: Pipeline) -> List[str]:
    prep = pipe.named_steps["prep"]
    try:
        return list(prep.get_feature_names_out())
    except Exception:
        return []


## 4. 常用树模型与默认配置
覆盖：DecisionTree / RandomForest / ExtraTrees / GradientBoosting / AdaBoost / HistGradientBoosting。


In [None]:
def make_model(task: str, model_name: str):
    is_clf = task in ("binary", "multiclass")

    if model_name == "hgb":
        if is_clf:
            return HistGradientBoostingClassifier(
                learning_rate=0.05,
                max_depth=6,
                max_iter=400,
                min_samples_leaf=50,
                l2_regularization=0.1,
                random_state=RANDOM_STATE
            )
        return HistGradientBoostingRegressor(
            learning_rate=0.05,
            max_depth=6,
            max_iter=400,
            min_samples_leaf=50,
            l2_regularization=0.1,
            random_state=RANDOM_STATE
        )

    if model_name == "rf":
        if is_clf:
            return RandomForestClassifier(
                n_estimators=400,
                max_depth=None,
                min_samples_leaf=5,
                n_jobs=-1,
                random_state=RANDOM_STATE
            )
        return RandomForestRegressor(
            n_estimators=400,
            max_depth=None,
            min_samples_leaf=5,
            n_jobs=-1,
            random_state=RANDOM_STATE
        )

    if model_name == "et":
        if is_clf:
            return ExtraTreesClassifier(
                n_estimators=600,
                max_depth=None,
                min_samples_leaf=2,
                n_jobs=-1,
                random_state=RANDOM_STATE
            )
        return ExtraTreesRegressor(
            n_estimators=600,
            max_depth=None,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=RANDOM_STATE
        )

    if model_name == "gbrt":
        if is_clf:
            return GradientBoostingClassifier(
                learning_rate=0.05,
                n_estimators=300,
                max_depth=3,
                random_state=RANDOM_STATE
            )
        return GradientBoostingRegressor(
            learning_rate=0.05,
            n_estimators=300,
            max_depth=3,
            random_state=RANDOM_STATE
        )

    if model_name == "ada":
        if is_clf:
            return AdaBoostClassifier(
                n_estimators=400,
                learning_rate=0.05,
                random_state=RANDOM_STATE
            )
        return AdaBoostRegressor(
            n_estimators=400,
            learning_rate=0.05,
            random_state=RANDOM_STATE
        )

    if model_name == "dt":
        if is_clf:
            return DecisionTreeClassifier(
                max_depth=8,
                min_samples_leaf=20,
                random_state=RANDOM_STATE
            )
        return DecisionTreeRegressor(
            max_depth=8,
                min_samples_leaf=20,
                random_state=RANDOM_STATE
        )

    raise ValueError(f"Unknown model_name={model_name}")


## 5. 训练与评估（Holdout）
回归：MAE / RMSE / R2；分类：Accuracy / AUC / PR-AUC / F1 / LogLoss。


In [None]:
def evaluate(task: str, y_true, y_pred, y_proba=None) -> Dict[str, float]:
    out = {}
    if task == "regression":
        out["mae"] = float(mean_absolute_error(y_true, y_pred))
        out["rmse"] = rmse(y_true, y_pred)
        out["r2"] = float(r2_score(y_true, y_pred))
        return out

    out["acc"] = float(accuracy_score(y_true, y_pred))
    out["f1_macro"] = float(f1_score(y_true, y_pred, average="macro"))

    if y_proba is not None:
        if task == "binary":
            out["auc"] = safe_auc(y_true, y_proba)
            out["pr_auc"] = float(average_precision_score(y_true, y_proba)) if len(np.unique(y_true)) > 1 else np.nan
            try:
                out["logloss"] = float(log_loss(y_true, np.c_[1 - y_proba, y_proba]))
            except Exception:
                out["logloss"] = np.nan
        else:
            try:
                out["logloss"] = float(log_loss(y_true, y_proba))
            except Exception:
                out["logloss"] = np.nan
    return out

def fit_one_model(
    X_train, y_train, X_valid, y_valid,
    task: str, model_name: str,
    categorical_cols: Optional[List[str]] = None
) -> Tuple[Pipeline, Dict[str, float]]:
    preprocess, _, _ = build_preprocess(X_train, categorical_cols=categorical_cols)
    model = make_model(task, model_name)
    pipe = Pipeline(steps=[("prep", preprocess), ("model", model)])

    pipe.fit(X_train, y_train)

    yhat = pipe.predict(X_valid)
    yproba = None
    if task == "binary":
        yproba = pipe.predict_proba(X_valid)[:, 1]
    elif task == "multiclass":
        yproba = pipe.predict_proba(X_valid)

    metrics = evaluate(task, y_valid, yhat, yproba)
    return pipe, metrics


## 6. 多模型对比（同一套切分/预处理）
一次跑完常见模型，输出对比表。


In [None]:
def benchmark_models(
    X_train, y_train, X_valid, y_valid, task: str,
    model_names=("hgb", "rf", "et", "gbrt", "dt"),
    categorical_cols: Optional[List[str]] = None
):
    rows = []
    pipes = {}
    for name in model_names:
        pipe, m = fit_one_model(X_train, y_train, X_valid, y_valid, task, name, categorical_cols=categorical_cols)
        pipes[name] = pipe
        rows.append({"model": name, **m})
    return pd.DataFrame(rows), pipes


## 7. 交叉验证（非时序 / 时序）
时序：TimeSeriesSplit（不打乱）。


In [None]:
def cross_validate_pipeline(
    X: pd.DataFrame, y: pd.Series, task: str,
    model_name: str = "hgb",
    time_series: bool = False,
    n_splits: int = 5,
    categorical_cols: Optional[List[str]] = None
):
    preprocess, _, _ = build_preprocess(X, categorical_cols=categorical_cols)
    model = make_model(task, model_name)
    pipe = Pipeline(steps=[("prep", preprocess), ("model", model)])

    if time_series:
        splitter = TimeSeriesSplit(n_splits=n_splits)
        split_iter = splitter.split(X)
    else:
        if task in ("binary", "multiclass"):
            splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
        else:
            splitter = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
        split_iter = splitter.split(X, y)

    fold_rows = []
    for i, (tr_idx, va_idx) in enumerate(split_iter):
        Xtr, Xva = X.iloc[tr_idx], X.iloc[va_idx]
        ytr, yva = y.iloc[tr_idx], y.iloc[va_idx]

        pipe.fit(Xtr, ytr)

        yhat = pipe.predict(Xva)
        yproba = None
        if task == "binary":
            yproba = pipe.predict_proba(Xva)[:, 1]
        elif task == "multiclass":
            yproba = pipe.predict_proba(Xva)

        m = evaluate(task, yva, yhat, yproba)
        fold_rows.append({"fold": i, **m})

    dfm = pd.DataFrame(fold_rows)
    summary = dfm.drop(columns=["fold"]).agg(["mean", "std"]).T
    return dfm, summary


## 8. 特征重要性（Permutation Importance）
输出：TopK 特征与重要性。


In [None]:
def permutation_importance_topk(
    pipe: Pipeline, X_valid: pd.DataFrame, y_valid: pd.Series,
    task: str, topk: int = 20, n_repeats: int = 10
):
    if task == "regression":
        scoring = "neg_mean_absolute_error"
    elif task == "binary":
        scoring = "roc_auc"
    else:
        scoring = "neg_log_loss"

    r = permutation_importance(
        pipe, X_valid, y_valid,
        n_repeats=n_repeats,
        random_state=RANDOM_STATE,
        scoring=scoring,
        n_jobs=-1
    )

    feat_names = get_feature_names(pipe)
    if not feat_names:
        feat_names = [f"f{i}" for i in range(len(r.importances_mean))]

    imp = pd.DataFrame({
        "feature": feat_names,
        "importance_mean": r.importances_mean,
        "importance_std": r.importances_std,
    }).sort_values("importance_mean", ascending=False)

    return imp.head(topk), imp


## 9. 时间序列特征工程（lag / rolling / 变动率）
关键约束：rolling 统计量必须 shift(1)。


In [None]:
def add_time_series_features(
    df: pd.DataFrame,
    group_cols: Optional[List[str]] = None,
    time_col: Optional[str] = None,
    value_cols: Optional[List[str]] = None,
    lags: Tuple[int, ...] = (1, 2, 5, 10),
    roll_windows: Tuple[int, ...] = (5, 20, 60),
):
    out = df.copy()

    if time_col is not None:
        out = out.sort_values(time_col).reset_index(drop=True)

    if value_cols is None:
        exclude = set((group_cols or []) + ([time_col] if time_col else []))
        value_cols = [c for c in out.columns if c not in exclude and pd.api.types.is_numeric_dtype(out[c])]

    if group_cols:
        g = out.groupby(group_cols, sort=False)
        for c in value_cols:
            for L in lags:
                out[f"{c}_lag{L}"] = g[c].shift(L)
            for w in roll_windows:
                out[f"{c}_rollmean{w}"] = g[c].shift(1).rolling(w).mean()
                out[f"{c}_rollstd{w}"]  = g[c].shift(1).rolling(w).std()
                out[f"{c}_rollmin{w}"]  = g[c].shift(1).rolling(w).min()
                out[f"{c}_rollmax{w}"]  = g[c].shift(1).rolling(w).max()
            out[f"{c}_diff1"] = g[c].diff(1)
            out[f"{c}_pct1"]  = g[c].pct_change(1)
    else:
        for c in value_cols:
            for L in lags:
                out[f"{c}_lag{L}"] = out[c].shift(L)
            for w in roll_windows:
                out[f"{c}_rollmean{w}"] = out[c].shift(1).rolling(w).mean()
                out[f"{c}_rollstd{w}"]  = out[c].shift(1).rolling(w).std()
                out[f"{c}_rollmin{w}"]  = out[c].shift(1).rolling(w).min()
                out[f"{c}_rollmax{w}"]  = out[c].shift(1).rolling(w).max()
            out[f"{c}_diff1"] = out[c].diff(1)
            out[f"{c}_pct1"]  = out[c].pct_change(1)

    return out


## 10. 端到端示例
需要替换：DATA_PATH / TARGET / TIME_COL / GROUP_COLS / IS_TIME_SERIES。


In [None]:
# ==== 改这里 ====
DATA_PATH = "data.csv"    # 支持 csv/parquet；根据现场实际修改
TARGET = "y"
TIME_COL = None           # 例如 "date"
GROUP_COLS = None         # 例如 ["asset_id"]
IS_TIME_SERIES = False    # 时序任务置 True

# 读取
if DATA_PATH.endswith(".csv"):
    df = pd.read_csv(DATA_PATH)
elif DATA_PATH.endswith(".parquet"):
    df = pd.read_parquet(DATA_PATH)
else:
    raise ValueError("Unsupported format")

quick_profile(df, target=TARGET, time_col=TIME_COL)

# 时间列尝试转 datetime
if TIME_COL is not None and TIME_COL in df.columns:
    try:
        df[TIME_COL] = pd.to_datetime(df[TIME_COL])
    except Exception:
        pass

task = infer_task_type(df[TARGET])
print("inferred task:", task)

# 可选：时序特征工程（按需启用）
# df = add_time_series_features(df, group_cols=GROUP_COLS, time_col=TIME_COL)

# 删除全空列
all_nan_cols = [c for c in df.columns if df[c].isna().all()]
if all_nan_cols:
    df = df.drop(columns=all_nan_cols)
    print("dropped all-NaN cols:", len(all_nan_cols))

# 切分
X_train, X_valid, X_test, y_train, y_valid, y_test = split_holdout(
    df, target=TARGET,
    time_col=TIME_COL if IS_TIME_SERIES else None,
    test_size=0.2, valid_size=0.2,
    shuffle=not IS_TIME_SERIES,
    stratify=(task in ("binary", "multiclass")) and (not IS_TIME_SERIES)
)

print("splits:", X_train.shape, X_valid.shape, X_test.shape)

# 多模型对比
res, pipes = benchmark_models(X_train, y_train, X_valid, y_valid, task, model_names=("hgb","rf","et","gbrt","dt"))
display(res.sort_values(res.columns[1], ascending=(task!="regression")))

best_name = res.sort_values(res.columns[1], ascending=(task!="regression")).iloc[0]["model"]
best_pipe = pipes[best_name]
print("best model:", best_name)

# test 评估
yhat_test = best_pipe.predict(X_test)
yproba_test = None
if task == "binary":
    yproba_test = best_pipe.predict_proba(X_test)[:, 1]
elif task == "multiclass":
    yproba_test = best_pipe.predict_proba(X_test)

test_metrics = evaluate(task, y_test, yhat_test, yproba_test)
print("test_metrics:", test_metrics)


## 11. 重要性输出（Top 20）


In [None]:
topk, full_imp = permutation_importance_topk(best_pipe, X_valid, y_valid, task, topk=20, n_repeats=8)
display(topk)


## 12. 训练后导出与复用（同机环境）
保存一个 pickle，便于复现。


In [None]:
import pickle

ARTIFACT_DIR = "artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

model_path = os.path.join(ARTIFACT_DIR, f"pipe_{best_name}_{timestamp()}.pkl")
with open(model_path, "wb") as f:
    pickle.dump(best_pipe, f)

print("saved:", model_path)

# 复用：
# with open(model_path, "rb") as f:
#     loaded = pickle.load(f)
# loaded.predict(X_test.head())


## 13. 面试现场自检清单
- 时序任务 shuffle 了
- rolling 特征没 shift(1)
- 预处理没放 Pipeline（导致统计量泄漏）
- 分类只看 accuracy，忽略不均衡（AUC/PR-AUC/F1/LogLoss）
- 过拟合不做约束（max_depth / min_samples_leaf / 正则）
