# CatBoost 现场训练与部署速查（ipynb）

> 目的：拿到数据后快速完成：读取 → 切分（时间优先）→ CatBoost baseline → 迭代 → 导出结果/模型 → 汇报要点  
>


## 0. 运行前清单

- [ ] 明确任务类型：回归 / 二分类 / 多分类 / 排序  
- [ ] 明确 target 列名、时间列（如有）、实体 ID 列（如有）  
- [ ] 排查泄漏：任何“事后信息/未来信息/结算后字段/未来滚动统计”  
- [ ] 选定切分方式：时间切分优先（最后一段 holdout）  
- [ ] 固定随机种子与输出目录（保存模型、预测、特征重要性、简报表）


In [None]:

# 0) 环境与依赖
# 如已预装则跳过；无权限时加 --user
# !pip -q install catboost


In [None]:

# 1) Imports & 全局配置
import os
import json
import time
import math
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, log_loss, accuracy_score

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

OUTDIR = "artifacts_catboost"
os.makedirs(OUTDIR, exist_ok=True)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


## 1. 读取数据

- 支持 CSV / Parquet / Feather / Pickle  
- 统一：`df` 为主表；`target_col`、`time_col`、`id_cols`、`drop_cols` 在一个配置区写死


In [None]:

# 2) 配置区（填空）
DATA_PATH = "data.csv"      # TODO: 修改为实际路径
TARGET_COL = "y"            # TODO: 修改为实际 target
TIME_COL   = "date"         # TODO: 若无时间列，设为 None
ID_COLS    = ["id"]         # TODO: 若无，设为 []
DROP_COLS  = []             # TODO: 手动加入明显泄漏列/无意义列（如未来字段、纯标识）

TASK = "regression"         # "regression" | "binary"

# 切分参数（时间切分：最后一段做验证）
VAL_FRAC = 0.2


In [None]:

# 3) 读入数据
def read_table(path: str) -> pd.DataFrame:
    if path.endswith(".csv"):
        return pd.read_csv(path)
    if path.endswith(".parquet"):
        return pd.read_parquet(path)
    if path.endswith(".feather"):
        return pd.read_feather(path)
    if path.endswith(".pkl") or path.endswith(".pickle"):
        return pd.read_pickle(path)
    raise ValueError(f"Unsupported file type: {path}")

df = read_table(DATA_PATH)
print("shape:", df.shape)
df.head()


## 2. 快速体检（EDA-lite）

输出：
- 列类型分布、缺失比例 Top  
- target 分布摘要  
- 时间范围（如有）


In [None]:

# 4) 列类型、缺失率
def missing_report(frame: pd.DataFrame, topk: int = 25) -> pd.DataFrame:
    miss = frame.isna().mean().sort_values(ascending=False)
    out = pd.DataFrame({"missing_frac": miss, "dtype": frame.dtypes.astype(str)})
    return out.head(topk)

print(df.dtypes.value_counts())
missing_report(df, topk=30)


In [None]:

# 5) target 体检
y = df[TARGET_COL]
print("target dtype:", y.dtype)
print("target describe:\n", y.describe())

if TASK == "binary":
    print("value counts:\n", y.value_counts(dropna=False))


In [None]:

# 6) 时间范围（如有）
if TIME_COL is not None and TIME_COL in df.columns:
    # 尽量转 datetime；失败则保持原样
    try:
        df[TIME_COL] = pd.to_datetime(df[TIME_COL])
        print("time min/max:", df[TIME_COL].min(), df[TIME_COL].max())
    except Exception as e:
        print("TIME_COL parse failed:", e)
        print("time min/max(raw):", df[TIME_COL].min(), df[TIME_COL].max())


## 3. 特征矩阵构建

规则：
- 删除 target / ID / DROP_COLS  
- 类别列交给 CatBoost（object/category）  
- 类别缺失统一成字符串 "NA"


In [None]:

# 7) 构建 X / y
drop_cols = list(set([TARGET_COL] + (ID_COLS or []) + (DROP_COLS or [])))
X = df.drop(columns=drop_cols, errors="ignore").copy()
y = df[TARGET_COL].copy()

# 类别特征识别
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# 类别缺失处理：统一为字符串，缺失填 "NA"
for c in cat_cols:
    X[c] = X[c].astype("object").fillna("NA")

cat_idx = [X.columns.get_loc(c) for c in cat_cols]

print("X shape:", X.shape)
print("n_cat_cols:", len(cat_cols))
print("cat_cols (first 30):", cat_cols[:30])


## 4. 切分：时间优先

- 有 TIME_COL：按时间排序后，最后 VAL_FRAC 作为验证集  
- 无 TIME_COL：退化为简单随机切分（用于非时序/或必须）


In [None]:

# 8) 切分
if TIME_COL is not None and TIME_COL in df.columns:
    order = df[TIME_COL].sort_values().index
    X_ord, y_ord = X.loc[order], y.loc[order]
    split = int(len(X_ord) * (1 - VAL_FRAC))
    X_tr, X_va = X_ord.iloc[:split], X_ord.iloc[split:]
    y_tr, y_va = y_ord.iloc[:split], y_ord.iloc[split:]
    split_mode = "time_holdout"
else:
    from sklearn.model_selection import train_test_split
    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=VAL_FRAC, random_state=RANDOM_SEED, shuffle=True)
    split_mode = "random_holdout"

print("split_mode:", split_mode)
print("train:", X_tr.shape, "valid:", X_va.shape)


## 5. Baseline：CatBoost 训练

约定：
- iterations 拉大 + early stopping 控制  
- verbose 控制输出节奏  
- 输出：指标、最佳迭代、特征重要性、预测文件、模型文件


In [None]:

# 9) 组装 Pool
train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
valid_pool = Pool(X_va, y_va, cat_features=cat_idx)


In [None]:

# 10) 训练参数模板
common_params = dict(
    random_seed=RANDOM_SEED,
    iterations=8000,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    od_type="Iter",
    od_wait=300,
    verbose=200
)

common_params


In [None]:

# 11) 训练并评估
t0 = time.time()

if TASK == "regression":
    model = CatBoostRegressor(
        loss_function="RMSE",
        eval_metric="RMSE",
        **common_params
    )
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    pred = model.predict(X_va)
    rmse = mean_squared_error(y_va, pred, squared=False)
    mae  = mean_absolute_error(y_va, pred)
    metrics = {"rmse": float(rmse), "mae": float(mae)}
    print("VALID rmse:", rmse, "mae:", mae)

elif TASK == "binary":
    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        **common_params
    )
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    proba = model.predict_proba(X_va)[:, 1]
    pred  = (proba >= 0.5).astype(int)
    auc   = roc_auc_score(y_va, proba)
    ll    = log_loss(y_va, proba)
    acc   = accuracy_score(y_va, pred)
    metrics = {"auc": float(auc), "logloss": float(ll), "acc": float(acc)}
    print("VALID auc:", auc, "logloss:", ll, "acc:", acc)

else:
    raise ValueError(f"Unknown TASK: {TASK}")

dt = time.time() - t0
print("train_seconds:", dt)


## 6. 输出工件（artifacts）

写出：
- `metrics.json`  
- `pred_valid.csv`（含 y_true / y_pred）  
- `feature_importance.csv`  
- `model.cbm`  
- `run_config.json`（包含关键配置、切分方式、列清单）


In [None]:

# 12) 保存 metrics / 预测
run_info = {
    "timestamp": pd.Timestamp.now().isoformat(),
    "data_path": DATA_PATH,
    "task": TASK,
    "target_col": TARGET_COL,
    "time_col": TIME_COL,
    "id_cols": ID_COLS,
    "drop_cols": DROP_COLS,
    "split_mode": split_mode,
    "val_frac": VAL_FRAC,
    "n_train": int(len(X_tr)),
    "n_valid": int(len(X_va)),
    "n_features": int(X.shape[1]),
    "n_cat_cols": int(len(cat_cols)),
    "cat_cols": cat_cols,
    "params": common_params,
    "metrics": metrics
}

with open(os.path.join(OUTDIR, "run_config.json"), "w") as f:
    json.dump(run_info, f, indent=2)

# 预测文件
pred_df = pd.DataFrame({
    "y_true": y_va.values
})

if TASK == "regression":
    pred_df["y_pred"] = pred
elif TASK == "binary":
    pred_df["p1"] = proba
    pred_df["y_pred"] = pred

pred_df.to_csv(os.path.join(OUTDIR, "pred_valid.csv"), index=False)

print("saved:", os.path.join(OUTDIR, "run_config.json"))
print("saved:", os.path.join(OUTDIR, "pred_valid.csv"))


In [None]:

# 13) 特征重要性
imp = model.get_feature_importance(train_pool, type="FeatureImportance")
fi = pd.DataFrame({"feature": X.columns, "importance": imp}).sort_values("importance", ascending=False)
fi.to_csv(os.path.join(OUTDIR, "feature_importance.csv"), index=False)
fi.head(30)


In [None]:

# 14) 保存模型
model_path = os.path.join(OUTDIR, "model.cbm")
model.save_model(model_path)
print("saved:", model_path)


## 7. 快速迭代区（只改 1~2 个变量）

流程：
1) depth：4 / 6 / 8  
2) l2_leaf_reg：3 / 6 / 10  
3) subsample + rsm（噪声大/过拟合时）  
4) loss_function：回归切 MAE；二分类保持 AUC/Logloss


In [None]:

# 15) 一组更稳健的泛化参数（需要时启用）
tuned_params = dict(
    random_seed=RANDOM_SEED,
    iterations=12000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=6,
    bootstrap_type="Bernoulli",
    subsample=0.8,
    rsm=0.8,
    od_type="Iter",
    od_wait=400,
    verbose=300
)

tuned_params


## 8. 简报素材（汇报时直接用）

输出要点：
- 数据：样本量、特征数、时间范围、切分方式  
- 指标：baseline + tuned（如有）  
- 重要特征 Top10 + 解释  
- 风险：泄漏、稳定性、shift、ID 记忆化


In [None]:

# 16) 生成简报表（Top features + metrics）
summary = {
    "split_mode": split_mode,
    "train_shape": list(X_tr.shape),
    "valid_shape": list(X_va.shape),
    "metrics": metrics,
    "top_features": fi.head(15).to_dict(orient="records")
}

with open(os.path.join(OUTDIR, "summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("saved:", os.path.join(OUTDIR, "summary.json"))
summary


## 9. 复现加载（必要时）


In [None]:

# 17) 复现加载
# 回归：CatBoostRegressor(); 分类：CatBoostClassifier()
from catboost import CatBoostRegressor, CatBoostClassifier

if TASK == "regression":
    m = CatBoostRegressor()
else:
    m = CatBoostClassifier()

m.load_model(os.path.join(OUTDIR, "model.cbm"))
print("loaded model ok")


## 10. 排查清单（出问题时按顺序查）

- 列名：target/time/id 是否写错  
- dtype：类别列是否被误读成数值/或 NaN 未处理导致类型混乱  
- 泄漏：是否把未来字段留在 X  
- 切分：时间列未排序导致“未来进训练”  
- 指标：分类/回归 metric 是否匹配  
- 过拟合：训练好、验证差 → depth 降/正则升/subsample+rsm/减少高基数 ID
