# Feature Engineering



- 核心原则：
  1) **先明确数据切分规则**（时间序列 vs i.i.d.）再做任何会“看见未来”的特征；
  2) 用 **Pipeline / ColumnTransformer** 把预处理和模型绑定，防止泄漏；
  3) 特征工程优先级：**(a) 正确性与不泄漏** > (b) 解释性 > (c) 复杂度。


## 0. 常用 import

In [None]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.inspection import permutation_importance

# 常见 baseline 模型
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier


## 1. 快速体检：列类型、缺失、常量、唯一值

In [None]:

def quick_profile(df: pd.DataFrame, max_uniques=20) -> pd.DataFrame:
    out = []
    n = len(df)
    for c in df.columns:
        s = df[c]
        nunique = s.nunique(dropna=True)
        out.append({
            "col": c,
            "dtype": str(s.dtype),
            "missing_pct": float(s.isna().mean()),
            "nunique": int(nunique),
            "sample_uniques": (s.dropna().unique()[:max_uniques]).tolist() if nunique <= max_uniques else None,
            "is_constant": bool(nunique <= 1),
        })
    prof = pd.DataFrame(out).sort_values(["missing_pct","nunique"], ascending=[False, True])
    return prof

# 用法：
# prof = quick_profile(df)
# prof.head(30)


## 2. 先定切分：i.i.d. vs 时间序列


### 2.1 i.i.d.（常规随机切分）
- 适用：样本之间独立同分布，或数据已打乱且没有时间泄漏风险。
- 注意：类别不均衡可用 `stratify=y`。


In [None]:

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y  # 分类可用
# )



### 2.2 时间序列（严格按时间切分）
- 适用：金融/电力/需求等，**预测未来**。
- 关键：训练集时间必须早于测试集；滚动验证用 `TimeSeriesSplit`。


In [None]:

def time_train_test_split(df, time_col, test_size=0.2):
    df = df.sort_values(time_col).reset_index(drop=True)
    n = len(df)
    cut = int(np.floor(n * (1 - test_size)))
    return df.iloc[:cut].copy(), df.iloc[cut:].copy()

# 用法：
# train_df, test_df = time_train_test_split(df, "date", test_size=0.2)


## 3. 列分组：数值/类别/日期时间（自动识别）

In [None]:

def infer_column_types(df: pd.DataFrame, target=None, time_col=None):
    cols = [c for c in df.columns if c != target]
    if time_col is not None and time_col in cols:
        cols.remove(time_col)

    num_cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in cols if (not pd.api.types.is_numeric_dtype(df[c]))]

    # 把“数字但其实是ID”的列挑出来：nunique 很大、且接近样本数
    maybe_id = []
    for c in num_cols:
        ratio = df[c].nunique(dropna=True) / max(1, len(df))
        if ratio > 0.8:
            maybe_id.append(c)

    return num_cols, cat_cols, maybe_id

# 用法：
# num_cols, cat_cols, maybe_id = infer_column_types(df, target="y", time_col="date")


## 4. sklearn Pipeline 模板（强烈建议：面试时就按这个来）


### 4.1 通用预处理
- 数值：缺失填充（median）+ 标准化/鲁棒缩放
- 类别：缺失填充（most_frequent）+ OneHotEncoder


In [None]:

def build_preprocessor(num_cols, cat_cols, scaler="standard"):
    if scaler == "standard":
        scaler_step = StandardScaler()
    elif scaler == "robust":
        scaler_step = RobustScaler()
    else:
        scaler_step = "passthrough"

    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", scaler_step),
    ])

    categorical_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )
    return pre

# 用法：
# pre = build_preprocessor(num_cols, cat_cols, scaler="robust")
# model = Ridge(alpha=1.0)
# pipe = Pipeline([("pre", pre), ("model", model)])


## 5. 时间特征：日期拆解 + 周期编码（对树模型/线性模型都友好）

In [None]:

def add_datetime_features(df: pd.DataFrame, time_col: str, drop_original=False):
    out = df.copy()
    t = pd.to_datetime(out[time_col])
    out[f"{time_col}_year"] = t.dt.year
    out[f"{time_col}_month"] = t.dt.month
    out[f"{time_col}_day"] = t.dt.day
    out[f"{time_col}_dow"] = t.dt.dayofweek  # 0=Mon
    out[f"{time_col}_hour"] = t.dt.hour
    out[f"{time_col}_is_month_end"] = t.dt.is_month_end.astype(int)
    out[f"{time_col}_is_month_start"] = t.dt.is_month_start.astype(int)

    # 周期编码（对线性模型更重要）
    out[f"{time_col}_dow_sin"] = np.sin(2*np.pi*out[f"{time_col}_dow"]/7)
    out[f"{time_col}_dow_cos"] = np.cos(2*np.pi*out[f"{time_col}_dow"]/7)
    out[f"{time_col}_month_sin"] = np.sin(2*np.pi*out[f"{time_col}_month"]/12)
    out[f"{time_col}_month_cos"] = np.cos(2*np.pi*out[f"{time_col}_month"]/12)

    if drop_original:
        out = out.drop(columns=[time_col])
    return out

# 用法：
# df2 = add_datetime_features(df, "date", drop_original=False)


## 6. lag / rolling 特征


> 任何 rolling/lag 特征都必须只用 “过去” 信息。  
> 具体实现：先按时间排序；对每个实体（如 instrument / meter / region）分组后 `shift()`；rolling 用 `shift(1).rolling(...)`。


In [None]:

def add_lag_features(
    df: pd.DataFrame,
    group_cols,
    time_col: str,
    value_cols,
    lags=(1,2,5),
):
    out = df.sort_values(group_cols + [time_col]).copy()
    g = out.groupby(group_cols, sort=False)
    for v in value_cols:
        for L in lags:
            out[f"{v}_lag{L}"] = g[v].shift(L)
    return out

def add_rolling_features(
    df: pd.DataFrame,
    group_cols,
    time_col: str,
    value_cols,
    windows=(3,7,14),
    stats=("mean","std","min","max")
):
    out = df.sort_values(group_cols + [time_col]).copy()
    g = out.groupby(group_cols, sort=False)
    for v in value_cols:
        base = g[v].shift(1)  # shift(1) 保证 rolling 不用到当前/未来
        for w in windows:
            r = base.rolling(window=w, min_periods=max(2, w//3))
            if "mean" in stats:
                out[f"{v}_roll{w}_mean"] = r.mean().reset_index(level=0, drop=True)
            if "std" in stats:
                out[f"{v}_roll{w}_std"] = r.std().reset_index(level=0, drop=True)
            if "min" in stats:
                out[f"{v}_roll{w}_min"] = r.min().reset_index(level=0, drop=True)
            if "max" in stats:
                out[f"{v}_roll{w}_max"] = r.max().reset_index(level=0, drop=True)
    return out

# 用法示例（按 instrument 分组）：
# df = add_lag_features(df, group_cols=["instrument"], time_col="date", value_cols=["price"], lags=(1,2,5,10))
# df = add_rolling_features(df, group_cols=["instrument"], time_col="date", value_cols=["price"], windows=(5,20), stats=("mean","std"))


## 7. 比率/差分/交互

In [None]:

def add_ratio_diff_features(df: pd.DataFrame, pairs):
    """pairs: list of (a,b) column name tuples. 生成 a_minus_b, a_over_b"""
    out = df.copy()
    eps = 1e-9
    for a,b in pairs:
        out[f"{a}_minus_{b}"] = out[a] - out[b]
        out[f"{a}_over_{b}"] = out[a] / (out[b].abs() + eps)
    return out

# 用法：
# df = add_ratio_diff_features(df, pairs=[("bid","ask"), ("high","low")])


## 8. 高基数类别


### 8.1 One-Hot 的问题
- 类别数很多（例如上万）会导致维度膨胀、内存和训练时间爆炸。
- 但如果样本量不大，且类别对 target 很关键，One-Hot 仍可能有效。

### 8.2 实战策略（按优先级）
1) **频次截断**：只保留 top-K 类别，其余归为 'OTHER'
2) **哈希技巧**：`FeatureHasher`（如果可用）
3) **目标编码**：需严格做 *out-of-fold*（否则严重泄漏）
4) **树模型**：某些树模型能吃 raw id（但在 sklearn 里一般还是需要编码）


In [None]:

def topk_category(df: pd.DataFrame, col: str, k=50, other="__OTHER__"):
    out = df.copy()
    vc = out[col].value_counts(dropna=False)
    keep = set(vc.index[:k])
    out[col] = out[col].where(out[col].isin(keep), other)
    return out

# 用法：
# df = topk_category(df, "sector", k=100)


## 9. 轻量特征筛选：Mutual Information + Permutation Importance


- **Mutual Information**：很快，适合初筛；对非线性关系也敏感。
- **Permutation Importance**：更贴近模型真实贡献；但更慢，最好在一个 baseline 模型上做。


In [None]:

def mi_rank(X: pd.DataFrame, y, task="regression", discrete_features="auto", random_state=42):
    # 这里要求 X 全是数值（所以通常在 OneHot 后做）
    X_ = np.asarray(X)
    y_ = np.asarray(y)
    if task == "regression":
        mi = mutual_info_regression(X_, y_, discrete_features=discrete_features, random_state=random_state)
    else:
        mi = mutual_info_classif(X_, y_, discrete_features=discrete_features, random_state=random_state)
    return np.array(mi)

def permutation_rank(pipe, X_valid, y_valid, scoring=None, n_repeats=5, random_state=42):
    r = permutation_importance(pipe, X_valid, y_valid, scoring=scoring, n_repeats=n_repeats, random_state=random_state)
    return r.importances_mean, r.importances_std

# 用法（Permutation）：
# pipe.fit(X_train, y_train)
# imp_mean, imp_std = permutation_rank(pipe, X_test, y_test, scoring="neg_root_mean_squared_error")


## 10. 落地流程


1) **Baseline**：最简单的 Pipeline（数值+类别） + Ridge/LogReg 或 RandomForest  
2) **加时间特征**：日期拆解 + (如果有实体) lag/rolling  
3) **加少量手工特征**：差分/比率/交互（最易解释）  
4) **做验证**：TimeSeriesSplit 或明确的时间切分；报告指标 + 误差分布  
5) **解释模型**：Top features（Permutation）+ 直观解释（为什么合理）  
6) **写进 PPT**：Data&Split、Features&Model、Results&Next steps


## 11. 示例

In [None]:

# 假设：df 包含 target 列 y；时间列 date；可能还有 instrument/entity 列
# df = pd.read_csv("your_data.csv")

# 1) 时间特征
# df = add_datetime_features(df, time_col="date", drop_original=False)

# 2) lag/rolling（如果是面板数据：instrument x time）
# df = add_lag_features(df, group_cols=["instrument"], time_col="date", value_cols=["price"], lags=(1,2,5))
# df = add_rolling_features(df, group_cols=["instrument"], time_col="date", value_cols=["price"], windows=(5,20))

# 3) 切分（严格按时间）
# train_df, test_df = time_train_test_split(df, time_col="date", test_size=0.2)
# y_train = train_df["y"]; X_train = train_df.drop(columns=["y"])
# y_test  = test_df["y"];  X_test  = test_df.drop(columns=["y"])

# 4) 列类型
# num_cols, cat_cols, maybe_id = infer_column_types(X_train, target=None, time_col="date")
# print("maybe_id:", maybe_id)

# 5) Pipeline + 模型
# pre = build_preprocessor(num_cols=num_cols, cat_cols=cat_cols, scaler="robust")
# pipe = Pipeline([("pre", pre), ("model", Ridge(alpha=1.0))])

# 6) 训练与评估（回归）
# pipe.fit(X_train, y_train)
# pred = pipe.predict(X_test)
# rmse = mean_squared_error(y_test, pred, squared=False)
# print("RMSE:", rmse)



## 12. 常见坑
- **Leakage**：rolling 里忘记 `shift(1)`；目标编码不做 OOF；标准化在全数据上 fit。
- **时间切分**：用 train_test_split 随机打乱导致未来信息进入训练。
- **类别处理**：测试集出现新类别 → OneHotEncoder 要 `handle_unknown="ignore"`。
- **缺失值**：树模型有时能处理，但 sklearn 大多数模型不接受 NaN → 统一 impute。
- **ID 列**：高基数数值列（看着像连续，其实是ID）会害死线性模型；建议丢弃或当类别处理。
