# Time Label 处理速查（Dataset Interview 用）

目标：把 raw time label 变成稳定、可复用、无泄漏的时间轴与特征；把关键检查项固化成可复制的代码块。  
环境假设：Python 3.8 + numpy / pandas / scikit-learn（其余包若要装，现场再处理）。

使用方式：打开数据后，从“0. 快速配置”往下跑；每个区块都可独立使用。  

输出物：
- `df`：完成时间解析与排序/去重后的主表
- `df_feat`：包含时间衍生特征（可选包含 lag/rolling）
- `train_df, test_df`：按时间切分后的训练/测试
- `cv`：TimeSeriesSplit（或自定义切分）


## 0. 快速配置（只改这里）

In [None]:
import numpy as np
import pandas as pd

# ====== 必填：按数据改 ======
TIME_COL = "timestamp"      # 时间列名
ID_COL   = None             # 多实体（多股票/多设备）就填列名；单实体留 None
TARGET_COL = "y"            # 目标列名（回归/分类标签）

# 频率（需要对齐/重采样时用）：'1min','5min','1H','1D' 等；不对齐则 None
FREQ = None

# 时区策略：
# - 若原始时间无时区但实际属于某个时区：ASSUME_TZ_IF_NAIVE 填那个时区（如 "America/New_York"）
# - 输出统一为 OUTPUT_TZ（默认 UTC）
ASSUME_TZ_IF_NAIVE = None
OUTPUT_TZ = "UTC"

# 去重策略：同一 (id,time) 多条时取 last/first；或按 agg 聚合
DEDUP_STRATEGY = "last"     # "last" / "first" / "agg"

# 对齐后缺测填充策略（仅在 FREQ 不为 None 时生效）
FILL_NUMERIC = "ffill"      # "ffill" / "bfill" / "interpolate" / None
MAX_GAP = None              # 允许 fill 的最大缺口（例如 '2H'）；不限制则 None


## 1. 读入与时间解析（datetime + 时区 + NaT 统计）

In [None]:
def parse_time_column(df: pd.DataFrame,
                      time_col: str,
                      assume_tz_if_naive: str = None,
                      output_tz: str = "UTC") -> pd.DataFrame:
    """
    1. 强制把时间列转成 pandas datetime
    2. 若为 naive 且给了 assume_tz_if_naive：先 localize；否则保持 naive
    3. 若带 tz 或已 localize：统一转 output_tz
    """
    out = df.copy()
    out[time_col] = pd.to_datetime(out[time_col], errors="coerce")

    nat_rate = out[time_col].isna().mean()
    if nat_rate > 0:
        print(f"[time parse] NaT rate: {nat_rate:.2%}")

    # 处理时区（pandas 时间列可能是 tz-aware 或 naive）
    try:
        tzinfo = out[time_col].dt.tz
    except Exception:
        tzinfo = None

    if tzinfo is None:
        if assume_tz_if_naive is not None:
            out[time_col] = out[time_col].dt.tz_localize(
                assume_tz_if_naive, ambiguous="infer", nonexistent="shift_forward"
            )
            tzinfo = out[time_col].dt.tz

    if tzinfo is not None and output_tz is not None:
        out[time_col] = out[time_col].dt.tz_convert(output_tz)

    return out


def basic_time_qc(df: pd.DataFrame, time_col: str, id_col: str = None):
    """基础 QC：NaT、范围、单调性、重复"""
    d = df
    if id_col is None:
        print("rows:", len(d))
        print("NaT:", int(d[time_col].isna().sum()))
        if d[time_col].notna().any():
            print("min/max:", d[time_col].min(), d[time_col].max())
        s = d.loc[d[time_col].notna(), time_col]
        print("monotonic increasing:", bool(s.is_monotonic_increasing))
        print("duplicate timestamps:", int(d.duplicated(subset=[time_col]).sum()))
    else:
        print("rows:", len(d), "unique ids:", int(d[id_col].nunique(dropna=False)))
        print("NaT:", int(d[time_col].isna().sum()))
        print("duplicate (id,time):", int(d.duplicated(subset=[id_col, time_col]).sum()))


# ====== 用法示例 ======
# df_raw = pd.read_csv("xxx.csv")
# df = parse_time_column(df_raw, TIME_COL, ASSUME_TZ_IF_NAIVE, OUTPUT_TZ)
# basic_time_qc(df, TIME_COL, ID_COL)


## 2. 排序 + 去重（同一 (id,time) 多条）

In [None]:
def sort_and_dedup(df: pd.DataFrame,
                   time_col: str,
                   id_col: str = None,
                   strategy: str = "last",
                   agg: dict = None) -> pd.DataFrame:
    """
    strategy:
      - 'last'/'first': 保留最后/最先记录
      - 'agg': 按 (id,time) 聚合（agg dict 必须给）
    """
    out = df.copy()
    key = [time_col] if id_col is None else [id_col, time_col]
    out = out.sort_values(key)

    if strategy in ("last", "first"):
        keep = "last" if strategy == "last" else "first"
        out = out.drop_duplicates(subset=key, keep=keep)
    elif strategy == "agg":
        if agg is None:
            raise ValueError("strategy='agg' 需要提供 agg 字典，例如 {'price':'last','volume':'sum'}")
        out = out.groupby(key, as_index=False).agg(agg)
    else:
        raise ValueError("Unknown strategy")

    return out


# ====== 用法示例 ======
# df = sort_and_dedup(df, TIME_COL, ID_COL, DEDUP_STRATEGY)
# basic_time_qc(df, TIME_COL, ID_COL)


## 3. 对齐到规则频率（可选：生成完整时间栅格 + 缺测标记）

对齐到规则频率的典型用途：分钟/小时/天级别建模；缺测需要显式处理（fill + missing mask）。  
多实体场景：每个 id 单独对齐再拼接。


In [None]:
def align_to_frequency(df: pd.DataFrame,
                       time_col: str,
                       freq: str,
                       id_col: str = None,
                       fill_numeric: str = "ffill",
                       max_gap: str = None) -> pd.DataFrame:
    """
    对齐：生成完整时间栅格，并把原始观测对齐上去。
    - 数值列默认 ffill/bfill/interpolate
    - 额外生成 _is_missing：对齐后该行是否为原始缺测
    备注：max_gap 的严格实现较长，现场根据需要补。
    """
    out = df.copy()
    if freq is None:
        return out

    numeric_cols = out.select_dtypes(include=[np.number]).columns.tolist()
    if time_col in numeric_cols:
        numeric_cols.remove(time_col)

    def _fill(g):
        g = g.set_index(time_col).sort_index()
        full_idx = pd.date_range(g.index.min(), g.index.max(), freq=freq, tz=g.index.tz)
        g2 = g.reindex(full_idx)
        g2["_is_missing"] = (g2[numeric_cols].isna().all(axis=1).astype(np.int8)
                             if numeric_cols else 0)

        if numeric_cols:
            if fill_numeric == "ffill":
                g2[numeric_cols] = g2[numeric_cols].ffill()
            elif fill_numeric == "bfill":
                g2[numeric_cols] = g2[numeric_cols].bfill()
            elif fill_numeric == "interpolate":
                g2[numeric_cols] = g2[numeric_cols].interpolate(limit_direction="both")
            elif fill_numeric is None:
                pass
            else:
                raise ValueError("unknown fill_numeric")

        g2.index.name = time_col
        return g2.reset_index()

    if id_col is None:
        return _fill(out)

    pieces = []
    for _id, g in out.groupby(id_col, sort=False):
        g2 = _fill(g)
        g2[id_col] = _id
        pieces.append(g2)
    out2 = pd.concat(pieces, ignore_index=True).sort_values([id_col, time_col])
    return out2


# ====== 用法示例 ======
# if FREQ is not None:
#     df = align_to_frequency(df, TIME_COL, FREQ, ID_COL, FILL_NUMERIC, MAX_GAP)


## 4. 时间衍生特征：日历特征 + 周期 sin/cos + 相对时间

In [None]:
def add_calendar_features(df: pd.DataFrame, time_col: str) -> pd.DataFrame:
    out = df.copy()
    t = out[time_col]

    out["year"] = t.dt.year
    out["month"] = t.dt.month
    out["day"] = t.dt.day
    out["dayofweek"] = t.dt.dayofweek
    out["hour"] = t.dt.hour
    out["minute"] = t.dt.minute

    out["is_weekend"] = (out["dayofweek"] >= 5).astype(np.int8)
    out["is_month_start"] = t.dt.is_month_start.astype(np.int8)
    out["is_month_end"] = t.dt.is_month_end.astype(np.int8)
    out["is_quarter_start"] = t.dt.is_quarter_start.astype(np.int8)
    out["is_quarter_end"] = t.dt.is_quarter_end.astype(np.int8)

    try:
        out["weekofyear"] = t.dt.isocalendar().week.astype(int)
    except Exception:
        out["weekofyear"] = t.dt.weekofyear

    return out


def add_cyclical_features(df: pd.DataFrame, col: str, period: int, prefix: str = None) -> pd.DataFrame:
    out = df.copy()
    x = out[col].astype(float)
    p = float(period)
    name = prefix or col
    out[f"{name}_sin"] = np.sin(2*np.pi*x/p)
    out[f"{name}_cos"] = np.cos(2*np.pi*x/p)
    return out


def add_relative_time_features(df: pd.DataFrame, time_col: str, id_col: str = None) -> pd.DataFrame:
    out = df.copy()

    def _rel(g):
        g = g.sort_values(time_col).copy()
        dt_prev = g[time_col].diff()
        g["dt_from_prev_sec"] = dt_prev.dt.total_seconds().fillna(0.0)
        dt_start = g[time_col] - g[time_col].iloc[0]
        g["dt_from_start_sec"] = dt_start.dt.total_seconds()
        return g

    if id_col is None:
        return _rel(out)
    return out.groupby(id_col, group_keys=False, sort=False).apply(_rel)


# ====== 用法示例 ======
# df_feat = add_calendar_features(df, TIME_COL)
# df_feat = add_cyclical_features(df_feat, "hour", 24)
# df_feat = add_cyclical_features(df_feat, "dayofweek", 7)
# df_feat = add_cyclical_features(df_feat, "month", 12)
# df_feat = add_relative_time_features(df_feat, TIME_COL, ID_COL)


## 5. Lag / Rolling 特征（防泄漏：先 shift 再 rolling）

In [None]:
def add_lag_features(df: pd.DataFrame,
                    cols,
                    lags=(1,2,3,5,10),
                    time_col: str = None,
                    id_col: str = None) -> pd.DataFrame:
    out = df.copy()
    if isinstance(cols, str):
        cols = [cols]

    def _lag(g):
        g = g.sort_values(time_col).copy() if time_col is not None else g.copy()
        for c in cols:
            for L in lags:
                g[f"{c}_lag{L}"] = g[c].shift(L)
        return g

    if id_col is None:
        return _lag(out)
    return out.groupby(id_col, group_keys=False, sort=False).apply(_lag)


def add_rolling_features(df: pd.DataFrame,
                         cols,
                         windows=(5,10,20),
                         stats=("mean","std","min","max"),
                         time_col: str = None,
                         id_col: str = None,
                         min_periods: int = 1,
                         shift_before: int = 1) -> pd.DataFrame:
    """rolling 特征：默认 shift_before=1，确保 rolling 只用历史信息"""
    out = df.copy()
    if isinstance(cols, str):
        cols = [cols]

    def _roll(g):
        g = g.sort_values(time_col).copy() if time_col is not None else g.copy()
        for c in cols:
            base = g[c].shift(shift_before)
            for w in windows:
                r = base.rolling(window=w, min_periods=min_periods)
                if "mean" in stats: g[f"{c}_rmean{w}"] = r.mean()
                if "std"  in stats: g[f"{c}_rstd{w}"]  = r.std(ddof=0)
                if "min"  in stats: g[f"{c}_rmin{w}"]  = r.min()
                if "max"  in stats: g[f"{c}_rmax{w}"]  = r.max()
        return g

    if id_col is None:
        return _roll(out)
    return out.groupby(id_col, group_keys=False, sort=False).apply(_roll)


# ====== 用法示例 ======
# FEATURE_BASE_COLS = ["x1","x2"]   # 数值特征列
# df_feat = add_lag_features(df_feat, FEATURE_BASE_COLS, lags=(1,2,3), time_col=TIME_COL, id_col=ID_COL)
# df_feat = add_rolling_features(df_feat, FEATURE_BASE_COLS, windows=(5,10,20), time_col=TIME_COL, id_col=ID_COL)


## 6. 时间切分：Train/Test + TimeSeriesSplit（walk-forward）

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def time_based_train_test_split(df: pd.DataFrame,
                                time_col: str,
                                test_size: float = 0.2,
                                id_col: str = None):
    """按时间分位数切分：最后一段时间做 test"""
    out = df.sort_values([time_col] if id_col is None else [time_col, id_col]).copy()
    out = out[out[time_col].notna()].copy()

    split_time = out[time_col].quantile(1 - test_size)
    train = out[out[time_col] <= split_time].copy()
    test  = out[out[time_col] >  split_time].copy()

    print("split_time:", split_time)
    print("train rows:", len(train), "test rows:", len(test))
    return train, test


def build_time_series_cv(n_splits: int = 5):
    return TimeSeriesSplit(n_splits=n_splits)


# ====== 用法示例 ======
# train_df, test_df = time_based_train_test_split(df_feat, TIME_COL, test_size=0.2, id_col=ID_COL)
# cv = build_time_series_cv(n_splits=5)


## 7. sklearn 建模骨架（预处理 + baseline 模型 + 指标）

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

def make_preprocess(numeric_features, categorical_features):
    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),
    ])
    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])
    pre = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric_features),
            ("cat", cat_pipe, categorical_features),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )
    return pre


def fit_evaluate_regression(train_df, test_df, target_col, time_col, drop_cols=()):
    df_tr = train_df.copy()
    df_te = test_df.copy()

    drop = set([target_col, time_col]) | set(drop_cols)
    if ID_COL is not None:
        drop.add(ID_COL)

    X_tr = df_tr.drop(columns=[c for c in drop if c in df_tr.columns])
    y_tr = df_tr[target_col].values
    X_te = df_te.drop(columns=[c for c in drop if c in df_te.columns])
    y_te = df_te[target_col].values

    numeric_features = X_tr.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = [c for c in X_tr.columns if c not in numeric_features]

    pre = make_preprocess(numeric_features, categorical_features)

    models = {
        "ridge": Ridge(alpha=1.0, random_state=0),
        "rf": RandomForestRegressor(n_estimators=300, random_state=0, n_jobs=-1),
    }

    for name, model in models.items():
        pipe = Pipeline(steps=[("pre", pre), ("model", model)])
        pipe.fit(X_tr, y_tr)
        pred = pipe.predict(X_te)
        rmse = mean_squared_error(y_te, pred, squared=False)
        print(f"[{name}] RMSE:", rmse)


def fit_evaluate_classification(train_df, test_df, target_col, time_col, drop_cols=()):
    df_tr = train_df.copy()
    df_te = test_df.copy()

    drop = set([target_col, time_col]) | set(drop_cols)
    if ID_COL is not None:
        drop.add(ID_COL)

    X_tr = df_tr.drop(columns=[c for c in drop if c in df_tr.columns])
    y_tr = df_tr[target_col].values
    X_te = df_te.drop(columns=[c for c in drop if c in df_te.columns])
    y_te = df_te[target_col].values

    numeric_features = X_tr.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = [c for c in X_tr.columns if c not in numeric_features]

    pre = make_preprocess(numeric_features, categorical_features)

    models = {
        "logreg": LogisticRegression(max_iter=2000),
        "rf": RandomForestClassifier(n_estimators=400, random_state=0, n_jobs=-1),
    }

    for name, model in models.items():
        pipe = Pipeline(steps=[("pre", pre), ("model", model)])
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_te)[:, 1] if hasattr(pipe[-1], "predict_proba") else pipe.decision_function(X_te)
        auc = roc_auc_score(y_te, proba)
        print(f"[{name}] AUC:", auc)


## 8. 一键流水线（time parse → dedup → align → calendar/cycle/relative → split → baseline）

In [None]:
def build_time_label_feature_table(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = parse_time_column(df_raw, TIME_COL, ASSUME_TZ_IF_NAIVE, OUTPUT_TZ)
    df = sort_and_dedup(df, TIME_COL, ID_COL, DEDUP_STRATEGY)

    if FREQ is not None:
        df = align_to_frequency(df, TIME_COL, FREQ, ID_COL, FILL_NUMERIC, MAX_GAP)

    df_feat = df.copy()
    df_feat = add_calendar_features(df_feat, TIME_COL)

    # 周期编码（按需要启用）
    if "hour" in df_feat.columns:
        df_feat = add_cyclical_features(df_feat, "hour", 24)
    if "dayofweek" in df_feat.columns:
        df_feat = add_cyclical_features(df_feat, "dayofweek", 7)
    if "month" in df_feat.columns:
        df_feat = add_cyclical_features(df_feat, "month", 12)

    df_feat = add_relative_time_features(df_feat, TIME_COL, ID_COL)
    return df_feat


# ====== 端到端示例（现场按数据路径与列名改） ======
# df_raw = pd.read_parquet("data.parquet")  # or pd.read_csv("data.csv")
# df_feat = build_time_label_feature_table(df_raw)
# basic_time_qc(df_feat, TIME_COL, ID_COL)
# train_df, test_df = time_based_train_test_split(df_feat, TIME_COL, test_size=0.2, id_col=ID_COL)
# fit_evaluate_regression(train_df, test_df, TARGET_COL, TIME_COL)        # 回归
# fit_evaluate_classification(train_df, test_df, TARGET_COL, TIME_COL)    # 二分类


## 9. 最小化检查清单（最后 2 分钟扫一遍）

- 时间列已转 datetime；NaT 比例已打印  
- (id,time) 排序且去重策略明确（last/first/agg）  
- train/test 严格按时间切分；无随机打乱  
- lag/rolling 仅使用历史信息（shift 后再 rolling）  
- 对齐后缺测有 `_is_missing` 或类似标记（若做了对齐）  
- baseline 模型已跑通并有指标（RMSE/AUC）  


## 10. 金融数据中的不均匀时间（Irregular / Event-driven Time）

金融 tick / trade 数据是事件驱动的，不满足均匀时间假设。  
处理原则：**不强行插值**，要么在事件时间中建模，要么通过 bar 机制把不均匀性显式吸收。


### 10.1 Event-time 建模（不对齐、不插值）

In [None]:
def add_event_time_features(df: pd.DataFrame,
                             time_col: str,
                             id_col: str = None) -> pd.DataFrame:
    """
    Event-time 特征：
    - event_idx: 事件序号
    - dt_from_prev_sec: 距上一个事件的时间间隔
    - log_dt_from_prev_sec
    适用于 tick / trade 级别数据
    """
    out = df.copy()

    def _evt(g):
        g = g.sort_values(time_col).copy()
        g["event_idx"] = np.arange(len(g), dtype=np.int64)
        dt = g[time_col].diff().dt.total_seconds()
        g["dt_from_prev_sec"] = dt.fillna(0.0)
        g["log_dt_from_prev_sec"] = np.log1p(g["dt_from_prev_sec"])
        return g

    if id_col is None:
        return _evt(out)
    return out.groupby(id_col, group_keys=False, sort=False).apply(_evt)


# ====== 用法示例 ======
# df_evt = add_event_time_features(df, TIME_COL, ID_COL)


### 10.2 Time bars（固定时间桶）

In [None]:
def make_time_bars(df: pd.DataFrame,
                   time_col: str,
                   freq: str,
                   price_col: str,
                   volume_col: str = None,
                   id_col: str = None) -> pd.DataFrame:
    """
    固定时间 bar：
    - price: last
    - return: log return
    - volume: sum
    - n_trades: count
    - has_trade: 是否有交易
    """
    out = df.copy()

    def _bar(g):
        g = g.set_index(time_col).sort_index()
        res = {}
        res["price"] = g[price_col].resample(freq).last()
        res["n_trades"] = g[price_col].resample(freq).count()
        if volume_col is not None:
            res["volume"] = g[volume_col].resample(freq).sum()
        bar = pd.concat(res, axis=1)
        bar["has_trade"] = (bar["n_trades"] > 0).astype(np.int8)
        bar["return"] = np.log(bar["price"] / bar["price"].shift(1))
        bar = bar.reset_index()
        return bar

    if id_col is None:
        return _bar(out)

    pieces = []
    for _id, g in out.groupby(id_col, sort=False):
        b = _bar(g)
        b[id_col] = _id
        pieces.append(b)

    return pd.concat(pieces, ignore_index=True).sort_values([id_col, time_col])


# ====== 用法示例 ======
# bar_df = make_time_bars(df, TIME_COL, "1min", price_col="price", volume_col="volume", id_col=ID_COL)


### 10.3 Volume / Dollar bars（市场活动时间）

In [None]:
def make_volume_bars(df: pd.DataFrame,
                     time_col: str,
                     price_col: str,
                     volume_col: str,
                     volume_threshold: float,
                     id_col: str = None) -> pd.DataFrame:
    """
    Volume bars：累计成交量达到阈值生成一个 bar
    输出包含 bar 的时间跨度（duration）
    """
    out = df.copy()

    def _vb(g):
        g = g.sort_values(time_col).copy()
        bars = []
        cum_vol = 0.0
        start_idx = 0

        for i, row in g.iterrows():
            cum_vol += row[volume_col]
            if cum_vol >= volume_threshold:
                chunk = g.iloc[start_idx:i+1]
                bar = {
                    time_col: chunk[time_col].iloc[-1],
                    "price": chunk[price_col].iloc[-1],
                    "volume": chunk[volume_col].sum(),
                    "n_trades": len(chunk),
                    "duration_sec": (
                        chunk[time_col].iloc[-1] - chunk[time_col].iloc[0]
                    ).total_seconds(),
                }
                bars.append(bar)
                cum_vol = 0.0
                start_idx = i + 1

        return pd.DataFrame(bars)

    if id_col is None:
        return _vb(out)

    pieces = []
    for _id, g in out.groupby(id_col, sort=False):
        b = _vb(g)
        b[id_col] = _id
        pieces.append(b)

    return pd.concat(pieces, ignore_index=True).sort_values([id_col, time_col])


# ====== 用法示例 ======
# vbar_df = make_volume_bars(df, TIME_COL, "price", "volume", volume_threshold=1e6, id_col=ID_COL)


### 10.4 什么时候用哪种（速查）

- **Tick / HFT / microstructure**：event-time + Δt 特征  
- **常规 alpha / 中低频**：time bars（1–5min）  
- **成交活跃度差异大**：volume / dollar bars  
- **研究流动性 / 信息到达**：duration（Δt）本身就是信号  

关键点：不均匀性不是噪声，而是金融数据的重要组成部分。


## 10. 金融 tick 不均匀：Event-time / Time bars / Volume & Dollar bars

金融 tick/trade/quote 是事件驱动，不是时钟驱动。处理路径分三类：

- **Event-time（事件时间）**：不对齐、不插值；把 `Δt` 当成特征
- **Time bars（固定时间桶）**：按固定频率聚合；显式记录空桶/交易次数
- **Volume / Dollar bars（活动时间）**：按成交量/成交额切桶；每个 bar 的 duration 自带不均匀信息


### 10.1 Event-time：保留不规则时间戳 + 显式 gap 特征

In [None]:
def add_event_time_features(df: pd.DataFrame,
                           time_col: str,
                           id_col: str = None,
                           add_log_gap: bool = True) -> pd.DataFrame:
    """
    事件时间特征：不做 resample。每条记录是一个 event。
    生成：
      - event_idx：事件序号（每个 id 内部）
      - gap_sec：与上一个 event 的时间差（秒）
      - log_gap_sec：log(1+gap)
    """
    out = df.copy()

    def _one(g):
        g = g.sort_values(time_col).copy()
        g["event_idx"] = np.arange(len(g), dtype=np.int64)
        gap = g[time_col].diff()
        g["gap_sec"] = gap.dt.total_seconds().fillna(0.0)
        if add_log_gap:
            g["log_gap_sec"] = np.log1p(g["gap_sec"].clip(lower=0.0))
        return g

    if id_col is None:
        return _one(out)
    return out.groupby(id_col, group_keys=False, sort=False).apply(_one)


# ====== 用法示例 ======
# df_evt = add_event_time_features(df, TIME_COL, ID_COL)
# df_evt = add_lag_features(df_evt, cols=["price","spread"], lags=(1,2,3), time_col=TIME_COL, id_col=ID_COL)
# df_evt = add_rolling_features(df_evt, cols=["return","gap_sec"], windows=(20,50,100), time_col=TIME_COL, id_col=ID_COL)


### 10.2 Time bars：固定时间桶聚合（OHLC/VWAP/成交量/交易次数/空桶标记）

In [None]:
def make_time_bars(df: pd.DataFrame,
                   time_col: str,
                   freq: str,
                   price_col: str = None,
                   volume_col: str = None,
                   id_col: str = None,
                   tz: str = None) -> pd.DataFrame:
    """
    固定时间桶（time bars）。
    输入 df 可为 tick（trade）或 quote 更新。
    - price_col 给了则生成 ohlc/last
    - volume_col 给了则生成 volume_sum
    - 生成 n_events（桶内事件数）与 has_event（是否为空桶）
    """
    out = df.copy().sort_values([time_col] if id_col is None else [id_col, time_col])

    def _bars(g):
        g = g.set_index(time_col).sort_index()
        if tz is not None:
            # 若索引无 tz，则 localize；若有 tz，则 convert
            if g.index.tz is None:
                g.index = g.index.tz_localize(tz, ambiguous="infer", nonexistent="shift_forward")
            else:
                g.index = g.index.tz_convert(tz)

        agg_parts = {}

        # 事件数：桶内 count
        agg_parts["n_events"] = g.iloc[:, 0].resample(freq).size().rename("n_events") if len(g.columns) else g.resample(freq).size().rename("n_events")

        if price_col is not None and price_col in g.columns:
            o = g[price_col].resample(freq).first().rename("open")
            h = g[price_col].resample(freq).max().rename("high")
            l = g[price_col].resample(freq).min().rename("low")
            c = g[price_col].resample(freq).last().rename("close")
            last = c.rename("last")
            agg_parts.update({"open": o, "high": h, "low": l, "close": c, "last": last})

            # return（基于 last）
            agg_parts["ret"] = np.log(last).diff().rename("ret")

        if volume_col is not None and volume_col in g.columns:
            vol = g[volume_col].resample(freq).sum(min_count=1).rename("volume_sum")
            agg_parts["volume_sum"] = vol

            # vwap（若 price_col+volume_col 都存在）
            if price_col is not None and price_col in g.columns:
                vwap = (g[price_col] * g[volume_col]).resample(freq).sum(min_count=1) / vol
                agg_parts["vwap"] = vwap.rename("vwap")

        res = pd.concat(agg_parts.values(), axis=1)
        res["has_event"] = (res["n_events"].fillna(0) > 0).astype(np.int8)
        res.index.name = time_col
        return res.reset_index()

    if id_col is None:
        return _bars(out)

    pieces = []
    for _id, g in out.groupby(id_col, sort=False):
        b = _bars(g)
        b[id_col] = _id
        pieces.append(b)
    return pd.concat(pieces, ignore_index=True).sort_values([id_col, time_col])


# ====== 用法示例 ======
# bars = make_time_bars(df, TIME_COL, freq="1min", price_col="price", volume_col="size", id_col=ID_COL)
# bars = add_calendar_features(bars, TIME_COL)
# train_df, test_df = time_based_train_test_split(bars, TIME_COL, test_size=0.2, id_col=ID_COL)


### 10.3 Volume bars / Dollar bars：按市场活动切桶（每个 bar 自带 duration）

In [None]:
def make_volume_bars(df: pd.DataFrame,
                     time_col: str,
                     price_col: str,
                     volume_col: str,
                     vol_threshold: float,
                     id_col: str = None) -> pd.DataFrame:
    """
    Volume bars：累计成交量达到阈值就切一个 bar。
    输出：
      - bar_id（每个 id 内部）
      - start_time/end_time/duration_sec
      - open/high/low/close
      - volume_sum, n_events
      - ret（基于 close 的 log return）
    """
    out = df.copy().sort_values([time_col] if id_col is None else [id_col, time_col])

    def _one(g):
        g = g.sort_values(time_col).copy()
        vol = g[volume_col].fillna(0.0).astype(float).values
        cumsum = np.cumsum(vol)
        bar_id = (cumsum // float(vol_threshold)).astype(np.int64)
        g["bar_id"] = bar_id

        grp = g.groupby("bar_id", as_index=False)
        res = grp.agg(
            start_time=(time_col, "first"),
            end_time=(time_col, "last"),
            open=(price_col, "first"),
            high=(price_col, "max"),
            low=(price_col, "min"),
            close=(price_col, "last"),
            volume_sum=(volume_col, "sum"),
            n_events=(volume_col, "size"),
        )
        res["duration_sec"] = (res["end_time"] - res["start_time"]).dt.total_seconds()
        res["ret"] = np.log(res["close"]).diff()
        return res

    if id_col is None:
        return _one(out)

    pieces = []
    for _id, g in out.groupby(id_col, sort=False):
        r = _one(g)
        r[id_col] = _id
        pieces.append(r)
    return pd.concat(pieces, ignore_index=True).sort_values([id_col, "end_time"])


def make_dollar_bars(df: pd.DataFrame,
                   time_col: str,
                   price_col: str,
                   volume_col: str,
                   dollar_threshold: float,
                   id_col: str = None) -> pd.DataFrame:
    """Dollar bars：累计成交额达到阈值就切一个 bar。"""
    out = df.copy().sort_values([time_col] if id_col is None else [id_col, time_col])
    out["_dollar"] = out[price_col].astype(float) * out[volume_col].astype(float)

    def _one(g):
        g = g.sort_values(time_col).copy()
        amt = g["_dollar"].fillna(0.0).values
        cumsum = np.cumsum(amt)
        bar_id = (cumsum // float(dollar_threshold)).astype(np.int64)
        g["bar_id"] = bar_id

        grp = g.groupby("bar_id", as_index=False)
        res = grp.agg(
            start_time=(time_col, "first"),
            end_time=(time_col, "last"),
            open=(price_col, "first"),
            high=(price_col, "max"),
            low=(price_col, "min"),
            close=(price_col, "last"),
            dollar_sum=("_dollar", "sum"),
            volume_sum=(volume_col, "sum"),
            n_events=(volume_col, "size"),
        )
        res["duration_sec"] = (res["end_time"] - res["start_time"]).dt.total_seconds()
        res["ret"] = np.log(res["close"]).diff()
        return res

    if id_col is None:
        return _one(out)

    pieces = []
    for _id, g in out.groupby(id_col, sort=False):
        r = _one(g)
        r[id_col] = _id
        pieces.append(r)
    return pd.concat(pieces, ignore_index=True).sort_values([id_col, "end_time"])


# ====== 用法示例 ======
# vbars = make_volume_bars(df, TIME_COL, price_col="price", volume_col="size", vol_threshold=10000, id_col=ID_COL)
# dbars = make_dollar_bars(df, TIME_COL, price_col="price", volume_col="size", dollar_threshold=1_000_000, id_col=ID_COL)
# vbars = add_calendar_features(vbars.rename(columns={"end_time": TIME_COL}), TIME_COL)  # 若想把 end_time 当 bar 时间戳


### 10.4 常见坑（写进检查清单里）

- 盲目插值（对价格线性插值 tick）会制造“假交易”，扭曲波动与收益分布  
- 直接把 tick 当均匀采样做 `shift(1)` 会混淆 1ms 与 5min 的历史信息；`gap_sec` 必须显式进入特征  
- Time bars 里空桶不是噪声：`has_event` / `n_events`（或 `_is_missing`）必须保留  
