# GRU Dataset Interview 快速部署与训练模板（自用）

- 目标：最短时间内跑通 GRU baseline（数据→滑窗→训练→评估→图表→结论要点）。
- 关键约束：时间切分防泄漏；标准化只在 train 拟合；滑窗标签严格对齐；全流程可复现。


## 使用顺序（按执行）
1. 环境与依赖检查
2. 读取数据与统一 schema
3. 目标构造与缺失处理
4. 时间切分
5. 滑窗构造
6. 标准化
7. GRU 训练
8. 评估与可视化
9. 稳定性与基线
10. 导出结果


## 1) 环境与依赖检查
- 记录版本，保证可复现。
- 依赖：numpy/pandas/sklearn + torch。


In [None]:

import sys, platform, importlib, warnings
warnings.filterwarnings("ignore")

def show_versions():
    pkgs = ["numpy","pandas","sklearn","matplotlib","torch"]
    out = {"python": sys.version.split()[0], "platform": platform.platform()}
    for p in pkgs:
        try:
            m = importlib.import_module(p)
            out[p] = getattr(m, "__version__", "unknown")
        except Exception:
            out[p] = None
    return out

show_versions()


## 2) 读取数据与统一 schema
- 规整为：time / (optional id) / target / numeric features。
- 排序：单序列按 time；面板按 (id,time)。


In [None]:

import pandas as pd
import numpy as np

DATA_PATH = "data.csv"   # TODO: 现场替换路径
time_col = "timestamp"   # TODO
id_col = None            # 例如 "asset_id"；单序列置 None
target_col = "y"         # TODO

df = pd.read_csv(DATA_PATH)

assert time_col in df.columns, f"missing {time_col}"
assert target_col in df.columns, f"missing {target_col}"
if id_col is not None:
    assert id_col in df.columns, f"missing {id_col}"

df[time_col] = pd.to_datetime(df[time_col])

sort_cols = [time_col] if id_col is None else [id_col, time_col]
df = df.sort_values(sort_cols).reset_index(drop=True)

df.head()


### 2.1 特征列推断
- 排除 time/id/target
- 仅保留数值型特征；非数值型先记录。


In [None]:

exclude = {time_col, target_col}
if id_col is not None:
    exclude.add(id_col)

candidate_cols = [c for c in df.columns if c not in exclude]

num_cols, cat_cols = [], []
for c in candidate_cols:
    if pd.api.types.is_numeric_dtype(df[c]):
        num_cols.append(c)
    else:
        cat_cols.append(c)

feature_cols = num_cols.copy()

print("rows:", len(df))
print("num_features:", len(feature_cols))
print("cat_features:", len(cat_cols))
cat_cols[:10], feature_cols[:10]


## 3) 目标构造与缺失处理
- horizon 预测：窗口末端为 t，标签取 t+h。
- 缺失：ffill/bfill 后仍缺填 0；生成缺失指示特征。


In [None]:

horizon = 1      # TODO
window_L = 64    # TODO

for c in feature_cols + [target_col]:
    if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
        df[f"{c}__isna"] = df[c].isna().astype(np.int8)
        if id_col is None:
            df[c] = df[c].ffill().bfill()
        else:
            df[c] = df.groupby(id_col, group_keys=False)[c].apply(lambda s: s.ffill().bfill())
        df[c] = df[c].fillna(0.0)

feature_cols = feature_cols + [c for c in df.columns if c.endswith("__isna")]

df[[time_col] + ([id_col] if id_col else []) + feature_cols[:5] + [target_col]].head()


## 4) 时间切分（严格防泄漏）
- 按全局时间点切分。


In [None]:

train_frac, valid_frac = 0.70, 0.15

unique_times = np.array(sorted(df[time_col].unique()))
nT = len(unique_times)

i_train_end = int(np.floor(nT * train_frac))
i_valid_end = int(np.floor(nT * (train_frac + valid_frac)))

train_time_max = unique_times[i_train_end-1]
valid_time_max = unique_times[i_valid_end-1]

df["split"] = np.where(df[time_col] <= train_time_max, "train",
               np.where(df[time_col] <= valid_time_max, "valid", "test"))

df["split"].value_counts()


## 5) 滑窗构造（N,T,F）
- 输出：X (N,L,F), y (N,)
- 面板：每个 id 独立滑窗。


In [None]:

from typing import Tuple, Optional, List

def make_windows_single(df_part: pd.DataFrame,
                        feature_cols: List[str],
                        target_col: str,
                        time_col: str,
                        horizon: int,
                        window_L: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    x = df_part[feature_cols].to_numpy(dtype=np.float32)
    y = df_part[target_col].to_numpy(dtype=np.float32)
    t = df_part[time_col].to_numpy()
    split = df_part["split"].to_numpy()

    n = len(df_part)
    last_end = n - 1 - horizon
    if last_end < window_L - 1:
        return (np.empty((0, window_L, len(feature_cols)), np.float32),
                np.empty((0,), np.float32),
                np.empty((0,), object),
                np.empty((0,), object))

    Xs, ys, ts, ss = [], [], [], []
    for end in range(window_L - 1, last_end + 1):
        start = end - window_L + 1
        Xs.append(x[start:end+1])
        ys.append(y[end + horizon])
        ts.append(t[end])
        ss.append(split[end])
    return np.stack(Xs), np.asarray(ys), np.asarray(ss), np.asarray(ts)

def make_windows(df: pd.DataFrame,
                 feature_cols: List[str],
                 target_col: str,
                 time_col: str,
                 id_col: Optional[str],
                 horizon: int,
                 window_L: int):
    X_all, y_all, split_all, id_all, t_all = [], [], [], [], []
    if id_col is None:
        X, y, s_end, t_end = make_windows_single(df, feature_cols, target_col, time_col, horizon, window_L)
        if len(y):
            X_all.append(X); y_all.append(y); split_all.append(s_end)
            id_all.append(np.full((len(y),), "single", dtype=object))
            t_all.append(t_end)
    else:
        for gid, g in df.groupby(id_col, sort=False):
            X, y, s_end, t_end = make_windows_single(g, feature_cols, target_col, time_col, horizon, window_L)
            if len(y) == 0:
                continue
            X_all.append(X); y_all.append(y); split_all.append(s_end)
            id_all.append(np.full((len(y),), gid, dtype=object))
            t_all.append(t_end)

    if not X_all:
        raise ValueError("no windows generated; check window_L/horizon/data length")

    X = np.concatenate(X_all, axis=0)
    y = np.concatenate(y_all, axis=0)
    split = np.concatenate(split_all, axis=0)
    ids = np.concatenate(id_all, axis=0)
    t_end = np.concatenate(t_all, axis=0)
    return X, y, split, ids, t_end

X, y, split_w, ids_w, t_end_w = make_windows(df, feature_cols, target_col, time_col, id_col, horizon, window_L)
X.shape, y.shape, pd.Series(split_w).value_counts()


## 6) 标准化（fit train）
- 只用 train 窗口拟合 scaler。


In [None]:

from sklearn.preprocessing import StandardScaler

train_mask = (split_w == "train")
valid_mask = (split_w == "valid")
test_mask  = (split_w == "test")

scaler = StandardScaler()
X_train = X[train_mask]
Ntr, T, F = X_train.shape
scaler.fit(X_train.reshape(Ntr*T, F))

def apply_scaler(X_in: np.ndarray) -> np.ndarray:
    N, T, F = X_in.shape
    return scaler.transform(X_in.reshape(N*T, F)).reshape(N, T, F).astype(np.float32)

Xz = apply_scaler(X)


## 7) GRU 模型与训练


In [None]:

# 任务类型
task = "regression"      # "regression" / "classification"
num_classes = None       # 分类任务时设为 C

# 训练超参
device = "cuda"
seed = 42
batch_size = 128
epochs = 30
lr = 1e-3
weight_decay = 1e-4
grad_clip = 1.0
patience = 5

# 模型超参
hidden_size = 64
num_layers = 1
dropout = 0.2


In [None]:

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

def set_seed(seed: int):
    import random, os
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(seed)
device_t = torch.device("cuda" if (device=="cuda" and torch.cuda.is_available()) else "cpu")
device_t


In [None]:

class WindowDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i): return self.X[i], self.y[i]

Xtr, ytr = Xz[train_mask], y[train_mask]
Xva, yva = Xz[valid_mask], y[valid_mask]
Xte, yte = Xz[test_mask],  y[test_mask]

if task == "classification":
    assert num_classes is not None
    ytr = ytr.astype(np.int64); yva = yva.astype(np.int64); yte = yte.astype(np.int64)
else:
    ytr = ytr.astype(np.float32); yva = yva.astype(np.float32); yte = yte.astype(np.float32)

dl_tr = DataLoader(WindowDataset(Xtr, ytr), batch_size=batch_size, shuffle=True)
dl_va = DataLoader(WindowDataset(Xva, yva), batch_size=batch_size, shuffle=False)
dl_te = DataLoader(WindowDataset(Xte, yte), batch_size=batch_size, shuffle=False)

(len(dl_tr), len(dl_va), len(dl_te))


In [None]:

class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, task, num_classes=None):
        super().__init__()
        self.task = task
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.drop = nn.Dropout(dropout)
        out_dim = 1 if task == "regression" else int(num_classes)
        self.head = nn.Linear(hidden_size, out_dim)

    def forward(self, x):
        out, _ = self.gru(x)
        last = out[:, -1, :]
        z = self.drop(last)
        return self.head(z)

model = GRUModel(F, hidden_size, num_layers, dropout, task, num_classes).to(device_t)
model


In [None]:

if task == "regression":
    loss_fn = nn.SmoothL1Loss()
else:
    loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

def eval_epoch(model, dl):
    model.eval()
    losses, ys, yhs = [], [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device_t); yb = yb.to(device_t)
            pred = model(xb)
            if task == "regression":
                pred2 = pred.squeeze(-1)
                loss = loss_fn(pred2, yb)
                yhs.append(pred2.detach().cpu().numpy())
                ys.append(yb.detach().cpu().numpy())
            else:
                loss = loss_fn(pred, yb)
                yhs.append(pred.detach().cpu().numpy())
                ys.append(yb.detach().cpu().numpy())
            losses.append(loss.item())
    return float(np.mean(losses)), np.concatenate(ys), np.concatenate(yhs)

def train_one_epoch(model, dl):
    model.train()
    losses = []
    for xb, yb in dl:
        xb = xb.to(device_t); yb = yb.to(device_t)
        optimizer.zero_grad(set_to_none=True)
        pred = model(xb)
        loss = loss_fn(pred.squeeze(-1), yb) if task == "regression" else loss_fn(pred, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        losses.append(loss.item())
    return float(np.mean(losses))


In [None]:

best_val = float("inf")
best_state = None
bad = 0
history = []

for ep in range(1, epochs+1):
    tr_loss = train_one_epoch(model, dl_tr)
    va_loss, _, _ = eval_epoch(model, dl_va)
    history.append({"epoch": ep, "train_loss": tr_loss, "val_loss": va_loss})
    print(f"epoch {ep:03d} | train {tr_loss:.6f} | val {va_loss:.6f}")

    if va_loss + 1e-8 < best_val:
        best_val = va_loss
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= patience:
            print("early stop")
            break

if best_state is not None:
    model.load_state_dict(best_state)

best_val


## 8) 评估与可视化


In [None]:

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import math

te_loss, y_te, yh_te = eval_epoch(model, dl_te)

if task == "regression":
    yhat = yh_te.reshape(-1)
    rmse = float(np.sqrt(mean_squared_error(y_te, yhat)))
    mae  = float(mean_absolute_error(y_te, yhat))
    r2   = float(r2_score(y_te, yhat))
    metrics = {"test_loss": float(te_loss), "rmse": rmse, "mae": mae, "r2": r2}
    metrics
else:
    logits = yh_te
    pred = logits.argmax(axis=1)
    acc = float(accuracy_score(y_te, pred))
    cm = confusion_matrix(y_te, pred)
    metrics = {"test_loss": float(te_loss), "accuracy": acc}
    metrics, cm


In [None]:

if task == "regression":
    yhat = yh_te.reshape(-1)
    plt.figure()
    plt.scatter(y_te, yhat, s=6)
    plt.xlabel("y_true"); plt.ylabel("y_pred")
    plt.title("Test: Pred vs True")
    plt.show()


## 9) 稳定性检查与对照基线


In [None]:

from sklearn.linear_model import Ridge

def bucket_by_time(t_arr, n_buckets=5):
    order = np.argsort(t_arr)
    return np.array_split(order, n_buckets)

if task == "regression":
    test_idx = np.where(test_mask)[0]
    buckets = bucket_by_time(t_end_w[test_idx], n_buckets=5)
    rows = []
    for bi, b in enumerate(buckets):
        idx = test_idx[b]
        dl_b = DataLoader(WindowDataset(Xz[idx], y[idx].astype(np.float32)), batch_size=512, shuffle=False)
        _, y_b, yh_b = eval_epoch(model, dl_b)
        yhat = yh_b.reshape(-1)
        rows.append({
            "bucket": bi,
            "n": int(len(idx)),
            "rmse": float(np.sqrt(mean_squared_error(y_b, yhat))),
            "mae": float(mean_absolute_error(y_b, yhat)),
        })
    display(pd.DataFrame(rows))

# baseline：Ridge(last-step)
X_last = Xz[:, -1, :]
Xtr_b, Xte_b = X_last[train_mask], X_last[test_mask]
ytr_b, yte_b = y[train_mask], y[test_mask]
if task == "regression":
    ridge = Ridge(alpha=1.0)
    ridge.fit(Xtr_b, ytr_b)
    pred = ridge.predict(Xte_b)
    ridge_rmse = float(np.sqrt(mean_squared_error(yte_b, pred)))
    {"ridge_rmse": ridge_rmse, "gru_rmse": metrics.get("rmse")}


## 10) 导出结果（用于 PPT）


In [None]:

import os, json
out_dir = "outputs_gru"
os.makedirs(out_dir, exist_ok=True)

hist_df = pd.DataFrame(history)
hist_df.to_csv(os.path.join(out_dir, "train_history.csv"), index=False)

with open(os.path.join(out_dir, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)

hist_df.tail(), metrics


In [None]:

def ppt_bullets(metrics: dict, task: str):
    if task == "regression":
        return [
            f"GRU test RMSE={metrics['rmse']:.4f}, MAE={metrics['mae']:.4f}, R2={metrics['r2']:.4f}",
            "时间切分完成；标准化仅在 train 拟合；滑窗标签对齐已验证",
            f"窗口 L={window_L}, horizon h={horizon}, hidden={hidden_size}, layers={num_layers}, dropout={dropout}",
            "稳定性：按时间分桶输出分段 RMSE/MAE（见 notebook）",
            "对照：Ridge(last-step) baseline 已跑，用于校验增益与避免泄漏误判",
        ]
    else:
        return [
            f"GRU test accuracy={metrics['accuracy']:.4f}",
            "时间切分完成；标准化仅在 train 拟合；滑窗标签对齐已验证",
            f"窗口 L={window_L}, horizon h={horizon}, hidden={hidden_size}, layers={num_layers}, dropout={dropout}",
            "稳定性：按时间分桶/按实体分组输出（见 notebook）",
            "对照：baseline 已跑，用于校验增益与避免泄漏误判",
        ]

ppt_bullets(metrics, task)
