# Dataset Interview — NN 部署与训练速查（现场参考）

（本 notebook 用作现场“流程清单 + 可直接运行的最小模板”。按数据情况删改即可。）

---


## 0. 运行开关（任务类型 / 切分方式）

- `TASK`: `"regression" | "binary" | "multiclass"`
- `SPLIT`: `"time" | "random"`
- `TARGET_COL`: 目标列名
- `TIME_COL`: 时间列名（时序切分用）


In [None]:
# ====== 配置区：按数据修改 ======
TASK = "regression"      # "regression" | "binary" | "multiclass"
SPLIT = "time"           # "time" | "random"
TARGET_COL = "y"
TIME_COL = "timestamp"   # SPLIT="time" 时使用

# 多分类时：类别数 K（也可在数据读入后自动推断）
NUM_CLASSES = None

# 训练超参（现场优先稳定）
SEED = 42
BATCH_SIZE = 512
LR = 1e-3
WEIGHT_DECAY = 1e-4
EPOCHS = 50
PATIENCE = 5
GRAD_CLIP = 1.0

# 资源
USE_GPU = True


## 1. 环境检查 + 依赖导入

In [None]:
import sys, platform, os, math, random, time
import numpy as np
import pandas as pd

print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("Working dir:", os.getcwd())


In [None]:
# 核心依赖：sklearn + torch
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, f1_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if (USE_GPU and torch.cuda.is_available()) else "cpu")
print("Device:", device)


## 2. 可复现性

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)


## 3. 读入数据（占位）

- 支持 `csv / parquet / feather` 等
- 读入后确保包含 `TARGET_COL`，时序任务确保包含 `TIME_COL`


In [None]:
# ====== 数据读入：按实际路径改 ======
# df = pd.read_csv("data.csv")
# df = pd.read_parquet("data.parquet")

df = None  # 读入后替换
df


## 4. 基础检查（缺失 / 类型 / 目标分布）

In [None]:
assert df is not None, "先在上一个 cell 读入 df"

print("shape:", df.shape)
print(df.head(3))

print("\nMissing ratio (top 20):")
miss = df.isna().mean().sort_values(ascending=False)
display(miss.head(20))

print("\nDtypes (top 30):")
display(df.dtypes.head(30))

assert TARGET_COL in df.columns, f"缺少目标列 {TARGET_COL}"


In [None]:
y_raw = df[TARGET_COL]
print("Target describe:")
display(y_raw.describe(include="all"))

if TASK in ["binary", "multiclass"]:
    print("Target value counts (top 20):")
    display(y_raw.value_counts().head(20))


## 5. 切分（时序 / 随机）

- 时序切分：按 `TIME_COL` 排序，前 80% train，后 20% valid（可改）
- 随机切分：`train_test_split`


In [None]:
def time_split(df, time_col, frac=0.8):
    d = df.sort_values(time_col).reset_index(drop=True)
    n = len(d)
    cut = int(n * frac)
    train_df = d.iloc[:cut].copy()
    val_df = d.iloc[cut:].copy()
    return train_df, val_df

if SPLIT == "time":
    assert TIME_COL in df.columns, f"SPLIT='time' 需要 {TIME_COL}"
    train_df, val_df = time_split(df, TIME_COL, frac=0.8)
else:
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED, shuffle=True)

print("train:", train_df.shape, "val:", val_df.shape)


## 6. 特征 / 目标拆分 + 预处理（sklearn 管道）

- 数值列：median 填充 + 标准化
- 类别列：most_frequent 填充 + OneHot
- 预处理只在 train fit，再 transform val


In [None]:
def build_preprocessor(train_df, target_col):
    X = train_df.drop(columns=[target_col])
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    numeric_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ])
    categorical_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
    ])

    pre = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric_cols),
            ("cat", categorical_pipe, categorical_cols),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )
    return pre, numeric_cols, categorical_cols

preprocessor, num_cols, cat_cols = build_preprocessor(train_df, TARGET_COL)
print("num cols:", len(num_cols), "cat cols:", len(cat_cols))


In [None]:
def prepare_xy(train_df, val_df, preprocessor, target_col):
    X_train_raw = train_df.drop(columns=[target_col])
    y_train_raw = train_df[target_col].copy()

    X_val_raw = val_df.drop(columns=[target_col])
    y_val_raw = val_df[target_col].copy()

    X_train = preprocessor.fit_transform(X_train_raw)
    X_val = preprocessor.transform(X_val_raw)

    return X_train, y_train_raw, X_val, y_val_raw

X_train_sp, y_train_raw, X_val_sp, y_val_raw = prepare_xy(train_df, val_df, preprocessor, TARGET_COL)
print("X_train type:", type(X_train_sp), "shape:", X_train_sp.shape)
print("X_val   type:", type(X_val_sp), "shape:", X_val_sp.shape)


## 7. 目标编码（分类）

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = None

if TASK == "binary":
    # 假设目标是 {0,1} 或 {False,True} 或两类字符串
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw.astype(str))
    y_val = le.transform(y_val_raw.astype(str))
    label_encoder = le
    assert set(np.unique(y_train)).issubset({0,1}), "binary 目标编码异常"
elif TASK == "multiclass":
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw.astype(str))
    y_val = le.transform(y_val_raw.astype(str))
    label_encoder = le
    K = int(np.max(y_train) + 1) if NUM_CLASSES is None else int(NUM_CLASSES)
    NUM_CLASSES = K
    print("NUM_CLASSES:", NUM_CLASSES)
else:
    # regression
    y_train = y_train_raw.astype(float).to_numpy()
    y_val = y_val_raw.astype(float).to_numpy()

print("y_train shape:", np.shape(y_train), "y_val shape:", np.shape(y_val))


## 8. Sparse → dense（小心内存）

- 预处理后可能得到稀疏矩阵；MLP 训练通常用 dense float32
- 维度过大时：先 baseline 或改用 embedding（需要额外工程）


In [None]:
def to_dense_float32(X):
    # X: scipy sparse or numpy
    if hasattr(X, "toarray"):
        X = X.toarray()
    return np.asarray(X, dtype=np.float32)

X_train = to_dense_float32(X_train_sp)
X_val = to_dense_float32(X_val_sp)

print("Dense shapes:", X_train.shape, X_val.shape, "dtype:", X_train.dtype)


## 9. Baseline（sklearn，1 分钟拿结果）

- regression: Ridge
- binary/multiclass: LogisticRegression


In [None]:
from sklearn.linear_model import Ridge, LogisticRegression

def eval_regression(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

def eval_binary(y_true, prob):
    # prob: P(y=1)
    ll = log_loss(y_true, prob, labels=[0,1])
    auc = roc_auc_score(y_true, prob)
    pred = (prob >= 0.5).astype(int)
    acc = accuracy_score(y_true, pred)
    f1 = f1_score(y_true, pred)
    return {"LogLoss": ll, "AUC": auc, "Acc": acc, "F1": f1}

def eval_multiclass(y_true, prob):
    # prob: (n, K)
    ll = log_loss(y_true, prob)
    pred = prob.argmax(axis=1)
    acc = accuracy_score(y_true, pred)
    f1 = f1_score(y_true, pred, average="macro")
    return {"LogLoss": ll, "Acc": acc, "MacroF1": f1}

if TASK == "regression":
    base = Ridge(alpha=1.0, random_state=SEED)
    base.fit(X_train, y_train)
    pred = base.predict(X_val)
    baseline_metrics = eval_regression(y_val, pred)
elif TASK == "binary":
    base = LogisticRegression(max_iter=500, n_jobs=-1)
    base.fit(X_train, y_train)
    prob = base.predict_proba(X_val)[:, 1]
    baseline_metrics = eval_binary(y_val, prob)
else:
    base = LogisticRegression(max_iter=500, n_jobs=-1, multi_class="auto")
    base.fit(X_train, y_train)
    prob = base.predict_proba(X_val)
    baseline_metrics = eval_multiclass(y_val, prob)

baseline_metrics


## 10. PyTorch Dataset / DataLoader

In [None]:
class NpDataset(Dataset):
    def __init__(self, X, y, task):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.task = task
        if task == "regression":
            self.y = torch.tensor(y, dtype=torch.float32)
        elif task == "binary":
            self.y = torch.tensor(y, dtype=torch.float32)  # BCEWithLogitsLoss
        else:
            self.y = torch.tensor(y, dtype=torch.long)     # CrossEntropyLoss

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_loader = DataLoader(NpDataset(X_train, y_train, TASK), batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = DataLoader(NpDataset(X_val,   y_val,   TASK), batch_size=4096,      shuffle=False, drop_last=False)


## 11. MLP 模型

In [None]:
class MLP(nn.Module):
    def __init__(self, d_in, d_out, hidden=(256, 128), dropout=0.1):
        super().__init__()
        layers = []
        prev = d_in
        for h in hidden:
            layers += [nn.Linear(prev, h), nn.ReLU(), nn.Dropout(dropout)]
            prev = h
        layers += [nn.Linear(prev, d_out)]
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

d_in = X_train.shape[1]
if TASK == "regression":
    d_out = 1
elif TASK == "binary":
    d_out = 1
else:
    assert NUM_CLASSES is not None, "multiclass 需要 NUM_CLASSES"
    d_out = int(NUM_CLASSES)

model = MLP(d_in=d_in, d_out=d_out, hidden=(256,128), dropout=0.1).to(device)
model


## 12. Loss / 优化器

In [None]:
if TASK == "regression":
    loss_fn = nn.SmoothL1Loss()
elif TASK == "binary":
    loss_fn = nn.BCEWithLogitsLoss()
else:
    loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)


## 13. 训练循环（early stopping + gradient clip）

In [None]:
def train_one_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    total, n = 0.0, 0
    for Xb, yb in loader:
        Xb = Xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad(set_to_none=True)
        logits = model(Xb)

        if TASK == "regression":
            loss = loss_fn(logits.squeeze(-1), yb)
        elif TASK == "binary":
            loss = loss_fn(logits.squeeze(-1), yb)
        else:
            loss = loss_fn(logits, yb)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()

        bs = Xb.size(0)
        total += loss.item() * bs
        n += bs
    return total / max(n, 1)

@torch.no_grad()
def eval_one_epoch(model, loader, loss_fn, device):
    model.eval()
    total, n = 0.0, 0
    for Xb, yb in loader:
        Xb = Xb.to(device)
        yb = yb.to(device)
        logits = model(Xb)

        if TASK == "regression":
            loss = loss_fn(logits.squeeze(-1), yb)
        elif TASK == "binary":
            loss = loss_fn(logits.squeeze(-1), yb)
        else:
            loss = loss_fn(logits, yb)

        bs = Xb.size(0)
        total += loss.item() * bs
        n += bs
    return total / max(n, 1)

def fit(model, train_loader, val_loader, loss_fn, optimizer, device, epochs=50, patience=5):
    best = float("inf")
    best_state = None
    bad = 0
    hist = []

    for ep in range(1, epochs + 1):
        t0 = time.time()
        tr = train_one_epoch(model, train_loader, optimizer, loss_fn, device)
        va = eval_one_epoch(model, val_loader, loss_fn, device)
        dt = time.time() - t0

        hist.append({"epoch": ep, "train_loss": tr, "val_loss": va, "sec": dt})
        print(f"ep {ep:03d} | train {tr:.6f} | val {va:.6f} | {dt:.1f}s")

        if va < best - 1e-6:
            best = va
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1

        if bad >= patience:
            print("early stop")
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return pd.DataFrame(hist), best

hist_df, best_val_loss = fit(model, train_loader, val_loader, loss_fn, optimizer, device, epochs=EPOCHS, patience=PATIENCE)
best_val_loss


## 14. 推断与指标评估

In [None]:
@torch.no_grad()
def predict(model, X, device, batch_size=4096):
    model.eval()
    X_t = torch.tensor(X, dtype=torch.float32)
    loader = DataLoader(X_t, batch_size=batch_size, shuffle=False)
    outs = []
    for xb in loader:
        xb = xb.to(device)
        outs.append(model(xb).detach().cpu())
    return torch.cat(outs, dim=0)

logits = predict(model, X_val, device=device)

if TASK == "regression":
    y_pred = logits.squeeze(-1).numpy()
    nn_metrics = eval_regression(y_val, y_pred)

elif TASK == "binary":
    prob = torch.sigmoid(logits.squeeze(-1)).numpy()
    nn_metrics = eval_binary(y_val, prob)

else:
    prob = torch.softmax(logits, dim=1).numpy()
    nn_metrics = eval_multiclass(y_val, prob)

print("Baseline:", baseline_metrics)
print("NN      :", nn_metrics)


## 15. 误差分析（快速定位问题）

- 回归：画 residual vs pred / time
- 分类：看 confusion / top 错误样本


In [None]:
import matplotlib.pyplot as plt

if TASK == "regression":
    resid = y_val - y_pred
    plt.figure()
    plt.plot(resid[:200])
    plt.title("Residual (first 200)")
    plt.show()

    plt.figure()
    plt.scatter(y_pred, resid, s=6)
    plt.title("Residual vs Pred")
    plt.xlabel("pred")
    plt.ylabel("resid")
    plt.show()


In [None]:
if TASK in ["binary", "multiclass"]:
    # 概率分布粗看
    plt.figure()
    if TASK == "binary":
        plt.hist(prob, bins=50)
        plt.title("Predicted P(y=1)")
    else:
        plt.hist(prob.max(axis=1), bins=50)
        plt.title("Max class probability")
    plt.show()


## 16. 保存（模型 + 预处理器 + 标签编码器）

- 现场通常只需要保存当前最优状态；也可直接展示结果不落盘


In [None]:
import joblib

ART_DIR = "artifacts"
os.makedirs(ART_DIR, exist_ok=True)

# torch 模型
torch.save(model.state_dict(), os.path.join(ART_DIR, "mlp_state_dict.pt"))

# sklearn 预处理器
joblib.dump(preprocessor, os.path.join(ART_DIR, "preprocessor.joblib"))

# label encoder（分类）
if label_encoder is not None:
    joblib.dump(label_encoder, os.path.join(ART_DIR, "label_encoder.joblib"))

print("Saved to:", ART_DIR)


## 17. 现场汇报用的输出（1 页）

- 数据：样本量 / 时间跨度 / 缺失比例
- Baseline 指标
- NN 指标 + 提升幅度
- 关键特征/误差分析发现
- 下一步改进方向（如果还有时间）


In [None]:
summary = {
    "n_train": int(len(train_df)),
    "n_val": int(len(val_df)),
    "task": TASK,
    "split": SPLIT,
    "baseline": baseline_metrics,
    "nn": nn_metrics,
}

summary
