# sklearn 线性模型训练速查（面试现场用）

目标：在受限环境里快速完成 **EDA → 预处理 → 线性模型训练 → 评估 → 解释性输出**。  
原则：所有预处理放进 `Pipeline`，避免数据泄露；时间序列只用过去预测未来。

In [None]:
# 基础依赖
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## 0. 数据约定

- `df`：pandas DataFrame  
- 目标列：`y_col`（回归/分类均可）  
- 时间列（若有）：`time_col`（用于排序与时间切分）

In [None]:
# 需要按现场数据改这三个变量
y_col = "y"
time_col = None   # e.g. "date"；没有则保持 None

# df = pd.read_csv("...")  # 现场读数据
# display(df.head())

## 1. 预处理模板（数值 / 类别）

- 数值：median 填补 + 标准化  
- 类别：众数填补 + OneHot（忽略未见类别）

In [None]:
num_sel = make_column_selector(dtype_include=np.number)
cat_sel = make_column_selector(dtype_exclude=np.number)

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=True)),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True)),
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_sel),
    ("cat", categorical_pipe, cat_sel),
], remainder="drop")

## 2. 切分（普通 / 时间序列）

- 普通：`train_test_split`  
- 时间序列：按时间排序后做前 80% 训练、后 20% 测试（或按需求改比例）

In [None]:
X = df.drop(columns=[y_col])
y = df[y_col]

if time_col is None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0
    )
else:
    df_sorted = df.sort_values(time_col)
    X = df_sorted.drop(columns=[y_col])
    y = df_sorted[y_col]
    split = int(len(df_sorted) * 0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]

## 3. 回归：Ridge 基线（稳）

- 第一版直接 Ridge  
- 指标：RMSE / MAE / R2

In [None]:
from sklearn.linear_model import Ridge

pipe_ridge = Pipeline([
    ("prep", preprocess),
    ("model", Ridge(random_state=0)),
])

pipe_ridge.fit(X_train, y_train)
pred = pipe_ridge.predict(X_test)

rmse = mean_squared_error(y_test, pred, squared=False)
mae  = mean_absolute_error(y_test, pred)
r2   = r2_score(y_test, pred)

rmse, mae, r2

## 4. 回归：Ridge 调参（alpha）

- 普通数据：`cv=5`  
- 时间序列：`TimeSeriesSplit`

In [None]:
param_grid = {"model__alpha": [1e-3, 1e-2, 1e-1, 1, 10, 100]}

cv = 5 if time_col is None else TimeSeriesSplit(n_splits=5)

gs_ridge = GridSearchCV(
    pipe_ridge,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=cv,
    n_jobs=-1
)

gs_ridge.fit(X_train, y_train)

best_ridge = gs_ridge.best_estimator_
pred = best_ridge.predict(X_test)

rmse = mean_squared_error(y_test, pred, squared=False)
mae  = mean_absolute_error(y_test, pred)
r2   = r2_score(y_test, pred)

rmse, mae, r2, gs_ridge.best_params_

## 5. 回归：ElasticNet（高维稀疏 + 稳健特征选择）

- onehot 后高维稀疏时经常用 ElasticNet  
- 调 `alpha` 和 `l1_ratio`

In [None]:
from sklearn.linear_model import ElasticNet

pipe_en = Pipeline([
    ("prep", preprocess),
    ("model", ElasticNet(max_iter=20000, random_state=0)),
])

param_grid = {
    "model__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1],
    "model__l1_ratio": [0.1, 0.5, 0.9],
}

gs_en = GridSearchCV(
    pipe_en,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=(5 if time_col is None else TimeSeriesSplit(n_splits=5)),
    n_jobs=-1
)

gs_en.fit(X_train, y_train)

best_en = gs_en.best_estimator_
pred = best_en.predict(X_test)

rmse = mean_squared_error(y_test, pred, squared=False)
mae  = mean_absolute_error(y_test, pred)
r2   = r2_score(y_test, pred)

rmse, mae, r2, gs_en.best_params_

## 6. 分类：LogisticRegression（含类不平衡）

- 指标：AUC / Accuracy / F1  
- `class_weight="balanced"` 处理类不平衡  
- L1 / ElasticNet：`solver="saga"`

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

# y 需要是 0/1
# y = df[y_col].astype(int)

pipe_clf = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(
        max_iter=5000,
        class_weight="balanced",
        solver="lbfgs",
    )),
])

pipe_clf.fit(X_train, y_train)
proba = pipe_clf.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

auc = roc_auc_score(y_test, proba)
acc = accuracy_score(y_test, pred)
f1  = f1_score(y_test, pred)

auc, acc, f1

## 7. 系数与特征名（解释性输出）

- 系数：`coef_`  
- 特征名：`ColumnTransformer.get_feature_names_out()`

In [None]:
def top_linear_coefficients(fitted_pipeline, topk=20):
    prep = fitted_pipeline.named_steps["prep"]
    model = fitted_pipeline.named_steps["model"]
    if not hasattr(model, "coef_"):
        raise ValueError("model has no coef_")

    feature_names = prep.get_feature_names_out()
    coef = np.ravel(model.coef_)

    s = pd.Series(coef, index=feature_names)
    return s.reindex(s.abs().sort_values(ascending=False).index).head(topk)

# 示例：
# top_linear_coefficients(best_ridge, topk=30)

## 8. 现场速查：常见坑

- 预处理必须进 Pipeline（尤其时间序列）  
- L1/ElasticNet/SGD 对标准化敏感  
- onehot 后是稀疏矩阵；整体均值中心化会出问题（scaler 放在 numeric 子管道）  
- 时间序列交叉验证用 `TimeSeriesSplit`，避免未来信息进入训练