# 第5课：模型调参与交叉验证

## 学习目标
- 理解过拟合和欠拟合
- 掌握交叉验证方法
- 学会使用网格搜索调参
- 了解其他调参方法

## 1. 过拟合与欠拟合

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris, make_classification
from sklearn.metrics import accuracy_score, make_scorer
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

In [None]:
# 演示过拟合和欠拟合
np.random.seed(42)
X = np.sort(np.random.rand(30) * 10).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.randn(30) * 0.3

# 不同复杂度的模型
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
degrees = [1, 4, 15]
titles = ['欠拟合 (degree=1)', '恰当拟合 (degree=4)', '过拟合 (degree=15)']

X_test = np.linspace(0, 10, 100).reshape(-1, 1)

for ax, degree, title in zip(axes, degrees, titles):
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    X_test_poly = poly.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_poly, y)
    y_pred = model.predict(X_test_poly)
    
    ax.scatter(X, y, color='blue', label='训练数据')
    ax.plot(X_test, y_pred, color='red', label='预测')
    ax.plot(X_test, np.sin(X_test), color='green', linestyle='--', label='真实函数')
    ax.set_title(title)
    ax.legend()

plt.tight_layout()
plt.show()

## 2. 交叉验证

In [None]:
# 加载数据
iris = load_iris()
X, y = iris.data, iris.target

# 简单训练测试分割的问题
results = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    results.append(model.score(X_test, y_test))

print("不同随机种子的测试准确率:")
print([f"{r:.3f}" for r in results])
print(f"均值: {np.mean(results):.3f}, 标准差: {np.std(results):.3f}")

In [None]:
# K 折交叉验证
model = DecisionTreeClassifier(random_state=42)

# 使用 cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print("5 折交叉验证:")
print(f"各折得分: {scores}")
print(f"平均得分: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

In [None]:
# 可视化 K 折交叉验证
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fig, ax = plt.subplots(figsize=(12, 6))

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
    # 绘制训练集和测试集
    ax.scatter(train_idx, [fold_idx] * len(train_idx), c='blue', marker='s', s=20, alpha=0.5)
    ax.scatter(test_idx, [fold_idx] * len(test_idx), c='red', marker='s', s=20, alpha=0.5)

ax.set_yticks(range(5))
ax.set_yticklabels([f'Fold {i+1}' for i in range(5)])
ax.set_xlabel('样本索引')
ax.set_title('5 折交叉验证分割示意图\n蓝色=训练集, 红色=验证集')
plt.tight_layout()
plt.show()

In [None]:
# 不同的交叉验证策略
from sklearn.model_selection import StratifiedKFold, LeaveOneOut, ShuffleSplit

# 分层 K 折（保持类别比例）
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_stratified = cross_val_score(model, X, y, cv=skf)
print(f"分层 5 折: {scores_stratified.mean():.3f} (+/- {scores_stratified.std() * 2:.3f})")

# 留一法（适用于小数据集）
loo = LeaveOneOut()
scores_loo = cross_val_score(model, X[:50], y[:50], cv=loo)
print(f"留一法 (前50样本): {scores_loo.mean():.3f}")

# 随机分割
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
scores_shuffle = cross_val_score(model, X, y, cv=ss)
print(f"随机分割 (10次): {scores_shuffle.mean():.3f} (+/- {scores_shuffle.std() * 2:.3f})")

## 3. 网格搜索调参

In [None]:
# 决策树参数调优
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print("网格搜索结果:")
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳得分: {grid_search.best_score_:.3f}")

In [None]:
# 查看所有参数组合结果
results_df = pd.DataFrame(grid_search.cv_results_)
print("\n前 10 个参数组合:")
print(results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
      .sort_values('rank_test_score').head(10))

In [None]:
# 可视化参数影响
pivot_depth = results_df.pivot_table(
    values='mean_test_score',
    index='param_max_depth',
    columns='param_min_samples_split'
)

plt.figure(figsize=(10, 6))
import seaborn as sns
sns.heatmap(pivot_depth, annot=True, fmt='.3f', cmap='YlGnBu')
plt.title('参数组合得分热力图')
plt.xlabel('min_samples_split')
plt.ylabel('max_depth')
plt.show()

## 4. 随机搜索调参

In [None]:
from scipy.stats import randint, uniform

# 随机森林参数分布
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 15),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestClassifier(random_state=42)

# 随机搜索
random_search = RandomizedSearchCV(
    rf, param_dist, n_iter=50, cv=5, 
    scoring='accuracy', random_state=42, n_jobs=-1
)
random_search.fit(X, y)

print("随机搜索结果:")
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳得分: {random_search.best_score_:.3f}")

In [None]:
# 比较网格搜索和随机搜索
import time

# 网格搜索
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10]
}

start = time.time()
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, n_jobs=-1)
grid_rf.fit(X, y)
grid_time = time.time() - start

# 随机搜索（相同数量的迭代）
start = time.time()
random_rf = RandomizedSearchCV(rf, param_dist, n_iter=27, cv=5, random_state=42, n_jobs=-1)
random_rf.fit(X, y)
random_time = time.time() - start

print("比较结果:")
print(f"网格搜索: 得分={grid_rf.best_score_:.3f}, 时间={grid_time:.2f}s")
print(f"随机搜索: 得分={random_rf.best_score_:.3f}, 时间={random_time:.2f}s")

## 5. 学习曲线

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy', n_jobs=-1
    )
    
    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    test_mean = test_scores.mean(axis=1)
    test_std = test_scores.std(axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='orange')
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练得分')
    plt.plot(train_sizes, test_mean, 'o-', color='orange', label='验证得分')
    plt.xlabel('训练样本数')
    plt.ylabel('准确率')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

# 比较不同复杂度模型的学习曲线
plot_learning_curve(DecisionTreeClassifier(max_depth=3), X, y, '简单模型 (max_depth=3)')
plot_learning_curve(DecisionTreeClassifier(max_depth=None), X, y, '复杂模型 (max_depth=None)')

## 6. 验证曲线

In [None]:
from sklearn.model_selection import validation_curve

param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]

train_scores, test_scores = validation_curve(
    DecisionTreeClassifier(random_state=42), X, y,
    param_name='max_depth', param_range=param_range,
    cv=5, scoring='accuracy', n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
test_mean = test_scores.mean(axis=1)
test_std = test_scores.std(axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='orange')
plt.plot(param_range, train_mean, 'o-', color='blue', label='训练得分')
plt.plot(param_range, test_mean, 'o-', color='orange', label='验证得分')
plt.xlabel('max_depth')
plt.ylabel('准确率')
plt.title('验证曲线 - max_depth')
plt.legend(loc='best')
plt.grid(True)
plt.show()

## 7. 实际案例：完整调参流程

In [None]:
# 生成更复杂的数据
X_large, y_large = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_redundant=5, n_classes=3, random_state=42
)

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
    X_large, y_large, test_size=0.2, random_state=42
)

print(f"训练集: {X_train.shape}")
print(f"测试集: {X_test.shape}")

In [None]:
# 第一步：快速随机搜索找到大致范围
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10)
}

rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    rf, param_dist, n_iter=30, cv=5, 
    scoring='accuracy', random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)

print("随机搜索最佳参数:")
print(random_search.best_params_)
print(f"最佳CV得分: {random_search.best_score_:.3f}")

In [None]:
# 第二步：在最佳参数附近进行网格搜索精调
best_params = random_search.best_params_

param_grid_fine = {
    'n_estimators': [max(50, best_params['n_estimators'] - 50),
                    best_params['n_estimators'],
                    best_params['n_estimators'] + 50],
    'max_depth': [max(3, best_params['max_depth'] - 2),
                 best_params['max_depth'],
                 best_params['max_depth'] + 2],
    'min_samples_split': [max(2, best_params['min_samples_split'] - 2),
                         best_params['min_samples_split'],
                         best_params['min_samples_split'] + 2]
}

grid_search_fine = GridSearchCV(
    rf, param_grid_fine, cv=5,
    scoring='accuracy', n_jobs=-1
)
grid_search_fine.fit(X_train, y_train)

print("精调后最佳参数:")
print(grid_search_fine.best_params_)
print(f"最佳CV得分: {grid_search_fine.best_score_:.3f}")

In [None]:
# 第三步：在测试集上评估
best_model = grid_search_fine.best_estimator_
test_score = best_model.score(X_test, y_test)

print(f"\n最终测试集准确率: {test_score:.3f}")

# 比较默认参数和调优后的模型
default_model = RandomForestClassifier(random_state=42)
default_model.fit(X_train, y_train)
default_score = default_model.score(X_test, y_test)

print(f"默认参数测试准确率: {default_score:.3f}")
print(f"提升: {(test_score - default_score) * 100:.1f}%")

## 8. 练习题

### 练习：对 SVM 进行参数调优

In [None]:
from sklearn.svm import SVC

# 数据准备
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 在这里编写代码
# 1. 定义 SVM 参数网格（C, gamma, kernel）
# 2. 使用网格搜索找到最佳参数
# 3. 在测试集上评估


## 9. 本课小结

1. **交叉验证**：更可靠的模型评估方法
2. **网格搜索**：穷举所有参数组合
3. **随机搜索**：在大参数空间中更高效
4. **学习曲线**：诊断过拟合/欠拟合
5. **验证曲线**：查看单个参数的影响
6. **调参流程**：随机搜索 → 网格搜索精调