# 第2课：线性回归

## 学习目标
- 理解线性回归的原理
- 掌握简单线性回归和多元线性回归
- 学会评估回归模型
- 了解正则化方法

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

## 1. 线性回归原理

线性回归假设目标值是特征的线性组合：

$$y = w_0 + w_1x_1 + w_2x_2 + ... + w_nx_n$$

目标是找到使预测误差最小的权重 $w$。

## 2. 简单线性回归

In [None]:
# 生成示例数据
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1)

# 可视化
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Sample Data')
plt.show()

In [None]:
# 训练线性回归模型
model = LinearRegression()
model.fit(X, y)

print(f"截距 (w0): {model.intercept_[0]:.4f}")
print(f"系数 (w1): {model.coef_[0][0]:.4f}")
print(f"\n拟合方程: y = {model.intercept_[0]:.2f} + {model.coef_[0][0]:.2f} * x")

In [None]:
# 可视化拟合结果
y_pred = model.predict(X)

plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, label='Data')
plt.plot(X, y_pred, color='red', linewidth=2, label='Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()
plt.show()

## 3. 多元线性回归

In [None]:
# 使用波士顿房价数据集的替代
from sklearn.datasets import make_regression

# 生成回归数据
X, y = make_regression(n_samples=500, n_features=10, noise=10, random_state=42)

# 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"训练集: {X_train.shape}")
print(f"测试集: {X_test.shape}")

In [None]:
# 训练模型
model = LinearRegression()
model.fit(X_train, y_train)

# 预测
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f"截距: {model.intercept_:.4f}")
print(f"系数: {model.coef_}")

## 4. 模型评估

In [None]:
# 评估指标
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("模型评估:")
print(f"训练集 MSE: {train_mse:.4f}")
print(f"测试集 MSE: {test_mse:.4f}")
print(f"训练集 R²: {train_r2:.4f}")
print(f"测试集 R²: {test_r2:.4f}")

In [None]:
# 残差图
residuals = y_test - y_test_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 预测 vs 实际
axes[0].scatter(y_test, y_test_pred, alpha=0.6)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title('Predicted vs Actual')

# 残差分布
axes[1].scatter(y_test_pred, residuals, alpha=0.6)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')

plt.tight_layout()
plt.show()

## 5. 正则化

In [None]:
# Ridge 回归 (L2 正则化)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

print("Ridge 回归:")
print(f"测试集 R²: {r2_score(y_test, ridge_pred):.4f}")

# Lasso 回归 (L1 正则化)
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

print("\nLasso 回归:")
print(f"测试集 R²: {r2_score(y_test, lasso_pred):.4f}")
print(f"非零系数数量: {np.sum(lasso.coef_ != 0)}")

In [None]:
# 比较系数
plt.figure(figsize=(12, 5))

x = np.arange(len(model.coef_))
width = 0.25

plt.bar(x - width, model.coef_, width, label='Linear')
plt.bar(x, ridge.coef_, width, label='Ridge')
plt.bar(x + width, lasso.coef_, width, label='Lasso')

plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.title('Coefficient Comparison')
plt.legend()
plt.show()

## 6. 练习题

### 练习：房价预测
使用线性回归预测房价

In [None]:
# 创建房价数据
np.random.seed(42)
n_samples = 200

# 特征：面积、房间数、年龄
area = np.random.randint(50, 200, n_samples)
rooms = np.random.randint(1, 6, n_samples)
age = np.random.randint(0, 50, n_samples)

# 价格 = 基础价 + 面积*1000 + 房间*50000 - 年龄*2000 + 噪声
price = 100000 + area * 1000 + rooms * 50000 - age * 2000 + np.random.randn(n_samples) * 10000

# 创建 DataFrame
house_data = pd.DataFrame({
    'area': area,
    'rooms': rooms,
    'age': age,
    'price': price
})

print(house_data.head())

# 在这里编写线性回归代码


## 7. 本课小结

1. **线性回归**：假设目标是特征的线性组合
2. **评估指标**：MSE、RMSE、R²
3. **Ridge**：L2 正则化，减小系数
4. **Lasso**：L1 正则化，特征选择
5. **残差分析**：检查模型假设是否成立