# 实战项目1：房价预测完整项目

## 项目概述

这是一个端到端的机器学习项目，涵盖从数据获取到模型部署的完整流程。

### 学习目标
- 掌握完整的机器学习项目流程
- 学习数据预处理和特征工程
- 实践多种模型的训练和比较
- 了解模型评估和优化方法

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# 设置显示选项
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')

## 1. 数据加载与探索

In [None]:
# 加载加州房价数据集
housing = fetch_california_housing()

# 创建 DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal'] = housing.target  # 目标变量：房价中位数（单位：10万美元）

print("数据集信息:")
print(f"样本数: {len(df)}")
print(f"特征数: {len(housing.feature_names)}")
print(f"\n特征说明:")
print(housing.DESCR[:1500])

In [None]:
# 查看数据基本信息
print("数据预览:")
df.head()

In [None]:
# 数据统计描述
df.describe()

In [None]:
# 检查缺失值
print("缺失值统计:")
print(df.isnull().sum())

## 2. 探索性数据分析 (EDA)

In [None]:
# 目标变量分布
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 直方图
axes[0].hist(df['MedHouseVal'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Median House Value (100k $)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of House Values')
axes[0].axvline(df['MedHouseVal'].mean(), color='red', linestyle='--', label=f'Mean: {df["MedHouseVal"].mean():.2f}')
axes[0].legend()

# 箱线图
axes[1].boxplot(df['MedHouseVal'])
axes[1].set_ylabel('Median House Value (100k $)')
axes[1].set_title('Box Plot of House Values')

plt.tight_layout()
plt.show()

In [None]:
# 相关性热力图
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# 与目标变量相关性最高的特征
target_corr = correlation_matrix['MedHouseVal'].drop('MedHouseVal').sort_values(key=abs, ascending=False)
print("与房价相关性排序:")
print(target_corr)

In [None]:
# 特征分布可视化
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(df.columns):
    axes[i].hist(df[col], bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_title(col)
    axes[i].set_xlabel('')

plt.tight_layout()
plt.show()

In [None]:
# 散点图：关键特征与房价的关系
key_features = ['MedInc', 'AveRooms', 'Latitude', 'Longitude']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    axes[i].scatter(df[feature], df['MedHouseVal'], alpha=0.3, s=5)
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('MedHouseVal')
    axes[i].set_title(f'{feature} vs House Value')

plt.tight_layout()
plt.show()

In [None]:
# 地理位置可视化
plt.figure(figsize=(12, 10))
scatter = plt.scatter(
    df['Longitude'], df['Latitude'],
    c=df['MedHouseVal'], cmap='viridis',
    alpha=0.5, s=df['Population']/100
)
plt.colorbar(scatter, label='Median House Value (100k $)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('California House Prices by Location')
plt.show()

## 3. 数据预处理

In [None]:
# 分离特征和目标
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")

In [None]:
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("标准化后的特征统计:")
print(f"均值: {X_train_scaled.mean(axis=0).round(2)}")
print(f"标准差: {X_train_scaled.std(axis=0).round(2)}")

## 4. 特征工程

In [None]:
# 创建新特征
def create_features(df):
    """创建新的特征"""
    df = df.copy()
    
    # 房间相关特征
    df['RoomsPerHousehold'] = df['AveRooms'] / df['AveOccup']
    df['BedroomRatio'] = df['AveBedrms'] / df['AveRooms']
    
    # 人口密度
    df['PopulationPerHousehold'] = df['Population'] / df['AveOccup']
    
    # 收入特征
    df['IncomePerRoom'] = df['MedInc'] / df['AveRooms']
    
    return df

# 应用特征工程
X_train_fe = create_features(X_train)
X_test_fe = create_features(X_test)

print(f"特征工程后的特征数: {X_train_fe.shape[1]}")
print(f"新特征: {list(X_train_fe.columns[8:])}")

In [None]:
# 处理无穷值
X_train_fe = X_train_fe.replace([np.inf, -np.inf], np.nan)
X_test_fe = X_test_fe.replace([np.inf, -np.inf], np.nan)

# 填充缺失值
X_train_fe = X_train_fe.fillna(X_train_fe.median())
X_test_fe = X_test_fe.fillna(X_train_fe.median())

# 标准化
scaler_fe = StandardScaler()
X_train_fe_scaled = scaler_fe.fit_transform(X_train_fe)
X_test_fe_scaled = scaler_fe.transform(X_test_fe)

## 5. 模型训练与比较

In [None]:
# 定义评估函数
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """训练并评估模型"""
    # 训练
    model.fit(X_train, y_train)
    
    # 预测
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # 计算指标
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    return {
        'Model': model_name,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train R2': train_r2,
        'Test R2': test_r2,
        'Test MAE': test_mae,
        'Predictions': y_test_pred
    }

In [None]:
# 定义模型
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01),
    'ElasticNet': ElasticNet(alpha=0.01, l1_ratio=0.5),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
}

# 训练和评估所有模型
results = []
predictions = {}

for name, model in models.items():
    print(f"训练 {name}...")
    result = evaluate_model(model, X_train_fe_scaled, X_test_fe_scaled, y_train, y_test, name)
    predictions[name] = result.pop('Predictions')
    results.append(result)

# 结果表格
results_df = pd.DataFrame(results).sort_values('Test R2', ascending=False)
print("\n模型比较结果:")
results_df

In [None]:
# 可视化模型比较
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RMSE 比较
x_pos = range(len(results_df))
axes[0].bar([i-0.2 for i in x_pos], results_df['Train RMSE'], 0.4, label='Train', color='steelblue')
axes[0].bar([i+0.2 for i in x_pos], results_df['Test RMSE'], 0.4, label='Test', color='darkorange')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[0].set_ylabel('RMSE')
axes[0].set_title('Model RMSE Comparison')
axes[0].legend()

# R2 比较
axes[1].bar([i-0.2 for i in x_pos], results_df['Train R2'], 0.4, label='Train', color='steelblue')
axes[1].bar([i+0.2 for i in x_pos], results_df['Test R2'], 0.4, label='Test', color='darkorange')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[1].set_ylabel('R² Score')
axes[1].set_title('Model R² Comparison')
axes[1].legend()

plt.tight_layout()
plt.show()

## 6. 模型优化

In [None]:
# 使用 GridSearchCV 优化 Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    rf, param_grid, cv=3, scoring='neg_mean_squared_error',
    verbose=1, n_jobs=-1
)

grid_search.fit(X_train_fe_scaled, y_train)

print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证 RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

In [None]:
# 使用最佳模型
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test_fe_scaled)

print("优化后的 Random Forest 性能:")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_best)):.4f}")
print(f"Test R²: {r2_score(y_test, y_pred_best):.4f}")
print(f"Test MAE: {mean_absolute_error(y_test, y_pred_best):.4f}")

## 7. 特征重要性分析

In [None]:
# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_fe.columns,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 8. 预测分析

In [None]:
# 预测值 vs 实际值
plt.figure(figsize=(10, 8))
plt.scatter(y_test, y_pred_best, alpha=0.3, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual House Value')
plt.ylabel('Predicted House Value')
plt.title('Actual vs Predicted House Values')
plt.tight_layout()
plt.show()

In [None]:
# 残差分析
residuals = y_test - y_pred_best

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 残差分布
axes[0].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(0, color='red', linestyle='--')
axes[0].set_xlabel('Residual')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Residual Distribution')

# 残差 vs 预测值
axes[1].scatter(y_pred_best, residuals, alpha=0.3, s=10)
axes[1].axhline(0, color='red', linestyle='--')
axes[1].set_xlabel('Predicted Value')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residuals vs Predicted Values')

plt.tight_layout()
plt.show()

## 9. 模型保存

In [None]:
import joblib

# 保存模型和预处理器
# joblib.dump(best_rf, 'house_price_model.pkl')
# joblib.dump(scaler_fe, 'house_price_scaler.pkl')

# 加载模型
# loaded_model = joblib.load('house_price_model.pkl')

print("模型保存代码已准备好")
print("取消注释即可保存/加载模型")

## 10. 项目总结

### 项目流程回顾

1. **数据加载**：使用加州房价数据集
2. **数据探索**：统计分析、相关性分析、可视化
3. **数据预处理**：缺失值处理、标准化
4. **特征工程**：创建新特征
5. **模型训练**：多种模型比较
6. **模型优化**：网格搜索调参
7. **结果分析**：特征重要性、残差分析

### 关键发现

1. 收入中位数 (MedInc) 是最重要的预测特征
2. 地理位置对房价有显著影响
3. 集成模型 (Random Forest, Gradient Boosting) 表现最好

### 改进方向

1. 使用更复杂的特征工程
2. 尝试 XGBoost、LightGBM
3. 进行更细致的超参数调优
4. 考虑地理特征的更多利用