In [3]:
import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存训练好的模型/缩放器
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso # 只保留 LinearRegression 和 Lasso
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error # 仍然导入其他指标以便后续评估
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# --- 配置 ---
DATA_FOLDER = 'Regression_data' # 包含最终CSV文件的文件夹
OUTPUT_FOLDER_INTERMEDIATE = 'Intermediate_Predictions' # 存放中间结果的文件夹
OUTPUT_FILE = 'submission_mae_optimized_ols_lasso.csv' # 最终输出文件名
N_SPLITS = 5 # 交叉验证的折数
RANDOM_STATE = 42 # 用于保证结果可复现

# --- 0. 创建中间结果文件夹 ---
os.makedirs(OUTPUT_FOLDER_INTERMEDIATE, exist_ok=True)
print(f"中间结果将保存在文件夹: '{OUTPUT_FOLDER_INTERMEDIATE}'")

# --- 1. 加载数据 ---
print("\n--- 加载数据 ---")
try:
    # 尝试读取之前保存的处理后的文件
    train_price_df = pd.read_csv(os.path.join(DATA_FOLDER, 'train_price_final.csv'), encoding='utf-8-sig')
    test_price_df = pd.read_csv(os.path.join(DATA_FOLDER, 'test_price_final.csv'), encoding='utf-8-sig')
    train_rent_df = pd.read_csv(os.path.join(DATA_FOLDER, 'train_rent_final.csv'), encoding='utf-8-sig')
    test_rent_df = pd.read_csv(os.path.join(DATA_FOLDER, 'test_rent_final.csv'), encoding='utf-8-sig')
    print("数据加载成功。")
    print(f"房价训练集形状: {train_price_df.shape}")
    print(f"房价测试集形状: {test_price_df.shape}")
    print(f"租金训练集形状: {train_rent_df.shape}")
    print(f"租金测试集形状: {test_rent_df.shape}")
except FileNotFoundError as e:
    print(f"加载数据时出错: {e}. 请确保CSV文件位于 '{DATA_FOLDER}' 目录下。")
    exit() # 如果数据无法加载则退出

# 存储测试集的 ID (非常重要!)
test_price_ids = test_price_df['ID'].copy()
test_rent_ids = test_rent_df['ID'].copy()

# --- 转换布尔列 (True/False) 为整数 (1/0) ---
print("\n--- 转换布尔列 (True/False) 为整数 (1/0) ---")
dataframes_to_check = {
    'train_price_df': train_price_df,
    'test_price_df': test_price_df,
    'train_rent_df': train_rent_df,
    'test_rent_df': test_rent_df
}
for df_name, df_obj in dataframes_to_check.items():
    if df_obj is None:
        print(f"跳过 {df_name}, DataFrame 未加载。")
        continue
    print(f"\n检查 {df_name}...")
    bool_columns = df_obj.select_dtypes(include='bool').columns
    if not bool_columns.empty:
        print(f"  找到 {len(bool_columns)} 个布尔列需要转换: {bool_columns.tolist()}")
        for col in bool_columns:
            try:
                globals()[df_name][col] = globals()[df_name][col].astype(int)
            except Exception as e:
                print(f"    转换 {df_name} 的列 '{col}' 时出错: {e}")
        print(f"  {df_name} 中的布尔列已转换为整数 (0/1)。")
    else:
        print("  未找到布尔列。")
print("\n--- 布尔列转换完成 ---")


# --- 2. 定义评价标准 (仅 MAE) ---
print("\n--- 定义评价标准 (MAE) ---")
# 使用 Scikit-learn 内置的 MAE
# 因为 MAE 是误差，越小越好，所以设置 greater_is_better=False
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好，GridSearchCV 会报告负MAE)。")

# 定义一个辅助函数，用于后续单独计算所有指标
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    return rmse, mae, medae

# --- 3. 预处理函数 (保持不变) ---
def preprocess_and_align(train_df, test_df, target_col='Price'):
    """执行预处理：分离目标变量，处理非数值列，对齐列，后续通过 Pipeline 进行填充和缩放。"""
    print(f"开始预处理数据...")
    y_train = train_df[target_col]
    X_train = train_df.drop(columns=[target_col])
    X_test = test_df.drop(columns=['ID'])
    non_numeric_train = X_train.select_dtypes(exclude=np.number).columns
    non_numeric_test = X_test.select_dtypes(exclude=np.number).columns
    if not non_numeric_train.empty:
        print(f"  从训练集中删除非数值列: {non_numeric_train.tolist()}")
        X_train = X_train.drop(columns=non_numeric_train)
    if not non_numeric_test.empty:
        print(f"  从测试集中删除非数值列: {non_numeric_test.tolist()}")
        X_test = X_test.drop(columns=non_numeric_test)
    train_cols = X_train.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test:
        X_test[c] = 0
    extra_in_test = set(test_cols) - set(train_cols)
    if extra_in_test:
        print(f"  删除仅在测试集中出现的额外列: {list(extra_in_test)}")
        X_test = X_test.drop(columns=list(extra_in_test))
    X_test = X_test[train_cols]
    print(f"  对齐后训练集特征形状: {X_train.shape}")
    print(f"  对齐后测试集特征形状: {X_test.shape}")
    return X_train, y_train, X_test

# --- 4. 定义 Pipeline ---
# 将 Pipeline 定义移到主流程之前，以便复用
pipeline_lr = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

pipeline_lasso = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', Lasso(random_state=RANDOM_STATE, max_iter=2000)) # 保持增加迭代次数
])

# 定义 Lasso 的参数网格
param_grid_lasso = {'model__alpha': np.logspace(-4, 1, 6)} # Lasso 通常需要较小的 alpha

# 交叉验证策略
cv_strategy = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

# --- 5. 主要工作流程 ---

# 用于存储最终选择的预测结果
final_price_predictions = None
final_rent_predictions = None

# --- ## 房价预测 ## ---
print("\n" + "="*30)
print(" 开始房价预测工作流程 ")
print("="*30)
X_train_price, y_train_price, X_test_price = preprocess_and_align(train_price_df, test_price_df)

# --- ### 房价 OLS (LinearRegression) ### ---
print("\n--- 房价 OLS (LinearRegression) ---")
print("评估 OLS...")
ols_price_cv_scores = cross_val_score(pipeline_lr, X_train_price, y_train_price, cv=cv_strategy, scoring=mae_scorer, n_jobs=-1)
ols_price_mean_cv_score = np.mean(ols_price_cv_scores)
print(f"OLS 交叉验证平均得分 (负MAE): {ols_price_mean_cv_score:.4f}")

print("在整个训练集上训练 OLS...")
pipeline_lr.fit(X_train_price, y_train_price)
print("OLS 训练完成。")

print("在测试集上预测 OLS...")
ols_price_predictions = pipeline_lr.predict(X_test_price)
print("OLS 预测完成。")

print("保存 OLS 中间预测结果...")
ols_price_submission = pd.DataFrame({'ID': test_price_ids, 'PredictedPrice': ols_price_predictions})
ols_price_output_path = os.path.join(OUTPUT_FOLDER_INTERMEDIATE, 'intermediate_price_ols.csv')
ols_price_submission.to_csv(ols_price_output_path, index=False, encoding='utf-8-sig')
print(f"OLS 房价预测已保存至: '{ols_price_output_path}'")

# --- ### 房价 Lasso ### ---
print("\n--- 房价 Lasso ---")
print("运行 GridSearchCV (Lasso)...")
grid_lasso_price = GridSearchCV(pipeline_lasso, param_grid_lasso, cv=cv_strategy, scoring=mae_scorer, n_jobs=-1, refit=True) # refit=True 会自动用最佳参数在全数据上训练
grid_lasso_price.fit(X_train_price, y_train_price)
lasso_price_best_cv_score = grid_lasso_price.best_score_
print(f"最优 Lasso 交叉验证得分 (负MAE): {lasso_price_best_cv_score:.4f}")
print(f"最优 Lasso 参数: {grid_lasso_price.best_params_}")

# best_estimator_ 已经是训练好的最优 Lasso Pipeline
best_lasso_price_pipeline = grid_lasso_price.best_estimator_

print("在测试集上预测最优 Lasso...")
lasso_price_predictions = best_lasso_price_pipeline.predict(X_test_price)
print("Lasso 预测完成。")

print("保存 Lasso 中间预测结果...")
lasso_price_submission = pd.DataFrame({'ID': test_price_ids, 'PredictedPrice': lasso_price_predictions})
lasso_price_output_path = os.path.join(OUTPUT_FOLDER_INTERMEDIATE, 'intermediate_price_lasso.csv')
lasso_price_submission.to_csv(lasso_price_output_path, index=False, encoding='utf-8-sig')
print(f"Lasso 房价预测已保存至: '{lasso_price_output_path}'")

# --- ### 房价模型选择与评估 ### ---
print("\n--- 房价模型选择与最终评估 ---")
if ols_price_mean_cv_score >= lasso_price_best_cv_score: # 比较负MAE，越大越好
    print(f"选择 OLS 作为最优房价模型 (CV Score: {ols_price_mean_cv_score:.4f} >= {lasso_price_best_cv_score:.4f})")
    final_price_predictions = ols_price_predictions
    best_price_model_pipeline = pipeline_lr # 注意：pipeline_lr 已经是fit过的
else:
    print(f"选择 Lasso 作为最优房价模型 (CV Score: {lasso_price_best_cv_score:.4f} > {ols_price_mean_cv_score:.4f})")
    final_price_predictions = lasso_price_predictions
    best_price_model_pipeline = best_lasso_price_pipeline # 这个也是fit过的

# 使用选出的最优模型在完整训练集上评估详细指标
try:
    y_train_pred_price = best_price_model_pipeline.predict(X_train_price)
    rmse_train_price, mae_train_price, medae_train_price = calculate_metrics(y_train_price, y_train_pred_price)
    print("最优房价模型在完整训练集上的指标:")
    print(f"  训练集 RMSE:  {rmse_train_price:.4f}")
    print(f"  训练集 MAE:    {mae_train_price:.4f}")
    print(f"  训练集 MedAE: {medae_train_price:.4f}")
except Exception as e:
    print(f"  计算最优房价模型训练指标时出错: {e}")


# --- ## 租金预测 ## ---
print("\n" + "="*30)
print(" 开始租金预测工作流程 ")
print("="*30)
X_train_rent, y_train_rent, X_test_rent = preprocess_and_align(train_rent_df, test_rent_df)

# --- ### 租金 OLS (LinearRegression) ### ---
print("\n--- 租金 OLS (LinearRegression) ---")
print("评估 OLS...")
# 需要重新实例化 OLS Pipeline，以防之前的 fit 操作影响 CV
pipeline_lr_rent = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
ols_rent_cv_scores = cross_val_score(pipeline_lr_rent, X_train_rent, y_train_rent, cv=cv_strategy, scoring=mae_scorer, n_jobs=-1)
ols_rent_mean_cv_score = np.mean(ols_rent_cv_scores)
print(f"OLS 交叉验证平均得分 (负MAE): {ols_rent_mean_cv_score:.4f}")

print("在整个训练集上训练 OLS...")
pipeline_lr_rent.fit(X_train_rent, y_train_rent)
print("OLS 训练完成。")

print("在测试集上预测 OLS...")
ols_rent_predictions = pipeline_lr_rent.predict(X_test_rent)
print("OLS 预测完成。")

print("保存 OLS 中间预测结果...")
ols_rent_submission = pd.DataFrame({'ID': test_rent_ids, 'PredictedPrice': ols_rent_predictions})
ols_rent_output_path = os.path.join(OUTPUT_FOLDER_INTERMEDIATE, 'intermediate_rent_ols.csv')
ols_rent_submission.to_csv(ols_rent_output_path, index=False, encoding='utf-8-sig')
print(f"OLS 租金预测已保存至: '{ols_rent_output_path}'")

# --- ### 租金 Lasso ### ---
print("\n--- 租金 Lasso ---")
print("运行 GridSearchCV (Lasso)...")
# 需要重新实例化 Lasso Pipeline 和 GridSearchCV
pipeline_lasso_rent = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', Lasso(random_state=RANDOM_STATE, max_iter=2000))
])
grid_lasso_rent = GridSearchCV(pipeline_lasso_rent, param_grid_lasso, cv=cv_strategy, scoring=mae_scorer, n_jobs=-1, refit=True)
grid_lasso_rent.fit(X_train_rent, y_train_rent)
lasso_rent_best_cv_score = grid_lasso_rent.best_score_
print(f"最优 Lasso 交叉验证得分 (负MAE): {lasso_rent_best_cv_score:.4f}")
print(f"最优 Lasso 参数: {grid_lasso_rent.best_params_}")

best_lasso_rent_pipeline = grid_lasso_rent.best_estimator_

print("在测试集上预测最优 Lasso...")
lasso_rent_predictions = best_lasso_rent_pipeline.predict(X_test_rent)
print("Lasso 预测完成。")

print("保存 Lasso 中间预测结果...")
lasso_rent_submission = pd.DataFrame({'ID': test_rent_ids, 'PredictedPrice': lasso_rent_predictions})
lasso_rent_output_path = os.path.join(OUTPUT_FOLDER_INTERMEDIATE, 'intermediate_rent_lasso.csv')
lasso_rent_submission.to_csv(lasso_rent_output_path, index=False, encoding='utf-8-sig')
print(f"Lasso 租金预测已保存至: '{lasso_rent_output_path}'")

# --- ### 租金模型选择与评估 ### ---
print("\n--- 租金模型选择与最终评估 ---")
if ols_rent_mean_cv_score >= lasso_rent_best_cv_score: # 比较负MAE，越大越好
    print(f"选择 OLS 作为最优租金模型 (CV Score: {ols_rent_mean_cv_score:.4f} >= {lasso_rent_best_cv_score:.4f})")
    final_rent_predictions = ols_rent_predictions
    best_rent_model_pipeline = pipeline_lr_rent # 这个是fit过的
else:
    print(f"选择 Lasso 作为最优租金模型 (CV Score: {lasso_rent_best_cv_score:.4f} > {ols_rent_mean_cv_score:.4f})")
    final_rent_predictions = lasso_rent_predictions
    best_rent_model_pipeline = best_lasso_rent_pipeline # 这个也是fit过的

# 使用选出的最优模型在完整训练集上评估详细指标
try:
    y_train_pred_rent = best_rent_model_pipeline.predict(X_train_rent)
    rmse_train_rent, mae_train_rent, medae_train_rent = calculate_metrics(y_train_rent, y_train_pred_rent)
    print("最优租金模型在完整训练集上的指标:")
    print(f"  训练集 RMSE:  {rmse_train_rent:.4f}")
    print(f"  训练集 MAE:    {mae_train_rent:.4f}")
    print(f"  训练集 MedAE: {medae_train_rent:.4f}")
except Exception as e:
    print(f"  计算最优租金模型训练指标时出错: {e}")


# --- 6. 合并最终选择的预测结果与保存 ---
print("\n--- 合并最优预测结果并保存提交文件 ---")

# 检查 final_price_predictions 和 final_rent_predictions 是否已生成
if final_price_predictions is None or final_rent_predictions is None:
     print("错误：未能成功生成房价或租金的最终预测结果。无法合并。")
else:
    # 使用 final_price_predictions 和 final_rent_predictions 创建 DataFrame
    submission_price_final = pd.DataFrame({'ID': test_price_ids, 'PredictedPrice': final_price_predictions})
    submission_rent_final = pd.DataFrame({'ID': test_rent_ids, 'PredictedPrice': final_rent_predictions})

    # 可选: 如果需要，确保预测值为非负数
    # submission_price_final['PredictedPrice'] = submission_price_final['PredictedPrice'].clip(lower=0)
    # submission_rent_final['PredictedPrice'] = submission_rent_final['PredictedPrice'].clip(lower=0)

    final_submission = pd.concat([submission_price_final, submission_rent_final], ignore_index=True)

    # 确保 ID 列是整数类型
    final_submission['ID'] = final_submission['ID'].astype(int)

    # 按 ID 排序
    final_submission = final_submission.sort_values(by='ID')

    try:
        final_submission.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig')
        print(f"✓ 最终提交文件已成功保存: '{OUTPUT_FILE}'")
        print(f"  总预测条数: {len(final_submission)}")
        print(f"  列名: {final_submission.columns.tolist()}")
        print("\n最终提交文件预览 (前5行):")
        print(final_submission.head())
    except Exception as e:
        print(f"✗ 保存最终提交文件时出错: {e}")

print("\n--- 工作流程完成 ---")

中间结果将保存在文件夹: 'Intermediate_Predictions'

--- 加载数据 ---
数据加载成功。
房价训练集形状: (103871, 187)
房价测试集形状: (34017, 187)
租金训练集形状: (98899, 153)
租金测试集形状: (9773, 152)

--- 转换布尔列 (True/False) 为整数 (1/0) ---

检查 train_price_df...
  找到 45 个布尔列需要转换: ['楼层_中楼层', '楼层_低楼层', '楼层_地下室', '楼层_底层', '楼层_顶层', '楼层_高楼层', '环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '电梯_未知', '建筑结构_未知结构', '建筑结构_框架结构', '建筑结构_混合结构', '建筑结构_砖木结构', '建筑结构_砖混结构', '建筑结构_钢混结构', '建筑结构_钢结构', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '别墅_双拼', '别墅_叠拼', '别墅_独栋', '别墅_联排', '别墅_非别墅', '产权所属_共有', '产权所属_非共有']
  train_price_df 中的布尔列已转换为整数 (0/1)。

检查 test_price_df...
  找到 45 个布尔列需要转换: ['楼层_中楼层', '楼层_低楼层', '楼层_地下室', '楼层_底层', '楼层_顶层', '楼层_高楼层', '环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '电梯_未知', '建筑结构_未知结构

  model = cd_fast.enet_coordinate_descent(


最优 Lasso 交叉验证得分 (负MAE): -859670.7281
最优 Lasso 参数: {'model__alpha': 10.0}
在测试集上预测最优 Lasso...
Lasso 预测完成。
保存 Lasso 中间预测结果...
Lasso 房价预测已保存至: 'Intermediate_Predictions\intermediate_price_lasso.csv'

--- 房价模型选择与最终评估 ---
选择 Lasso 作为最优房价模型 (CV Score: -859670.7281 > -860805.4599)
最优房价模型在完整训练集上的指标:
  训练集 RMSE:  1453774.7536
  训练集 MAE:    857353.8618
  训练集 MedAE: 567058.5739

 开始租金预测工作流程 
开始预处理数据...
  对齐后训练集特征形状: (98899, 152)
  对齐后测试集特征形状: (9773, 152)

--- 租金 OLS (LinearRegression) ---
评估 OLS...
OLS 交叉验证平均得分 (负MAE): -206417.0163
在整个训练集上训练 OLS...
OLS 训练完成。
在测试集上预测 OLS...
OLS 预测完成。
保存 OLS 中间预测结果...
OLS 租金预测已保存至: 'Intermediate_Predictions\intermediate_rent_ols.csv'

--- 租金 Lasso ---
运行 GridSearchCV (Lasso)...


  model = cd_fast.enet_coordinate_descent(


最优 Lasso 交叉验证得分 (负MAE): -206447.5349
最优 Lasso 参数: {'model__alpha': 10.0}
在测试集上预测最优 Lasso...
Lasso 预测完成。
保存 Lasso 中间预测结果...
Lasso 租金预测已保存至: 'Intermediate_Predictions\intermediate_rent_lasso.csv'

--- 租金模型选择与最终评估 ---
选择 OLS 作为最优租金模型 (CV Score: -206417.0163 >= -206447.5349)
最优租金模型在完整训练集上的指标:
  训练集 RMSE:  357204.0525
  训练集 MAE:    205994.8482
  训练集 MedAE: 146618.6541

--- 合并最优预测结果并保存提交文件 ---
✓ 最终提交文件已成功保存: 'submission_mae_optimized_ols_lasso.csv'
  总预测条数: 43790
  列名: ['ID', 'PredictedPrice']

最终提交文件预览 (前5行):
        ID  PredictedPrice
0  1000000    1.124996e+07
1  1000001    3.579498e+06
2  1000002    5.460950e+06
3  1000003    2.394602e+06
4  1000004    8.015916e+06

--- 工作流程完成 ---
