In [2]:
# ==============================================================================
# 完整脚本：叶节点编码 + Top Features + RidgeCV **租金预测** (Rent Prediction) - V3 (Log Target, Combined Features, External CV)
# ==============================================================================

import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存对象
from sklearn.model_selection import KFold, cross_val_score # 导入 cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix # 用于组合稀疏和稠密矩阵

print("开始执行 Rent 预测流程...")
print("==============================")

# --- 1. 配置 ---
# 输入文件夹 (包含 Rent 的 _selected.csv 文件)
# *** 假设 Rent 的筛选后数据也保存在这里，如果不是请修改 ***
INPUT_FOLDER = 'Rent_Capped_Aggregated_Data'
# 输出文件夹 (存放本次 Rent 预测结果)
OUTPUT_FOLDER = 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' # 文件夹名区分 Rent
# 输入文件名
TRAIN_RENT_FILE = 'train_rent_capped_agg.csv' # 使用 Rent 的筛选后文件
TEST_RENT_FILE = 'test_rent_capped_agg.csv'   # 使用 Rent 的筛选后文件
# 输出文件名
OUTPUT_FILE = 'submission_leaf_combined_ridge_rent_log_mae_3.csv' # 文件名区分 Rent

TARGET_COLUMN = 'Price' # 租金数据中的目标列仍然是 'Price'
ID_COLUMN = 'ID'
RANDOM_STATE = 42

# GBRT 参数 (可以与 Price 相同，或单独调整)
GBRT_PARAMS = {
    'n_estimators': 200,
    'max_depth': 5,
    'min_samples_leaf': 30,
    'learning_rate': 0.05,
    'subsample': 0.7,
    'random_state': RANDOM_STATE
}

# 保留多少个最重要的原始特征与叶节点结合
N_TOP_ORIGINAL_FEATURES = 20

# RidgeCV 参数 (可以与 Price 相同，或单独调整)
RIDGE_ALPHAS = np.logspace(-4, 6, 15) # Alpha 搜索范围
RIDGE_CV_FOLDS = 5 # RidgeCV 内部交叉验证折数

# 外部交叉验证折数
EXTERNAL_CV_FOLDS = 6 # 保持 6 折

# --- 2. 创建输出文件夹 ---
try:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    print(f"输出文件夹 '{OUTPUT_FOLDER}' 已创建或已存在。")
except OSError as e:
    print(f"创建文件夹 '{OUTPUT_FOLDER}' 时出错: {e}")
    exit()

# --- 3. 加载 Rent 数据 ---
print(f"\n--- 从 '{INPUT_FOLDER}' 加载 Rent 数据 ---")
try:
    train_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TRAIN_RENT_FILE), encoding='utf-8-sig')
    test_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TEST_RENT_FILE), encoding='utf-8-sig')
    print("Rent 数据加载成功。")
    print(f"  训练集形状: {train_rent_df.shape}")
    print(f"  测试集形状: {test_rent_df.shape}")
except FileNotFoundError as e:
    print(f"加载 Rent 数据时出错: {e}. 请确保文件路径正确 ({INPUT_FOLDER} 下应有 {TRAIN_RENT_FILE} 和 {TEST_RENT_FILE})。")
    exit()
except Exception as e:
    print(f"加载 Rent 数据时发生其他错误: {e}")
    exit()

# --- 转换布尔列 ---
print("\n--- 转换布尔列 (Rent) 为整数 (1/0) ---")
for df_name in ['train_rent_df', 'test_rent_df']:
    df_obj = globals()[df_name]
    if df_obj is None: continue
    print(f"\n检查 {df_name}...")
    bool_columns = df_obj.select_dtypes(include='bool').columns
    if not bool_columns.empty:
        print(f"  找到 {len(bool_columns)} 个布尔列需要转换: {bool_columns.tolist()}")
        for col in bool_columns:
            try:
                df_obj.loc[:, col] = df_obj[col].astype(int)
            except Exception as e:
                print(f"    转换 {df_name} 的列 '{col}' 时出错: {e}")
        print(f"  {df_name} 中的布尔列已转换为整数 (0/1)。")
    else:
        print("  未找到布尔列。")
print("\n--- 布尔列转换完成 ---")

# 存储测试集的 ID
test_rent_ids = test_rent_df[ID_COLUMN].copy()

# --- 4. 准备数据 (分离 X/y, Log Transform y, 对齐, 填充 NaN) ---
print("\n--- 准备 Rent 训练和测试数据 ---")
try:
    # !!! 应用 Log Transform !!!
    y_train_rent = np.log1p(train_rent_df[TARGET_COLUMN])
    print(f"  已对 Rent 目标变量 '{TARGET_COLUMN}' 应用 log1p 转换。")
    
    X_train_rent = train_rent_df.drop(columns=[TARGET_COLUMN])
    X_test_rent = test_rent_df.drop(columns=[ID_COLUMN])

    # 移除可能残余的非数值列
    non_numeric_train_rent = X_train_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_train_rent.empty:
        print(f"  警告: Rent 训练集发现非数值列，将移除: {non_numeric_train_rent.tolist()}")
        X_train_rent = X_train_rent.drop(columns=non_numeric_train_rent)
    non_numeric_test_rent = X_test_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_test_rent.empty:
         print(f"  警告: Rent 测试集发现非数值列，将移除: {non_numeric_test_rent.tolist()}")
         X_test_rent = X_test_rent.drop(columns=non_numeric_test_rent)

    # 对齐列
    train_rent_cols = X_train_rent.columns # 保存 Rent 特征列名
    test_rent_cols = X_test_rent.columns
    missing_in_test_rent = set(train_rent_cols) - set(test_rent_cols)
    for c in missing_in_test_rent: X_test_rent[c] = 0
    extra_in_test_rent = set(test_rent_cols) - set(train_rent_cols)
    if extra_in_test_rent: X_test_rent = X_test_rent.drop(columns=list(extra_in_test_rent))
    X_test_rent = X_test_rent[train_rent_cols] 

    print(f"  对齐后 Rent 特征形状: Train={X_train_rent.shape}, Test={X_test_rent.shape}")

    # **填充缺失值**
    print("  使用中位数填充缺失值...")
    imputer_rent = SimpleImputer(strategy='median')
    X_train_rent_imputed = imputer_rent.fit_transform(X_train_rent)
    X_test_rent_imputed = imputer_rent.transform(X_test_rent)
    print("  Rent 数据缺失值填充完成。")

except Exception as e:
     print(f"Rent 数据准备时发生错误: {e}")
     exit()

# --- 5. 训练 GBRT 模型 (Rent) 以生成叶节点 & 获取特征重要性 ---
print("\n--- 训练 Gradient Boosting 模型 (Rent) 以生成叶节点特征 ---")
gbrt_rent = GradientBoostingRegressor(**GBRT_PARAMS) # 使用新变量名
try:
    print(f"  使用以下参数训练 GBRT: {GBRT_PARAMS}")
    gbrt_rent.fit(X_train_rent_imputed, y_train_rent) # 使用 Rent 数据训练
    print("  GBRT 模型 (Rent) 训练完成。")

    # 获取特征重要性
    importances_rent = gbrt_rent.feature_importances_
    top_n_indices_rent = np.argsort(importances_rent)[::-1][:N_TOP_ORIGINAL_FEATURES]
    top_n_features_rent = train_rent_cols[top_n_indices_rent].tolist() # 使用 Rent 列名
    print(f"  识别出 Rent Top {N_TOP_ORIGINAL_FEATURES} 原始特征: {top_n_features_rent}")

except Exception as e:
    print(f"  训练 GBRT (Rent) 或获取重要性时出错: {e}")
    top_n_features_rent = [] 

# --- 6. 获取叶节点索引 (Rent) ---
print("\n--- 获取叶节点索引 (Rent) ---")
try:
    train_rent_leaf_indices = gbrt_rent.apply(X_train_rent_imputed)
    test_rent_leaf_indices = gbrt_rent.apply(X_test_rent_imputed)
except Exception as e:
    print(f"  获取叶节点索引 (Rent) 时出错: {e}")
    exit()

# --- 7. 对叶节点索引进行 One-Hot 编码 (Rent) ---
print("\n--- 对叶节点索引进行 One-Hot 编码 (Rent) ---")
leaf_encoder_rent = OneHotEncoder(handle_unknown='ignore', ) # 使用新编码器
try:
    X_train_rent_leaves_encoded = leaf_encoder_rent.fit_transform(train_rent_leaf_indices)
    X_test_rent_leaves_encoded = leaf_encoder_rent.transform(test_rent_leaf_indices)
    n_leaf_features_rent = X_train_rent_leaves_encoded.shape[1]
    print(f"  生成了 {n_leaf_features_rent} 个 Rent 叶节点二元特征。")
except Exception as e:
    print(f"  One-Hot 编码 (Rent) 时出错: {e}")
    exit()

# --- 8. 组合叶节点特征与 Top N 原始特征 (Rent) ---
print(f"\n--- 组合 Rent 叶节点特征与 Top {len(top_n_features_rent)} 原始特征 ---")
try:
    if top_n_features_rent:
        # 从 Rent 的填充后 NumPy 数组中按索引选取
        X_train_rent_top_features = X_train_rent_imputed[:, top_n_indices_rent]
        X_test_rent_top_features = X_test_rent_imputed[:, top_n_indices_rent]

        X_train_rent_final = hstack([X_train_rent_leaves_encoded, csr_matrix(X_train_rent_top_features)])
        X_test_rent_final = hstack([X_test_rent_leaves_encoded, csr_matrix(X_test_rent_top_features)])
        print(f"  组合后 Rent 最终特征形状: Train={X_train_rent_final.shape}, Test={X_test_rent_final.shape}")
    else:
        X_train_rent_final = X_train_rent_leaves_encoded
        X_test_rent_final = X_test_rent_leaves_encoded
        print("  只使用编码后的 Rent 叶节点特征。")
except Exception as e:
    print(f"  组合 Rent 特征时出错: {e}")
    exit()

# --- 9. 定义评价标准 (MAE Scorer) ---
print("\n--- 定义评价标准 (MAE) ---")
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好)。")
# 辅助函数 (保持不变, 内部处理 log/原尺度转换)
def calculate_metrics(y_true_log, y_pred_log):
    rmse_log = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    medae_log = median_absolute_error(y_true_log, y_pred_log)
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)
    # Clip negative predictions after inverse transform before calculating metrics
    y_pred_orig = np.clip(y_pred_orig, a_min=0, a_max=None) 
    rmse_orig = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    medae_orig = median_absolute_error(y_true_orig, y_pred_orig)
    return rmse_orig, mae_orig, medae_orig

# --- 10. 定义最终的 RidgeCV Pipeline (Rent) ---
print("\n--- 定义最终的 RidgeCV Pipeline (Rent) ---")
ridge_cv_model_rent = RidgeCV( # 新实例
    alphas=RIDGE_ALPHAS,
    cv=RIDGE_CV_FOLDS,
    scoring=mae_scorer,
    store_cv_values=False
)
final_pipeline_rent = Pipeline([ # 新 Pipeline 实例
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ridge_cv_model_rent)
])

# --- 11. 外部交叉验证评估 Pipeline (Rent) ---
print(f"\n--- 使用 {EXTERNAL_CV_FOLDS}-折交叉验证评估最终 Rent Pipeline (MAE in log space) ---")
try:
    external_cv_rent = KFold(n_splits=EXTERNAL_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    cv_scores_log_mae_rent = cross_val_score(final_pipeline_rent, X_train_rent_final, y_train_rent, 
                                             cv=external_cv_rent, scoring=mae_scorer, n_jobs=-1)
    mean_cv_log_mae_rent = np.mean(cv_scores_log_mae_rent)
    std_cv_log_mae_rent = np.std(cv_scores_log_mae_rent)
    print(f"  Rent 交叉验证得分 (负 Log MAE): {mean_cv_log_mae_rent:.4f} +/- {std_cv_log_mae_rent:.4f}")
    print(f"  Rent 交叉验证得分 (Log MAE): {-mean_cv_log_mae_rent:.4f}")
except Exception as e:
    print(f"  Rent 外部交叉验证时出错: {e}")

# --- 12. 在全部 Rent 训练数据上训练最终 Pipeline ---
print("\n--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---")
try:
    final_pipeline_rent.fit(X_train_rent_final, y_train_rent)
    print("  最终 Rent Pipeline 训练完成。")
    best_alpha_rent = final_pipeline_rent.named_steps['model'].alpha_
    print(f"  RidgeCV (Rent) 找到的最佳 alpha: {best_alpha_rent:.6f}")
except Exception as e:
    print(f"  训练最终 Rent Pipeline 时出错: {e}")
    exit()

# --- 13. 在 Rent 训练集上评估最终模型 (原始价格空间) ---
print("\n--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---")
try:
    y_train_pred_rent_log = final_pipeline_rent.predict(X_train_rent_final)
    rmse_train_rent, mae_train_rent, medae_train_rent = calculate_metrics(y_train_rent, y_train_pred_rent_log) 
    print(f"  Rent 训练集 RMSE (原): {rmse_train_rent:.2f}")
    print(f"  Rent 训练集 MAE  (原): {mae_train_rent:.2f}")
    print(f"  Rent 训练集 MedAE(原): {medae_train_rent:.2f}")
except Exception as e:
    print(f"  评估 Rent 训练集时出错: {e}")

# --- 14. 在 Rent 测试集上进行预测 ---
print("\n--- 在 Rent 测试集上进行预测 ---")
try:
    predictions_rent_log = final_pipeline_rent.predict(X_test_rent_final)
    # !!! 逆转换回原始租金空间 !!!
    predictions_rent = np.expm1(predictions_rent_log)
    print("  Rent 预测完成并已转换回原始租金空间。")
    
    # 检查并修正负数预测值
    if np.any(predictions_rent < 0):
        print(f"  警告: 发现 {np.sum(predictions_rent < 0)} 个负数预测值，将修正为 0。")
        predictions_rent = np.clip(predictions_rent, a_min=0, a_max=None)

except Exception as e:
    print(f"  在 Rent 测试集上预测时出错: {e}")
    exit()

# --- 15. 创建并保存 Rent 提交文件 ---
print(f"\n--- 创建并保存 Rent 提交文件到 '{OUTPUT_FOLDER}' ---")
submission_df_rent = pd.DataFrame({
    ID_COLUMN: test_rent_ids,
    'PredictedPrice': predictions_rent # 使用 Rent 的预测结果
})
submission_df_rent[ID_COLUMN] = submission_df_rent[ID_COLUMN].astype(int)
submission_df_rent = submission_df_rent.sort_values(by=ID_COLUMN)
try:
    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE) # 使用 Rent 的输出文件名
    submission_df_rent.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✓ Rent 提交文件已成功保存: {output_path}")
    print(f"  总预测条数: {len(submission_df_rent)}")
    print("\nRent 提交文件预览 (前5行):")
    print(submission_df_rent.head())
except Exception as e:
    print(f"✗ 保存 Rent 提交文件时出错: {e}")

print("\n--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---")

开始执行 Rent 预测流程...
输出文件夹 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' 已创建或已存在。

--- 从 'Rent_Capped_Aggregated_Data' 加载 Rent 数据 ---
Rent 数据加载成功。
  训练集形状: (98899, 159)
  测试集形状: (9773, 158)

--- 转换布尔列 (Rent) 为整数 (1/0) ---

检查 train_rent_df...
  找到 39 个布尔列需要转换: ['环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '电梯_未知', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '租赁方式_合租', '租赁方式_整租', '付款方式_半年付价', '付款方式_双月付价', '付款方式_季付价', '付款方式_年付价', '付款方式_月付价', '付款方式_未知', '车位_免费使用', '车位_无车位', '车位_租用车位', '燃气_无', '燃气_有', '燃气_未知']
  train_rent_df 中的布尔列已转换为整数 (0/1)。

检查 test_rent_df...
  找到 38 个布尔列需要转换: ['环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '租赁方式_合租',

  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse


  Rent 交叉验证得分 (负 Log MAE): -0.1189 +/- 0.0007
  Rent 交叉验证得分 (Log MAE): 0.1189

--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---
  最终 Rent Pipeline 训练完成。
  RidgeCV (Rent) 找到的最佳 alpha: 7196.856730

--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---
  Rent 训练集 RMSE (原): 109699.54
  Rent 训练集 MAE  (原): 60590.60
  Rent 训练集 MedAE(原): 33654.60

--- 在 Rent 测试集上进行预测 ---
  Rent 预测完成并已转换回原始租金空间。

--- 创建并保存 Rent 提交文件到 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' ---
✓ Rent 提交文件已成功保存: LeafEncoding_Combined_Ridge_Prediction_Rent_Log/submission_leaf_combined_ridge_rent_log_mae_3.csv
  总预测条数: 9773

Rent 提交文件预览 (前5行):
        ID  PredictedPrice
0  2000000    1.643712e+05
1  2000001    4.408012e+05
2  2000002    3.984165e+05
3  2000003    1.757087e+06
4  2000004    1.429464e+06

--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---


In [1]:
# ==============================================================================
# 完整脚本：叶节点编码 + Top Features + RidgeCV **租金预测** (Rent Prediction) - V3 (Log Target, Combined Features, External CV)
# ==============================================================================

import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存对象
from sklearn.model_selection import KFold, cross_val_score # 导入 cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix # 用于组合稀疏和稠密矩阵

print("开始执行 Rent 预测流程...")
print("==============================")

# --- 1. 配置 ---
# 输入文件夹 (包含 Rent 的 _selected.csv 文件)
# *** 假设 Rent 的筛选后数据也保存在这里，如果不是请修改 ***
INPUT_FOLDER = 'Rent_Capped_Aggregated_Data'
# 输出文件夹 (存放本次 Rent 预测结果)
OUTPUT_FOLDER = 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' # 文件夹名区分 Rent
# 输入文件名
TRAIN_RENT_FILE = 'train_rent_capped_agg.csv' # 使用 Rent 的筛选后文件
TEST_RENT_FILE = 'test_rent_capped_agg.csv'   # 使用 Rent 的筛选后文件
# 输出文件名
OUTPUT_FILE = 'submission_leaf_combined_ridge_rent_log_mae_4.csv' # 文件名区分 Rent

TARGET_COLUMN = 'Price' # 租金数据中的目标列仍然是 'Price'
ID_COLUMN = 'ID'
RANDOM_STATE = 42

# GBRT 参数 (可以与 Price 相同，或单独调整)
GBRT_PARAMS = {
    'n_estimators': 300,
    'max_depth': 5,
    'min_samples_leaf': 30,
    'learning_rate': 0.01,
    'subsample': 0.7,
    'random_state': RANDOM_STATE
}

# 保留多少个最重要的原始特征与叶节点结合
N_TOP_ORIGINAL_FEATURES = 40

# RidgeCV 参数 (可以与 Price 相同，或单独调整)
RIDGE_ALPHAS = np.logspace(3, 5, 15) # Alpha 搜索范围
RIDGE_CV_FOLDS = 5 # RidgeCV 内部交叉验证折数

# 外部交叉验证折数
EXTERNAL_CV_FOLDS = 10 # 保持 6 折

# --- 2. 创建输出文件夹 ---
try:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    print(f"输出文件夹 '{OUTPUT_FOLDER}' 已创建或已存在。")
except OSError as e:
    print(f"创建文件夹 '{OUTPUT_FOLDER}' 时出错: {e}")
    exit()

# --- 3. 加载 Rent 数据 ---
print(f"\n--- 从 '{INPUT_FOLDER}' 加载 Rent 数据 ---")
try:
    train_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TRAIN_RENT_FILE), encoding='utf-8-sig')
    test_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TEST_RENT_FILE), encoding='utf-8-sig')
    print("Rent 数据加载成功。")
    print(f"  训练集形状: {train_rent_df.shape}")
    print(f"  测试集形状: {test_rent_df.shape}")
except FileNotFoundError as e:
    print(f"加载 Rent 数据时出错: {e}. 请确保文件路径正确 ({INPUT_FOLDER} 下应有 {TRAIN_RENT_FILE} 和 {TEST_RENT_FILE})。")
    exit()
except Exception as e:
    print(f"加载 Rent 数据时发生其他错误: {e}")
    exit()

# --- 转换布尔列 ---
print("\n--- 转换布尔列 (Rent) 为整数 (1/0) ---")
for df_name in ['train_rent_df', 'test_rent_df']:
    df_obj = globals()[df_name]
    if df_obj is None: continue
    print(f"\n检查 {df_name}...")
    bool_columns = df_obj.select_dtypes(include='bool').columns
    if not bool_columns.empty:
        print(f"  找到 {len(bool_columns)} 个布尔列需要转换: {bool_columns.tolist()}")
        for col in bool_columns:
            try:
                df_obj.loc[:, col] = df_obj[col].astype(int)
            except Exception as e:
                print(f"    转换 {df_name} 的列 '{col}' 时出错: {e}")
        print(f"  {df_name} 中的布尔列已转换为整数 (0/1)。")
    else:
        print("  未找到布尔列。")
print("\n--- 布尔列转换完成 ---")

# 存储测试集的 ID
test_rent_ids = test_rent_df[ID_COLUMN].copy()

# --- 4. 准备数据 (分离 X/y, Log Transform y, 对齐, 填充 NaN) ---
print("\n--- 准备 Rent 训练和测试数据 ---")
try:
    # !!! 应用 Log Transform !!!
    y_train_rent = np.log1p(train_rent_df[TARGET_COLUMN])
    print(f"  已对 Rent 目标变量 '{TARGET_COLUMN}' 应用 log1p 转换。")
    
    X_train_rent = train_rent_df.drop(columns=[TARGET_COLUMN])
    X_test_rent = test_rent_df.drop(columns=[ID_COLUMN])

    # 移除可能残余的非数值列
    non_numeric_train_rent = X_train_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_train_rent.empty:
        print(f"  警告: Rent 训练集发现非数值列，将移除: {non_numeric_train_rent.tolist()}")
        X_train_rent = X_train_rent.drop(columns=non_numeric_train_rent)
    non_numeric_test_rent = X_test_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_test_rent.empty:
         print(f"  警告: Rent 测试集发现非数值列，将移除: {non_numeric_test_rent.tolist()}")
         X_test_rent = X_test_rent.drop(columns=non_numeric_test_rent)

    # 对齐列
    train_rent_cols = X_train_rent.columns # 保存 Rent 特征列名
    test_rent_cols = X_test_rent.columns
    missing_in_test_rent = set(train_rent_cols) - set(test_rent_cols)
    for c in missing_in_test_rent: X_test_rent[c] = 0
    extra_in_test_rent = set(test_rent_cols) - set(train_rent_cols)
    if extra_in_test_rent: X_test_rent = X_test_rent.drop(columns=list(extra_in_test_rent))
    X_test_rent = X_test_rent[train_rent_cols] 

    print(f"  对齐后 Rent 特征形状: Train={X_train_rent.shape}, Test={X_test_rent.shape}")

    # **填充缺失值**
    print("  使用中位数填充缺失值...")
    imputer_rent = SimpleImputer(strategy='median')
    X_train_rent_imputed = imputer_rent.fit_transform(X_train_rent)
    X_test_rent_imputed = imputer_rent.transform(X_test_rent)
    print("  Rent 数据缺失值填充完成。")

except Exception as e:
     print(f"Rent 数据准备时发生错误: {e}")
     exit()

# --- 5. 训练 GBRT 模型 (Rent) 以生成叶节点 & 获取特征重要性 ---
print("\n--- 训练 Gradient Boosting 模型 (Rent) 以生成叶节点特征 ---")
gbrt_rent = GradientBoostingRegressor(**GBRT_PARAMS) # 使用新变量名
try:
    print(f"  使用以下参数训练 GBRT: {GBRT_PARAMS}")
    gbrt_rent.fit(X_train_rent_imputed, y_train_rent) # 使用 Rent 数据训练
    print("  GBRT 模型 (Rent) 训练完成。")

    # 获取特征重要性
    importances_rent = gbrt_rent.feature_importances_
    top_n_indices_rent = np.argsort(importances_rent)[::-1][:N_TOP_ORIGINAL_FEATURES]
    top_n_features_rent = train_rent_cols[top_n_indices_rent].tolist() # 使用 Rent 列名
    print(f"  识别出 Rent Top {N_TOP_ORIGINAL_FEATURES} 原始特征: {top_n_features_rent}")

except Exception as e:
    print(f"  训练 GBRT (Rent) 或获取重要性时出错: {e}")
    top_n_features_rent = [] 

# --- 6. 获取叶节点索引 (Rent) ---
print("\n--- 获取叶节点索引 (Rent) ---")
try:
    train_rent_leaf_indices = gbrt_rent.apply(X_train_rent_imputed)
    test_rent_leaf_indices = gbrt_rent.apply(X_test_rent_imputed)
except Exception as e:
    print(f"  获取叶节点索引 (Rent) 时出错: {e}")
    exit()

# --- 7. 对叶节点索引进行 One-Hot 编码 (Rent) ---
print("\n--- 对叶节点索引进行 One-Hot 编码 (Rent) ---")
leaf_encoder_rent = OneHotEncoder(handle_unknown='ignore', ) # 使用新编码器
try:
    X_train_rent_leaves_encoded = leaf_encoder_rent.fit_transform(train_rent_leaf_indices)
    X_test_rent_leaves_encoded = leaf_encoder_rent.transform(test_rent_leaf_indices)
    n_leaf_features_rent = X_train_rent_leaves_encoded.shape[1]
    print(f"  生成了 {n_leaf_features_rent} 个 Rent 叶节点二元特征。")
except Exception as e:
    print(f"  One-Hot 编码 (Rent) 时出错: {e}")
    exit()

# --- 8. 组合叶节点特征与 Top N 原始特征 (Rent) ---
print(f"\n--- 组合 Rent 叶节点特征与 Top {len(top_n_features_rent)} 原始特征 ---")
try:
    if top_n_features_rent:
        # 从 Rent 的填充后 NumPy 数组中按索引选取
        X_train_rent_top_features = X_train_rent_imputed[:, top_n_indices_rent]
        X_test_rent_top_features = X_test_rent_imputed[:, top_n_indices_rent]

        X_train_rent_final = hstack([X_train_rent_leaves_encoded, csr_matrix(X_train_rent_top_features)])
        X_test_rent_final = hstack([X_test_rent_leaves_encoded, csr_matrix(X_test_rent_top_features)])
        print(f"  组合后 Rent 最终特征形状: Train={X_train_rent_final.shape}, Test={X_test_rent_final.shape}")
    else:
        X_train_rent_final = X_train_rent_leaves_encoded
        X_test_rent_final = X_test_rent_leaves_encoded
        print("  只使用编码后的 Rent 叶节点特征。")
except Exception as e:
    print(f"  组合 Rent 特征时出错: {e}")
    exit()

# --- 9. 定义评价标准 (MAE Scorer) ---
print("\n--- 定义评价标准 (MAE) ---")
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好)。")
# 辅助函数 (保持不变, 内部处理 log/原尺度转换)
def calculate_metrics(y_true_log, y_pred_log):
    rmse_log = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    medae_log = median_absolute_error(y_true_log, y_pred_log)
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)
    # Clip negative predictions after inverse transform before calculating metrics
    y_pred_orig = np.clip(y_pred_orig, a_min=0, a_max=None) 
    rmse_orig = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    medae_orig = median_absolute_error(y_true_orig, y_pred_orig)
    return rmse_orig, mae_orig, medae_orig

# --- 10. 定义最终的 RidgeCV Pipeline (Rent) ---
print("\n--- 定义最终的 RidgeCV Pipeline (Rent) ---")
ridge_cv_model_rent = RidgeCV( # 新实例
    alphas=RIDGE_ALPHAS,
    cv=RIDGE_CV_FOLDS,
    scoring=mae_scorer,
    store_cv_values=False
)
final_pipeline_rent = Pipeline([ # 新 Pipeline 实例
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ridge_cv_model_rent)
])

# --- 11. 外部交叉验证评估 Pipeline (Rent) ---
print(f"\n--- 使用 {EXTERNAL_CV_FOLDS}-折交叉验证评估最终 Rent Pipeline (MAE in log space) ---")
try:
    external_cv_rent = KFold(n_splits=EXTERNAL_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    cv_scores_log_mae_rent = cross_val_score(final_pipeline_rent, X_train_rent_final, y_train_rent, 
                                             cv=external_cv_rent, scoring=mae_scorer, n_jobs=-1)
    mean_cv_log_mae_rent = np.mean(cv_scores_log_mae_rent)
    std_cv_log_mae_rent = np.std(cv_scores_log_mae_rent)
    print(f"  Rent 交叉验证得分 (负 Log MAE): {mean_cv_log_mae_rent:.4f} +/- {std_cv_log_mae_rent:.4f}")
    print(f"  Rent 交叉验证得分 (Log MAE): {-mean_cv_log_mae_rent:.4f}")
except Exception as e:
    print(f"  Rent 外部交叉验证时出错: {e}")

# --- 12. 在全部 Rent 训练数据上训练最终 Pipeline ---
print("\n--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---")
try:
    final_pipeline_rent.fit(X_train_rent_final, y_train_rent)
    print("  最终 Rent Pipeline 训练完成。")
    best_alpha_rent = final_pipeline_rent.named_steps['model'].alpha_
    print(f"  RidgeCV (Rent) 找到的最佳 alpha: {best_alpha_rent:.6f}")
except Exception as e:
    print(f"  训练最终 Rent Pipeline 时出错: {e}")
    exit()

# --- 13. 在 Rent 训练集上评估最终模型 (原始价格空间) ---
print("\n--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---")
try:
    y_train_pred_rent_log = final_pipeline_rent.predict(X_train_rent_final)
    rmse_train_rent, mae_train_rent, medae_train_rent = calculate_metrics(y_train_rent, y_train_pred_rent_log) 
    print(f"  Rent 训练集 RMSE (原): {rmse_train_rent:.2f}")
    print(f"  Rent 训练集 MAE  (原): {mae_train_rent:.2f}")
    print(f"  Rent 训练集 MedAE(原): {medae_train_rent:.2f}")
except Exception as e:
    print(f"  评估 Rent 训练集时出错: {e}")

# --- 14. 在 Rent 测试集上进行预测 ---
print("\n--- 在 Rent 测试集上进行预测 ---")
try:
    predictions_rent_log = final_pipeline_rent.predict(X_test_rent_final)
    # !!! 逆转换回原始租金空间 !!!
    predictions_rent = np.expm1(predictions_rent_log)
    print("  Rent 预测完成并已转换回原始租金空间。")
    
    # 检查并修正负数预测值
    if np.any(predictions_rent < 0):
        print(f"  警告: 发现 {np.sum(predictions_rent < 0)} 个负数预测值，将修正为 0。")
        predictions_rent = np.clip(predictions_rent, a_min=0, a_max=None)

except Exception as e:
    print(f"  在 Rent 测试集上预测时出错: {e}")
    exit()

# --- 15. 创建并保存 Rent 提交文件 ---
print(f"\n--- 创建并保存 Rent 提交文件到 '{OUTPUT_FOLDER}' ---")
submission_df_rent = pd.DataFrame({
    ID_COLUMN: test_rent_ids,
    'PredictedPrice': predictions_rent # 使用 Rent 的预测结果
})
submission_df_rent[ID_COLUMN] = submission_df_rent[ID_COLUMN].astype(int)
submission_df_rent = submission_df_rent.sort_values(by=ID_COLUMN)
try:
    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE) # 使用 Rent 的输出文件名
    submission_df_rent.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✓ Rent 提交文件已成功保存: {output_path}")
    print(f"  总预测条数: {len(submission_df_rent)}")
    print("\nRent 提交文件预览 (前5行):")
    print(submission_df_rent.head())
except Exception as e:
    print(f"✗ 保存 Rent 提交文件时出错: {e}")

print("\n--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---")

  from scipy.sparse import issparse


开始执行 Rent 预测流程...
输出文件夹 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' 已创建或已存在。

--- 从 'Rent_Capped_Aggregated_Data' 加载 Rent 数据 ---
Rent 数据加载成功。
  训练集形状: (98899, 159)
  测试集形状: (9773, 158)

--- 转换布尔列 (Rent) 为整数 (1/0) ---

检查 train_rent_df...
  找到 39 个布尔列需要转换: ['环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '电梯_未知', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '租赁方式_合租', '租赁方式_整租', '付款方式_半年付价', '付款方式_双月付价', '付款方式_季付价', '付款方式_年付价', '付款方式_月付价', '付款方式_未知', '车位_免费使用', '车位_无车位', '车位_租用车位', '燃气_无', '燃气_有', '燃气_未知']
  train_rent_df 中的布尔列已转换为整数 (0/1)。

检查 test_rent_df...
  找到 38 个布尔列需要转换: ['环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '租赁方式_合租',

  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse


  Rent 交叉验证得分 (负 Log MAE): -0.1303 +/- 0.0009
  Rent 交叉验证得分 (Log MAE): 0.1303

--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---
  最终 Rent Pipeline 训练完成。
  RidgeCV (Rent) 找到的最佳 alpha: 51794.746792

--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---
  Rent 训练集 RMSE (原): 122312.14
  Rent 训练集 MAE  (原): 67352.69
  Rent 训练集 MedAE(原): 37482.39

--- 在 Rent 测试集上进行预测 ---
  Rent 预测完成并已转换回原始租金空间。

--- 创建并保存 Rent 提交文件到 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' ---
✓ Rent 提交文件已成功保存: LeafEncoding_Combined_Ridge_Prediction_Rent_Log/submission_leaf_combined_ridge_rent_log_mae_4.csv
  总预测条数: 9773

Rent 提交文件预览 (前5行):
        ID  PredictedPrice
0  2000000    1.708021e+05
1  2000001    4.031391e+05
2  2000002    3.906607e+05
3  2000003    1.775051e+06
4  2000004    1.458940e+06

--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---


In [5]:
# ==============================================================================
# 完整脚本：叶节点编码 + Top Features + RidgeCV **租金预测** (Rent Prediction) - V3 (Log Target, Combined Features, External CV)
# ==============================================================================

import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存对象
from sklearn.model_selection import KFold, cross_val_score # 导入 cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix # 用于组合稀疏和稠密矩阵

print("开始执行 Rent 预测流程...")
print("==============================")

# --- 1. 配置 ---
# 输入文件夹 (包含 Rent 的 _selected.csv 文件)
# *** 假设 Rent 的筛选后数据也保存在这里，如果不是请修改 ***
INPUT_FOLDER = 'Rent_Capped_Aggregated_Data'
# 输出文件夹 (存放本次 Rent 预测结果)
OUTPUT_FOLDER = 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' # 文件夹名区分 Rent
# 输入文件名
TRAIN_RENT_FILE = 'train_rent_capped_agg.csv' # 使用 Rent 的筛选后文件
TEST_RENT_FILE = 'test_rent_capped_agg.csv'   # 使用 Rent 的筛选后文件
# 输出文件名
OUTPUT_FILE = 'submission_leaf_combined_ridge_rent_log_mae_5.csv' # 文件名区分 Rent

TARGET_COLUMN = 'Price' # 租金数据中的目标列仍然是 'Price'
ID_COLUMN = 'ID'
RANDOM_STATE = 42

# GBRT 参数 (可以与 Price 相同，或单独调整)
GBRT_PARAMS = {
    'n_estimators': 300,
    'max_depth': 5,
    'min_samples_leaf': 30,
    'learning_rate': 0.01,
    'subsample': 0.7,
    'random_state': RANDOM_STATE
}

# 保留多少个最重要的原始特征与叶节点结合
N_TOP_ORIGINAL_FEATURES = 30

# RidgeCV 参数 (可以与 Price 相同，或单独调整)
RIDGE_ALPHAS = np.logspace(3, 5, 15) # Alpha 搜索范围
RIDGE_CV_FOLDS = 5 # RidgeCV 内部交叉验证折数

# 外部交叉验证折数
EXTERNAL_CV_FOLDS = 10 # 保持 6 折

# --- 2. 创建输出文件夹 ---
try:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    print(f"输出文件夹 '{OUTPUT_FOLDER}' 已创建或已存在。")
except OSError as e:
    print(f"创建文件夹 '{OUTPUT_FOLDER}' 时出错: {e}")
    exit()

# --- 3. 加载 Rent 数据 ---
print(f"\n--- 从 '{INPUT_FOLDER}' 加载 Rent 数据 ---")
try:
    train_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TRAIN_RENT_FILE), encoding='utf-8-sig')
    test_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TEST_RENT_FILE), encoding='utf-8-sig')
    print("Rent 数据加载成功。")
    print(f"  训练集形状: {train_rent_df.shape}")
    print(f"  测试集形状: {test_rent_df.shape}")
except FileNotFoundError as e:
    print(f"加载 Rent 数据时出错: {e}. 请确保文件路径正确 ({INPUT_FOLDER} 下应有 {TRAIN_RENT_FILE} 和 {TEST_RENT_FILE})。")
    exit()
except Exception as e:
    print(f"加载 Rent 数据时发生其他错误: {e}")
    exit()

# --- 转换布尔列 ---
print("\n--- 转换布尔列 (Rent) 为整数 (1/0) ---")
for df_name in ['train_rent_df', 'test_rent_df']:
    df_obj = globals()[df_name]
    if df_obj is None: continue
    print(f"\n检查 {df_name}...")
    bool_columns = df_obj.select_dtypes(include='bool').columns
    if not bool_columns.empty:
        print(f"  找到 {len(bool_columns)} 个布尔列需要转换: {bool_columns.tolist()}")
        for col in bool_columns:
            try:
                df_obj.loc[:, col] = df_obj[col].astype(int)
            except Exception as e:
                print(f"    转换 {df_name} 的列 '{col}' 时出错: {e}")
        print(f"  {df_name} 中的布尔列已转换为整数 (0/1)。")
    else:
        print("  未找到布尔列。")
print("\n--- 布尔列转换完成 ---")

# 存储测试集的 ID
test_rent_ids = test_rent_df[ID_COLUMN].copy()

# --- 4. 准备数据 (分离 X/y, Log Transform y, 对齐, 填充 NaN) ---
print("\n--- 准备 Rent 训练和测试数据 ---")
try:
    # !!! 应用 Log Transform !!!
    y_train_rent = np.log1p(train_rent_df[TARGET_COLUMN])
    print(f"  已对 Rent 目标变量 '{TARGET_COLUMN}' 应用 log1p 转换。")
    
    X_train_rent = train_rent_df.drop(columns=[TARGET_COLUMN])
    X_test_rent = test_rent_df.drop(columns=[ID_COLUMN])

    # 移除可能残余的非数值列
    non_numeric_train_rent = X_train_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_train_rent.empty:
        print(f"  警告: Rent 训练集发现非数值列，将移除: {non_numeric_train_rent.tolist()}")
        X_train_rent = X_train_rent.drop(columns=non_numeric_train_rent)
    non_numeric_test_rent = X_test_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_test_rent.empty:
         print(f"  警告: Rent 测试集发现非数值列，将移除: {non_numeric_test_rent.tolist()}")
         X_test_rent = X_test_rent.drop(columns=non_numeric_test_rent)

    # 对齐列
    train_rent_cols = X_train_rent.columns # 保存 Rent 特征列名
    test_rent_cols = X_test_rent.columns
    missing_in_test_rent = set(train_rent_cols) - set(test_rent_cols)
    for c in missing_in_test_rent: X_test_rent[c] = 0
    extra_in_test_rent = set(test_rent_cols) - set(train_rent_cols)
    if extra_in_test_rent: X_test_rent = X_test_rent.drop(columns=list(extra_in_test_rent))
    X_test_rent = X_test_rent[train_rent_cols] 

    print(f"  对齐后 Rent 特征形状: Train={X_train_rent.shape}, Test={X_test_rent.shape}")

    # **填充缺失值**
    print("  使用中位数填充缺失值...")
    imputer_rent = SimpleImputer(strategy='median')
    X_train_rent_imputed = imputer_rent.fit_transform(X_train_rent)
    X_test_rent_imputed = imputer_rent.transform(X_test_rent)
    print("  Rent 数据缺失值填充完成。")

except Exception as e:
     print(f"Rent 数据准备时发生错误: {e}")
     exit()

# --- 5. 训练 GBRT 模型 (Rent) 以生成叶节点 & 获取特征重要性 ---
print("\n--- 训练 Gradient Boosting 模型 (Rent) 以生成叶节点特征 ---")
gbrt_rent = GradientBoostingRegressor(**GBRT_PARAMS) # 使用新变量名
try:
    print(f"  使用以下参数训练 GBRT: {GBRT_PARAMS}")
    gbrt_rent.fit(X_train_rent_imputed, y_train_rent) # 使用 Rent 数据训练
    print("  GBRT 模型 (Rent) 训练完成。")

    # 获取特征重要性
    importances_rent = gbrt_rent.feature_importances_
    top_n_indices_rent = np.argsort(importances_rent)[::-1][:N_TOP_ORIGINAL_FEATURES]
    top_n_features_rent = train_rent_cols[top_n_indices_rent].tolist() # 使用 Rent 列名
    print(f"  识别出 Rent Top {N_TOP_ORIGINAL_FEATURES} 原始特征: {top_n_features_rent}")

except Exception as e:
    print(f"  训练 GBRT (Rent) 或获取重要性时出错: {e}")
    top_n_features_rent = [] 

# --- 6. 获取叶节点索引 (Rent) ---
print("\n--- 获取叶节点索引 (Rent) ---")
try:
    train_rent_leaf_indices = gbrt_rent.apply(X_train_rent_imputed)
    test_rent_leaf_indices = gbrt_rent.apply(X_test_rent_imputed)
except Exception as e:
    print(f"  获取叶节点索引 (Rent) 时出错: {e}")
    exit()

# --- 7. 对叶节点索引进行 One-Hot 编码 (Rent) ---
print("\n--- 对叶节点索引进行 One-Hot 编码 (Rent) ---")
leaf_encoder_rent = OneHotEncoder(handle_unknown='ignore', ) # 使用新编码器
try:
    X_train_rent_leaves_encoded = leaf_encoder_rent.fit_transform(train_rent_leaf_indices)
    X_test_rent_leaves_encoded = leaf_encoder_rent.transform(test_rent_leaf_indices)
    n_leaf_features_rent = X_train_rent_leaves_encoded.shape[1]
    print(f"  生成了 {n_leaf_features_rent} 个 Rent 叶节点二元特征。")
except Exception as e:
    print(f"  One-Hot 编码 (Rent) 时出错: {e}")
    exit()

# --- 8. 组合叶节点特征与 Top N 原始特征 (Rent) ---
print(f"\n--- 组合 Rent 叶节点特征与 Top {len(top_n_features_rent)} 原始特征 ---")
try:
    if top_n_features_rent:
        # 从 Rent 的填充后 NumPy 数组中按索引选取
        X_train_rent_top_features = X_train_rent_imputed[:, top_n_indices_rent]
        X_test_rent_top_features = X_test_rent_imputed[:, top_n_indices_rent]

        X_train_rent_final = hstack([X_train_rent_leaves_encoded, csr_matrix(X_train_rent_top_features)])
        X_test_rent_final = hstack([X_test_rent_leaves_encoded, csr_matrix(X_test_rent_top_features)])
        print(f"  组合后 Rent 最终特征形状: Train={X_train_rent_final.shape}, Test={X_test_rent_final.shape}")
    else:
        X_train_rent_final = X_train_rent_leaves_encoded
        X_test_rent_final = X_test_rent_leaves_encoded
        print("  只使用编码后的 Rent 叶节点特征。")
except Exception as e:
    print(f"  组合 Rent 特征时出错: {e}")
    exit()

# --- 9. 定义评价标准 (MAE Scorer) ---
print("\n--- 定义评价标准 (MAE) ---")
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好)。")
# 辅助函数 (保持不变, 内部处理 log/原尺度转换)
def calculate_metrics(y_true_log, y_pred_log):
    rmse_log = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    medae_log = median_absolute_error(y_true_log, y_pred_log)
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)
    # Clip negative predictions after inverse transform before calculating metrics
    y_pred_orig = np.clip(y_pred_orig, a_min=0, a_max=None) 
    rmse_orig = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    medae_orig = median_absolute_error(y_true_orig, y_pred_orig)
    return rmse_orig, mae_orig, medae_orig

# --- 10. 定义最终的 RidgeCV Pipeline (Rent) ---
print("\n--- 定义最终的 RidgeCV Pipeline (Rent) ---")
ridge_cv_model_rent = RidgeCV( # 新实例
    alphas=RIDGE_ALPHAS,
    cv=RIDGE_CV_FOLDS,
    scoring=mae_scorer,
    store_cv_values=False
)
final_pipeline_rent = Pipeline([ # 新 Pipeline 实例
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ridge_cv_model_rent)
])

# --- 11. 外部交叉验证评估 Pipeline (Rent) ---
print(f"\n--- 使用 {EXTERNAL_CV_FOLDS}-折交叉验证评估最终 Rent Pipeline (MAE in log space) ---")
try:
    external_cv_rent = KFold(n_splits=EXTERNAL_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    cv_scores_log_mae_rent = cross_val_score(final_pipeline_rent, X_train_rent_final, y_train_rent, 
                                             cv=external_cv_rent, scoring=mae_scorer, n_jobs=-1)
    mean_cv_log_mae_rent = np.mean(cv_scores_log_mae_rent)
    std_cv_log_mae_rent = np.std(cv_scores_log_mae_rent)
    print(f"  Rent 交叉验证得分 (负 Log MAE): {mean_cv_log_mae_rent:.4f} +/- {std_cv_log_mae_rent:.4f}")
    print(f"  Rent 交叉验证得分 (Log MAE): {-mean_cv_log_mae_rent:.4f}")
except Exception as e:
    print(f"  Rent 外部交叉验证时出错: {e}")

# --- 12. 在全部 Rent 训练数据上训练最终 Pipeline ---
print("\n--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---")
try:
    final_pipeline_rent.fit(X_train_rent_final, y_train_rent)
    print("  最终 Rent Pipeline 训练完成。")
    best_alpha_rent = final_pipeline_rent.named_steps['model'].alpha_
    print(f"  RidgeCV (Rent) 找到的最佳 alpha: {best_alpha_rent:.6f}")
except Exception as e:
    print(f"  训练最终 Rent Pipeline 时出错: {e}")
    exit()

# --- 13. 在 Rent 训练集上评估最终模型 (原始价格空间) ---
print("\n--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---")
try:
    y_train_pred_rent_log = final_pipeline_rent.predict(X_train_rent_final)
    rmse_train_rent, mae_train_rent, medae_train_rent = calculate_metrics(y_train_rent, y_train_pred_rent_log) 
    print(f"  Rent 训练集 RMSE (原): {rmse_train_rent:.2f}")
    print(f"  Rent 训练集 MAE  (原): {mae_train_rent:.2f}")
    print(f"  Rent 训练集 MedAE(原): {medae_train_rent:.2f}")
except Exception as e:
    print(f"  评估 Rent 训练集时出错: {e}")

# --- 14. 在 Rent 测试集上进行预测 ---
print("\n--- 在 Rent 测试集上进行预测 ---")
try:
    predictions_rent_log = final_pipeline_rent.predict(X_test_rent_final)
    # !!! 逆转换回原始租金空间 !!!
    predictions_rent = np.expm1(predictions_rent_log)
    print("  Rent 预测完成并已转换回原始租金空间。")
    
    # 检查并修正负数预测值
    if np.any(predictions_rent < 0):
        print(f"  警告: 发现 {np.sum(predictions_rent < 0)} 个负数预测值，将修正为 0。")
        predictions_rent = np.clip(predictions_rent, a_min=0, a_max=None)

except Exception as e:
    print(f"  在 Rent 测试集上预测时出错: {e}")
    exit()

# --- 15. 创建并保存 Rent 提交文件 ---
print(f"\n--- 创建并保存 Rent 提交文件到 '{OUTPUT_FOLDER}' ---")
submission_df_rent = pd.DataFrame({
    ID_COLUMN: test_rent_ids,
    'PredictedPrice': predictions_rent # 使用 Rent 的预测结果
})
submission_df_rent[ID_COLUMN] = submission_df_rent[ID_COLUMN].astype(int)
submission_df_rent = submission_df_rent.sort_values(by=ID_COLUMN)
try:
    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE) # 使用 Rent 的输出文件名
    submission_df_rent.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✓ Rent 提交文件已成功保存: {output_path}")
    print(f"  总预测条数: {len(submission_df_rent)}")
    print("\nRent 提交文件预览 (前5行):")
    print(submission_df_rent.head())
except Exception as e:
    print(f"✗ 保存 Rent 提交文件时出错: {e}")

print("\n--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---")

开始执行 Rent 预测流程...
输出文件夹 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' 已创建或已存在。

--- 从 'Rent_Capped_Aggregated_Data' 加载 Rent 数据 ---
Rent 数据加载成功。
  训练集形状: (98899, 159)
  测试集形状: (9773, 158)

--- 转换布尔列 (Rent) 为整数 (1/0) ---

检查 train_rent_df...
  找到 39 个布尔列需要转换: ['环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '电梯_未知', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '租赁方式_合租', '租赁方式_整租', '付款方式_半年付价', '付款方式_双月付价', '付款方式_季付价', '付款方式_年付价', '付款方式_月付价', '付款方式_未知', '车位_免费使用', '车位_无车位', '车位_租用车位', '燃气_无', '燃气_有', '燃气_未知']
  train_rent_df 中的布尔列已转换为整数 (0/1)。

检查 test_rent_df...
  找到 38 个布尔列需要转换: ['环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '装修_精装.1', '装修_简装.1', '装修_毛坯.1', '装修_其他.1', '装修_未知.1', '租赁方式_合租',

  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse
  from scipy.sparse import issparse


  Rent 交叉验证得分 (负 Log MAE): -0.1303 +/- 0.0009
  Rent 交叉验证得分 (Log MAE): 0.1303

--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---
  最终 Rent Pipeline 训练完成。
  RidgeCV (Rent) 找到的最佳 alpha: 51794.746792

--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---
  Rent 训练集 RMSE (原): 122430.49
  Rent 训练集 MAE  (原): 67407.62
  Rent 训练集 MedAE(原): 37500.40

--- 在 Rent 测试集上进行预测 ---
  Rent 预测完成并已转换回原始租金空间。

--- 创建并保存 Rent 提交文件到 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' ---
✓ Rent 提交文件已成功保存: LeafEncoding_Combined_Ridge_Prediction_Rent_Log/submission_leaf_combined_ridge_rent_log_mae_5.csv
  总预测条数: 9773

Rent 提交文件预览 (前5行):
        ID  PredictedPrice
0  2000000    1.696220e+05
1  2000001    4.019170e+05
2  2000002    3.880802e+05
3  2000003    1.764912e+06
4  2000004    1.448916e+06

--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---


In [None]:
# ==============================================================================
# 完整脚本：叶节点编码 + Top Features + RidgeCV **租金预测** (Rent Prediction) - V3 (Log Target, Combined Features, External CV)
# ==============================================================================

import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存对象
from sklearn.model_selection import KFold, cross_val_score # 导入 cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix # 用于组合稀疏和稠密矩阵

print("开始执行 Rent 预测流程...")
print("==============================")

# --- 1. 配置 ---
# 输入文件夹 (包含 Rent 的 _selected.csv 文件)
# *** 假设 Rent 的筛选后数据也保存在这里，如果不是请修改 ***
INPUT_FOLDER = 'Rent_Capped_Aggregated_Data'
# 输出文件夹 (存放本次 Rent 预测结果)
OUTPUT_FOLDER = 'LeafEncoding_Combined_Ridge_Prediction_Rent_Log' # 文件夹名区分 Rent
# 输入文件名
TRAIN_RENT_FILE = 'train_rent_capped_agg.csv' # 使用 Rent 的筛选后文件
TEST_RENT_FILE = 'test_rent_capped_agg.csv'   # 使用 Rent 的筛选后文件
# 输出文件名
OUTPUT_FILE = 'submission_leaf_combined_ridge_rent_log_mae_5.csv' # 文件名区分 Rent

TARGET_COLUMN = 'Price' # 租金数据中的目标列仍然是 'Price'
ID_COLUMN = 'ID'
RANDOM_STATE = 42

# GBRT 参数 (可以与 Price 相同，或单独调整)
GBRT_PARAMS = {
    'n_estimators': 300,
    'max_depth': 5,
    'min_samples_leaf': 30,
    'learning_rate': 0.01,
    'subsample': 0.7,
    'random_state': RANDOM_STATE
}

# 保留多少个最重要的原始特征与叶节点结合
N_TOP_ORIGINAL_FEATURES = 30

# RidgeCV 参数 (可以与 Price 相同，或单独调整)
RIDGE_ALPHAS = np.logspace(3, 5, 15) # Alpha 搜索范围
RIDGE_CV_FOLDS = 5 # RidgeCV 内部交叉验证折数

# 外部交叉验证折数
EXTERNAL_CV_FOLDS = 10 # 保持 6 折

# --- 2. 创建输出文件夹 ---
try:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    print(f"输出文件夹 '{OUTPUT_FOLDER}' 已创建或已存在。")
except OSError as e:
    print(f"创建文件夹 '{OUTPUT_FOLDER}' 时出错: {e}")
    exit()

# --- 3. 加载 Rent 数据 ---
print(f"\n--- 从 '{INPUT_FOLDER}' 加载 Rent 数据 ---")
try:
    train_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TRAIN_RENT_FILE), encoding='utf-8-sig')
    test_rent_df = pd.read_csv(os.path.join(INPUT_FOLDER, TEST_RENT_FILE), encoding='utf-8-sig')
    print("Rent 数据加载成功。")
    print(f"  训练集形状: {train_rent_df.shape}")
    print(f"  测试集形状: {test_rent_df.shape}")
except FileNotFoundError as e:
    print(f"加载 Rent 数据时出错: {e}. 请确保文件路径正确 ({INPUT_FOLDER} 下应有 {TRAIN_RENT_FILE} 和 {TEST_RENT_FILE})。")
    exit()
except Exception as e:
    print(f"加载 Rent 数据时发生其他错误: {e}")
    exit()

# --- 转换布尔列 ---
print("\n--- 转换布尔列 (Rent) 为整数 (1/0) ---")
for df_name in ['train_rent_df', 'test_rent_df']:
    df_obj = globals()[df_name]
    if df_obj is None: continue
    print(f"\n检查 {df_name}...")
    bool_columns = df_obj.select_dtypes(include='bool').columns
    if not bool_columns.empty:
        print(f"  找到 {len(bool_columns)} 个布尔列需要转换: {bool_columns.tolist()}")
        for col in bool_columns:
            try:
                df_obj.loc[:, col] = df_obj[col].astype(int)
            except Exception as e:
                print(f"    转换 {df_name} 的列 '{col}' 时出错: {e}")
        print(f"  {df_name} 中的布尔列已转换为整数 (0/1)。")
    else:
        print("  未找到布尔列。")
print("\n--- 布尔列转换完成 ---")

# 存储测试集的 ID
test_rent_ids = test_rent_df[ID_COLUMN].copy()

# --- 4. 准备数据 (分离 X/y, Log Transform y, 对齐, 填充 NaN) ---
print("\n--- 准备 Rent 训练和测试数据 ---")
try:
    # !!! 应用 Log Transform !!!
    y_train_rent = np.log1p(train_rent_df[TARGET_COLUMN])
    print(f"  已对 Rent 目标变量 '{TARGET_COLUMN}' 应用 log1p 转换。")
    
    X_train_rent = train_rent_df.drop(columns=[TARGET_COLUMN])
    X_test_rent = test_rent_df.drop(columns=[ID_COLUMN])

    # 移除可能残余的非数值列
    non_numeric_train_rent = X_train_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_train_rent.empty:
        print(f"  警告: Rent 训练集发现非数值列，将移除: {non_numeric_train_rent.tolist()}")
        X_train_rent = X_train_rent.drop(columns=non_numeric_train_rent)
    non_numeric_test_rent = X_test_rent.select_dtypes(exclude=np.number).columns
    if not non_numeric_test_rent.empty:
         print(f"  警告: Rent 测试集发现非数值列，将移除: {non_numeric_test_rent.tolist()}")
         X_test_rent = X_test_rent.drop(columns=non_numeric_test_rent)

    # 对齐列
    train_rent_cols = X_train_rent.columns # 保存 Rent 特征列名
    test_rent_cols = X_test_rent.columns
    missing_in_test_rent = set(train_rent_cols) - set(test_rent_cols)
    for c in missing_in_test_rent: X_test_rent[c] = 0
    extra_in_test_rent = set(test_rent_cols) - set(train_rent_cols)
    if extra_in_test_rent: X_test_rent = X_test_rent.drop(columns=list(extra_in_test_rent))
    X_test_rent = X_test_rent[train_rent_cols] 

    print(f"  对齐后 Rent 特征形状: Train={X_train_rent.shape}, Test={X_test_rent.shape}")

    # **填充缺失值**
    print("  使用中位数填充缺失值...")
    imputer_rent = SimpleImputer(strategy='median')
    X_train_rent_imputed = imputer_rent.fit_transform(X_train_rent)
    X_test_rent_imputed = imputer_rent.transform(X_test_rent)
    print("  Rent 数据缺失值填充完成。")

except Exception as e:
     print(f"Rent 数据准备时发生错误: {e}")
     exit()

# --- 5. 训练 GBRT 模型 (Rent) 以生成叶节点 & 获取特征重要性 ---
print("\n--- 训练 Gradient Boosting 模型 (Rent) 以生成叶节点特征 ---")
gbrt_rent = GradientBoostingRegressor(**GBRT_PARAMS) # 使用新变量名
try:
    print(f"  使用以下参数训练 GBRT: {GBRT_PARAMS}")
    gbrt_rent.fit(X_train_rent_imputed, y_train_rent) # 使用 Rent 数据训练
    print("  GBRT 模型 (Rent) 训练完成。")

    # 获取特征重要性
    importances_rent = gbrt_rent.feature_importances_
    top_n_indices_rent = np.argsort(importances_rent)[::-1][:N_TOP_ORIGINAL_FEATURES]
    top_n_features_rent = train_rent_cols[top_n_indices_rent].tolist() # 使用 Rent 列名
    print(f"  识别出 Rent Top {N_TOP_ORIGINAL_FEATURES} 原始特征: {top_n_features_rent}")

except Exception as e:
    print(f"  训练 GBRT (Rent) 或获取重要性时出错: {e}")
    top_n_features_rent = [] 

# --- 6. 获取叶节点索引 (Rent) ---
print("\n--- 获取叶节点索引 (Rent) ---")
try:
    train_rent_leaf_indices = gbrt_rent.apply(X_train_rent_imputed)
    test_rent_leaf_indices = gbrt_rent.apply(X_test_rent_imputed)
except Exception as e:
    print(f"  获取叶节点索引 (Rent) 时出错: {e}")
    exit()

# --- 7. 对叶节点索引进行 One-Hot 编码 (Rent) ---
print("\n--- 对叶节点索引进行 One-Hot 编码 (Rent) ---")
leaf_encoder_rent = OneHotEncoder(handle_unknown='ignore', ) # 使用新编码器
try:
    X_train_rent_leaves_encoded = leaf_encoder_rent.fit_transform(train_rent_leaf_indices)
    X_test_rent_leaves_encoded = leaf_encoder_rent.transform(test_rent_leaf_indices)
    n_leaf_features_rent = X_train_rent_leaves_encoded.shape[1]
    print(f"  生成了 {n_leaf_features_rent} 个 Rent 叶节点二元特征。")
except Exception as e:
    print(f"  One-Hot 编码 (Rent) 时出错: {e}")
    exit()

# --- 8. 组合叶节点特征与 Top N 原始特征 (Rent) ---
print(f"\n--- 组合 Rent 叶节点特征与 Top {len(top_n_features_rent)} 原始特征 ---")
try:
    if top_n_features_rent:
        # 从 Rent 的填充后 NumPy 数组中按索引选取
        X_train_rent_top_features = X_train_rent_imputed[:, top_n_indices_rent]
        X_test_rent_top_features = X_test_rent_imputed[:, top_n_indices_rent]

        X_train_rent_final = hstack([X_train_rent_leaves_encoded, csr_matrix(X_train_rent_top_features)])
        X_test_rent_final = hstack([X_test_rent_leaves_encoded, csr_matrix(X_test_rent_top_features)])
        print(f"  组合后 Rent 最终特征形状: Train={X_train_rent_final.shape}, Test={X_test_rent_final.shape}")
    else:
        X_train_rent_final = X_train_rent_leaves_encoded
        X_test_rent_final = X_test_rent_leaves_encoded
        print("  只使用编码后的 Rent 叶节点特征。")
except Exception as e:
    print(f"  组合 Rent 特征时出错: {e}")
    exit()

# --- 9. 定义评价标准 (MAE Scorer) ---
print("\n--- 定义评价标准 (MAE) ---")
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好)。")
# 辅助函数 (保持不变, 内部处理 log/原尺度转换)
def calculate_metrics(y_true_log, y_pred_log):
    rmse_log = np.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae_log = mean_absolute_error(y_true_log, y_pred_log)
    medae_log = median_absolute_error(y_true_log, y_pred_log)
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)
    # Clip negative predictions after inverse transform before calculating metrics
    y_pred_orig = np.clip(y_pred_orig, a_min=0, a_max=None) 
    rmse_orig = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    medae_orig = median_absolute_error(y_true_orig, y_pred_orig)
    return rmse_orig, mae_orig, medae_orig

# --- 10. 定义最终的 RidgeCV Pipeline (Rent) ---
print("\n--- 定义最终的 RidgeCV Pipeline (Rent) ---")
ridge_cv_model_rent = RidgeCV( # 新实例
    alphas=RIDGE_ALPHAS,
    cv=RIDGE_CV_FOLDS,
    scoring=mae_scorer,
    store_cv_values=False
)
final_pipeline_rent = Pipeline([ # 新 Pipeline 实例
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ridge_cv_model_rent)
])

# --- 11. 外部交叉验证评估 Pipeline (Rent) ---
print(f"\n--- 使用 {EXTERNAL_CV_FOLDS}-折交叉验证评估最终 Rent Pipeline (MAE in log space) ---")
try:
    external_cv_rent = KFold(n_splits=EXTERNAL_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    cv_scores_log_mae_rent = cross_val_score(final_pipeline_rent, X_train_rent_final, y_train_rent, 
                                             cv=external_cv_rent, scoring=mae_scorer, n_jobs=-1)
    mean_cv_log_mae_rent = np.mean(cv_scores_log_mae_rent)
    std_cv_log_mae_rent = np.std(cv_scores_log_mae_rent)
    print(f"  Rent 交叉验证得分 (负 Log MAE): {mean_cv_log_mae_rent:.4f} +/- {std_cv_log_mae_rent:.4f}")
    print(f"  Rent 交叉验证得分 (Log MAE): {-mean_cv_log_mae_rent:.4f}")
except Exception as e:
    print(f"  Rent 外部交叉验证时出错: {e}")

# --- 12. 在全部 Rent 训练数据上训练最终 Pipeline ---
print("\n--- 在全部 Rent 训练数据上训练最终的 RidgeCV Pipeline ---")
try:
    final_pipeline_rent.fit(X_train_rent_final, y_train_rent)
    print("  最终 Rent Pipeline 训练完成。")
    best_alpha_rent = final_pipeline_rent.named_steps['model'].alpha_
    print(f"  RidgeCV (Rent) 找到的最佳 alpha: {best_alpha_rent:.6f}")
except Exception as e:
    print(f"  训练最终 Rent Pipeline 时出错: {e}")
    exit()

# --- 13. 在 Rent 训练集上评估最终模型 (原始价格空间) ---
print("\n--- 在完整 Rent 训练集上评估最终 Ridge 模型 (原始租金空间) ---")
try:
    y_train_pred_rent_log = final_pipeline_rent.predict(X_train_rent_final)
    rmse_train_rent, mae_train_rent, medae_train_rent = calculate_metrics(y_train_rent, y_train_pred_rent_log) 
    print(f"  Rent 训练集 RMSE (原): {rmse_train_rent:.2f}")
    print(f"  Rent 训练集 MAE  (原): {mae_train_rent:.2f}")
    print(f"  Rent 训练集 MedAE(原): {medae_train_rent:.2f}")
except Exception as e:
    print(f"  评估 Rent 训练集时出错: {e}")

# --- 14. 在 Rent 测试集上进行预测 ---
print("\n--- 在 Rent 测试集上进行预测 ---")
try:
    predictions_rent_log = final_pipeline_rent.predict(X_test_rent_final)
    # !!! 逆转换回原始租金空间 !!!
    predictions_rent = np.expm1(predictions_rent_log)
    print("  Rent 预测完成并已转换回原始租金空间。")
    
    # 检查并修正负数预测值
    if np.any(predictions_rent < 0):
        print(f"  警告: 发现 {np.sum(predictions_rent < 0)} 个负数预测值，将修正为 0。")
        predictions_rent = np.clip(predictions_rent, a_min=0, a_max=None)

except Exception as e:
    print(f"  在 Rent 测试集上预测时出错: {e}")
    exit()

# --- 15. 创建并保存 Rent 提交文件 ---
print(f"\n--- 创建并保存 Rent 提交文件到 '{OUTPUT_FOLDER}' ---")
submission_df_rent = pd.DataFrame({
    ID_COLUMN: test_rent_ids,
    'PredictedPrice': predictions_rent # 使用 Rent 的预测结果
})
submission_df_rent[ID_COLUMN] = submission_df_rent[ID_COLUMN].astype(int)
submission_df_rent = submission_df_rent.sort_values(by=ID_COLUMN)
try:
    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE) # 使用 Rent 的输出文件名
    submission_df_rent.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✓ Rent 提交文件已成功保存: {output_path}")
    print(f"  总预测条数: {len(submission_df_rent)}")
    print("\nRent 提交文件预览 (前5行):")
    print(submission_df_rent.head())
except Exception as e:
    print(f"✗ 保存 Rent 提交文件时出错: {e}")

print("\n--- Rent 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---")