In [None]:
# 叶节点编码 + RidgeCV 房价预测 (Price Prediction)
import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存对象
from sklearn.model_selection import KFold # 只导入 KFold，因为 RidgeCV 内置 CV
from sklearn.ensemble import GradientBoostingRegressor # 用于叶节点编码
from sklearn.linear_model import RidgeCV # 最终的正则化线性模型
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error # 导入所有需要的指标函数
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix # 用于组合稀疏和稠密矩阵 (如果需要)

# --- 1. 配置 ---
# 输入文件夹 (包含 _selected.csv 文件)
INPUT_FOLDER = 'Feature_Selected_Data'
# 输出文件夹 (存放本次预测结果)
OUTPUT_FOLDER = 'LeafEncoding_Ridge_Prediction'
# 输入文件名
TRAIN_PRICE_FILE = 'train_price_selected.csv'
TEST_PRICE_FILE = 'test_price_selected.csv'
# 输出文件名
OUTPUT_FILE = 'submission_leaf_ridge_mae.csv'

TARGET_COLUMN = 'Price'
ID_COLUMN = 'ID'

RANDOM_STATE = 42 # 保证结果可复现

# GBRT 参数 (用于生成叶节点)
GBRT_PARAMS = {
    'n_estimators': 100,
    'max_depth': 5,
    'min_samples_leaf': 30,
    'learning_rate': 0.1,
    'subsample': 0.7,
    'random_state': RANDOM_STATE
}

# RidgeCV 参数
RIDGE_ALPHAS = np.logspace(-3, 5, 9) # alpha 搜索范围
RIDGE_CV_FOLDS = 5 # RidgeCV 内部交叉验证折数

# --- 2. 创建输出文件夹 ---
try:
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    print(f"输出文件夹 '{OUTPUT_FOLDER}' 已创建或已存在。")
except OSError as e:
    print(f"创建文件夹 '{OUTPUT_FOLDER}' 时出错: {e}")
    exit()

# --- 3. 加载经过特征筛选的数据 ---
print(f"\n--- 从 '{INPUT_FOLDER}' 加载 Price 数据 ---")
try:
    train_df = pd.read_csv(os.path.join(INPUT_FOLDER, TRAIN_PRICE_FILE), encoding='utf-8-sig')
    test_df = pd.read_csv(os.path.join(INPUT_FOLDER, TEST_PRICE_FILE), encoding='utf-8-sig')
    print("Price 数据加载成功。")
    print(f"  训练集形状: {train_df.shape}")
    print(f"  测试集形状: {test_df.shape}")
except FileNotFoundError as e:
    print(f"加载数据时出错: {e}. 请确保文件路径正确。")
    exit()
except Exception as e:
    print(f"加载数据时发生其他错误: {e}")
    exit()

# 存储测试集的 ID
test_ids = test_df[ID_COLUMN].copy()

# --- 4. 准备数据 (分离 X/y, 对齐, 填充 NaN) ---
print("\n--- 准备训练和测试数据 ---")
try:
    y_train = train_df[TARGET_COLUMN]
    X_train = train_df.drop(columns=[TARGET_COLUMN])
    X_test = test_df.drop(columns=[ID_COLUMN])

    # 移除可能残余的非数值列
    non_numeric_train = X_train.select_dtypes(exclude=np.number).columns
    if not non_numeric_train.empty:
        print(f"  警告: 训练集发现非数值列，将移除: {non_numeric_train.tolist()}")
        X_train = X_train.drop(columns=non_numeric_train)
    non_numeric_test = X_test.select_dtypes(exclude=np.number).columns
    if not non_numeric_test.empty:
         X_test = X_test.drop(columns=non_numeric_test)

    # 对齐列
    train_cols = X_train.columns
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test: X_test[c] = 0
    extra_in_test = set(test_cols) - set(train_cols)
    if extra_in_test: X_test = X_test.drop(columns=list(extra_in_test))
    X_test = X_test[train_cols] # 确保顺序一致

    print(f"  对齐后特征形状: Train={X_train.shape}, Test={X_test.shape}")

    # **填充缺失值**
    print("  使用中位数填充缺失值...")
    imputer = SimpleImputer(strategy='median')
    # 使用 NumPy 数组进行后续处理效率更高
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    print("  缺失值填充完成。")

except Exception as e:
     print(f"数据准备时发生错误: {e}")
     exit()

# --- 5. 训练 GBRT 模型以生成叶节点 ---
print("\n--- 训练 Gradient Boosting 模型以生成叶节点特征 ---")
gbrt = GradientBoostingRegressor(**GBRT_PARAMS)
try:
    print(f"  使用以下参数训练 GBRT: {GBRT_PARAMS}")
    # GBRT 可以直接接受 NumPy 数组
    gbrt.fit(X_train_imputed, y_train)
    print("  GBRT 模型训练完成。")
except Exception as e:
    print(f"  训练 GBRT 时出错: {e}")
    exit()

# --- 6. 获取叶节点索引 ---
print("\n--- 获取叶节点索引 ---")
try:
    train_leaf_indices = gbrt.apply(X_train_imputed)
    test_leaf_indices = gbrt.apply(X_test_imputed)
    print(f"  训练集叶节点索引形状: {train_leaf_indices.shape}")
    print(f"  测试集叶节点索引形状: {test_leaf_indices.shape}")
except Exception as e:
    print(f"  获取叶节点索引时出错: {e}")
    exit()

# --- 7. 对叶节点索引进行 One-Hot 编码 ---
print("\n--- 对叶节点索引进行 One-Hot 编码 ---")
leaf_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
try:
    X_train_leaves_encoded = leaf_encoder.fit_transform(train_leaf_indices)
    X_test_leaves_encoded = leaf_encoder.transform(test_leaf_indices)
    print(f"  编码后训练集特征形状 (稀疏): {X_train_leaves_encoded.shape}")
    print(f"  编码后测试集特征形状 (稀疏): {X_test_leaves_encoded.shape}")
    n_leaf_features = X_train_leaves_encoded.shape[1]
    print(f"  生成了 {n_leaf_features} 个叶节点二元特征。")
except Exception as e:
    print(f"  One-Hot 编码时出错: {e}")
    exit()

# --- 8. 准备最终特征集 (本次只使用叶节点) ---
X_train_final = X_train_leaves_encoded
X_test_final = X_test_leaves_encoded
print("\n  本次只使用编码后的叶节点特征作为最终输入。")


# --- 9. 定义评价标准 (MAE Scorer) - 解决 NameError 的关键 ---
print("\n--- 定义评价标准 (MAE) ---")
# ！！！在这里定义 mae_scorer ！！！
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好)。")

# 定义辅助函数用于后续评估
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    return rmse, mae, medae

# --- 10. 定义并训练最终的 RidgeCV 模型 ---
print("\n--- 定义并训练最终的 RidgeCV 模型 ---")

ridge_cv_model = RidgeCV(
    alphas=RIDGE_ALPHAS,
    cv=RIDGE_CV_FOLDS,
    scoring=mae_scorer, # 使用上面定义的 mae_scorer
    store_cv_values=False
)

# 构建最终的 Pipeline
final_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)), # 稀疏矩阵不中心化
    ('model', ridge_cv_model)
])

try:
    print("  开始训练最终的 RidgeCV Pipeline...")
    final_pipeline.fit(X_train_final, y_train)
    print("  RidgeCV Pipeline 训练完成。")
    best_alpha = final_pipeline.named_steps['model'].alpha_
    print(f"  RidgeCV 找到的最佳 alpha: {best_alpha:.6f}")
except Exception as e:
    print(f"  训练 RidgeCV 时出错: {e}")
    exit()

# --- 11. 在训练集上评估最终模型 ---
print("\n--- 在完整训练集上评估最终 Ridge 模型 ---")
try:
    y_train_pred = final_pipeline.predict(X_train_final)
    rmse_train, mae_train, medae_train = calculate_metrics(y_train, y_train_pred)
    print(f"  训练集 RMSE:  {rmse_train:.4f}")
    print(f"  训练集 MAE:    {mae_train:.4f}")
    print(f"  训练集 MedAE: {medae_train:.4f}")
except Exception as e:
    print(f"  评估训练集时出错: {e}")

# --- 12. 在测试集上进行预测 ---
print("\n--- 在测试集上进行预测 ---")
try:
    predictions = final_pipeline.predict(X_test_final)
    print("  预测完成。")
except Exception as e:
    print(f"  在测试集上预测时出错: {e}")
    exit()

# --- 13. 创建并保存提交文件 ---
print(f"\n--- 创建并保存提交文件到 '{OUTPUT_FOLDER}' ---")
submission_df = pd.DataFrame({
    ID_COLUMN: test_ids,
    'PredictedPrice': predictions
})
submission_df[ID_COLUMN] = submission_df[ID_COLUMN].astype(int)
submission_df = submission_df.sort_values(by=ID_COLUMN)
try:
    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
    submission_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✓ 提交文件已成功保存: {output_path}")
    print(f"  总预测条数: {len(submission_df)}")
    print("\n提交文件预览 (前5行):")
    print(submission_df.head())
except Exception as e:
    print(f"✗ 保存提交文件时出错: {e}")

print("\n--- 叶节点编码 + RidgeCV 预测流程完成 ---")

输出文件夹 'LeafEncoding_Ridge_Prediction' 已创建或已存在。

--- 从 'Feature_Selected_Data' 加载 Price 数据 ---
Price 数据加载成功。
  训练集形状: (103871, 150)
  测试集形状: (34017, 150)

--- 准备训练和测试数据 ---
  警告: 训练集发现非数值列，将移除: ['楼层_中楼层', '楼层_低楼层', '楼层_地下室', '楼层_底层', '楼层_顶层', '楼层_高楼层', '环线_三至四环', '环线_中环至外环', '环线_二环内', '环线_二至三环', '环线_五至六环', '环线_六环外', '环线_内环内', '环线_内环至中环', '环线_内环至外环', '环线_四至五环', '环线_外环外', '环线_无环线', '电梯_无', '电梯_有', '电梯_未知', '建筑结构_未知结构', '建筑结构_框架结构', '建筑结构_混合结构', '建筑结构_砖木结构', '建筑结构_砖混结构', '建筑结构_钢混结构', '建筑结构_钢结构', '装修_精装', '装修_简装', '装修_毛坯', '装修_其他', '装修_未知', '别墅_双拼', '别墅_叠拼', '别墅_独栋', '别墅_联排', '别墅_非别墅', '产权所属_共有', '产权所属_非共有']
  对齐后特征形状: Train=(103871, 109), Test=(34017, 109)
  使用中位数填充缺失值...
  缺失值填充完成。

--- 训练 Gradient Boosting 模型以生成叶节点特征 ---
  使用以下参数训练 GBRT: {'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 30, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 42}
  GBRT 模型训练完成。

--- 获取叶节点索引 ---
  训练集叶节点索引形状: (103871, 100)
  测试集叶节点索引形状: (34017, 100)

--- 对叶节点索引进行 One-Hot 编码 ---
  编码后训练集特征形状 (稀疏



  RidgeCV Pipeline 训练完成。
  RidgeCV 找到的最佳 alpha: 10000.000000

--- 在完整训练集上评估最终 Ridge 模型 ---
  训练集 RMSE:  530581.5983
  训练集 MAE:    295425.3068
  训练集 MedAE: 171167.0565

--- 在测试集上进行预测 ---
  预测完成。

--- 创建并保存提交文件到 'LeafEncoding_Ridge_Prediction' ---
✓ 提交文件已成功保存: LeafEncoding_Ridge_Prediction\submission_leaf_ridge_mae.csv
  总预测条数: 34017

提交文件预览 (前5行):
        ID  PredictedPrice
0  1000000    1.606780e+07
1  1000001    3.280667e+06
2  1000002    3.591298e+06
3  1000003    2.274458e+06
4  1000004    1.089319e+07

--- 叶节点编码 + RidgeCV 预测流程完成 ---


In [3]:
# ==============================================================================
# 完整脚本：叶节点编码 + Top Features + RidgeCV 房价预测 (Price Prediction) - V3 (Log Target, Combined Features, External CV)
# ==============================================================================

import pandas as pd
import numpy as np
import os
import joblib # 用于将来可能保存对象
from sklearn.model_selection import KFold, cross_val_score # 导入 cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, make_scorer, mean_squared_error, median_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack, csr_matrix # 用于组合稀疏和稠密矩阵

# --- 1. 配置 ---
INPUT_FOLDER = 'Feature_Selected_Data'
OUTPUT_FOLDER = 'LeafEncoding_Combined_Ridge_Prediction_Log' # 更新文件夹名
TRAIN_PRICE_FILE = 'train_price_selected.csv'
TEST_PRICE_FILE = 'test_price_selected.csv'
OUTPUT_FILE = 'submission_leaf_combined_ridge_log_mae.csv' # 更新文件名

TARGET_COLUMN = 'Price'
ID_COLUMN = 'ID'
RANDOM_STATE = 42

# GBRT 参数 (调整后)
GBRT_PARAMS = {
    'n_estimators': 150,      # 稍微增加树的数量
    'max_depth': 4,          # 稍微降低树的深度
    'min_samples_leaf': 30,
    'learning_rate': 0.1,
    'subsample': 0.7,
    'random_state': RANDOM_STATE
}

# 保留多少个最重要的原始特征与叶节点结合
N_TOP_ORIGINAL_FEATURES = 10 

# RidgeCV 参数
RIDGE_ALPHAS = np.logspace(-4, 6, 15) # 保持精细的 Alpha 范围
RIDGE_CV_FOLDS = 5 # RidgeCV 内部交叉验证折数

# 外部交叉验证折数 (用于最终评估)
EXTERNAL_CV_FOLDS = 6 # 您之前要求的 6 折

# --- 2. 创建输出文件夹 ---
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
print(f"输出文件夹 '{OUTPUT_FOLDER}' 已创建或已存在。")

# --- 3. 加载数据 ---
print(f"\n--- 从 '{INPUT_FOLDER}' 加载 Price 数据 ---")
try:
    train_df = pd.read_csv(os.path.join(INPUT_FOLDER, TRAIN_PRICE_FILE), encoding='utf-8-sig')
    test_df = pd.read_csv(os.path.join(INPUT_FOLDER, TEST_PRICE_FILE), encoding='utf-8-sig')
    print("Price 数据加载成功。")
except FileNotFoundError as e:
    print(f"加载数据时出错: {e}. 请确保文件路径正确。")
    exit()
except Exception as e:
    print(f"加载数据时发生其他错误: {e}")
    exit()

# --- 转换布尔列 ---
print("\n--- 转换布尔列 (True/False) 为整数 (1/0) ---")
# (省略了详细打印，假设之前的步骤已完成)
for df_name in ['train_df', 'test_df']:
    df_obj = globals()[df_name]
    bool_columns = df_obj.select_dtypes(include='bool').columns
    if not bool_columns.empty:
        print(f"  转换 {df_name} 中的 {len(bool_columns)} 个布尔列...")
        for col in bool_columns:
            df_obj.loc[:, col] = df_obj[col].astype(int)
print("--- 布尔列转换完成 ---")

# 存储测试集的 ID
test_ids = test_df[ID_COLUMN].copy()

# --- 4. 准备数据 (分离 X/y, Log Transform y, 对齐, 填充 NaN) ---
print("\n--- 准备训练和测试数据 ---")
try:
    # !!! 应用 Log Transform !!!
    y_train = np.log1p(train_df[TARGET_COLUMN])
    print(f"  已对目标变量 '{TARGET_COLUMN}' 应用 log1p 转换。")
    
    X_train = train_df.drop(columns=[TARGET_COLUMN])
    X_test = test_df.drop(columns=[ID_COLUMN])

    # 移除可能残余的非数值列
    non_numeric_train = X_train.select_dtypes(exclude=np.number).columns
    if not non_numeric_train.empty:
        print(f"  警告: 训练集发现非数值列，将移除: {non_numeric_train.tolist()}")
        X_train = X_train.drop(columns=non_numeric_train)
    non_numeric_test = X_test.select_dtypes(exclude=np.number).columns
    if not non_numeric_test.empty:
         X_test = X_test.drop(columns=non_numeric_test)

    # 对齐列
    train_cols = X_train.columns # 保存列名以供后续使用
    test_cols = X_test.columns
    missing_in_test = set(train_cols) - set(test_cols)
    for c in missing_in_test: X_test[c] = 0
    extra_in_test = set(test_cols) - set(train_cols)
    if extra_in_test: X_test = X_test.drop(columns=list(extra_in_test))
    X_test = X_test[train_cols] 

    print(f"  对齐后特征形状: Train={X_train.shape}, Test={X_test.shape}")

    # **填充缺失值**
    print("  使用中位数填充缺失值...")
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    print("  缺失值填充完成。")

except Exception as e:
     print(f"数据准备时发生错误: {e}")
     exit()

# --- 5. 训练 GBRT 模型以生成叶节点 & 获取特征重要性 ---
print("\n--- 训练 Gradient Boosting 模型以生成叶节点特征 ---")
gbrt = GradientBoostingRegressor(**GBRT_PARAMS)
try:
    print(f"  使用以下参数训练 GBRT: {GBRT_PARAMS}")
    # GBRT 在填充后的 NumPy 数组上训练
    gbrt.fit(X_train_imputed, y_train) # 注意 y_train 是 log 转换后的
    print("  GBRT 模型训练完成。")

    # 获取特征重要性
    importances = gbrt.feature_importances_
    # 获取 Top N 特征的索引和名称
    top_n_indices = np.argsort(importances)[::-1][:N_TOP_ORIGINAL_FEATURES]
    top_n_features = train_cols[top_n_indices].tolist() # 使用之前保存的列名
    print(f"  识别出 Top {N_TOP_ORIGINAL_FEATURES} 原始特征: {top_n_features}")

except Exception as e:
    print(f"  训练 GBRT 或获取重要性时出错: {e}")
    # 即使出错，也继续尝试叶节点编码，但不添加原始特征
    top_n_features = [] 
    # exit() # 如果希望严格要求特征重要性计算成功，则取消此行注释

# --- 6. 获取叶节点索引 ---
print("\n--- 获取叶节点索引 ---")
try:
    train_leaf_indices = gbrt.apply(X_train_imputed)
    test_leaf_indices = gbrt.apply(X_test_imputed)
except Exception as e:
    print(f"  获取叶节点索引时出错: {e}")
    exit()

# --- 7. 对叶节点索引进行 One-Hot 编码 ---
print("\n--- 对叶节点索引进行 One-Hot 编码 ---")
leaf_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
try:
    X_train_leaves_encoded = leaf_encoder.fit_transform(train_leaf_indices)
    X_test_leaves_encoded = leaf_encoder.transform(test_leaf_indices)
    n_leaf_features = X_train_leaves_encoded.shape[1]
    print(f"  生成了 {n_leaf_features} 个叶节点二元特征。")
except Exception as e:
    print(f"  One-Hot 编码时出错: {e}")
    exit()

# --- 8. 组合叶节点特征与 Top N 原始特征 ---
print(f"\n--- 组合叶节点特征与 Top {len(top_n_features)} 原始特征 ---")
try:
    if top_n_features:
        # 从填充后的 NumPy 数组中按索引选取 Top N 特征
        X_train_top_features = X_train_imputed[:, top_n_indices]
        X_test_top_features = X_test_imputed[:, top_n_indices]

        # 使用 hstack 组合（需要将稠密部分转为 CSR 稀疏格式）
        X_train_final = hstack([X_train_leaves_encoded, csr_matrix(X_train_top_features)])
        X_test_final = hstack([X_test_leaves_encoded, csr_matrix(X_test_top_features)])
        print(f"  组合后最终特征形状: Train={X_train_final.shape}, Test={X_test_final.shape}")
    else:
        # 如果没有 top_n_features (例如 GBRT 训练失败)，只用叶节点
        X_train_final = X_train_leaves_encoded
        X_test_final = X_test_leaves_encoded
        print("  只使用编码后的叶节点特征。")

except Exception as e:
    print(f"  组合特征时出错: {e}")
    exit()


# --- 9. 定义评价标准 (MAE Scorer) ---
print("\n--- 定义评价标准 (MAE) ---")
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
print("MAE 评价标准已创建 (得分越低越好)。")
# 辅助函数
def calculate_metrics(y_true, y_pred):
    # !!! 注意: 这里的 y_true 和 y_pred 都是 log 转换后的 !!!
    # 计算 log 空间上的指标
    rmse_log = np.sqrt(mean_squared_error(y_true, y_pred))
    mae_log = mean_absolute_error(y_true, y_pred)
    medae_log = median_absolute_error(y_true, y_pred)
    
    # 计算原始价格空间上的指标 (需要逆转换)
    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)
    rmse_orig = np.sqrt(mean_squared_error(y_true_orig, y_pred_orig))
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    medae_orig = median_absolute_error(y_true_orig, y_pred_orig)
    
    # 返回原始空间的指标以供最终评估
    return rmse_orig, mae_orig, medae_orig


# --- 10. 定义最终的 RidgeCV Pipeline ---
print("\n--- 定义最终的 RidgeCV Pipeline ---")
ridge_cv_model = RidgeCV(
    alphas=RIDGE_ALPHAS,
    cv=RIDGE_CV_FOLDS,
    scoring=mae_scorer, # RidgeCV 内部仍然优化 log 空间上的 MAE
    store_cv_values=False
)
final_pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)), # 稀疏矩阵不中心化
    ('model', ridge_cv_model)
])

# --- 11. 外部交叉验证评估 Pipeline ---
print(f"\n--- 使用 {EXTERNAL_CV_FOLDS}-折交叉验证评估最终 Pipeline (MAE in log space) ---")
try:
    # 使用 KFold 进行外部 CV
    external_cv = KFold(n_splits=EXTERNAL_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    # 在 log 空间上计算 MAE
    cv_scores_log_mae = cross_val_score(final_pipeline, X_train_final, y_train, 
                                        cv=external_cv, scoring=mae_scorer, n_jobs=-1)
    
    mean_cv_log_mae = np.mean(cv_scores_log_mae)
    std_cv_log_mae = np.std(cv_scores_log_mae)
    
    print(f"  交叉验证得分 (负 Log MAE): {mean_cv_log_mae:.4f} +/- {std_cv_log_mae:.4f}")
    print(f"  交叉验证得分 (Log MAE): {-mean_cv_log_mae:.4f}") # 打印正的 Log MAE

    # **重要**: 这个 CV MAE 是在 log 空间上的，不直接等于原始价格空间的 MAE
    
except Exception as e:
    print(f"  外部交叉验证时出错: {e}")
    # 即使 CV 出错，也继续尝试训练最终模型

# --- 12. 在全部训练数据上训练最终 Pipeline ---
print("\n--- 在全部训练数据上训练最终的 RidgeCV Pipeline ---")
try:
    final_pipeline.fit(X_train_final, y_train)
    print("  最终 Pipeline 训练完成。")
    best_alpha = final_pipeline.named_steps['model'].alpha_
    print(f"  RidgeCV 找到的最佳 alpha: {best_alpha:.6f}")
except Exception as e:
    print(f"  训练最终 Pipeline 时出错: {e}")
    exit()

# --- 13. 在训练集上评估最终模型 (原始价格空间) ---
print("\n--- 在完整训练集上评估最终 Ridge 模型 (原始价格空间) ---")
try:
    y_train_pred_log = final_pipeline.predict(X_train_final)
    # 使用更新后的 calculate_metrics 函数，它会返回原始空间的指标
    rmse_train, mae_train, medae_train = calculate_metrics(y_train, y_train_pred_log) 
    print(f"  训练集 RMSE (原): {rmse_train:.2f}")
    print(f"  训练集 MAE  (原): {mae_train:.2f}")
    print(f"  训练集 MedAE(原): {medae_train:.2f}")
except Exception as e:
    print(f"  评估训练集时出错: {e}")

# --- 14. 在测试集上进行预测 ---
print("\n--- 在测试集上进行预测 ---")
try:
    predictions_log = final_pipeline.predict(X_test_final)
    # !!! 逆转换回原始价格空间 !!!
    predictions = np.expm1(predictions_log)
    print("  预测完成并已转换回原始价格空间。")
    
    # 检查是否有负数预测 (log转换后理论上不会，但 expm1 可能产生接近0的负数)
    if np.any(predictions < 0):
        print(f"  警告: 发现 {np.sum(predictions < 0)} 个负数预测值，将修正为 0。")
        predictions = np.clip(predictions, a_min=0, a_max=None)

except Exception as e:
    print(f"  在测试集上预测时出错: {e}")
    exit()

# --- 15. 创建并保存提交文件 ---
print(f"\n--- 创建并保存提交文件到 '{OUTPUT_FOLDER}' ---")
submission_df = pd.DataFrame({
    ID_COLUMN: test_ids,
    'PredictedPrice': predictions
})
submission_df[ID_COLUMN] = submission_df[ID_COLUMN].astype(int)
submission_df = submission_df.sort_values(by=ID_COLUMN)
try:
    output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
    submission_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"✓ 提交文件已成功保存: {output_path}")
    print(f"  总预测条数: {len(submission_df)}")
    print("\n提交文件预览 (前5行):")
    print(submission_df.head())
except Exception as e:
    print(f"✗ 保存提交文件时出错: {e}")

print("\n--- 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---")

输出文件夹 'LeafEncoding_Combined_Ridge_Prediction_Log' 已创建或已存在。

--- 从 'Feature_Selected_Data' 加载 Price 数据 ---
Price 数据加载成功。

--- 转换布尔列 (True/False) 为整数 (1/0) ---
  转换 train_df 中的 40 个布尔列...
  转换 test_df 中的 40 个布尔列...
--- 布尔列转换完成 ---

--- 准备训练和测试数据 ---
  已对目标变量 'Price' 应用 log1p 转换。
  对齐后特征形状: Train=(103871, 149), Test=(34017, 149)
  使用中位数填充缺失值...


  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[:, col] = df_obj[col].astype(int)
  df_obj.loc[

  缺失值填充完成。

--- 训练 Gradient Boosting 模型以生成叶节点特征 ---
  使用以下参数训练 GBRT: {'n_estimators': 150, 'max_depth': 4, 'min_samples_leaf': 30, 'learning_rate': 0.1, 'subsample': 0.7, 'random_state': 42}
  GBRT 模型训练完成。
  识别出 Top 10 原始特征: ['面积_数值', '城市_0', 'lon', 'lat', '环线_无环线', '燃气费_数值', '停车费用_数值', '城市_4', '房屋优势_地铁', '区域']

--- 获取叶节点索引 ---

--- 对叶节点索引进行 One-Hot 编码 ---
  生成了 2332 个叶节点二元特征。

--- 组合叶节点特征与 Top 10 原始特征 ---
  组合后最终特征形状: Train=(103871, 2342), Test=(34017, 2342)

--- 定义评价标准 (MAE) ---
MAE 评价标准已创建 (得分越低越好)。

--- 定义最终的 RidgeCV Pipeline ---

--- 使用 6-折交叉验证评估最终 Pipeline (MAE in log space) ---
  交叉验证得分 (负 Log MAE): -0.1217 +/- 0.0005
  交叉验证得分 (Log MAE): 0.1217

--- 在全部训练数据上训练最终的 RidgeCV Pipeline ---




  最终 Pipeline 训练完成。
  RidgeCV 找到的最佳 alpha: 7196.856730

--- 在完整训练集上评估最终 Ridge 模型 (原始价格空间) ---
  训练集 RMSE (原): 619835.82
  训练集 MAE  (原): 278646.86
  训练集 MedAE(原): 128316.77

--- 在测试集上进行预测 ---
  预测完成并已转换回原始价格空间。

--- 创建并保存提交文件到 'LeafEncoding_Combined_Ridge_Prediction_Log' ---
✓ 提交文件已成功保存: LeafEncoding_Combined_Ridge_Prediction_Log\submission_leaf_combined_ridge_log_mae.csv
  总预测条数: 34017

提交文件预览 (前5行):
        ID  PredictedPrice
0  1000000    1.654684e+07
1  1000001    3.231474e+06
2  1000002    3.812927e+06
3  1000003    2.002398e+06
4  1000004    1.020258e+07

--- 叶节点编码 + Top Features + RidgeCV (Log Target) 预测流程完成 ---
