## Price测试

In [73]:
# 1️⃣ 环境准备
import pandas as pd
import numpy as np
import re
import warnings
from scipy.sparse import issparse
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer #FunctionTransformer: 用于将自定义函数（如 np.log1p）应用为 Pipeline 中的一个步骤。
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.base import clone

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

# 🎯自定义指标函数（计算原始价格尺度上的MAE与RMSE）
def calculate_mae_rmse_original(y_log_true, y_log_pred):
    mae, rmse = np.nan, np.nan
    try:
        y_log_true_np = np.asarray(y_log_true).flatten()
        y_log_pred_np = np.asarray(y_log_pred).flatten()
        #清理nan\inf，防止后续指数计算溢出
        log_max = np.log1p(np.finfo(np.float64).max / 10)
        y_log_true_np = np.nan_to_num(y_log_true_np, nan=0.0, posinf=log_max, neginf=-700)
        y_log_pred_np = np.nan_to_num(y_log_pred_np, nan=0.0, posinf=log_max, neginf=-700)
        y_log_true_np = np.clip(y_log_true_np, -700, log_max)
        y_log_pred_np = np.clip(y_log_pred_np, -700, log_max)
        
        y_price_pred = np.expm1(y_log_pred_np)
        y_price_true = np.expm1(y_log_true_np)
        y_price_pred[y_price_pred < 0] = 0 #价格不能为负，裁剪为0

        large_finite_val = np.finfo(np.float64).max / 10
        y_price_pred = np.nan_to_num(y_price_pred, nan=0.0, posinf=large_finite_val, neginf=0.0)
        y_price_true = np.nan_to_num(y_price_true, nan=0.0, posinf=large_finite_val, neginf=0.0)
        y_price_pred = np.clip(y_price_pred, 0, large_finite_val)
        y_price_true = np.clip(y_price_true, 0, large_finite_val)

        if np.isnan(y_price_pred).any() or np.isnan(y_price_true).any():
            y_price_pred = np.nan_to_num(y_price_pred, nan=0.0)
            y_price_true = np.nan_to_num(y_price_true, nan=0.0)

        mae = mean_absolute_error(y_price_true, y_price_pred)
        mse = mean_squared_error(y_price_true, y_price_pred)
        if mse < 0 or not np.isfinite(mse): rmse = np.nan
        else: rmse = np.sqrt(mse) # 计算RMSE
        return mae, rmse
    except (ValueError, OverflowError, TypeError) as e:
        return np.nan, np.nan

# --- 🚀 优化点 1: 重新定义 mae_metric (用于 make_scorer) ---
# GridSearchCV 默认认为“分数越高越好”，MAE是一个误差指标，越低越好。
def mae_metric(y_log_true, y_log_pred):
    """指标：返回负 MAE (需最大化)"""
    mae, _ = calculate_mae_rmse_original(y_log_true, y_log_pred)
    if np.isnan(mae):
        return -np.finfo(np.float64).max 
    return -mae

# --- 🚀 优化点 2: 向量化特征清理函数 ---
def clean_complex_features_optimized(df):
    """执行单位转换、正则提取和初始特征清理 (向量化版本)"""
    df = df.copy()
    
    # 1. 单位转换 (不变)
    def clean_unit(series, unit):
        cleaned = series.astype(str).str.replace(unit, '', regex=False).str.strip()
        cleaned = cleaned.replace(['', '暂无', 'null', 'None'], np.nan)
        return pd.to_numeric(cleaned, errors='coerce')
    
    for col, unit in [('建筑面积', '㎡'), ('套内面积', '㎡'), ('房屋总数', '户'), ('楼栋总数', '栋')]:
        if col in df.columns:
            df[col] = clean_unit(df[col], unit)
    if '绿 化 率' in df.columns:
        df['绿 化 率'] = clean_unit(df['绿 化 率'], '%')
        if df['绿 化 率'] is not None:
            df['绿 化 率'] = df['绿 化 率'] / 100

    # 2. 提取数字范围均值 (使用 .str.extractall 向量化)
    cols_to_extract = ['物 业 费', '燃气费', '供热费', '停车费用', '建筑年代', '停车位']
    for col in cols_to_extract:
        if col in df.columns:
            cleaned_series = df[col].astype(str).replace(['暂无', 'null', 'None', ''], np.nan)
            all_nums = cleaned_series.str.extractall(r"(\d+(?:\.\d+)?)")[0].astype(float)
            avg_series = all_nums.groupby(level=0).mean()
            df[f'{col}_均值'] = avg_series
            df = df.drop(columns=[col], errors='ignore')
    
    for col in df.filter(like='_均值').columns:
        if df[col].isnull().any():
            median_val = df[col].dropna().median()
            if pd.isna(median_val): median_val = 0
            df[col] = df[col].fillna(median_val)

    # 3. 提取楼层
    if '所在楼层' in df.columns:
        df['楼层位置'] = df['所在楼层'].astype(str).str.extract(r'([A-Za-z\u4e00-\u9fa5]+)').fillna('未知')
        df['总楼层数'] = pd.to_numeric(df['所在楼层'].astype(str).str.extract(r'\(共(\d+)层\)')[0], errors='coerce').fillna(0).astype(int)
        df = df.drop(columns=['所在楼层'])

    # 4. 提取户型
    if '房屋户型' in df.columns:
        df['室'] = pd.to_numeric(df['房屋户型'].astype(str).str.extract(r'(\d+)室')[0], errors='coerce').fillna(0).astype(int)
        df['厅'] = pd.to_numeric(df['房屋户型'].astype(str).str.extract(r'(\d+)厅')[0], errors='coerce').fillna(0).astype(int)
        df['卫'] = pd.to_numeric(df['房屋户型'].astype(str).str.extract(r'(\d+)卫')[0], errors='coerce').fillna(0).astype(int)
        df = df.drop(columns=['房屋户型'])

    # 5. 交互项/多项式特征 (不变)
    if '建筑面积' in df.columns and '套内面积' in df.columns:
        df['建筑面积_clean'] = df['建筑面积'].replace(0, np.nan)
        df['套内面积_比'] = (df['套内面积'] / df['建筑面积_clean']).replace([np.inf, -np.inf], np.nan).fillna(0)
        df = df.drop(columns=['建筑面积_clean'])
    if '容 积 率' in df.columns:
        df['容 积 率'] = pd.to_numeric(df['容 积 率'], errors='coerce').fillna(df['容 积 率'].median())
        df['容积率_sq'] = df['容 积 率'] ** 2
    
    # 填充数值型缺失 (对所有数值列进行最终的中位数填充)
    for col in df.select_dtypes(include=np.number).columns:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())

    # 6. 删除列 (不变)
    drop_cols = [
        '抵押信息', '别墅类型', '交易时间', '上次交易',
        '房屋优势', '核心卖点', '户型介绍', '周边配套', '交通出行', '客户反馈', '产权描述', '房屋用途',
        'coord_x', 'coord_y', '物业办公电话', '开发商', '物业公司', '房屋年限',
        '停车费用' 
    ]
    df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

    # 填充缺失分类特征 (不变)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].fillna('缺失')

    return df

# 2️⃣ 数据读取 (PRICE 文件)
TRAIN_SNIPPET = "ruc_Class25Q2_train_price.csv"
TEST_SNIPPET = "ruc_Class25Q2_test_price.csv"
TARGET_NAME = "Price"
TRAIN_ENCODING = 'utf-8-sig'
TEST_ENCODING = 'utf-8-sig'

# (加载逻辑不变)
try:
    train_price = pd.read_csv(TRAIN_SNIPPET, low_memory=False, encoding=TRAIN_ENCODING, sep=',')
    print(f"✅ 训练集加载成功，编码: {TRAIN_ENCODING}，分隔符: Comma。")
except Exception:
    train_price = pd.read_csv(TRAIN_SNIPPET, low_memory=False, encoding=TRAIN_ENCODING, sep='\t')
    print(f"✅ 训练集加载成功，编码: {TRAIN_ENCODING}，分隔符: Tab。")
try:
    test_price = pd.read_csv(TEST_SNIPPET, low_memory=False, encoding=TEST_ENCODING, sep=',')
    print(f"✅ 测试集加载成功，编码: {TEST_ENCODING}，分隔符: Comma。")
except Exception:
    test_price = pd.read_csv(TEST_SNIPPET, low_memory=False, encoding=TEST_ENCODING, sep='\t')
    print(f"✅ 测试集加载成功，编码: {TEST_ENCODING}，分隔符: Tab。")

train_data = train_price
test_data = test_price

# 3️⃣ 初步检查与清洗 
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)
target = TARGET_NAME
if target not in train_data.columns:
    raise ValueError(f"目标变量 '{target}' 不在训练集列中。")
KEEP_COLS = [target, "物 业 费", "停车费用"]
#识别数据泄露特征，排除在KEEP_COLS中的重要列，删除泄露未来信息的列
leak_cols = [c for c in train_data.columns if ("comm" in c.lower() or "price" in c.lower()) and c not in KEEP_COLS]
train_data.drop(columns=[col for col in leak_cols if col in train_data.columns], inplace=True, errors='ignore')
test_data.drop(columns=[col for col in leak_cols if col in test_data.columns], inplace=True, errors='ignore')

all_data = pd.concat([train_data.drop(columns=[target]), test_data], keys=['train', 'test'])
all_data_cleaned = clean_complex_features_optimized(all_data) 
for col in all_data_cleaned.select_dtypes(include='object').columns:
    all_data_cleaned[col] = all_data_cleaned[col].astype('category') #所有文本列换为分类数据类型
X_train_raw = all_data_cleaned.loc['train'].copy() #训练特征
X_test_final = all_data_cleaned.loc['test'].copy() #测试特征
y_train_log = np.log1p(train_data[target])
y_train_price = train_data[target]

# 4️⃣ 离群值处理 
Q1 = y_train_log.quantile(0.25)
Q3 = y_train_log.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier_mask = (y_train_log >= lower_bound) & (y_train_log <= upper_bound)
X_train_no_outliers = X_train_raw.loc[outlier_mask].copy()
y_train_log_no_outliers = y_train_log.loc[outlier_mask].copy()
y_train_price_no_outliers = y_train_price.loc[outlier_mask].copy() #使用掩码过滤离群值对应的样本
initial_count = len(train_data)
final_count = len(X_train_no_outliers)
print(f"\n--- 离群值处理结果 ---")
print(f"原始样本数: {initial_count}")
print(f"移除离群值后样本数: {final_count} (请报告此数字)")

# 5️⃣ 区分变量类型 
auto_numeric_features = X_train_no_outliers.select_dtypes(include=[np.number]).columns.tolist()
auto_categorical_features = X_train_no_outliers.select_dtypes(exclude=[np.number]).columns.tolist()
known_categorical_numerics = ['城市', '区域', '板块', '区县', '年份']
categorical_features = list(set(auto_categorical_features + [col for col in known_categorical_numerics if col in X_train_no_outliers.columns])) #set()：去重，避免重复特征/条件判断：只添加数据集中实际存在的特征
numeric_features = [f for f in auto_numeric_features if f not in known_categorical_numerics]
if 'ID' in numeric_features: numeric_features.remove('ID')
if 'ID' in categorical_features: categorical_features.remove('ID')
print(f"\n--- 修正后的特征 ---")
print(f"数值特征 ({len(numeric_features)}): {numeric_features}")
print(f"分类特征 ({len(categorical_features)}): {categorical_features}")

# 6️⃣ 特征预处理 Pipeline 定义 
log_transformer = FunctionTransformer(lambda x: np.log1p(np.maximum(x, 0)), validate=False)
numeric_transformer = Pipeline([
    ('imputer1', SimpleImputer(strategy='median')), # 第一步：中位数填充缺失值
    ('logtransform', log_transformer),              # 第二步：对数变换
    ('imputer2', SimpleImputer(strategy='median')), # 第三步：再次填充，处理变换后可能产生的异常值
    ('scaler', StandardScaler())                    # 第四步：标准化，使特征均值为0，方差为1，便于模型优化
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='缺失')),    # 填充缺失
    ('astype_str', FunctionTransformer(lambda x: x.astype(str), validate=False)), # 转字符串
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=50)) # 未见过的类别时忽略，避免报错；限制最大类别数，防止维度爆炸
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), # 数值特征流水线
        ('cat', categorical_transformer, categorical_features) # 分类特征流水线
    ],
    remainder='drop',      # 处理其他列的方式
    sparse_threshold=0.3   # 稀疏矩阵阈值
)

# 7️⃣ 模型定义与超参数（重点步骤）
models_base = {
    'OLS': LinearRegression(),
    'LASSO': Lasso(max_iter=10000, random_state=111),
    # --- ⬇️ 核心修正：已删除所有导致崩溃的参数 (tol, random_state, solver) ⬇️ ---
    'Ridge': Ridge(max_iter=10000), 
    'ElasticNet': ElasticNet(max_iter=10000, random_state=111)
}

# 为需要调优的模型指定超参数的搜索范围
param_grids = {
    'LASSO': {'alpha': np.logspace(-4, 0, 3)}, #[0.0001, 0.01, 1]产生一系列对数等距的值，搜索alpha常用方法
    'Ridge': {'alpha': np.logspace(-2, 2, 3)}, #[0.01, 1, 100]
    'ElasticNet': {'alpha': np.logspace(-4, 0, 2), 'l1_ratio': [0.1, 0.9]},
}

# --- 🚀 优化点 4: 调换顺序，先做预处理 (原步骤 B) ---
print("\n--- 步骤 B (优化): 立即执行预处理 ---")
preprocessor_final = clone(preprocessor)
preprocessor_final.fit(X_train_no_outliers)
print("--- Transforming Data ---")
X_full_transformed = preprocessor_final.transform(X_train_no_outliers)
X_test_transformed = preprocessor_final.transform(X_test_final)  #将训练数据、测试数据转换为模型可以使用的数值矩阵
y_full_log_np = y_train_log_no_outliers.values
y_full_price_np = y_train_price_no_outliers.values
mean_price = np.mean(y_full_price_np)
print(f"Data transformed. Shape: {X_full_transformed.shape}")

# --- 🚀 优化点 5: 在预处理后的数据上进行 GSCV 和评估 ---
print("\n--- 步骤 9 (优化): 在已转换数据上运行 GSCV 和评估 ---")
X_train_transformed, X_valid_transformed, y_train_log_split, y_valid_log_split = train_test_split(
    X_full_transformed, y_full_log_np, test_size=0.2, random_state=111
)
kf = KFold(n_splits=6, shuffle=True, random_state=111) #6折交叉验证：平衡偏差/方差
custom_mae_scorer = make_scorer(mae_metric, greater_is_better=True) 

results = []
fitted_models = {}

for name, model_base in models_base.items():
    print(f"\n--- 开始处理: {name} ---")
    final_model = clone(model_base)
    
    if name in param_grids:
        print(f"--- 运行 GridSearchCV: {name} ---")
        gscv = GridSearchCV(
            final_model, param_grids[name], scoring=custom_mae_scorer,
            cv=kf, n_jobs=-1, verbose=1, refit=True 
        )
        gscv.fit(X_full_transformed, y_full_log_np) #网格搜索交叉验证
        print(f"✅ {name} Best Params found: {gscv.best_params_}")
        cv_mae = -gscv.best_score_  #将结果中提取最佳交叉验证
        final_model = gscv.best_estimator_ 
        fitted_models[name] = final_model
        
    else: # (OLS 逻辑不变)
        print(f"--- 拟合 (无 GSCV): {name} ---")
        final_model.fit(X_full_transformed, y_full_log_np)
        print(f"--- {name} 手动 CV (无 GSCV) ---")
        fold_mae_scores = []
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_full_transformed, y_full_log_np)):
            X_train_fold, X_val_fold = X_full_transformed[train_idx], X_full_transformed[val_idx]
            y_train_fold, y_val_fold = y_full_log_np[train_idx], y_full_log_np[val_idx]
            fold_model = clone(final_model).fit(X_train_fold, y_train_fold)
            y_pred_fold_log = fold_model.predict(X_val_fold)
            fold_mae_scores.append(-mae_metric(y_val_fold, y_pred_fold_log))
        
        cv_mae = np.nanmean(fold_mae_scores)
        fitted_models[name] = final_model

    print(f"--- {name} CV MAE: {cv_mae:.2f} ---")

    # --- 性能计算 (In-sample / Out-of-sample) ---
    try:
        y_pred_train_log = final_model.predict(X_train_transformed)
        mae_train, _ = calculate_mae_rmse_original(y_train_log_split, y_pred_train_log)
    except Exception: mae_train = np.nan
    try:
        y_pred_valid_log = final_model.predict(X_valid_transformed)
        mae_valid, _ = calculate_mae_rmse_original(y_valid_log_split, y_pred_valid_log)
    except Exception: mae_valid = np.nan
    
    results.append({
        "Metrics": name,
        "In-sample MAE": mae_train,     # 训练集性能
        "In-sample RMAE": mae_train / mean_price if mean_price != 0 else np.nan,
        "Out-of-sample MAE": mae_valid, # 验证集性能
        "Out-of-sample RMAE": mae_valid / mean_price if mean_price != 0 else np.nan,
        "Cross-validation MAE": cv_mae, # 交叉验证性能
        "Cross-validation RMAE": cv_mae / mean_price if mean_price != 0 and not np.isnan(cv_mae) else np.nan
    })
    
    # 诊断 (不变)
    print(f"  Intercept: {final_model.intercept_:.4f}")
    if hasattr(final_model, 'coef_'):
        coefs = final_model.coef_
        if issparse(coefs): coefs = coefs.toarray().flatten()
        print(f"  Coefficient Stats: Mean={np.mean(coefs):.4f}, Std={np.std(coefs):.4f}")
        if np.allclose(coefs, 0, atol=1e-5):
            print(f"  ⚠️ 警告: 模型 {name} 的系数几乎全为零！")

# 🔟 输出结果表 (不变)
result_df = pd.DataFrame(results)
print("\n--- Determining Best Linear Model (based on Cross-validation MAE) ---")
valid_cv_mae_indices = result_df["Cross-validation MAE"].dropna().index
best_model_name_final = 'OLS'
if not valid_cv_mae_indices.empty:
    best_model_idx = result_df.loc[valid_cv_mae_indices, "Cross-validation MAE"].idxmin() #在所有有效的交叉验证 MAE 中，找到最小值对应的索引。.idxmin() 函数返回最小值的索引。
    best_model_name_orig = result_df.loc[best_model_idx, "Metrics"] #根据找到的最小MAE的索引，从"Metrics"列中获取该模型的原始名称（比如'Ridge'）。
    if best_model_name_orig in fitted_models and fitted_models[best_model_name_orig] is not None: #检查这个找到的最佳模型（比如 'Ridge'）是否在前一步骤中成功训练并被保存在了fitted_models字典里。
        #如果成功了
        result_df.loc[result_df['Metrics'] == best_model_name_orig, 'Metrics'] = 'Best Linear Model' #在result_df中，将这个最佳模型的名字（比如 'Ridge'）重命名为'Best Linear Model'，方便在最终表格中识别。
        best_model_name_final = best_model_name_orig 
        print(f"最佳模型（基于 CV MAE）: {best_model_name_final}")
    else:
        #如果失败了
        print(f"警告: 确定的最佳模型 '{best_model_name_orig}' (CV MAE) 训练失败或不存在。正在回退。")
        available_models = [m for m in ['OLS', 'Ridge', 'LASSO', 'ElasticNet'] if m in fitted_models and fitted_models[m] is not None]
        #创建一个列表，包含所有成功训练的模型，按 OLS, Ridge, LASSO, ElasticNet 的优先顺序排列。
        if available_models: #如果至少有一个模型成功训练了
            best_model_name_final = available_models[0] #选择列表中的第一个（即优先级最高的）可用模型作为备选的最佳模型
            print(f"回退到第一个可用模型: {best_model_name_final}")
            #在result_df中将这个备选模型的名字重命名为 'Best Linear Model (Fallback)'。
            if best_model_name_final in result_df['Metrics'].values:
                result_df.loc[result_df['Metrics'] == best_model_name_final, 'Metrics'] = 'Best Linear Model (Fallback)'
            else: #如果所有模型都训练失败了，打印错误信息。
                fallback_idx = next((i for i, r in enumerate(results) if r['Metrics'] == best_model_name_final), None)
                if fallback_idx is not None:
                    result_df.loc[fallback_idx, 'Metrics'] = 'Best Linear Model (Fallback)'
        else:
            print("错误: 所有模型训练失败。无法确定最佳模型。")
else:
    print("警告: 所有模型在 Cross-validation MAE 上失败。无法确定最佳模型。")

result_df_formatted = result_df.copy() #格式化结果表格：创建 result_df 的一个副本，以免修改原始数据
for col in result_df_formatted.columns:
    if 'MAE' in col:
        result_df_formatted[col] = result_df_formatted[col].apply(lambda x: f"{x:.2f}" if pd.notna(x) else 'NaN')
    #将该列中的数值格式化为保留两位小数的字符串。如果是缺失值 (NaN)，则显示 'NaN'
    elif 'RMAE' in col:
        result_df_formatted[col] = result_df_formatted[col].apply(lambda x: f"{x:.4f}" if pd.notna(x) else 'NaN')
    #将该列中的数值格式化为保留四位小数的字符串。如果是缺失值 (NaN)，则显示 'NaN'。
print("\n--- 10. Model Performance Summary (MAE and RMAE for Original Price Level) ---")
print(result_df_formatted.to_markdown(index=False))


# 🔟➕1️⃣ 选择最优模型 (不变)
print(f"\n--- Selecting Final Model for Prediction: {best_model_name_final} ---")
if best_model_name_final in fitted_models and fitted_models[best_model_name_final] is not None:
    best_model = fitted_models[best_model_name_final] #如果上述检查通过（即最佳模型确实成功训练了），就从 fitted_models 字典中取出那个训练好的模型对象（比如训练好的 Ridge 实例），并将其赋值给变量 best_model。
    best_model_name_predict = best_model_name_final #记录下实际用于预测的模型名称
else: #稳健性设计
    available_models = [m for m in ['Ridge', 'OLS', 'LASSO', 'ElasticNet'] if m in fitted_models and fitted_models[m] is not None]
    if available_models:
        best_model_name_predict = available_models[0] #选择列表中第一个（即优先级最高的）成功训练的模型作为备选预测模型。
        best_model = fitted_models[best_model_name_predict]
        print(f"警告: CV MAE 最佳模型 '{best_model_name_final}' 失败或不可用，回退到第一个可用的模型: {best_model_name_predict}。")
    else:
        raise RuntimeError("所有模型训练失败，无法继续进行预测。")


# 🔟➕2️⃣ 预测测试集 (不变)
try:
    print(f"\n--- 使用最终模型进行预测: {best_model_name_predict} ---")
    
    input_data_is_sparse = issparse(X_test_transformed) #检查输入的测试数据是否是稀疏矩阵格式
    input_data_nan = (input_data_is_sparse and np.isnan(X_test_transformed.data).any()) or \
                     (not input_data_is_sparse and np.isnan(X_test_transformed).any())
    input_data_inf = (input_data_is_sparse and np.isinf(X_test_transformed.data).any()) or \
                     (not input_data_is_sparse and np.isinf(X_test_transformed).any())
    if input_data_nan or input_data_inf:
        print("警告：最终预测前在 X_test_transformed 中检测到 NaN/Inf，尝试填充 0。")
        if input_data_is_sparse:
            X_test_transformed.data = np.nan_to_num(X_test_transformed.data, nan=0.0, posinf=0.0, neginf=0.0)
        else:
            X_test_transformed = np.nan_to_num(X_test_transformed, nan=0.0, posinf=0.0, neginf=0.0)

    test_pred_log = best_model.predict(X_test_transformed) #使用选定的 best_model 对处理后的测试数据 X_test_transformed 进行预测

    # (诊断逻辑不变)
    print(f"预测的对数价格 (前 5 个): {test_pred_log[:5]}")
    finite_preds = test_pred_log[np.isfinite(test_pred_log)]
    if finite_preds.size > 0:
        print(f"对数价格统计 (有限值): Min={np.min(finite_preds):.2f}, Max={np.max(finite_preds):.2f}, Mean={np.mean(finite_preds):.2f}, Std={np.std(finite_preds):.2f}")
    if np.isnan(test_pred_log).any() or np.isinf(test_pred_log).any():
        print("❌ 警告: 预测的对数价格包含 NaN 或 Inf！正在尝试清理...")
        median_log_pred = np.nanmedian(finite_preds) if finite_preds.size > 0 else np.log1p(np.median(y_train_price_no_outliers))
        test_pred_log = np.nan_to_num(test_pred_log, nan=median_log_pred, posinf=np.log1p(np.finfo(np.float64).max / 10), neginf=-700)
    #逆转换回原始价格尺度与最终清理
    test_pred_price = np.expm1(test_pred_log) #对数尺度的预测值 test_pred_log 转换回原始的价格尺度
    test_pred_price[test_pred_price < 0] = 0
    large_finite_val = np.finfo(np.float64).max / 10 #数值上限
    median_fallback = np.median(y_train_price_no_outliers) if len(y_train_price_no_outliers)>0 else 0 #计算训练集原始价格的中位数，作为最终的备用值。
    test_pred_price = np.nan_to_num(test_pred_price, nan=median_fallback,
                                      posinf=large_finite_val, neginf=0.0) #对价格尺度的预测值进行最后一次清理，替换可能因 expm1 产生的 NaN 或 Inf，并将超出极大范围的值替换掉。
    test_pred_price = np.clip(test_pred_price, 0, large_finite_val) #将所有预测值限制在 0 和 large_finite_val 之间，确保数值的合理性。
    print(f"最终预测价格 (前 5 个): {test_pred_price[:5]}")

except Exception as e:
    import traceback
    print(f"最终预测出错: {e}")
    traceback.print_exc()
    median_fallback = np.median(y_train_price_no_outliers) if len(y_train_price_no_outliers)>0 else 0
    print(f"预测失败，回退到预测中位数: {median_fallback}")
    test_pred_price = np.full(X_test_transformed.shape[0], median_fallback) #创建一个数组，其长度与测试集样本数相同，并将所有预测值都设置为这个 median_fallback

# 🔟➕3️⃣ 生成提交文件 (使用您已修正的路径)
submission_df = pd.DataFrame({
    "ID": test_data['ID'].values if 'ID' in test_data.columns and len(test_data['ID']) == X_test_transformed.shape[0] else np.arange(X_test_transformed.shape[0]),
    "prediction": test_pred_price
})
submission_df.to_csv("prediction_price.csv", index=False, encoding="utf-8-sig")
print("🎯 Prediction file saved as **prediction_price.csv** (使用逗号分隔)")

# 🔟➕4️⃣ 保存模型性能表 (使用您已修正的路径)
result_df.to_csv("performance_table_price.csv", index=False, encoding="utf-8-sig")
print("📊 Model performance saved as **performance_table_price.csv** (使用逗号分隔)")

✅ 训练集加载成功，编码: utf-8-sig，分隔符: Comma。
✅ 测试集加载成功，编码: utf-8-sig，分隔符: Comma。
Train shape: (103871, 55)
Test shape: (34017, 55)

--- 离群值处理结果 ---
原始样本数: 103871
移除离群值后样本数: 102726 (请报告此数字)

--- 修正后的特征 ---
数值特征 (20): ['建筑面积', '套内面积', 'lon', 'lat', '房屋总数', '楼栋总数', '绿 化 率', '容 积 率', '物 业 费_均值', '燃气费_均值', '供热费_均值', '停车费用_均值', '建筑年代_均值', '停车位_均值', '总楼层数', '室', '厅', '卫', '套内面积_比', '容积率_sq']
分类特征 (19): ['环线', '供电', '供水', '年份', '梯户比例', '产权所属', '房屋朝向', '物业类别', '城市', '板块', '交易权属', '环线位置', '建筑结构', '区域', '装修情况', '楼层位置', '配备电梯', '区县', '供暖']

--- 步骤 B (优化): 立即执行预处理 ---
--- Transforming Data ---
Data transformed. Shape: (102726, 418)

--- 步骤 9 (优化): 在已转换数据上运行 GSCV 和评估 ---

--- 开始处理: OLS ---
--- 拟合 (无 GSCV): OLS ---
--- OLS 手动 CV (无 GSCV) ---
--- OLS CV MAE: 410888.11 ---
  Intercept: 14.3715
  Coefficient Stats: Mean=-0.0101, Std=0.7568

--- 开始处理: LASSO ---
--- 运行 GridSearchCV: LASSO ---
Fitting 6 folds for each of 3 candidates, totalling 18 fits
✅ LASSO Best Params found: {'alpha': np.float64(0.0001)}
--- LA

## 关于租价预测

In [65]:
# 1️⃣ 环境准备
import pandas as pd
import numpy as np
import re
import warnings
# Add scipy sparse matrix check
from scipy.sparse import issparse # 从 scipy.sparse 模块导入 issparse 函数，用于检查数据是否是稀疏矩阵格式
from sklearn.model_selection import train_test_split, KFold, GridSearchCV 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline # Pipeline 用于定义转换器和 GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer # 导入 make_scorer
from sklearn.impute import SimpleImputer
from sklearn.base import clone # 需要 clone

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

# 🎯自定义指标函数
# (calculate_mae_rmse_original 函数保持不变，包含健壮性处理) 
def calculate_mae_rmse_original(y_log_true, y_log_pred):
    mae, rmse = np.nan, np.nan
    try:
        y_log_true_np = np.asarray(y_log_true).flatten()
        y_log_pred_np = np.asarray(y_log_pred).flatten()

        log_max = np.log1p(np.finfo(np.float64).max / 10)
        y_log_true_np = np.nan_to_num(y_log_true_np, nan=0.0, posinf=log_max, neginf=-700)
        y_log_pred_np = np.nan_to_num(y_log_pred_np, nan=0.0, posinf=log_max, neginf=-700) 
        y_log_true_np = np.clip(y_log_true_np, -700, log_max)
        y_log_pred_np = np.clip(y_log_pred_np, -700, log_max)

        y_price_pred = np.expm1(y_log_pred_np) #进行指数运算的逆操作：使用 np.expm1(x) 函数（计算 exp(x) - 1）将清理过的**预测对数值**转换回**原始价格尺度**。这是 np.log1p() 的精确逆运算。
        y_price_true = np.expm1(y_log_true_np)
        y_price_pred[y_price_pred < 0] = 0

        large_finite_val = np.finfo(np.float64).max / 10
        y_price_pred = np.nan_to_num(y_price_pred, nan=0.0, posinf=large_finite_val, neginf=0.0)
        y_price_true = np.nan_to_num(y_price_true, nan=0.0, posinf=large_finite_val, neginf=0.0)
        y_price_pred = np.clip(y_price_pred, 0, large_finite_val)
        y_price_true = np.clip(y_price_true, 0, large_finite_val)

        if np.isnan(y_price_pred).any() or np.isnan(y_price_true).any():
             y_price_pred = np.nan_to_num(y_price_pred, nan=0.0)
             y_price_true = np.nan_to_num(y_price_true, nan=0.0)

        mae = mean_absolute_error(y_price_true, y_price_pred)
        mse = mean_squared_error(y_price_true, y_price_pred)
        if mse < 0 or not np.isfinite(mse): rmse = np.nan
        else: rmse = np.sqrt(mse)
        return mae, rmse
    except (ValueError, OverflowError, TypeError) as e:
        return np.nan, np.nan

# 自定义 MAE 评分器 (用于 GridSearchCV)
def mae_scorer(estimator, X, y_log_true):
    """评分器：返回负 MAE (需最大化)"""
    y_log_pred = estimator.predict(X)
    mae, _ = calculate_mae_rmse_original(y_log_true, y_log_pred)
    if np.isnan(mae):
        return -np.finfo(np.float64).max # 返回极差分数
    return -mae

# ⚙️ 复杂特征清理函数 (核心修正：适配 RENT 并保留关键特征)
def clean_complex_features(df):
    """执行单位转换、正则提取和初始特征清理。已针对 RENT 数据集列名适配。"""
    df = df.copy()

    # 1. 单位转换
    def clean_unit(series, unit):
        cleaned = series.astype(str).str.replace(unit, '', regex=False).str.strip()
        cleaned = cleaned.replace(['', '暂无', 'null', 'None'], np.nan)
        return pd.to_numeric(cleaned, errors='coerce')

    # *** 适配 RENT: '面积', '房屋总数', '楼栋总数' ***
    for col, unit in [('面积', '㎡'), ('房屋总数', '户'), ('楼栋总数', '栋')]:
        if col in df.columns:
            df[col] = clean_unit(df[col], unit)
    if '绿 化 率' in df.columns:
        df['绿 化 率'] = clean_unit(df['绿 化 率'], '%')
        if df['绿 化 率'] is not None:
            df['绿 化 率'] = df['绿 化 率'] / 100

    # 2. 提取数字范围均值
    def extract_avg_value(text):
        if pd.isna(text) or str(text).strip() in ['暂无', 'null', 'None','']: return np.nan
        match = re.findall(r"(\d+(?:\.\d+)?)", str(text)) ## 使用正则表达式查找文本中的所有数字（包括整数和小数）
        nums = [float(n) for n in match]
        return np.mean(nums) if nums else np.nan

    # 适配 RENT: 确保 '停车位' 和 '停车费用' 都被清理
    for col in ['物 业 费', '燃气费', '供热费', '停车费用', '建筑年代', '停车位']:
        if col in df.columns:
            df[f'{col}_均值'] = df[col].apply(extract_avg_value)
            df = df.drop(columns=[col], errors='ignore')
    
    for col in df.filter(like='_均值').columns:
         if df[col].isnull().any():
             median_val = df[col].dropna().median()
             if pd.isna(median_val): median_val = 0
             df[col] = df[col].fillna(median_val)

    # 3. 提取楼层 (适配 RENT 列名 '楼层' 和格式 "低楼层/18层")
    if '楼层' in df.columns:
        df['楼层位置'] = df['楼层'].astype(str).str.extract(r'([A-Za-z\u4e00-\u9fa5]+)').fillna('未知')
        df['总楼层数'] = pd.to_numeric(df['楼层'].astype(str).str.extract(r'/(\d+)层')[0], errors='coerce').fillna(0).astype(int)
        df = df.drop(columns=['楼层'])
    
    # 4. 提取户型 (适配 RENT 列名 '户型')
    if '户型' in df.columns:
        df['室'] = pd.to_numeric(df['户型'].astype(str).str.extract(r'(\d+)室')[0], errors='coerce').fillna(0).astype(int)
        df['厅'] = pd.to_numeric(df['户型'].astype(str).str.extract(r'(\d+)厅')[0], errors='coerce').fillna(0).astype(int)
        df['卫'] = pd.to_numeric(df['户型'].astype(str).str.extract(r'(\d+)卫')[0], errors='coerce').fillna(0).astype(int)
        df = df.drop(columns=['户型'])

    # 5. 交互项/多项式特征 (适配 RENT：仅使用 '面积' 和 '容 积 率')
    if '面积' in df.columns:
        df['面积'] = pd.to_numeric(df['面积'], errors='coerce') # 确保 '面积' 是数值
        df['面积_sq'] = df['面积'] ** 2
    if '容 积 率' in df.columns:
        df['容 积 率'] = pd.to_numeric(df['容 积 率'], errors='coerce').fillna(df['容 积 率'].median())
        df['容积率_sq'] = df['容 积 率'] ** 2
    
    # 修正：对所有数值列填充中位数
    for col in df.select_dtypes(include=np.number).columns:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())

    # 6. 删除列 (RENT 列表)
    # 核心修正：保留 '城市', '板块', '区县', 'lon', 'lat' 
    drop_cols = [
        # RENT 独有需删除的特征
        '装修', '交易时间', '付款方式', '租赁方式', '租期', '配套设施', 
        # '城市', '区域', '板块', '区县', # <-- 保留
        # 'lon', 'lat', # <-- 保留
        # '房屋总数', '停车位_均值', # <-- 保留
        '核心卖点', '户型介绍', '周边配套', '交通出行', '客户反馈', '产权描述', '房屋用途', 
        'coord_x', 'coord_y', '物业办公电话', '开发商', '物业公司', '房屋年限',
        '停车费用'
    ]
    df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

    # 填充缺失分类特征
    for col in df.select_dtypes(include='object').columns:
         df[col] = df[col].fillna('缺失')

    return df


# 2️⃣ 数据读取 (RENT 文件)
TRAIN_SNIPPET = "ruc_Class25Q2_train_rent.csv"
TEST_SNIPPET = "ruc_Class25Q2_test_rent.csv"
TARGET_NAME = "Price" 

TRAIN_ENCODING = 'UTF-8'
TEST_ENCODING = 'UTF-8'

train_rent = None
test_rent = None

# 核心修正：RENT 训练集优先尝试 Comma (',') 分隔符
# 1. 加载训练集
try:
    train_rent = pd.read_csv(TRAIN_SNIPPET, low_memory=False, encoding=TRAIN_ENCODING, sep=',')
    print(f"✅ 训练集加载成功，编码: {TRAIN_ENCODING}，分隔符: Comma。")
except Exception as e:
    try:
        train_rent = pd.read_csv(TRAIN_SNIPPET, low_memory=False, encoding=TRAIN_ENCODING, sep='\t')
        print(f"✅ 训练集加载成功，编码: {TRAIN_ENCODING}，分隔符: Tab。")
    except Exception as e_inner:
        print(f"❌ 训练集加载失败。错误: {e_inner}")
        raise RuntimeError("无法加载 RENT 模型的训练集。")

# 2. 加载测试集 (修正：RENT 测试集也优先尝试 Comma (','))
try:
    test_rent = pd.read_csv(TEST_SNIPPET, low_memory=False, encoding=TEST_ENCODING, sep=',')
    print(f"✅ 测试集加载成功，编码: {TEST_ENCODING}，分隔符: Comma。")
except Exception:
    try:
        test_rent = pd.read_csv(TEST_SNIPPET, low_memory=False, encoding=TEST_ENCODING, sep='\t')
        print(f"✅ 测试集加载成功，编码: {TEST_ENCODING}，分隔符: Tab。")
    except Exception as e_inner:
         print(f"❌ 测试集加载失败。错误: {e_inner}")
         raise RuntimeError("无法加载 RENT 模型的测试集。")

train_data = train_rent
test_data = test_rent


# 3️⃣ 初步检查与清洗
# (清洗逻辑保持不变)
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)

target = TARGET_NAME
if target not in train_data.columns:
    raise ValueError(f"目标变量 '{target}' 不在训练集列中。")

KEEP_COLS = [target, "物 业 费", "停车费用"] # 原始文本列
leak_cols_potential = [
    c for c in train_data.columns
    if "comm" in c.lower() or "price" in c.lower() or "rent" in c.lower() 
]
leak_cols = [c for c in leak_cols_potential if c not in KEEP_COLS] # 从潜在泄露列中，排除掉KEEP_COLS中指定的列，得到最终要删除的泄露列列表。

train_data.drop(columns=[col for col in leak_cols if col in train_data.columns],
                inplace=True, errors='ignore')
#从训练数据中删除leak_cols列表里存在的列。inplace=True表示直接在原 DataFrame上修改。
test_data.drop(columns=[col for col in leak_cols if col in test_data.columns],
               inplace=True, errors='ignore')

all_data = pd.concat([train_data.drop(columns=[target]), test_data], keys=['train', 'test'])
all_data_cleaned = clean_complex_features(all_data)

for col in all_data_cleaned.select_dtypes(include='object').columns:
    all_data_cleaned[col] = all_data_cleaned[col].astype('category')
    
X_train_raw = all_data_cleaned.loc['train'].copy()
X_test_final = all_data_cleaned.loc['test'].copy()

y_train_log = np.log1p(train_data[target])
y_train_price = train_data[target]

# 4️⃣ 离群值处理
Q1 = y_train_log.quantile(0.25)
Q3 = y_train_log.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier_mask = (y_train_log >= lower_bound) & (y_train_log <= upper_bound)
X_train_no_outliers = X_train_raw.loc[outlier_mask].copy()
y_train_log_no_outliers = y_train_log.loc[outlier_mask].copy()
y_train_price_no_outliers = y_train_price.loc[outlier_mask].copy()

initial_count = len(train_data)
final_count = len(X_train_no_outliers)
print(f"\n--- 离群值处理结果 ---")
print(f"原始样本数: {initial_count}")
print(f"移除离群值后样本数: {final_count} (请报告此数字)")

# 5️⃣ 训练集与验证集划分(保持pandas类型)
X_train, X_valid, y_train_log_split, y_valid_log_split = train_test_split(
    X_train_no_outliers, y_train_log_no_outliers, test_size=0.2, random_state=111
)
_, _, y_train_price_split, y_valid_price_split = train_test_split(
    X_train_no_outliers, y_train_price_no_outliers, test_size=0.2, random_state=111
)

# 6️⃣ 区分变量类型 (在完整无离群值数据上定义)
# 核心修正：手动定义哪些“数字”列应该是分类的
auto_numeric_features = X_train_no_outliers.select_dtypes(include=[np.number]).columns.tolist()
auto_categorical_features = X_train_no_outliers.select_dtypes(exclude=[np.number]).columns.tolist()

known_categorical_numerics = ['城市', '区域', '板块', '区县', '年份'] # '区域' 在 RENT 数据中可能不存在

categorical_features = list(set(auto_categorical_features + [col for col in known_categorical_numerics if col in X_train_no_outliers.columns]))
numeric_features = [f for f in auto_numeric_features if f not in known_categorical_numerics]
# 确保 ID 列（如果存在）不被用作特征
if 'ID' in numeric_features:
    numeric_features.remove('ID')
if 'ID' in categorical_features:
    categorical_features.remove('ID')

print(f"\n--- 修正后的特征 ---")
print(f"数值特征 ({len(numeric_features)}): {numeric_features}")
print(f"分类特征 ({len(categorical_features)}): {categorical_features}")

# 7️⃣ 特征预处理 Pipeline 定义
# 核心修正：对数值特征先 Log 变换(处理非负)，再Impute和 Scale
log_transformer = FunctionTransformer(lambda x: np.log1p(np.maximum(x, 0)), validate=False)

numeric_transformer = Pipeline([
    ('imputer1', SimpleImputer(strategy='median')),
    ('logtransform', log_transformer),
    ('imputer2', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 保持放松的 OHE 限制 (max_categories=50)
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='缺失')),
    ('astype_str', FunctionTransformer(lambda x: x.astype(str), validate=False)), # 确保 OHE 接收字符串
    ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=50)) # 独热编码
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    sparse_threshold=0.3
)

# 8️⃣ 模型定义与超参数 (用于 GridSearchCV - 极速版)
# 定义基础模型
models_base = {
    'OLS': LinearRegression(),
    'LASSO': Lasso(max_iter=10000, random_state=111),
    'Ridge': Ridge(max_iter=10000, random_state=111, solver='sag', tol=1e-3), # 使用 'sag' 求解器
    'ElasticNet': ElasticNet(max_iter=10000, random_state=111)
}
# 定义最小化的参数网格
param_grids = {
    'LASSO': {'model__alpha': np.logspace(-4, 0, 3)},
    'Ridge': {'model__alpha': np.logspace(-2, 2, 3)},
    'ElasticNet': {'model__alpha': np.logspace(-4, 0, 2), 'model__l1_ratio': [0.1, 0.9]},
}

results = []
best_params_found = {} # 存储 GridSearchCV 找到的最佳参数
fitted_models = {} # 存储最终拟合的模型 (不是 Pipeline)

# 9️⃣ 模型训练与评估 (GridSearchCV 调优 + 分离预处理 + 手动 CV 评估)
print("\n--- 9. 模型训练、超参数调优 (GridSearchCV) 与评估 (手动 CV) ---")
full_data_X = X_train_no_outliers
full_data_y_log = y_train_log_no_outliers
mean_price = np.mean(y_train_price_no_outliers)
kf = KFold(n_splits=6, shuffle=True, random_state=111)

# 创建自定义评分器实例
custom_mae_scorer = make_scorer(mae_scorer, greater_is_better=False)

# 步骤 A: 使用 GridSearchCV 寻找最佳参数
print("\n--- 步骤 A: 使用 GridSearchCV 寻找最佳参数 ---")
for name, model_base in models_base.items():
    pipe_for_gridsearch = Pipeline(steps=[('preprocessor', preprocessor), ('model', model_base)])
    best_params_found[name] = {}

    if name in param_grids:
        print(f"\n--- 开始 GridSearchCV: {name} ---")
        gscv = GridSearchCV(
            pipe_for_gridsearch, param_grids[name], scoring=custom_mae_scorer,
            cv=kf, n_jobs=-1, verbose=1
        )
        try:
            gscv.fit(full_data_X, full_data_y_log)
            best_params_found[name] = gscv.best_params_
            print(f"✅ {name} Best Params found: {best_params_found[name]}")
        except Exception as e:
            print(f"❌ {name} GridSearchCV failed: {e}. 将使用默认参数。")
            best_params_found[name] = {}
    else:
        print(f"✅ {name} 不需要 GridSearchCV。")

# 步骤 B: 分离预处理
print("\n--- 步骤 B: 分离预处理 ---")
# 1. 拟合预处理器
preprocessor_final = clone(preprocessor)
preprocessor_final.fit(X_train_no_outliers)
# 2. 转换数据
print("--- Transforming Data ---")
try:
    X_full_transformed = preprocessor_final.transform(X_train_no_outliers)
    X_train_transformed = preprocessor_final.transform(X_train)
    X_valid_transformed = preprocessor_final.transform(X_valid)
    X_test_transformed = preprocessor_final.transform(X_test_final)
    full_data_y_log_np = y_train_log_no_outliers.values # 转换为 NumPy array
    print(f"Data transformed. Shape: {X_full_transformed.shape}")
except Exception as e:
    print(f"❌ 错误: 数据转换失败: {e}")
    raise

# 步骤 C: 使用最佳参数训练模型并进行手动 CV 评估
print("\n--- 步骤 C: 使用最佳参数训练模型并手动 CV 评估 ---")
for name, model_base in models_base.items():
    params_for_model = {k.split('__')[1]: v for k, v in best_params_found[name].items()}
    
    final_model = clone(model_base)
    # 保 Ridge 使用 'sag' 求解器
    if name == 'Ridge' and 'solver' not in params_for_model:
        # 如果 GridSearchCV 没搜索 solver，手动添加 'sag'
        final_model.set_params(solver='sag', tol=1e-3, **params_for_model)
    else:
        final_model.set_params(**params_for_model)

    # 1. 在完整的转换后数据上拟合最终模型
    try:
        # (训练前检查数据) 
        is_sparse_full = issparse(X_full_transformed) # 检查数据是否稀疏
        if (is_sparse_full and (np.isnan(X_full_transformed.data).any() or np.isinf(X_full_transformed.data).any())) or \
           (not is_sparse_full and (np.isnan(X_full_transformed).any() or np.isinf(X_full_transformed).any())):
             raise ValueError(f"NaN or Inf detected in X_full_transformed before fitting {name}")
        if np.isnan(full_data_y_log_np).any() or np.isinf(full_data_y_log_np).any():
             raise ValueError(f"NaN or Inf detected in y_log before fitting {name}")

        print(f"--- Fitting {name} ---")
        current_model = final_model.fit(X_full_transformed, full_data_y_log_np)
        print(f"✅ {name} Trained using best/default params: {current_model.get_params()}")
        fitted_models[name] = current_model

        # 保持系数诊断
        print(f"   Intercept: {current_model.intercept_:.4f}")
        if hasattr(current_model, 'coef_'): # 检查模型是否有 coef_ 属性
             coefs = current_model.coef_
             if issparse(coefs): coefs = coefs.toarray().flatten() # 如果系数是稀疏的，转换为密集数组
             print(f"   Coefficient Stats: Mean={np.mean(coefs):.4f}, Std={np.std(coefs):.4f}, Min={np.min(coefs):.4f}, Max={np.max(coefs):.4f}")
             if np.allclose(coefs, 0, atol=1e-5): # 检查系数是否几乎都为零（Lasso 可能出现）
                 print(f"   ⚠️ 警告: 模型 {name} 的系数几乎全为零！")

    except Exception as e:
        print(f"❌ {name} Failed to train on full data: {e}")
        fitted_models[name] = None
        results.append({
            "Metrics": name, "In-sample MAE": np.nan, "In-sample RMAE": np.nan,
            "Out-of-sample MAE": np.nan, "Out-of-sample RMAE": np.nan,
            "Cross-validation MAE": np.nan, "Cross-validation RMAE": np.nan
        })
        continue

    current_model = fitted_models[name]
    if current_model is None: continue

    # 性能计算 (原始价格水平)
    # 1. In-sample
    try:
        y_pred_train_log = current_model.predict(X_train_transformed)
        mae_train, _ = calculate_mae_rmse_original(y_train_log_split, y_pred_train_log)
    except Exception as e: mae_train = np.nan

    # 2. Out-of-sample
    try:
        y_pred_valid_log = current_model.predict(X_valid_transformed)
        mae_valid, _ = calculate_mae_rmse_original(y_valid_log_split, y_pred_valid_log)
    except Exception as e: mae_valid = np.nan

    # 手动实现 6-Fold Cross-Validation (在转换后数据上) 
    print(f"--- 开始手动 6-Fold Cross-Validation: {name} ---")
    fold_mae_scores = [] # 用于存储每一折的 MAE
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_full_transformed, full_data_y_log_np)): # 遍历 6 个折的索引
        X_train_fold, X_val_fold = X_full_transformed[train_idx], X_full_transformed[val_idx]      # 获取当前折的训练和验证数据
        y_train_fold, y_val_fold = full_data_y_log_np[train_idx], full_data_y_log_np[val_idx]      # 获取对应的目标值

        fold_model = clone(model_base).set_params(**params_for_model) # 创建一个新的模型实例并设置参数
        if name == 'Ridge' and 'solver' not in params_for_model:
             fold_model.set_params(solver='sag', tol=1e-3)

        try:
            # (数据检查是否有 NaN/Inf)
            is_sparse_train_fold = issparse(X_train_fold)
            if (is_sparse_train_fold and (np.isnan(X_train_fold.data).any() or np.isinf(X_train_fold.data).any())) or \
               (not is_sparse_train_fold and (np.isnan(X_train_fold).any() or np.isinf(X_train_fold).any())) or \
               np.isnan(y_train_fold).any() or np.isinf(y_train_fold).any():
                 raise ValueError("NaN or Inf in fold training data")

            fold_model.fit(X_train_fold, y_train_fold) # 在当前折的训练数据上拟合模型

            is_sparse_val_fold = issparse(X_val_fold)
            if (is_sparse_val_fold and (np.isnan(X_val_fold.data).any() or np.isinf(X_val_fold.data).any())) or \
               (not is_sparse_val_fold and (np.isnan(X_val_fold).any() or np.isinf(X_val_fold).any())):
                raise ValueError("NaN or Inf in fold validation data for prediction")

            y_pred_fold_log = fold_model.predict(X_val_fold) # 在当前折的验证数据上预测
            mae_fold, _ = calculate_mae_rmse_original(y_val_fold, y_pred_fold_log) # 计算当前折的 MAE
            fold_mae_scores.append(mae_fold)
        except Exception as e:
            # print(f"  Fold {fold+1} failed: {e}") # 可选
            fold_mae_scores.append(np.nan)

    cv_mae = np.nanmean(fold_mae_scores) if np.isfinite(fold_mae_scores).any() else np.nan
    print(f"--- {name} Manual CV finished. Average MAE: {cv_mae if not np.isnan(cv_mae) else 'NaN'} ---")

    results.append({
        "Metrics": name,
        "In-sample MAE": mae_train,
        "In-sample RMAE": mae_train / mean_price if mean_price != 0 else np.nan,
        "Out-of-sample MAE": mae_valid,
        "Out-of-sample RMAE": mae_valid / mean_price if mean_price != 0 else np.nan,
        "Cross-validation MAE": cv_mae,
        "Cross-validation RMAE": cv_mae / mean_price if mean_price != 0 and not np.isnan(cv_mae) else np.nan
    })

# 🔟 输出结果表
result_df = pd.DataFrame(results)

# 核心修正：根据 Cross-validation MAE 自动选择最佳模型
print("\n--- Determining Best Linear Model (based on Cross-validation MAE) ---")
valid_cv_mae_indices = result_df["Cross-validation MAE"].dropna().index
best_model_name_final = 'OLS' # Default fallback
if not valid_cv_mae_indices.empty:
    best_model_idx = result_df.loc[valid_cv_mae_indices, "Cross-validation MAE"].idxmin() # *** 使用 CV MAE 排序 ***
    best_model_name_orig = result_df.loc[best_model_idx, "Metrics"]
    
    if best_model_name_orig in fitted_models and fitted_models[best_model_name_orig] is not None:
        result_df.loc[result_df['Metrics'] == best_model_name_orig, 'Metrics'] = 'Best Linear Model'
        best_model_name_final = best_model_name_orig # 使用实际的最佳模型名称
        print(f"最佳模型（基于 CV MAE）: {best_model_name_final}")
    else:
        print(f"警告: 确定的最佳模型 '{best_model_name_orig}' (CV MAE) 训练失败或不存在。正在回退。")
        # 回退逻辑：寻找 OOS MAE 最小的
        valid_oos_mae_indices = result_df["Out-of-sample MAE"].dropna().index
        if not valid_oos_mae_indices.empty:
            best_oos_model_idx = result_df.loc[valid_oos_mae_indices, "Out-of-sample MAE"].idxmin()
            best_model_name_final = result_df.loc[best_oos_model_idx, "Metrics"]
            print(f"回退到 OOS MAE 最佳模型: {best_model_name_final}")
            if best_model_name_final in result_df['Metrics'].values:
                 result_df.loc[result_df['Metrics'] == best_model_name_final, 'Metrics'] = 'Best Linear Model (Fallback OOS)'
            else: 
                 pass 
        else:
             print("错误: 所有模型在 OOS 和 CV MAE 上均失败。回退到 OLS。")
             best_model_name_final = 'OLS' # 最后手段
else:
    print("警告: 所有模型在 Cross-validation MAE 上失败。回退到 OOS MAE 最佳模型。")
    valid_oos_mae_indices = result_df["Out-of-sample MAE"].dropna().index
    if not valid_oos_mae_indices.empty:
        best_oos_model_idx = result_df.loc[valid_oos_mae_indices, "Out-of-sample MAE"].idxmin()
        best_model_name_final = result_df.loc[best_oos_model_idx, "Metrics"]
        print(f"回退到 OOS MAE 最佳模型: {best_model_name_final}")
        if best_model_name_final in result_df['Metrics'].values: # Check if it's not already 'Best Linear Model'
             result_df.loc[result_df['Metrics'] == best_model_name_final, 'Metrics'] = 'Best Linear Model (Fallback OOS)'
    else:
        print("错误: 所有模型在 OOS 和 CV MAE 上均失败。回退到 OLS。")
        best_model_name_final = 'OLS'


result_df_formatted = result_df.copy()
for col in result_df_formatted.columns:
    if 'MAE' in col:
        result_df_formatted[col] = result_df_formatted[col].apply(lambda x: f"{x:.2f}" if pd.notna(x) else 'NaN')
    elif 'RMAE' in col:
        result_df_formatted[col] = result_df_formatted[col].apply(lambda x: f"{x:.4f}" if pd.notna(x) else 'NaN')

print("\n--- 10. Model Performance Summary (MAE and RMAE for Original Rent Level) ---")
print(result_df_formatted.to_markdown(index=False))


# 🔟➕1️⃣ 选择最优模型
# 核心修正：使用由 CV MAE 自动选择的 best_model_name_final
print(f"\n--- Selecting Final Model based on CV MAE: {best_model_name_final} ---")
if best_model_name_final in fitted_models and fitted_models[best_model_name_final] is not None:
    best_model = fitted_models[best_model_name_final]
    best_model_name_predict = best_model_name_final # 记录实际使用的模型
else:
     # 如果 CV MAE 选择的模型失败了
     available_models = [m for m in ['Ridge', 'OLS', 'LASSO', 'ElasticNet'] if m in fitted_models and fitted_models[m] is not None]
     if available_models:
         best_model_name_predict = available_models[0]
         best_model = fitted_models[best_model_name_predict]
         print(f"警告: CV MAE 最佳模型 '{best_model_name_final}' 失败或不可用，回退到第一个可用的模型 (优先 Ridge/OLS): {best_model_name_predict}。")
     else:
        raise RuntimeError("所有模型训练失败，无法继续进行预测。")


# 🔟➕2️⃣ 预测测试集 (使用转换后的 X_test)
try:
    print(f"\n--- 使用最终模型进行预测: {best_model_name_predict} ---")
    # (预测前检查)
    input_data_is_sparse = issparse(X_test_transformed)
    input_data_nan = (input_data_is_sparse and np.isnan(X_test_transformed.data).any()) or \
                     (not input_data_is_sparse and np.isnan(X_test_transformed).any())
    input_data_inf = (input_data_is_sparse and np.isinf(X_test_transformed.data).any()) or \
                     (not input_data_is_sparse and np.isinf(X_test_transformed).any())
    if input_data_nan or input_data_inf:
        # 尝试最后一次清理
        print("警告：最终预测前在 X_test_transformed 中检测到 NaN/Inf，尝试填充 0。")
        if input_data_is_sparse:
            X_test_transformed.data = np.nan_to_num(X_test_transformed.data, nan=0.0, posinf=0.0, neginf=0.0)
        else:
            X_test_transformed = np.nan_to_num(X_test_transformed, nan=0.0, posinf=0.0, neginf=0.0)

    test_pred_log = best_model.predict(X_test_transformed)

    # 保持预测后诊断
    print(f"预测的对数价格 (前 5 个): {test_pred_log[:5]}")
    finite_preds = test_pred_log[np.isfinite(test_pred_log)]
    if finite_preds.size > 0:
        print(f"对数价格统计 (有限值): Min={np.min(finite_preds):.2f}, Max={np.max(finite_preds):.2f}, Mean={np.mean(finite_preds):.2f}, Std={np.std(finite_preds):.2f}")
        if np.allclose(finite_preds, finite_preds[0], atol=1e-6): # 使用更小的容忍度检查
            print("⚠️ 警告: 所有预测的对数价格几乎相同！")
    else:
        print("⚠️ 警告: 预测的对数价格不包含任何有限值！")
    if np.isnan(test_pred_log).any() or np.isinf(test_pred_log).any():
        print("❌ 警告: 预测的对数价格包含 NaN 或 Inf！正在尝试清理...")
        median_log_pred = np.nanmedian(finite_preds) if finite_preds.size > 0 else np.log1p(np.median(y_train_price_no_outliers))
        test_pred_log = np.nan_to_num(test_pred_log, nan=median_log_pred, posinf=np.log1p(np.finfo(np.float64).max / 10), neginf=-700)

    test_pred_price = np.expm1(test_pred_log)
    test_pred_price[test_pred_price < 0] = 0
    large_finite_val = np.finfo(np.float64).max / 10
    median_fallback = np.median(y_train_price_no_outliers) if len(y_train_price_no_outliers)>0 else 0
    test_pred_price = np.nan_to_num(test_pred_price, nan=median_fallback,
                                   posinf=large_finite_val, neginf=0.0)
    test_pred_price = np.clip(test_pred_price, 0, large_finite_val)

    # 保持最终价格预测诊断
    print(f"最终预测价格 (前 5 个): {test_pred_price[:5]}")
    if test_pred_price.size > 0 and np.allclose(test_pred_price, test_pred_price[0], atol=1e-2): # 允许微小差异
         print("❌ 警告: 所有最终预测价格几乎相同！")


except Exception as e:
    # ... (Fallback 代码保持不变) ...
    import traceback
    print(f"最终预测出错: {e}")
    traceback.print_exc() # <--- *** 核心修正：取消注释以打印完整错误 ***
    median_fallback = np.median(y_train_price_no_outliers) if len(y_train_price_no_outliers)>0 else 0
    print(f"预测失败，回退到预测中位数: {median_fallback}")
    test_pred_price = np.full(X_test_transformed.shape[0], median_fallback)

# 🔟➕3️⃣ 生成提交文件
# (代码不变)
submission_df = pd.DataFrame({
    # 修正点：使用大写 "ID"
    "ID": test_data['ID'].values if 'ID' in test_data.columns and len(test_data['ID']) == X_test_transformed.shape[0] else np.arange(X_test_transformed.shape[0]),
    "prediction": test_pred_price
})

submission_df.to_csv("prediction_rent.csv", index=False, encoding="utf-8-sig")
print("🎯 Prediction file saved as **prediction_rent.csv** (使用逗号分隔)")

# 🔟➕4️⃣ 保存模型性能表
result_df.to_csv("performance_table_rent.csv", index=False, encoding="utf-8-sig")
print("📊 Model performance saved as **performance_table_rent.csv** (使用逗号分隔)")

✅ 训练集加载成功，编码: UTF-8，分隔符: Comma。
✅ 测试集加载成功，编码: UTF-8，分隔符: Comma。
Train shape: (98899, 46)
Test shape: (9773, 46)

--- 离群值处理结果 ---
原始样本数: 98899
移除离群值后样本数: 98205 (请报告此数字)

--- 修正后的特征 ---
数值特征 (19): ['面积', 'lon', 'lat', '房屋总数', '楼栋总数', '绿 化 率', '容 积 率', '物 业 费_均值', '燃气费_均值', '供热费_均值', '停车费用_均值', '建筑年代_均值', '停车位_均值', '总楼层数', '室', '厅', '卫', '面积_sq', '容积率_sq']
分类特征 (18): ['燃气', '用水', '朝向', '用电', '环线位置', '供电', '楼层位置', '物业类别', '城市', '建筑结构', '车位', '供水', '区县', '采暖', '年份', '板块', '电梯', '供暖']

--- 9. 模型训练、超参数调优 (GridSearchCV) 与评估 (手动 CV) ---

--- 步骤 A: 使用 GridSearchCV 寻找最佳参数 ---
✅ OLS 不需要 GridSearchCV。

--- 开始 GridSearchCV: LASSO ---
Fitting 6 folds for each of 3 candidates, totalling 18 fits
✅ LASSO Best Params found: {'model__alpha': np.float64(0.0001)}

--- 开始 GridSearchCV: Ridge ---
Fitting 6 folds for each of 3 candidates, totalling 18 fits
✅ Ridge Best Params found: {'model__alpha': np.float64(0.01)}

--- 开始 GridSearchCV: ElasticNet ---
Fitting 6 folds for each of 4 candidates, totalling 24 fi

## merge

In [77]:
import pandas as pd
import warnings

price_file = "prediction_price.csv"  
rent_file = "prediction_rent.csv"    
output_file = "prediction.csv"       
warnings.filterwarnings("ignore")
print(f"--- 开始合并 ---")

try:
    # 1. 读取房价预测文件
    print(f"正在读取 {price_file}...")
    df_price = pd.read_csv(price_file)
    print(f"✅ {price_file} 加载成功，包含 {len(df_price)} 行。")

    # 檢查必需的列是否存在
    if 'ID' not in df_price.columns or 'prediction' not in df_price.columns:
        raise ValueError(f"错误：{price_file} 文件缺少 'ID' 或 'prediction' 列。")

except FileNotFoundError:
    print(f"❌ 错误：找不到文件 {price_file}。请确保文件在当前目录下。")
    exit() # 退出腳本
except Exception as e:
    print(f"❌ 读取 {price_file} 时出错：{e}")
    exit()

try:
    # 2. 读取租金预测文件
    print(f"正在读取 {rent_file}...")
    df_rent = pd.read_csv(rent_file)
    print(f"✅ {rent_file} 加载成功，包含 {len(df_rent)} 行。")

    # 检查必需的列是否存在
    if 'ID' not in df_rent.columns or 'prediction' not in df_rent.columns:
        raise ValueError(f"错误：{rent_file} 文件缺少 'ID' 或 'prediction' 列。")

except FileNotFoundError:
    print(f"❌ 错误：找不到文件 {rent_file}。请确保文件在当前目录下。")
    exit()
except Exception as e:
    print(f"❌ 读取 {rent_file} 时出错：{e}")
    exit()

# 3. 合并
#    pd.concat 确认按行堆叠 (axis=0)
#    ignore_index=True 会重新生成一个连续索引 (0, 1, 2, ...)
print(f"正在合并两个文件...")
df_combined = pd.concat([df_price, df_rent], ignore_index=True)
total_rows = len(df_combined)
print(f"✅ 合并完成，总共 {total_rows} 行。")

# 可选：检查合并后的文件是否有重复的 ID 
if df_combined['ID'].duplicated().any():
    print(f"⚠️ 警告：合并后的文件中检测到重复的 ID！请检查您的原始预测文件。")
    duplicate_ids = df_combined[df_combined['ID'].duplicated()]['ID'].unique()
    print(f"   重复的 ID 示例: {list(duplicate_ids[:5])}...") # 只顯示前5個

# 4. 保存合并后的文件
try:
    print(f"正在保存合并后的文件到 {output_file}...")
    # index=False: 不将 DataFrame 的索引写入 CSV 文件
    df_combined[['ID', 'prediction']].to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"🎉 成功！合并后的提交文件已保存为 {output_file}。")
except Exception as e:
    print(f"❌ 保存文件 {output_file} 时出错：{e}")

print(f"--- 合并脚本执行完毕 ---")

--- 开始合并 ---
正在读取 prediction_price.csv...
✅ prediction_price.csv 加载成功，包含 34017 行。
正在读取 prediction_rent.csv...
✅ prediction_rent.csv 加载成功，包含 9773 行。
正在合并两个文件...
✅ 合并完成，总共 43790 行。
正在保存合并后的文件到 prediction.csv...
🎉 成功！合并后的提交文件已保存为 prediction.csv。
--- 合并脚本执行完毕 ---
