In [89]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from sklearn.base import clone 
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from geopy.distance import geodesic # 用于计算地理距离
from sklearn.cluster import KMeans # 用于地理聚类
from sklearn.preprocessing import KBinsDiscretizer # 用于特征分箱
import cn2an
import warnings

# 设置环境
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### 定义处理price各列的辅助函数

In [90]:
# --- 辅助函数：特征工程 ---

def handle_region(df, rare_threshold=0.03):
    """
    处理 '区域' 列:
    将出现频率低于 threshold 的类别合并为 '其他区域'
    """
    print("处理 [区域]...")
    value_counts = df['区域'].value_counts(normalize=True)
    rare_regions = value_counts[value_counts <= rare_threshold].index
    
    # 替换稀有类别
    df['区域'] = df['区域'].replace(rare_regions, '其他区域')
    print(f"合并了 {len(rare_regions)} 个稀有区域为 '其他区域'")
    return df

def handle_ring_road(df):
    """
    处理 '环线' 列:
    1. 填充NaN为 '未知'
    2. 进行有序编码
    """
    print("处理 [环线]...")
    df['环线'] = df['环线'].fillna('未知')
    
    # 定义环线的有序映射

    ring_map = {
        '二环内': 1,
        '二至三环': 1,
        '三至四环': 2,
        '四至五环': 2, 
        '五至六环': 3,
        '六环外': 4,
        '内环内':1,
        '内环至中环':2,
        '中环至外环':3,
        '内环至外环':3,
        '外环外':4,
        '未知': 4
    }
    
    # 使用 .map() 进行映射，未在map中出现的值用 5 填充
    df['环线_ordinal'] = df['环线'].map(ring_map).fillna(5).astype(int)
    
    # 删除原始列
    df = df.drop('环线', axis=1)
    return df
# --- 户型解析 ---
def handle_house_type(df, col_name, n_train): # 添加 n_train 参数
    """
    解析'房屋户型'列，提取'室', '厅', '卫'的数量。 (来自 demo.ipynb)
    使用训练集的中位数填充。
    """
    df = df.copy()
    print(f"  处理 [{col_name}]...")

    df[col_name] = df[col_name].astype(str).str.replace('房间', '室')
    layout_info = df[col_name].str.extract(r'(\d+)[室](?:(\d+)厅)?(?:.*?(\d+)卫)?', expand=True)
    layout_info.columns = ['室', '厅', '卫']
    df = pd.concat([df, layout_info], axis=1)

    for col in ['室', '厅', '卫']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    print(f"    正在计算 '室', '厅', '卫' 的中位数 (基于前 {n_train} 行)...")
    train_part = df.iloc[:n_train]
    medians = {}
    for col in ['室', '厅', '卫']:
        if col in df.columns:
            median_val = train_part[col].median()
            if pd.isna(median_val):
                if col == '室': median_val = 2.0
                elif col == '厅': median_val = 1.0
                else: median_val = 1.0
            print(f"      '{col}' 中位数: {median_val}")
            df[col].fillna(median_val, inplace=True)
            medians[col] = median_val

    for col in ['室', '厅', '卫']:
        df[col] = df[col].fillna(medians[col])
        # 0厅是合理的，0室0卫通常不合理，用中位数填充
        if col != '厅':
             df.loc[df[col] <= 0, col] = medians[col]

    df = df.drop(columns=[col_name], errors='ignore')
    print(f"    已提取 '室', '厅', '卫' 特征并填充缺失值。")
    return df

# --- 楼层处理 ---
def handle_floor(df, n_train, col='所在楼层'):
    """
    解析楼层列，提取'总楼层'和'楼层类别'。 (来自 demo.ipynb)
    """
    df_processed = df.copy()
    print(f"  处理 [{col}]...")

    if col not in df_processed.columns:
        print(f"    错误：列 '{col}' 不在DataFrame中，无法处理楼层。")
        return df

    print("    提取总楼层和当前楼层...")
    total_floors1 = df_processed[col].astype(str).str.extract(r'\(共(\d+)层\)', expand=False)
    total_floors2 = df_processed[col].astype(str).str.extract(r'/(\d+)(?:层|\))?$', expand=False)
    current_floor_ext = df_processed[col].astype(str).str.extract(r'^(\d+)/', expand=False)
    df_processed['总楼层'] = pd.to_numeric(total_floors1.fillna(total_floors2), errors='coerce')
    df_processed['当前楼层_temp'] = pd.to_numeric(current_floor_ext, errors='coerce')

    print("    提取明确的楼层类别...")
    df_processed['楼层类别_explicit'] = np.nan
    df_processed.loc[df_processed[col].astype(str).str.contains('高楼层', na=False), '楼层类别_explicit'] = '高楼层'
    df_processed.loc[df_processed[col].astype(str).str.contains('中楼层', na=False), '楼层类别_explicit'] = '中楼层'
    df_processed.loc[df_processed[col].astype(str).str.contains('低楼层', na=False), '楼层类别_explicit'] = '低楼层'
    df_processed.loc[df_processed[col].astype(str).str.contains('顶层', na=False), '楼层类别_explicit'] = '顶层'
    df_processed.loc[df_processed[col].astype(str).str.contains('底层', na=False), '楼层类别_explicit'] = '底层'
    df_processed.loc[df_processed[col].astype(str).str.contains('地下', na=False), '楼层类别_explicit'] = '地下室'

    print("    推断楼层类别...")
    df_processed['楼层类别_inferred'] = np.nan
    infer_mask = (df_processed['楼层类别_explicit'].isna()) & (df_processed['当前楼层_temp'].notna()) & (df_processed['总楼层'].notna()) & (df_processed['总楼层'] > 0)
    if infer_mask.any():
        current = df_processed.loc[infer_mask, '当前楼层_temp']
        total = df_processed.loc[infer_mask, '总楼层']
        ratio = current / total
        df_processed.loc[infer_mask, '楼层类别_inferred'] = '中楼层'
        df_processed.loc[infer_mask & (ratio <= 1/3), '楼层类别_inferred'] = '低楼层'
        df_processed.loc[infer_mask & (ratio >= 2/3), '楼层类别_inferred'] = '高楼层'
        df_processed.loc[infer_mask & (current == total), '楼层类别_inferred'] = '顶层'
        df_processed.loc[infer_mask & (current == 1), '楼层类别_inferred'] = '底层'

    print("    合并类别并填充缺失值...")
    df_processed['楼层类别'] = df_processed['楼层类别_explicit'].fillna(df_processed['楼层类别_inferred'])
    train_part = df_processed.iloc[:n_train]
    median_total_floors_train = train_part['总楼层'].median()
    if pd.isna(median_total_floors_train): median_total_floors_train = 18.0
    mode_floor_category_train = train_part['楼层类别'].mode()
    fill_category_train = mode_floor_category_train[0] if not mode_floor_category_train.empty else '中楼层'

    original_na_total_floors = df_processed['总楼层'].isnull().sum()
    if original_na_total_floors > 0:
        df_processed['总楼层'].fillna(median_total_floors_train, inplace=True)
        print(f"      填充了 '总楼层' 列的 {original_na_total_floors} 个缺失值 (使用训练集中位数 {median_total_floors_train:.0f})。")
    original_na_category = df_processed['楼层类别'].isnull().sum()
    if original_na_category > 0:
        df_processed['楼层类别'].fillna(fill_category_train, inplace=True)
        print(f"      填充了 '楼层类别' 列的 {original_na_category} 个缺失值 (使用训练集众数 '{fill_category_train}')。")
    df_processed['总楼层'] = pd.to_numeric(df_processed['总楼层'], errors='coerce').fillna(median_total_floors_train)

    print("    进行独热编码并清理临时列...")
    cols_to_drop = [col, '当前楼层_temp', '楼层类别_explicit', '楼层类别_inferred']
    df_processed = df_processed.drop(columns=cols_to_drop, errors='ignore')
    df_processed = pd.get_dummies(df_processed, columns=['楼层类别'], prefix='楼层', drop_first=False, dummy_na=False)
    print("    楼层信息处理完成。")
    return df_processed

In [91]:

def handle_building_structure(df):
    """
    处理 '建筑结构' 列:
    """
    print("处理 [建筑结构]...")

    structure_col = df['建筑结构'].copy()

    structure_col = structure_col.fillna('未知结构')
    structure_col = structure_col.replace('', '未知结构')
    
    # 2. 基于数据分布进行智能分组
    def group_building_structure(struct):
        if pd.isna(struct) or struct in ['', '未知结构', '（空白）']:
            return '未知结构'
        elif struct == '钢混结构':
            return '钢混结构'  # 最多的一类，单独保留
        elif struct in ['混合结构', '砖混结构']:
            return '混合砖混类'  # 中等数量的两类合并
        elif struct in ['框架结构', '钢结构']:
            return '框架钢构类' 
        elif struct == '砖木结构':
            return '其他稀有结构'  # 数量极少，归为其他
        else:
            return '其他结构'  
    
    df['建筑结构分组'] = structure_col.apply(group_building_structure)
    
    # 3. 创建结构稳定性特征,基于工程知识, 数值越高表示结构越稳定/现代
    structure_stability_map = {
        '钢混结构': 5,      # 最稳定现代
        '框架钢构类': 4,     # 现代结构
        '混合砖混类': 3,     # 中等稳定性
        '未知结构': 2,       # 设为中性
        '其他结构': 2,       # 设为中性
        '其他稀有结构': 1    # 传统/老旧结构
    }
    
    df['结构稳定性评分'] = df['建筑结构分组'].map(structure_stability_map)
    
    # 创建结构安全性特征（基于建筑规范）数值越高表示安全性越好
    structure_safety_map = {
        '钢混结构': 3,        # 安全性好
        '框架钢构类': 3,       # 安全性好
        '混合砖混类': 2,       # 安全性中等
        '未知结构': 1,         # 设为保守值
        '其他结构': 1,         # 设为保守值
        '其他稀有结构': 1      # 传统结构安全性相对较低
    }
    
    df['结构安全性评分'] = df['建筑结构分组'].map(structure_safety_map)
    df['建筑结构'] = structure_col
    
    print(f"建筑结构分组分布: {df['建筑结构分组'].value_counts().to_dict()}")
    return df

def handle_area(df, n_train):
    """
    专门处理建筑面积和套内面积列，并创建得房率特征
    """
    df_processed = df.copy()
    print(f"  处理 [建筑面积] 和 [套内面积]...")

    print("    清理面积单位并转为数值...")
    for col in ['建筑面积', '套内面积']:
        if col in df_processed.columns:
            df_processed[col] = pd.to_numeric(df_processed[col].astype(str).str.replace('㎡', '').str.strip(), errors='coerce')
            df_processed.loc[df_processed[col] < 5, col] = np.nan # 小于5平米视为异常

    print(f"    计算训练集 (前 {n_train} 行) 平均得房率...")
    train_part = df_processed.iloc[:n_train]
    valid_train_data = train_part[
        (train_part['套内面积'].notna()) & (train_part['套内面积'] > 0) &
        (train_part['建筑面积'].notna()) & (train_part['建筑面积'] > 0) &
        (train_part['套内面积'] <= train_part['建筑面积'])
    ].copy()

    efficiency_rate_train = 0.8
    if not valid_train_data.empty and valid_train_data['建筑面积'].sum() > 0:
        efficiency_rate_train = valid_train_data['套内面积'].sum() / valid_train_data['建筑面积'].sum()
        efficiency_rate_train = np.clip(efficiency_rate_train, 0.6, 1.0)
    print(f"      计算得到的训练集平均得房率: {efficiency_rate_train:.4f}")

    print("    估算/修正 '套内面积'...")
    impute_mask = (
        (df_processed['套内面积'].isna()) |
        (df_processed['套内面积'] / df_processed['建筑面积'] < 0.5) |
        (df_processed['套内面积'] / df_processed['建筑面积'] > 1.0)
    ) & df_processed['建筑面积'].notna() & (df_processed['建筑面积'] > 0)
    rows_to_impute = impute_mask.sum()
    if rows_to_impute > 0:
        df_processed.loc[impute_mask, '套内面积'] = df_processed.loc[impute_mask, '建筑面积'] * efficiency_rate_train
        print(f"      使用得房率估算了 {rows_to_impute} 行的 '套内面积'")

    print("    使用中位数填充 '建筑面积' 缺失值...")
    median_building_area = train_part['建筑面积'].median()
    if pd.isna(median_building_area): median_building_area = 90.0
    df_processed['建筑面积'].fillna(median_building_area, inplace=True)
    print(f"      建筑面积中位数: {median_building_area:.2f}")

    print("    再次检查并填充 '套内面积' 缺失值...")
    final_impute_mask = df_processed['套内面积'].isna() & df_processed['建筑面积'].notna() & (df_processed['建筑面积'] > 0)
    rows_to_impute_final = final_impute_mask.sum()
    if rows_to_impute_final > 0:
         df_processed.loc[final_impute_mask, '套内面积'] = df_processed.loc[final_impute_mask, '建筑面积'] * efficiency_rate_train
         print(f"      补充估算了 {rows_to_impute_final} 行的 '套内面积'")

    print("    创建 '得房率' 特征...")
    df_processed['得房率'] = np.where(
        df_processed['建筑面积'] > 0,
        df_processed['套内面积'] / df_processed['建筑面积'],
        np.nan
    )
    median_efficiency = df_processed.iloc[:n_train]['得房率'].median()
    if pd.isna(median_efficiency) or median_efficiency <= 0.5 or median_efficiency > 1.0:
        median_efficiency = efficiency_rate_train
    df_processed['得房率'].fillna(median_efficiency, inplace=True)
    df_processed['得房率'] = df_processed['得房率'].clip(0.5, 1.0)
    print(f"      最终 '得房率' 使用的中位数/填充值: {median_efficiency:.4f}")

    # 保留套内面积，因为可能有用
    # df_processed = df_processed.drop(columns=['套内面积'], errors='ignore')
    return df_processed

def handle_orientation(df):
    """
    处理 '房屋朝向' 列:
    进行 Multi-Hot 编码
    """
    print("处理 [房屋朝向]...")
    df['房屋朝向'] = df['房屋朝向'].fillna('未知')

    df['is_朝南'] = df['房屋朝向'].str.contains('南').astype(int)
    df['is_朝东'] = df['房屋朝向'].str.contains('东').astype(int)
    df['is_朝西'] = df['房屋朝向'].str.contains('西').astype(int)
    df['is_朝北'] = df['房屋朝向'].str.contains('北').astype(int)
    
    df = df.drop('房屋朝向', axis=1)
    return df

def process_structure(df, structure_col, fillna_value, prefix):
    """
    处理列，填充缺失值并将其转换为虚拟变量。
    """
    df_processed = df.copy()
    print(f"  处理 [{structure_col}] (转为 dummies)...")
    if structure_col not in df_processed.columns:
        print(f"    警告: 列 '{structure_col}' 不存在，跳过。")
        return df_processed

    df_processed[structure_col].fillna(fillna_value, inplace=True)
    structure_dummies = pd.get_dummies(df_processed[structure_col], prefix=prefix, dummy_na=False)
    df_processed = pd.concat([df_processed, structure_dummies], axis=1)
    df_processed = df_processed.drop(columns=[structure_col], errors='ignore')
    print(f"    已将 '{structure_col}' 转换为 dummies，前缀为 '{prefix}'。")
    return df_processed


In [92]:

def handle_decoration(df):
    """
    处理 '装修情况' 列 :
    填充 NaN 值为 '未知'
    """
    print("处理 [装修情况]...")
    if '装修情况' not in df.columns:
        print("警告: 列 '装修情况' 不存在。")
        return df
        
    # 填充 NaN 
    df['装修情况'] = df['装修情况'].fillna('未知')    
    # 打印唯一值以供检查
    print(f"  处理后 '装修情况' 的唯一值: {df['装修情况'].unique()}")
    
    return df

def handle_elevator_ratio(df, n_train): 
    """
    处理 '梯户比例' 列:
    ... (其他不变) ...
    """
    print("处理 [梯户比例]...")
    
    if '梯户比例' not in df.columns:
        print("警告: 列 '梯户比例' 不存在。")
        return df
        
    df['梯户比例_clean'] = df['梯户比例'].fillna('')
    df['梯户比例_clean'] = df['梯户比例_clean'].replace(r'^\\\\s*、\\\\s*$', '', regex=True)
    
    def extract_ratio(ratio_str):
        try:
            match = re.search(r'(\\S+?)梯\\s*(\\S+?)户', ratio_str)
            if match:
                ele_str, hh_str = match.groups()
                elevator = cn2an.cn2an(ele_str.strip(), "smart")
                household = cn2an.cn2an(hh_str.strip(), "smart")
                return float(elevator), float(household)
        except:
            pass
        return np.nan, np.nan
    
    ratios = df['梯户比例_clean'].apply(extract_ratio)
    df['电梯数'] = ratios.apply(lambda x: x[0])
    df['每层户数'] = ratios.apply(lambda x: x[1])
    
    df['户梯比'] = df['每层户数'] / df['电梯数']
    
    df['户梯比'] = df['户梯比'].replace([np.inf, -np.inf, 0], np.nan)
    
    # 仅使用训练集计算中位数
    train_part = df.iloc[:n_train]
    valid_ratios_train = train_part['户梯比'].dropna()
    median_ratio = valid_ratios_train.median() if not valid_ratios_train.empty else 2.0
    median_elevator_train = train_part['电梯数'].median()
    median_household_train = train_part['每层户数'].median()

    df['户梯比'] = df['户梯比'].fillna(median_ratio)
    df['电梯数'] = df['电梯数'].fillna(median_elevator_train).replace(0, 1)
    df['每层户数'] = df['每层户数'].fillna(median_household_train)
    
    df['高密度标志'] = (df['户梯比'] > 5).astype(int)
    df['低密度标志'] = (df['户梯比'] < 2).astype(int)
    
    df = df.drop(['梯户比例', '梯户比例_clean'], axis=1, errors='ignore')
    
    print(f"户梯比中位数 (基于训练集): {median_ratio:.2f}")
    return df

def handle_villa_type(df):
    """
    处理 '别墅类型' 列
    绝大多数为NaN
    """
    col = '别墅类型'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df
    # 填充 NaN
    df[col] = df[col].fillna('非别墅')
    
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    return df


def handle_property_use(df, rare_threshold=0.005): # 0.5% 阈值
    """
    处理 '房屋用途' 列 
    1. 替换 NaN 为 '未知'
    2. 合并同义词/相似类别
    3. 合并低于 threshold 的稀有类别为 '其他'
    """
    col = '房屋用途'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    # 1. 填充 NaN 
    df[col] = df[col].fillna('未知')

    # 2. 合并同义词 
    # 2a. 合并所有 "公寓" (公寓, 公寓(住宅), 公寓/公寓, 酒店式公寓, 商务公寓等)
    # 使用正则表达式 r'.*公寓.*' 匹配所有包含 '公寓' 的条目
    df[col] = df[col].replace(r'.*公寓.*', '公寓', regex=True)
    
    # 2b. 合并所有 "商用"
    commercial_list = [
        '商业办公类', 
        '商住两用', 
        '商业', 
        '写字楼',
        '底商' # 底商也归为商用
    ]
    df[col] = df[col].replace(commercial_list, '商用')
    
    # 3. 合并稀有类别
    value_counts = df[col].value_counts(normalize=True)
    rare_values = value_counts[value_counts < rare_threshold].index
    rare_values = rare_values.drop('其他', errors='ignore') 
    
    if len(rare_values) > 0:
        print(f"  在 '{col}' 中, 合并 {len(rare_values)} 个稀有类别为 '其他': {rare_values.tolist()}")
        df[col] = df[col].replace(rare_values, '其他')
    
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    return df

def handle_transaction_ownership(df, rare_threshold=0.005): # 0.5% 阈值
    """
    处理 '交易权属' 列 
    1. 替换 NaN为 '未知'
    2. 合并同义词/相似类别
    3. 合并低于 threshold 的稀有类别为 '其他'
    """
    col = '交易权属'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    # 1. 填充 NaN 
    df[col] = df[col].fillna('未知')
    # 2. 合并同义词
    # 合并 "安置房"
    df[col] = df[col].replace(['动迁安置房', '拆迁还建房', '回迁房', '定向安置房'], '安置房')
    # 合并 "经济适用房"
    df[col] = df[col].replace(['二类经济适用房', '经济适用房', '限价商品房', '一类经济适用房'], '经济适用房')
    # 合并 "政策房"
    df[col] = df[col].replace(['已购公房', '房改房', '售后公房', '央产房'], '政策房')
    
    # 3. 合并稀有类别
    value_counts = df[col].value_counts(normalize=True)
    rare_values = value_counts[value_counts < rare_threshold].index
    
    if len(rare_values) > 0:
        print(f"  在 '{col}' 中, 合并 {len(rare_values)} 个稀有类别为 '其他'")
        df[col] = df[col].replace(rare_values, '其他')
        
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    return df


def handle_house_advantages(df):
    """
    处理 '房屋优势' 列:
    多标签文本字段。
    1. 清洗 (NaN, '、') 为空字符串
    2. 提取 'is_Adv_地铁' (二元特征)。
    3. 提取 'is_Adv_装修' (二元特征)。
    4. 提取 'Adv_Tenure_Ordinal' (有序特征)，用于后续合并：
       - 3: 房本满五年
       - 2: 房本满两年
       - 0: 未提及或未知
    """
    col = '房屋优势'
    col_clean = '房屋优势_clean' 
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    # 1. 清洗 
    df[col_clean] = df[col].fillna('')
    df[col_clean] = df[col_clean].replace(r'^\s*、\s*$', '', regex=True)
    
    # 2. 提取 'is_Adv_地铁'
    df['is_Adv_地铁'] = df[col_clean].str.contains('地铁').astype(int)
    
    # 3. 提取 'is_Adv_装修'
    df['is_Adv_装修'] = df[col_clean].str.contains('装修').astype(int)
    
    # 4. 提取 'Adv_Tenure_Ordinal' 
    conditions = [
        df[col_clean].str.contains('房本满五年'),
        df[col_clean].str.contains('房本满两年')
    ]
    choices = [
        3,  # 满五年
        2   # 满两年
    ]
    # 默认值为 0 
    df['Adv_Tenure_Ordinal'] = np.select(conditions, choices, default=0)
    
    print(f"  提取了 'is_Adv_地铁', 'is_Adv_装修', 'Adv_Tenure_Ordinal'")
    df = df.drop([col, col_clean], axis=1, errors='ignore')
    return df

In [93]:
def handle_developer(df):
    """
    处理 '开发商' 列:
    将此列转换为一个二元特征 'has_Developer' (1=有, 0=无)。
    """
    col = '开发商'
    new_col = 'has_Developer'
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    # 1. 定义所有代表 "无开发商" 的字符串
    no_developer_list = [
        '无', 
        '无开发公司', 
        '无开发商', 
        '暂无信息', 
        '暂无资料'
    ]

    # 2. 填充 NaN
    df[col] = df[col].fillna('无')

    df[new_col] = (~df[col].isin(no_developer_list)).astype(int)

    df = df.drop(col, axis=1)
    print(f"  创建了新特征 '{new_col}'")
    return df

def handle_property_management(df):
    """
    处理 '物业公司' 列:
    将此列转换为一个二元特征 'has_PropertyMgmt' (1=有, 0=无)。
    """
    col = '物业公司'
    new_col = 'has_PropertyMgmt'
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    # 1. 定义所有代表 "无物业" 的字符串
    no_management_list = [
        '无', 
        '无物业', 
        '无物业管理', 
        '无物业管理服务', 
        '暂时无物业公司'
    ]

    # 2. 填充 NaN。
    df[col] = df[col].fillna('无物业')
    df[new_col] = (~df[col].isin(no_management_list)).astype(int)

    df = df.drop(col, axis=1)
    print(f"  创建了新特征 '{new_col}'")
    return df

def handle_district(df):
    """
    处理 '区县' 列 数字型分类特征。
    1. 填充 7% 的缺失值 (NaN) 为 '未知' 类别。
    2. 将整列转换为 'string' 类型，以防止模型将其误认为连续数值。
    后面使用目标编码 (Target Encoding)
    """
    col = '区县'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    df[col] = df[col].fillna('未知')
    df[col] = df[col].astype(str)
    
    print(f"  已将 '{col}' 填充缺失值并转换为 'string' 类型。")
    print(f"  处理后 '{col}' 的唯一值 (示例): {df[col].unique()[:10]}")
    return df

def clean_numeric_str(s):
    """
    辅助函数：从 '1317户' 或 '19栋' 这样的字符串中提取数字。
    """
    if pd.isna(s):
        return np.nan
    match = re.search(r'(\d+)', str(s))
    if match:
        return int(match.group(1))
    else:
        return np.nan

def handle_community_stats(df, n_train): 
    """
    处理 '房屋总数' 和 '楼栋总数':
    """
    houses_col = '房屋总数'
    buildings_col = '楼栋总数'
    interaction_col = 'avg_units_per_building' 
    
    print(f"处理 [{houses_col}] 和 [{buildings_col}]...")
    
    if houses_col not in df.columns or buildings_col not in df.columns:
        print(f"警告: 缺少 '{houses_col}' 或 '{buildings_col}'。")
        return df

    df[houses_col] = df[houses_col].apply(clean_numeric_str)
    df[buildings_col] = df[buildings_col].apply(clean_numeric_str)
    
    temp_buildings = df[buildings_col].replace(0, np.nan)
    df[interaction_col] = df[houses_col] / temp_buildings
    
    # 仅使用训练集计算中位数
    train_part = df.iloc[:n_train]
    houses_median = train_part[houses_col].median()
    buildings_median = train_part[buildings_col].median()
    interaction_median = train_part[interaction_col].median() 


    df[houses_col] = df[houses_col].fillna(houses_median)
    df[buildings_col] = df[buildings_col].fillna(buildings_median)
    df[interaction_col] = df[interaction_col].fillna(interaction_median)

    print(f"  清洗了 '{houses_col}' (中位数: {houses_median}) 和 '{buildings_col}' (中位数: {buildings_median})。")
    print(f"  创建了新的交互特征 '{interaction_col}' (中位数: {interaction_median})。")
    
    return df

def parse_building_year(s):
    """
    辅助函数：从字符串中解析年份
    """
    if pd.isna(s):
        return np.nan    

    s_str = str(s)
    nums = re.findall(r'(\d{4})', s_str)
    
    if len(nums) == 0:
        return np.nan
    elif len(nums) == 1:
        return float(nums[0])
    else:
        return (float(nums[0]) + float(nums[1])) / 2

def handle_building_age(df, n_train, col='建筑年代', trans_year_col='交易年份',
                        group_col_l1='板块', group_col_l2='区域'):
    """
    处理 '建筑年代' 列，计算 '房龄' 并使用多级中位数填充缺失值。
        col (str): 包含原始建筑年代信息的列名。
        trans_year_col (str): 包含交易年份的列名。
        group_col_l1 (str): 第一级分组列名 ('板块')。
        group_col_l2 (str): 第二级分组列名 ('区域')。

    Returns:
        pd.DataFrame: 处理后的数据框，包含 '房龄' 列。
    """
    df_processed = df.copy() 
    age_col = '房龄'         
    parsed_year_col = col + '_parsed' # 临时解析年份列

    print(f"处理 [{col}]，计算 {age_col}...")

    #  检查依赖列是否存在 
    if col not in df_processed.columns:
        print(f"    警告：未找到 '{col}' 列。无法计算房龄，将填充默认值 20。")
        df_processed[age_col] = 20
        return df_processed
    if trans_year_col not in df_processed.columns:
        print(f"    警告：未找到 '{trans_year_col}' 列，无法计算房龄，将填充默认值 20。")
        df_processed[age_col] = 20
        df_processed = df_processed.drop(columns=[col], errors='ignore') # 删除原始列
        return df_processed
    if group_col_l1 not in df_processed.columns:
        print(f"    警告：未找到一级分组列 '{group_col_l1}'，L1填充将无效。")
        group_col_l1 = None # 设为 None 以跳过 L1 填充
    if group_col_l2 not in df_processed.columns:
        print(f"    警告：未找到二级分组列 '{group_col_l2}'，L2填充将无效。")
        group_col_l2 = None # 设为 None 以跳过 L2 填充

    #  1. 解析年份 
    df_processed[parsed_year_col] = df_processed[col].apply(parse_building_year)

    #  2. 计算房龄 
    df_processed[age_col] = df_processed[trans_year_col] - df_processed[parsed_year_col]
    nan_count_initial = df_processed[age_col].isnull().sum()
    print(f"    初步计算后 '{age_col}' 缺失值数量: {nan_count_initial}")

    #  3. 填充缺失值 (L1/L2/L3 中位数，仅基于训练集计算) 
    if nan_count_initial > 0:
        print(f"    使用分组中位数填充 '{age_col}' 缺失值 (基于前 {n_train} 行)...")
        train_part = df_processed.iloc[:n_train]
        # 只使用房龄 >= 0 的训练集数据计算中位数
        valid_train_age = train_part.loc[train_part[age_col] >= 0].copy()

        # 计算 L1, L2, L3 填充值
        median_map_l1 = pd.Series(dtype=float)
        if group_col_l1 and not valid_train_age.empty:
             try:
                 median_map_l1 = valid_train_age.groupby(group_col_l1)[age_col].median()
             except Exception as e:
                 print(f"      警告: 计算 L1 中位数图时出错: {e}")

        median_map_l2 = pd.Series(dtype=float)
        if group_col_l2 and not valid_train_age.empty:
             try:
                 median_map_l2 = valid_train_age.groupby(group_col_l2)[age_col].median()
             except Exception as e:
                 print(f"      警告: 计算 L2 中位数图时出错: {e}")

        global_median = valid_train_age[age_col].median() if not valid_train_age.empty else np.nan
        fill_global = global_median if pd.notna(global_median) else 20.0 # 备用值，例如20年

        # 应用 L1 填充
        nan_mask_l1 = df_processed[age_col].isnull()
        if nan_mask_l1.any() and group_col_l1 and not median_map_l1.empty:
            fill_values_l1 = df_processed.loc[nan_mask_l1, group_col_l1].map(median_map_l1)
            df_processed[age_col].fillna(fill_values_l1, inplace=True)

        # 应用 L2 填充
        nan_mask_l2 = df_processed[age_col].isnull()
        if nan_mask_l2.any() and group_col_l2 and not median_map_l2.empty:
            fill_values_l2 = df_processed.loc[nan_mask_l2, group_col_l2].map(median_map_l2)
            df_processed[age_col].fillna(fill_values_l2, inplace=True)

        # 应用 L3 (Global) 填充
        nan_mask_l3 = df_processed[age_col].isnull()
        if nan_mask_l3.any():
            df_processed[age_col].fillna(fill_global, inplace=True)
            print(f" 使用全局中位数/备用值 ({fill_global:.1f}) 填充了 {nan_mask_l3.sum()} 个剩余缺失值。")
        print(f"'{age_col}' 缺失值填充完成。")
    else:
        print(f"'{age_col}' 无需填充。")

    #  4. 清理异常值 (负房龄) 
    negative_age_count = (df_processed[age_col] < 0).sum()
    if negative_age_count > 0:
        print(f"发现 {negative_age_count} 行负房龄，将其修正为 0。")
        df_processed[age_col] = np.maximum(0, df_processed[age_col])

    #  5. 删除原始列和临时列 
    df_processed = df_processed.drop(columns=[col, parsed_year_col], errors='ignore')
    print(f"'{age_col}' 特征处理完成。最终缺失值: {df_processed[age_col].isnull().sum()}")

    return df_processed

def _parse_fee(s):
    """
    如果为范围，则取平均值。
    """
    if pd.isna(s):
        return np.nan
    # 转换为字符串并移除非数字字符，保留小数点和破折号
    s_str = str(s).replace(' ', '') # 移除空格
    if '空白' in s_str:
        return np.nan    
    # 查找所有数字 (包括小数)
    nums = re.findall(r'(\d+\.?\d*)', s_str)
    
    if len(nums) == 0:
        return np.nan
    elif len(nums) == 1:
        return float(nums[0])
    else:
        return (float(nums[0]) + float(nums[1])) / 2

def handle_greenery_rate(df, n_train): 
    """
    处理 '绿化率' 列:
    """
    col = '绿化率'
    new_col = 'GreeneryRate' 
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    s = df[col].astype(str).str.replace(r'[\\s%]', '', regex=True)
    
    # 在原始列上操作
    df[col] = df[col].replace('10500', np.nan)  
    df[col] = pd.to_numeric(df[col], errors='coerce') 
    df[col] = df[col].replace(10500, np.nan)
    
    # 重新基于清理后的列计算 s_numeric
    s_numeric = pd.to_numeric(s, errors='coerce')
    s_numeric = s_numeric.replace(10500, np.nan) # 确保 s_numeric 也清除了

    # 仅使用训练集计算中位数
    median_val = s_numeric.iloc[:n_train].median()


    df[new_col] = s_numeric.fillna(median_val)
    
    print(f"  创建了新特征 '{new_col}' (中位数: {median_val:.2f})。")

    df = df.drop(col, axis=1, errors='ignore')
    return df



In [94]:

def handle_plot_ratio(df, n_train): 
    """
    处理 '容积率' 列
    """
    col = '容积率' 
    new_col = 'PlotRatio' 
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    s_numeric = pd.to_numeric(df[col], errors='coerce') 
    

    # 仅使用训练集计算中位数
    median_val = s_numeric.iloc[:n_train].median()    
    
    df[new_col] = s_numeric.fillna(median_val)
    
    print(f"  创建了新特征 '{new_col}' (中位数: {median_val:.2f})。")
    
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_property_fee(df, n_train): 
    """
    处理 '物业费' 列 
    """
    col = '物业费'
    new_col = 'PropertyFee'
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    s_numeric = df[col].apply(_parse_fee) 
    
    # 仅使用训练集计算中位数
    median_val = s_numeric.iloc[:n_train].median()
    
    df[new_col] = s_numeric.fillna(median_val)
    
    print(f"  创建了新特征 '{new_col}' (中位数: {median_val:.2f})。")
    df = df.drop(col, axis=1, errors='ignore')
    return df



In [95]:
def handle_water_supply(df):
    """
    处理 '供水' 列
    1. 填充 NaN
    2. 创建 is_Water_Civil (民水) 和 is_Water_Commercial (商水) 两个二元特征
    """
    col = '供水'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        # 如果列不存在，创建默认值以避免下游错误
        df['is_Water_Civil'] = 0
        df['is_Water_Commercial'] = 0
        return df

    # 1. 填充NaN为空字符串''，这样 .str.contains 才不会对NaN报错，直接将填充后的 Pandas Series 赋值给 s。
    s = df[col].fillna('') 
    
    # 2. 创建二元特征
    df['is_Water_Civil'] = s.str.contains('民水').astype(int)
    df['is_Water_Commercial'] = s.str.contains('商水').astype(int)
    
    print("  创建了 'is_Water_Civil' 和 'is_Water_Commercial' 特征。")
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    # 3. 删除原始列
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_heating(df):
    """
    创建 is_Heating_Central (集中), is_Heating_Self (自采暖), is_Heating_None (无) 三个二元特征
    """
    col = '供暖'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        df['is_Heating_Central'] = 0
        df['is_Heating_Self'] = 0
        df['is_Heating_None'] = 0
        return df

    s = df[col].fillna('')
    
    #  创建二元特征
    df['is_Heating_Central'] = s.str.contains('集中供暖').astype(int)
    df['is_Heating_Self'] = s.str.contains('自采暖').astype(int)
    df['is_Heating_None'] = s.str.contains('无供暖').astype(int)
    
    print("  创建了 'is_Heating_Central', 'is_Heating_Self', 'is_Heating_None' 特征。")
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    # 删除原始列
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_electricity(df):
    """
    处理 '供电' 列 
    创建 is_Electricity_Civil (民电) 和 is_Electricity_Commercial (商电) 两个二元特征
    """
    col = '供电'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        df['is_Electricity_Civil'] = 0
        df['is_Electricity_Commercial'] = 0
        return df

    s = df[col].fillna('')
       
    # 创建二元特征
    df['is_Electricity_Civil'] = s.str.contains('民电').astype(int)
    df['is_Electricity_Commercial'] = s.str.contains('商电').astype(int)
    
    print("  创建了 'is_Electricity_Civil' 和 'is_Electricity_Commercial' 特征。")
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    #  删除原始列
    df = df.drop(col, axis=1, errors='ignore')
    return df

    
def handle_gas_fee(df, n_train): 
    """
    处理 '燃气费' 列
    """
    col = '燃气费'
    new_col = 'GasFee' 
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    s_numeric = df[col].apply(_parse_fee) 
    
    median_val = s_numeric.iloc[:n_train].median()
    
    df[new_col] = s_numeric.fillna(median_val)
    
    print(f"  创建了新特征 '{new_col}' (中位数: {median_val:.2f})。")
    
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_heating_fee(df, n_train): 
    """
    处理 '供热费' 列 
    """
    col = '供热费'
    new_col = 'HeatingFee'
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df

    s_numeric = df[col].apply(_parse_fee) 
    
    median_val = s_numeric.iloc[:n_train].median()

    
    df[new_col] = s_numeric.fillna(median_val)
    
    print(f"  创建了新特征 '{new_col}' (中位数: {median_val:.2f})。")
    
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_parking_spots(df, n_train): 
    """
    处理 '停车位' 列 用中位数填充
    """
    col = '停车位'
    new_col = 'ParkingSpots' 
    
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        df[new_col] = 0 
        return df

    s_numeric = pd.to_numeric(df[col], errors='coerce')
    
    median_val = s_numeric.iloc[:n_train].median()
 
    
    if pd.isna(median_val):
        median_val = 0 
        print(f"  警告: '{col}' 的训练集中位数计算为 NaN，使用 0 作为填充值。")

    df[new_col] = s_numeric.fillna(median_val)
    
    print(f"  创建了新特征 '{new_col}' (中位数: {median_val:.2f})。")
    
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_elevator_equipped(df):
    """
    处理 '配备电梯' 列 
    """
    print("处理 [配备电梯]...")
    if '配备电梯' not in df.columns:
        print("警告: 列 '配备电梯' 不存在。")
        return df

    # 填充 NaN
    df['配备电梯'] = df['配备电梯'].fillna('未知')
    
    # 打印唯一值以供检查
    print(f"  处理后 '配备电梯' 的唯一值: {df['配备电梯'].unique()}")
    
    return df

def handle_housing_tenure(df):
    """
    处理 '房屋年限' 列 
    """
    col = '房屋年限'
    print(f"处理 [{col}]...")
    if col not in df.columns:
        print(f"警告: 列 '{col}' 不存在。")
        return df
    df[col] = df[col].fillna('未知')
    print(f"  处理后 '{col}' 的唯一值: {df[col].unique()}")
    return df


### 开始加载数据

In [96]:
#  步骤 1: 数据加载 (Price) 
print("--- 步骤 1: 数据加载 (仅 Price) ---")
try:
    df_train_price_raw = pd.read_csv('./data/ruc_Class25Q2_train_price.csv')
    df_test_price_raw = pd.read_csv('./data/ruc_Class25Q2_test_price.csv')
    print(f"Price 训练数据加载成功: {df_train_price_raw.shape}")
    print(f"Price 测试数据加载成功: {df_test_price_raw.shape}")
except FileNotFoundError:
    print("错误：未找到 Price 数据文件。请确保文件在 './data/' 目录下。")

# --- 步骤 2: 存储原始信息与分离 ---
print("\\n--- 步骤 2: 存储原始信息与分离 ---")
n_train_price = df_train_price_raw.shape[0]
print(f"Price 训练集原始行数: {n_train_price}")

# 分离目标变量 (Price)
if 'Price' in df_train_price_raw.columns:
    y_train_price = df_train_price_raw['Price'].copy()
    # 对目标变量进行对数变换 
    y_train_ln_price = np.log1p(y_train_price)
    print(f"已分离 Price 目标变量 (y_train_price, y_train_ln_price)，长度: {len(y_train_price)}")
else:
    print("警告: 'Price' 列在训练集中未找到。")
    y_train_price = None
    y_train_ln_price = None

# 分离测试集 ID
if 'ID' in df_test_price_raw.columns:
    test_ids_price = df_test_price_raw['ID'].copy()
    print(f"已分离 Price 测试集 ID (test_ids_price)，长度: {len(test_ids_price)}")
else:
    print("警告: 'ID' 列在测试集中未找到。")
    test_ids_price = None

# --- 步骤 3: 合并数据集以便统一处理 ---
print("\\n--- 步骤 3: 合并 Price 训练集与测试集 ---")
# 从训练集中移除 Price 列，从测试集中移除 ID 列
df_train_to_concat = df_train_price_raw.drop(columns=['Price'], errors='ignore')
df_test_to_concat = df_test_price_raw.drop(columns=['ID'], errors='ignore')

# 添加来源标识
df_train_to_concat['source'] = 'train'
df_test_to_concat['source'] = 'test'

# 合并
df_price = pd.concat([df_train_to_concat, df_test_to_concat], ignore_index=True)
print(f"Price 数据集合并完成。合并后 df_price 形状: {df_price.shape}")

# 清理原始数据框以释放内存
del df_train_price_raw, df_test_price_raw, df_train_to_concat, df_test_to_concat
import gc
gc.collect()

--- 步骤 1: 数据加载 (仅 Price) ---
Price 训练数据加载成功: (103871, 55)
Price 测试数据加载成功: (34017, 55)
\n--- 步骤 2: 存储原始信息与分离 ---
Price 训练集原始行数: 103871
已分离 Price 目标变量 (y_train_price, y_train_ln_price)，长度: 103871
已分离 Price 测试集 ID (test_ids_price)，长度: 34017
\n--- 步骤 3: 合并 Price 训练集与测试集 ---
Price 数据集合并完成。合并后 df_price 形状: (137888, 55)


2709

In [97]:
# --- 步骤 4: 初步列删除 ---
print("\\n--- 步骤 4: 初步列删除 ---")

columns_to_drop_initial = [
    '抵押信息',       # 完全为空
    '环线位置',       # 与 '环线' 重复
    '上次交易',       # 时间信息将在后续处理
    '产权所属',       # 类别较少且信息量可能不大
    '物业办公电话',   # 信息价值低
    '产权描述',       # 文本，暂不处理
    '核心卖点',       # 文本，暂不处理
    '户型介绍',       # 文本，暂不处理
    '周边配套',       # 文本，暂不处理
    '交通出行',       # 文本，暂不处理
    '客户反馈',       # 文本，暂不处理
    '板块_comm',      # 与 '板块' 信息重叠
    '建筑结构_comm',  # 与 '建筑结构' 信息重叠
    'coord_x',        # 与 lon/lat 重复
    'coord_y',        # 与 lon/lat 重复
    '停车费用',      
    '物业类别',
    'source'         
]

existing_cols_to_drop = [col for col in columns_to_drop_initial if col in df_price.columns]

print(f"准备删除以下 {len(existing_cols_to_drop)} 列: {existing_cols_to_drop}")

# 执行删除
df_price = df_price.drop(columns=existing_cols_to_drop, errors='ignore')

print(f"初步列删除后 df_price 形状: {df_price.shape}")

# 再次检查信息，看剩余列和大致类型
print("\\n初步清理后的数据信息:")
df_price.info()

\n--- 步骤 4: 初步列删除 ---
准备删除以下 18 列: ['抵押信息', '环线位置', '上次交易', '产权所属', '物业办公电话', '产权描述', '核心卖点', '户型介绍', '周边配套', '交通出行', '客户反馈', '板块_comm', '建筑结构_comm', 'coord_x', 'coord_y', '停车费用', '物业类别', 'source']
初步列删除后 df_price 形状: (137888, 37)
\n初步清理后的数据信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137888 entries, 0 to 137887
Data columns (total 37 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   城市      137888 non-null  int64  
 1   区域      137888 non-null  float64
 2   板块      137888 non-null  float64
 3   环线      56096 non-null   object 
 4   房屋户型    137294 non-null  object 
 5   所在楼层    137888 non-null  object 
 6   建筑面积    137888 non-null  object 
 7   套内面积    45899 non-null   object 
 8   房屋朝向    137887 non-null  object 
 9   建筑结构    137294 non-null  object 
 10  装修情况    137294 non-null  object 
 11  梯户比例    134634 non-null  object 
 12  配备电梯    121445 non-null  object 
 13  别墅类型    1597 non-null    object 
 14  交易时间    137888 non-null  object

In [98]:
def apply_common_preprocessing(df, n_train):
    """
    按顺序调用所有的特征工程辅助函数。
    """
    print(f"--- 开始执行常规数据处理(n_train={n_train}) ---")
    df_processed = df.copy()

    # --- 逐个调用辅助函数 ---
    
    # 区域/环线 
    df_processed = handle_region(df_processed, rare_threshold=0.03)
    df_processed = handle_ring_road(df_processed)
    
    # 户型/楼层 
    df_processed = handle_house_type(df_processed, col_name='房屋户型', n_train=n_train)
    df_processed = handle_floor(df_processed, n_train=n_train, col='所在楼层')
    
    # 结构/面积/朝向 
    df_processed = handle_building_structure(df_processed)
    df_processed = handle_area(df_processed, n_train=n_train) 
    df_processed = handle_orientation(df_processed)
    
    # 装修/梯户/别墅/用途/权属/优势 
    df_processed = handle_decoration(df_processed)
    df_processed = handle_elevator_ratio(df_processed, n_train=n_train) 
    df_processed = handle_villa_type(df_processed)
    df_processed = handle_property_use(df_processed, rare_threshold=0.005)
    df_processed = handle_transaction_ownership(df_processed, rare_threshold=0.005)
    df_processed = handle_house_advantages(df_processed)
    
    # 开发商/物业/区县/社区统计/房龄 
    df_processed = handle_developer(df_processed)
    df_processed = handle_property_management(df_processed)
    df_processed = handle_district(df_processed)
    df_processed = handle_community_stats(df_processed, n_train=n_train)
    # 交易年份需要先处理
    df_processed['交易年份'] = pd.to_datetime(df_processed['交易时间']).dt.year
    df_processed = handle_building_age(
        df_processed, 
        n_train=n_train, 
        col='建筑年代', 
        trans_year_col='交易年份',
        group_col_l1='板块', 
        group_col_l2='区域'
    )

    df_processed = handle_greenery_rate(df_processed, n_train=n_train)
    
    # 容积率/物业费 
    df_processed = handle_plot_ratio(df_processed, n_train=n_train)
    df_processed = handle_property_fee(df_processed, n_train=n_train)
    
    # 水/暖/电/燃气费/供热费/停车位
    df_processed = handle_water_supply(df_processed)
    df_processed = handle_heating(df_processed)
    df_processed = handle_electricity(df_processed)
    df_processed = handle_gas_fee(df_processed, n_train=n_train)
    df_processed = handle_heating_fee(df_processed, n_train=n_train)

    df_processed = handle_parking_spots(df_processed, n_train=n_train)
    df_processed = handle_elevator_equipped(df_processed)
    df_processed = handle_housing_tenure(df_processed)

    df_processed = df_processed.drop(columns=['交易时间'], errors='ignore')

    print(f"--- 完成 apply_common_preprocessing ---")
    return df_processed


df_price_processed = apply_common_preprocessing(df_price, n_train_price)

print("\\n--- 初步预处理完成 ---")
print(f"处理后的 df_price_processed 形状: {df_price_processed.shape}")
df_price_processed.info()



--- 开始执行常规数据处理(n_train=103871) ---
处理 [区域]...
合并了 116 个稀有区域为 '其他区域'
处理 [环线]...
  处理 [房屋户型]...
    正在计算 '室', '厅', '卫' 的中位数 (基于前 103871 行)...
      '室' 中位数: 3.0
      '厅' 中位数: 2.0
      '卫' 中位数: 1.0
    已提取 '室', '厅', '卫' 特征并填充缺失值。
  处理 [所在楼层]...
    提取总楼层和当前楼层...
    提取明确的楼层类别...
    推断楼层类别...
    合并类别并填充缺失值...
    进行独热编码并清理临时列...
    楼层信息处理完成。
处理 [建筑结构]...
建筑结构分组分布: {'钢混结构': 107584, '混合砖混类': 19347, '未知结构': 7248, '框架钢构类': 3623, '其他稀有结构': 86}
  处理 [建筑面积] 和 [套内面积]...
    清理面积单位并转为数值...
    计算训练集 (前 103871 行) 平均得房率...
      计算得到的训练集平均得房率: 0.8253
    估算/修正 '套内面积'...
      使用得房率估算了 92231 行的 '套内面积'
    使用中位数填充 '建筑面积' 缺失值...
      建筑面积中位数: 91.11
    再次检查并填充 '套内面积' 缺失值...
    创建 '得房率' 特征...
      最终 '得房率' 使用的中位数/填充值: 0.8253
处理 [房屋朝向]...
处理 [装修情况]...
  处理后 '装修情况' 的唯一值: ['精装' '简装' '未知' '毛坯' '其他']
处理 [梯户比例]...
户梯比中位数 (基于训练集): 2.00
处理 [别墅类型]...
  处理后 '别墅类型' 的唯一值: ['非别墅' '独栋' '联排' '叠拼' '双拼']
处理 [房屋用途]...
  在 '房屋用途' 中, 合并 6 个稀有类别为 '其他': ['公寓', '车库', '新式里弄', '四合院', '花园洋房', '未知']
  处理后 '房屋用途' 的唯一值: ['普通

In [99]:
# 步骤 5: 创建地理空间特征

from geopy.distance import geodesic
from sklearn.cluster import KMeans

def compute_city_center_and_distances(df):
    """计算城市中心点及房源到中心点的距离。"""
    df_out = df.copy()
    print("开始计算地理空间特征 (距离中心)...")
    if '城市' not in df_out.columns or 'lon' not in df_out.columns or 'lat' not in df_out.columns:
        print("  警告: 缺少 '城市', 'lon', 或 'lat' 列，无法计算距离特征。")
        return df_out

    train_part = df_out.iloc[:n_train_price] 
    city_centers = train_part.groupby('城市', observed=True)[['lon', 'lat']].mean().reset_index()
    city_centers = city_centers.rename(columns={'lon': 'center_lon', 'lat': 'center_lat'})
    print(f"  计算了 {len(city_centers)} 个城市的中心点 (基于训练集)。")

    df_out = pd.merge(df_out, city_centers, on='城市', how='left')

    def compute_distance(row):
        if pd.isna(row['lat']) or pd.isna(row['lon']) or pd.isna(row['center_lat']) or pd.isna(row['center_lon']):
            return np.nan
        try:
            return geodesic((row['lat'], row['lon']), (row['center_lat'], row['center_lon'])).km
        except ValueError:
            return np.nan

    df_out['距离中心_公里'] = df_out.apply(compute_distance, axis=1)
    median_dist_train = df_out.iloc[:n_train_price]['距离中心_公里'].median()
    df_out['距离中心_公里'].fillna(median_dist_train, inplace=True)
    print(f"  计算了 '距离中心_公里'，并用训练集中位数 ({median_dist_train:.2f} km) 填充了 NaN。")

    df_out['距离中心_公里_平方'] = df_out['距离中心_公里'] ** 2
    df_out = df_out.drop(columns=['center_lon', 'center_lat'], errors='ignore')
    print("地理距离特征创建完毕。")
    return df_out

def create_geo_clusters(df, n_train, n_clusters=50, city_col='城市', lon_col='lon', lat_col='lat'):
    """为每个城市计算地理聚类并进行独热编码。"""
    df_processed = df.copy()
    print(f"开始创建地理聚类 (每个城市 {n_clusters} 个簇)...")

    required_cols = [city_col, lon_col, lat_col]
    if not all(col in df_processed.columns for col in required_cols):
        print(f"  错误: DataFrame 缺少必需的列: {required_cols}。跳过聚类...")
        return df_processed

    coord_nan_mask = df_processed[[lon_col, lat_col]].isnull().any(axis=1)
    cluster_col_temp = '地理聚类_temp'
    df_processed[cluster_col_temp] = -1 # 未分配
    all_city_labels = df_processed[city_col].unique()
    
    train_mask = df_processed.index < n_train
    test_mask = df_processed.index >= n_train

    for city_label in all_city_labels:
        city_mask = (df_processed[city_col] == city_label) & (~coord_nan_mask)
        city_data_train = df_processed.loc[city_mask & train_mask, [lon_col, lat_col]]
        city_data_test = df_processed.loc[city_mask & test_mask, [lon_col, lat_col]]

        if len(city_data_train) >= n_clusters:
            try:
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
                kmeans.fit(city_data_train)
                
                if not city_data_train.empty:
                    clusters_train = kmeans.predict(city_data_train)
                    df_processed.loc[city_data_train.index, cluster_col_temp] = clusters_train
                if not city_data_test.empty:
                    clusters_test = kmeans.predict(city_data_test)
                    df_processed.loc[city_data_test.index, cluster_col_temp] = clusters_test
            except Exception as e:
                print(f"    警告: 城市 {city_label} KMeans 失败: {e}")
        elif len(city_data_train) > 0:
             print(f"    警告: 城市 {city_label} 训练集数据点不足 ({len(city_data_train)})，跳过聚类。")

    combined_label_col = '地理聚类_带城市'
    df_processed[combined_label_col] = np.where(
         (df_processed[cluster_col_temp] != -1) & df_processed[city_col].notna(),
         'C' + df_processed[city_col].astype(str).str.split('.').str[0] + '_' + df_processed[cluster_col_temp].astype(str),
         'GeoCluster_Unknown'
    )
    df_processed = pd.get_dummies(df_processed, columns=[combined_label_col], prefix='GeoCluster', drop_first=False)
    df_processed = df_processed.drop(columns=[cluster_col_temp], errors='ignore')
    print("地理聚类特征创建完毕。")
    return df_processed

print("\\n--- 步骤 5: 正在创建地理空间特征 ---")
df_price_with_geo = compute_city_center_and_distances(df_price_processed) 
df_price_with_geo = create_geo_clusters(df_price_with_geo, n_train=n_train_price, n_clusters=50)





\n--- 步骤 5: 正在创建地理空间特征 ---
开始计算地理空间特征 (距离中心)...
  计算了 12 个城市的中心点 (基于训练集)。
  计算了 '距离中心_公里'，并用训练集中位数 (14.58 km) 填充了 NaN。
地理距离特征创建完毕。
开始创建地理聚类 (每个城市 50 个簇)...
地理聚类特征创建完毕。


In [100]:
  

from sklearn.model_selection import KFold

def apply_target_encoding_combined(df, y_target, n_train, loc_cols, n_splits=6, random_state=111):
    """在合并的数据框上对 loc_cols 进行 K-Fold 目标编码。"""
    df_te = df.copy()
    print(f"处理 {loc_cols} (K-Fold Target Encoding, n_splits={n_splits})...")
    if not loc_cols: return df_te 

    existing_loc_cols = [col for col in loc_cols if col in df_te.columns]
    if not existing_loc_cols: return df_te
    print(f"  将对以下存在的列进行编码: {existing_loc_cols}")

    new_col = 'Location_Target_Encoded'
    global_mean = y_target.mean()

    def create_key(df_slice):
        return df_slice[existing_loc_cols].astype(str).agg('_'.join, axis=1)
    df_te['key'] = create_key(df_te)

    X_train_part = df_te.iloc[:n_train].copy()
    X_train_part['target'] = y_target
    full_train_map = X_train_part.groupby('key')['target'].mean()
    print(f"  计算了基于 {n_train} 训练样本的完整均值图谱。")

    X_test_part = df_te.iloc[n_train:].copy()
    df_te.loc[X_test_part.index, new_col] = X_test_part['key'].map(full_train_map).fillna(global_mean)
    print(f"  已将完整图谱应用于 {len(X_test_part)} 个测试样本。")

    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    df_te.loc[X_train_part.index, new_col] = np.nan

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_part)):
        X_train_fold = X_train_part.iloc[train_idx]
        X_val_fold = X_train_part.iloc[val_idx]
        fold_map = X_train_fold.groupby('key')['target'].mean()
        df_te.loc[X_val_fold.index, new_col] = X_val_fold['key'].map(fold_map)

    
    is_train_mask = df_te.index < n_train
    is_nan_mask = df_te[new_col].isnull()
    fill_mask = is_train_mask & is_nan_mask
    fill_count = fill_mask.sum()
    if fill_count > 0:
        print(f"    填充训练集中 K-Fold 后剩余的 {fill_count} 个 NaN...")
        values_to_fill = df_te.loc[fill_mask, 'key'].map(full_train_map).values
        df_te.loc[fill_mask, new_col] = values_to_fill

    df_te[new_col] = df_te[new_col].fillna(global_mean)
    cols_to_drop = existing_loc_cols + ['key', 'target']
    df_te = df_te.drop(columns=cols_to_drop, errors='ignore')
    print(f"  K-Fold Target Encoding 完成。新特征: '{new_col}'。")
    return df_te

print("\\n--- 步骤 4: 正在执行目标编码 ---")
location_columns_to_encode = ['城市', '区域', '区县', '板块']
df_price_te = apply_target_encoding_combined(df_price_with_geo, y_train_price, n_train_price, location_columns_to_encode)



# 第五步：系统性特征工程 

print("\\n" + "=" * 30)
print("第五步：系统性特征工程")
print("=" * 30)

# --- 5.a 创建比率特征 ---
def cal_ratio(df, NumeratorValue, DenominatorValue, new_col_name):
    """计算比率特征，处理除以零的情况。"""
    if NumeratorValue in df.columns and DenominatorValue in df.columns:
        print(f"  创建比率特征 [{new_col_name}]...")
        df[new_col_name] = np.where(
            df[DenominatorValue] > 0,
            df[NumeratorValue] / df[DenominatorValue],
            0
        )
    else:
        print(f"  警告: 无法创建比率 '{new_col_name}'，缺少列。")


def create_ratio_features(df):
    df_out = df.copy()
    print("开始创建比率特征...")
    cal_ratio(df_out, '房屋总数', '楼栋总数', '平均每栋房屋数')
    cal_ratio(df_out, 'ParkingSpots', '房屋总数', '停车位与房屋总数比')
    cal_ratio(df_out, 'GreeneryRate', 'PlotRatio', '绿化率与容积率比')
    cal_ratio(df_out, '室', '厅', '室厅比')
    cal_ratio(df_out, '室', '卫', '室卫比')
    print("比率特征创建完毕。")
    return df_out

df_price_eng = create_ratio_features(df_price_te)


# --- 5.c 对数变换 (处理偏度) ---
def log_transform(df, skewed_cols):
    """对指定的偏斜数值特征应用 log1p 转换。"""
    df_transformed = df.copy()
    print("开始进行对数变换...")
    transformed_cols = []
    for col in skewed_cols:
        col_log = f'log_{col}'
        if col in df_transformed.columns:
            if pd.api.types.is_numeric_dtype(df_transformed[col]):
                min_val = df_transformed[col].min()
                if min_val >= 0:
                    df_transformed[col_log] = np.log1p(df_transformed[col])
                    df_transformed = df_transformed.drop(columns=[col], errors='ignore')
                    transformed_cols.append(col)
                    print(f"  已对 '{col}' 应用 log1p 转换 -> '{col_log}'")
                else:
                    print(f"  警告: 列 '{col}' 包含负值 (最小值: {min_val})，跳过 log1p 转换。")
            else:
                print(f"  警告: 列 '{col}' 非数值类型，跳过 log1p 转换。")
    print(f"对数变换完成。共处理了 {len(transformed_cols)} 列。")
    return df_transformed

# (使用更新后的新列名)
skewed_cols_to_transform = [
    'PlotRatio', '楼栋总数', 'HeatingFee', 'ParkingSpots', '房屋总数', 
    'PropertyFee', '距离中心_公里', '距离中心_公里_平方', '平均每栋房屋数', 
    '停车位与房屋总数比', '建筑面积', '套内面积', 'Location_Target_Encoded'
]
skewed_cols_to_transform = [col for col in skewed_cols_to_transform if col in df_price_eng.columns]

df_price_eng = log_transform(df_price_eng, skewed_cols_to_transform)


# --- 5.d 分箱 (Binning) ---
from sklearn.preprocessing import KBinsDiscretizer

def bin_and_encode(df, n_train, feature, n_bins=5, strategy='kmeans'):
    """对特征进行分箱和独热编码。"""
    df_binned = df.copy()
    print(f"开始对特征 [{feature}] 进行分箱 (n_bins={n_bins}, strategy='{strategy}')...")
    if feature not in df_binned.columns: return df_binned
    if not pd.api.types.is_numeric_dtype(df_binned[feature]): return df_binned

    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=None)
    feature_binned_col = f'{feature}_分箱'

    median_val_train = df_binned.iloc[:n_train][feature].median()
    df_binned[feature].fillna(median_val_train, inplace=True)
    print(f"  使用训练集中位数 ({median_val_train:.2f}) 填充了 '{feature}' 的 NaN。")

    try:
        train_data_for_fit = df_binned.iloc[:n_train][[feature]]
        if train_data_for_fit[feature].nunique() < n_bins:
             actual_bins = train_data_for_fit[feature].nunique()
             print(f"  警告: 唯一值 ({actual_bins}) < 箱数 ({n_bins})。")
             if actual_bins < 2: return df_binned
             binner = KBinsDiscretizer(n_bins=actual_bins, encode='ordinal', strategy='uniform', subsample=None)
        
        binner.fit(train_data_for_fit)
        df_binned[feature_binned_col] = binner.transform(df_binned[[feature]])
        print(f"  已创建分箱特征 '{feature_binned_col}'。")
    except ValueError as e:
        print(f"  错误: 对特征 '{feature}' 分箱时出错: {e}。跳过...")
        return df_binned

    df_binned = pd.get_dummies(df_binned, columns=[feature_binned_col], prefix=f'{feature}段', drop_first=False)
    df_binned = df_binned.drop(columns=[feature], errors='ignore')
    print(f"  已对分箱结果进行独热编码并移除原始特征。")
    return df_binned

# (使用 log 变换后的特征名)
features_to_bin = [
    '房龄', '总楼层', # 原始数值
    'log_距离中心_公里',
    'log_建筑面积',
    'log_套内面积',
    'log_Location_Target_Encoded'
]
features_to_bin = [col for col in features_to_bin if col in df_price_eng.columns] # 确保存在

for feature in features_to_bin:
    df_price_eng = bin_and_encode(df_price_eng, n_train_price, feature=feature, n_bins=5, strategy='kmeans')

print("分箱处理完成。")


# --- 5.e 创建交互项 ---
from sklearn.preprocessing import PolynomialFeatures

def create_polynomial_interactions(df, n_train, continuous_cols_candidates, degree=2, interaction_only=True):
    """为数据集创建多项式交互项特征。"""
    df_poly = df.copy()
    print(f"开始创建交互项 (degree={degree}, interaction_only={interaction_only})...")

    binary_cols = [col for col in df_poly.columns
                   if df_poly[col].nunique(dropna=False) == 2 and
                      df_poly[col].min() == 0 and df_poly[col].max() == 1]
    all_numeric_cols = df_poly.select_dtypes(include=np.number).columns.tolist()
    current_continuous_cols = [col for col in all_numeric_cols if col not in binary_cols]
    cols_for_poly = [col for col in continuous_cols_candidates if col in current_continuous_cols]

    print(f"  将为以下 {len(cols_for_poly)} 个连续特征创建交互项: {cols_for_poly}")
    if not cols_for_poly: return df_poly

    poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)

    print("  使用训练集的中位数填充交互项特征中的 NaN...")
    train_part_poly = df_poly.iloc[:n_train]
    medians_poly = train_part_poly[cols_for_poly].median()
    df_poly[cols_for_poly] = df_poly[cols_for_poly].fillna(medians_poly)
    if df_poly[cols_for_poly].isnull().any().any():
         df_poly[cols_for_poly] = df_poly[cols_for_poly].fillna(0)

    try:
        poly.fit(df_poly.iloc[:n_train][cols_for_poly])
        poly_features = poly.transform(df_poly[cols_for_poly])
        poly_feature_names = [name.replace(' ', '_TIMES_').replace('^2', '_SQ') for name in poly.get_feature_names_out(cols_for_poly)]
        print(f"  生成了 {len(poly_feature_names)} 个多项式/交互项特征。")

        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_poly.index)
        
        df_poly = df_poly.drop(columns=cols_for_poly, errors='ignore')
        df_final = pd.concat([df_poly, poly_df], axis=1)

        print(f"交互项创建完毕。数据集新维度: {df_final.shape}")
        return df_final
    except Exception as e:
        print(f"  错误: 创建交互项时出错: {e}。返回未修改的数据框。")
        return df_poly

# (使用未被分箱的特征)
continuous_cols_for_interactions = [
    '得房率',
    '室', '厅', '卫'
]
continuous_cols_for_interactions = [col for col in continuous_cols_for_interactions if col in df_price_eng.columns]
df_price_final_eng = create_polynomial_interactions(df_price_eng, n_train_price, continuous_cols_for_interactions)


# --- 最终检查 ---
print("\\n" + "=" * 30)
print("系统性特征工程完成后的数据信息:")
print(f"最终形状: {df_price_final_eng.shape}")
df_price_final_eng.info()

\n--- 步骤 4: 正在执行目标编码 ---
处理 ['城市', '区域', '区县', '板块'] (K-Fold Target Encoding, n_splits=6)...
  将对以下存在的列进行编码: ['城市', '区域', '区县', '板块']
  计算了基于 103871 训练样本的完整均值图谱。
  已将完整图谱应用于 34017 个测试样本。
    填充训练集中 K-Fold 后剩余的 95 个 NaN...
  K-Fold Target Encoding 完成。新特征: 'Location_Target_Encoded'。
第五步：系统性特征工程
开始创建比率特征...
  创建比率特征 [平均每栋房屋数]...
  创建比率特征 [停车位与房屋总数比]...
  创建比率特征 [绿化率与容积率比]...
  创建比率特征 [室厅比]...
  创建比率特征 [室卫比]...
比率特征创建完毕。
开始进行对数变换...
  已对 'PlotRatio' 应用 log1p 转换 -> 'log_PlotRatio'
  已对 '楼栋总数' 应用 log1p 转换 -> 'log_楼栋总数'
  已对 'HeatingFee' 应用 log1p 转换 -> 'log_HeatingFee'
  已对 'ParkingSpots' 应用 log1p 转换 -> 'log_ParkingSpots'
  已对 '房屋总数' 应用 log1p 转换 -> 'log_房屋总数'
  已对 'PropertyFee' 应用 log1p 转换 -> 'log_PropertyFee'
  已对 '距离中心_公里' 应用 log1p 转换 -> 'log_距离中心_公里'
  已对 '距离中心_公里_平方' 应用 log1p 转换 -> 'log_距离中心_公里_平方'
  已对 '平均每栋房屋数' 应用 log1p 转换 -> 'log_平均每栋房屋数'
  已对 '停车位与房屋总数比' 应用 log1p 转换 -> 'log_停车位与房屋总数比'
  已对 '建筑面积' 应用 log1p 转换 -> 'log_建筑面积'
  已对 '套内面积' 应用 log1p 转换 -> 'log_套内面积'
  已对 'Location_Target_Enc

In [101]:
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings('ignore', category=ConvergenceWarning)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# 步骤 1: 数据准备 (分离 Kaggle 测试集)

print("--- 步骤 1: 数据准备 ---")
df_final = df_price_final_eng.copy()

# 将特征拆分为 训练集 (用于建模) 和 测试集 (用于Kaggle提交)
X_train_full = df_final.iloc[:n_train_price]
X_test_kaggle = df_final.iloc[n_train_price:]

# 目标变量 (log 转换后)
y_train_full_log = y_train_ln_price.copy()
# 目标变量 (原始价格)
y_train_full_orig = y_train_price.copy()

print(f"完整训练集 X 形状: {X_train_full.shape}")
print(f"Kaggle测试集 X 形状: {X_test_kaggle.shape}")


# 步骤 2: y 异常值处理 

print("\n--- 步骤 2: y 异常值处理 (IQR 移除) ---")

# 在原始价格 y_train_full_orig 上计算 IQR
Q1 = y_train_full_orig.quantile(0.01)
Q3 = y_train_full_orig.quantile(0.95)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 创建掩码
mask = (y_train_full_orig >= lower_bound) & (y_train_full_orig <= upper_bound)
rows_before = len(y_train_full_orig)
rows_after = mask.sum()

print(f"IQR 异常值边界: [{lower_bound:.0f}, {upper_bound:.0f}]")
print(f"移除 y 异常值前: {rows_before} 行")
print(f"移除 y 异常值后: {rows_after} 行")
print(f"移除了 {rows_before - rows_after} 行")

# 应用掩码到 X 和 y
X_train_clean = X_train_full[mask].copy()
y_train_clean_log = y_train_full_log[mask].copy()
y_train_clean_orig = y_train_full_orig[mask].copy()


# 步骤 3: 训练集/验证集拆分 (80/20)

print("\n--- 步骤 3: 拆分训练集 (80%) 和验证集 (20%) ---")
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X_train_clean, 
    y_train_clean_log, 
    test_size=0.2, 
    random_state=111
)

# 确保 y_orig 也被正确拆分
y_train_orig = y_train_clean_orig.loc[y_train_log.index]
y_val_orig = y_train_clean_orig.loc[y_val_log.index]

print(f"X_train 形状: {X_train.shape}")
print(f"X_val 形状: {X_val.shape}")

# 步骤 4: X 特征最终处理 (使用 ColumnTransformer)

print("\n--- 步骤 4: X 特征最终处理 (使用 ColumnTransformer) ---")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 

# --- 4.1: 定义三种类型的列名 ---
print("  正在分离 数值型, 分类型(object), 和 布尔型(bool) 列...")

# 1. 数值型特征 (int/float, 但排除 bool)
numeric_features = X_train.select_dtypes(include=np.number, exclude=[bool]).columns
print(f"    找到了 {len(numeric_features)} 个数值型特征 (将进行缩放)。")

# 2. 分类型特征 (object)
categorical_features = X_train.select_dtypes(include=['object']).columns
print(f"    找到了 {len(categorical_features)} 个分类型特征 (将进行独热编码)。")

# 3. 布尔型特征 (bool)
boolean_features = X_train.select_dtypes(include=[bool]).columns
print(f"    找到了 {len(boolean_features)} 个布尔型特征 。")

# 检查总数是否匹配
total_features_processed = len(numeric_features) + len(categorical_features) + len(boolean_features)
print(f"    总计: {total_features_processed} / {X_train.shape[1]} 列将被处理。")
if total_features_processed != X_train.shape[1]:
    print("  警告: 有部分列的类型未被匹配，它们将被'remainder'规则处理！")


# --- 4.2: (Clipping) 异常值截断 (只对数值型) ---
print("  正在计算和应用截断 (Clipping) 到数值型特征...")
lower_bounds_X = X_train[numeric_features].quantile(0.01)
upper_bounds_X = X_train[numeric_features].quantile(0.99)

# 使用 .loc 来避免 SettingWithCopyWarning
X_train.loc[:, numeric_features] = X_train[numeric_features].clip(lower_bounds_X, upper_bounds_X, axis=1)
X_val.loc[:, numeric_features] = X_val[numeric_features].clip(lower_bounds_X, upper_bounds_X, axis=1)
X_train_clean.loc[:, numeric_features] = X_train_clean[numeric_features].clip(lower_bounds_X, upper_bounds_X, axis=1)
X_test_kaggle.loc[:, numeric_features] = X_test_kaggle[numeric_features].clip(lower_bounds_X, upper_bounds_X, axis=1)


# --- 4.3: 定义三条流水线 ---

# 流水线 1: 数值型
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 流水线 2: 分类型 
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='未知')), # 填充缺失的分类
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # 独热编码
])

# 流水线 3: 布尔型 什么都不做，保持原样
boolean_transformer = 'passthrough'

# 组合三条流水线
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features)
    ],
    remainder='drop' # 丢弃任何不属于这三类的列 
)


# --- 4.4: 应用 ColumnTransformer ---
print("  正在应用 ColumnTransformer (Impute, Scale, OneHotEncode)...")

# 在 X_train (80%数据) 上 .fit()
preprocessor.fit(X_train)

# 在所有数据集上 .transform()
X_train_scaled = preprocessor.transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
X_train_clean_scaled = preprocessor.transform(X_train_clean)
X_test_kaggle_scaled = preprocessor.transform(X_test_kaggle)


print("\n--- 步骤 4 输出 ---")
print(f"完整清理并缩放的训练集 X 形状: {X_train_clean_scaled.shape}")
print(f"Kaggle测试集 (已缩放) X 形状: {X_test_kaggle_scaled.shape}")

# 检查 NaN
nan_check = np.isnan(X_train_clean_scaled).sum()
print(f"  最终 NaN 检查 (应为 0): {nan_check}")

# 最终列数 = (数值列数) + (独热编码产生的新列数) + (布尔列数)
print(f"  注意：最终特征数量： {X_train_scaled.shape[1]} (因为独热编码)")

--- 步骤 1: 数据准备 ---
完整训练集 X 形状: (103871, 697)
Kaggle测试集 X 形状: (34017, 697)

--- 步骤 2: y 异常值处理 (IQR 移除) ---
IQR 异常值边界: [-9112782, 15866578]
移除 y 异常值前: 103871 行
移除 y 异常值后: 103264 行
移除了 607 行

--- 步骤 3: 拆分训练集 (80%) 和验证集 (20%) ---
X_train 形状: (82611, 697)
X_val 形状: (20653, 697)

--- 步骤 4: X 特征最终处理 (使用 ColumnTransformer) ---
  正在分离 数值型, 分类型(object), 和 布尔型(bool) 列...
    找到了 53 个数值型特征 (将进行缩放)。
    找到了 8 个分类型特征 (将进行独热编码)。
    找到了 636 个布尔型特征 。
    总计: 697 / 697 列将被处理。
  正在计算和应用截断 (Clipping) 到数值型特征...
  正在应用 ColumnTransformer (Impute, Scale, OneHotEncode)...

--- 步骤 4 输出 ---
完整清理并缩放的训练集 X 形状: (103264, 725)
Kaggle测试集 (已缩放) X 形状: (34017, 725)
  最终 NaN 检查 (应为 0): 0
  注意：最终特征数量： 725 (因为独热编码)


In [102]:
# 目标编码 (Target Encoding)

from sklearn.model_selection import KFold

def apply_target_encoding_combined(df, y_target, n_train, loc_cols, n_splits=6, random_state=111):
    """在合并的数据框上对 loc_cols 进行 K-Fold 目标编码。"""
    df_te = df.copy()
    print(f"处理 {loc_cols} (K-Fold Target Encoding, n_splits={n_splits})...")
    if not loc_cols: return df_te

    existing_loc_cols = [col for col in loc_cols if col in df_te.columns]
    if not existing_loc_cols: return df_te
    print(f"  将对以下存在的列进行编码: {existing_loc_cols}")

    new_col = 'Location_Target_Encoded'
    global_mean = y_target.mean()

    def create_key(df_slice):
        return df_slice[existing_loc_cols].astype(str).agg('_'.join, axis=1)
    df_te['key'] = create_key(df_te)

    X_train_part = df_te.iloc[:n_train].copy()
    X_train_part['target'] = y_target
    full_train_map = X_train_part.groupby('key')['target'].mean()
    print(f"  计算了基于 {n_train} 训练样本的完整均值图谱。")

    X_test_part = df_te.iloc[n_train:].copy()
    df_te.loc[X_test_part.index, new_col] = X_test_part['key'].map(full_train_map).fillna(global_mean)
    print(f"  已将完整图谱应用于 {len(X_test_part)} 个测试样本。")

    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    df_te.loc[X_train_part.index, new_col] = np.nan

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_part)):
        X_train_fold = X_train_part.iloc[train_idx]
        X_val_fold = X_train_part.iloc[val_idx]
        fold_map = X_train_fold.groupby('key')['target'].mean()
        df_te.loc[X_val_fold.index, new_col] = X_val_fold['key'].map(fold_map)

    is_train_mask = df_te.index < n_train
    is_nan_mask = df_te[new_col].isnull()
    fill_mask = is_train_mask & is_nan_mask
    fill_count = fill_mask.sum()
    if fill_count > 0:
        print(f"    填充训练集中 K-Fold 后剩余的 {fill_count} 个 NaN...")
        values_to_fill = df_te.loc[fill_mask, 'key'].map(full_train_map).values
        df_te.loc[fill_mask, new_col] = values_to_fill
    

    df_te[new_col] = df_te[new_col].fillna(global_mean)
    cols_to_drop = existing_loc_cols + ['key', 'target']
    df_te = df_te.drop(columns=cols_to_drop, errors='ignore')
    print(f"  K-Fold Target Encoding 完成。新特征: '{new_col}'。")
    return df_te

# --- (调用 4) ---
print("\\n--- 步骤 4: 正在执行目标编码 ---")
location_columns_to_encode = ['城市', '区域', '区县', '板块']
df_price_te = apply_target_encoding_combined(df_price_with_geo, y_train_price, n_train_price, location_columns_to_encode)



# 第五步：系统性特征工程

print("\\n" + "=" * 30)
print("第五步：系统性特征工程 (剩余部分)")
print("=" * 30)

# --- 5.a 创建比率特征 ---
def cal_ratio(df, NumeratorValue, DenominatorValue, new_col_name):
    """计算比率特征，处理除以零的情况。"""
    if NumeratorValue in df.columns and DenominatorValue in df.columns:
        print(f"  创建比率特征 [{new_col_name}]...")
        df[new_col_name] = np.where(
            df[DenominatorValue] > 0,
            df[NumeratorValue] / df[DenominatorValue],
            0
        )
    else:
        print(f"  警告: 无法创建比率 '{new_col_name}'，缺少列。")


def create_ratio_features(df):
    df_out = df.copy()
    print("开始创建比率特征...")
    cal_ratio(df_out, '房屋总数', '楼栋总数', '平均每栋房屋数')
    cal_ratio(df_out, 'ParkingSpots', '房屋总数', '停车位与房屋总数比')
    cal_ratio(df_out, 'GreeneryRate', 'PlotRatio', '绿化率与容积率比')
    cal_ratio(df_out, '室', '厅', '室厅比')
    cal_ratio(df_out, '室', '卫', '室卫比')
    print("比率特征创建完毕。")
    return df_out
df_price_eng = create_ratio_features(df_price_te)


# --- 5.c 对数变换 (处理偏度) ---
def log_transform(df, skewed_cols):
    """对指定的偏斜数值特征应用 log1p 转换。"""
    df_transformed = df.copy()
    print("开始进行对数变换...")
    transformed_cols = []
    for col in skewed_cols:
        col_log = f'log_{col}'
        if col in df_transformed.columns:
            if pd.api.types.is_numeric_dtype(df_transformed[col]):
                min_val = df_transformed[col].min()
                if min_val >= 0:
                    df_transformed[col_log] = np.log1p(df_transformed[col])
                    df_transformed = df_transformed.drop(columns=[col], errors='ignore')
                    transformed_cols.append(col)
                    print(f"  已对 '{col}' 应用 log1p 转换 -> '{col_log}'")
                else:
                    print(f"  警告: 列 '{col}' 包含负值 (最小值: {min_val})，跳过 log1p 转换。")
            else:
                print(f"  警告: 列 '{col}' 非数值类型，跳过 log1p 转换。")
    print(f"对数变换完成。共处理了 {len(transformed_cols)} 列。")
    return df_transformed

# (使用更新后的新列名)
skewed_cols_to_transform = [
    'PlotRatio', '楼栋总数', 'HeatingFee', 'ParkingSpots', '房屋总数', 
    'PropertyFee', '距离中心_公里', '距离中心_公里_平方', '平均每栋房屋数', 
    '停车位与房屋总数比', '建筑面积', '套内面积', 'Location_Target_Encoded'
]
skewed_cols_to_transform = [col for col in skewed_cols_to_transform if col in df_price_eng.columns]
df_price_eng = log_transform(df_price_eng, skewed_cols_to_transform)


# --- 5.d 分箱 (Binning) ---
from sklearn.preprocessing import KBinsDiscretizer

def bin_and_encode(df, n_train, feature, n_bins=5, strategy='kmeans'):
    """对特征进行分箱和独热编码。"""
    df_binned = df.copy()
    print(f"开始对特征 [{feature}] 进行分箱 (n_bins={n_bins}, strategy='{strategy}')...")
    if feature not in df_binned.columns: return df_binned
    if not pd.api.types.is_numeric_dtype(df_binned[feature]): return df_binned

    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=None)
    feature_binned_col = f'{feature}_分箱'

    median_val_train = df_binned.iloc[:n_train][feature].median()
    df_binned[feature].fillna(median_val_train, inplace=True)
    print(f"  使用训练集中位数 ({median_val_train:.2f}) 填充了 '{feature}' 的 NaN。")

    try:
        train_data_for_fit = df_binned.iloc[:n_train][[feature]]
        if train_data_for_fit[feature].nunique() < n_bins:
             actual_bins = train_data_for_fit[feature].nunique()
             print(f"  警告: 唯一值 ({actual_bins}) < 箱数 ({n_bins})。")
             if actual_bins < 2: return df_binned
             binner = KBinsDiscretizer(n_bins=actual_bins, encode='ordinal', strategy='uniform', subsample=None)
        
        binner.fit(train_data_for_fit)
        df_binned[feature_binned_col] = binner.transform(df_binned[[feature]])
        print(f"  已创建分箱特征 '{feature_binned_col}'。")
    except ValueError as e:
        print(f"  错误: 对特征 '{feature}' 分箱时出错: {e}。跳过...")
        return df_binned

    df_binned = pd.get_dummies(df_binned, columns=[feature_binned_col], prefix=f'{feature}段', drop_first=False)
    df_binned = df_binned.drop(columns=[feature], errors='ignore')
    print(f"  已对分箱结果进行独热编码并移除原始特征。")
    return df_binned

# (使用 log 变换后的特征名)
features_to_bin = [
    '房龄', '总楼层', # 原始数值
    'log_距离中心_公里',
    'log_建筑面积',
    'log_套内面积',
    'log_Location_Target_Encoded'
]
features_to_bin = [col for col in features_to_bin if col in df_price_eng.columns] # 确保存在

for feature in features_to_bin:
    df_price_eng = bin_and_encode(df_price_eng, n_train_price, feature=feature, n_bins=5, strategy='kmeans')

print("分箱处理完成。")


# --- 5.e 创建交互项 ---
from sklearn.preprocessing import PolynomialFeatures

def create_polynomial_interactions(df, n_train, continuous_cols_candidates, degree=2, interaction_only=True):
    """为数据集创建多项式交互项特征。"""
    df_poly = df.copy()
    print(f"开始创建交互项 (degree={degree}, interaction_only={interaction_only})...")

    binary_cols = [col for col in df_poly.columns
                   if df_poly[col].nunique(dropna=False) == 2 and
                      df_poly[col].min() == 0 and df_poly[col].max() == 1]
    all_numeric_cols = df_poly.select_dtypes(include=np.number).columns.tolist()
    current_continuous_cols = [col for col in all_numeric_cols if col not in binary_cols]
    cols_for_poly = [col for col in continuous_cols_candidates if col in current_continuous_cols]

    print(f"  将为以下 {len(cols_for_poly)} 个连续特征创建交互项: {cols_for_poly}")
    if not cols_for_poly: return df_poly

    poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)

    print("  使用训练集的中位数填充交互项特征中的 NaN...")
    train_part_poly = df_poly.iloc[:n_train]
    medians_poly = train_part_poly[cols_for_poly].median()
    df_poly[cols_for_poly] = df_poly[cols_for_poly].fillna(medians_poly)
    if df_poly[cols_for_poly].isnull().any().any():
         df_poly[cols_for_poly] = df_poly[cols_for_poly].fillna(0)

    try:
        poly.fit(df_poly.iloc[:n_train][cols_for_poly])
        poly_features = poly.transform(df_poly[cols_for_poly])
        poly_feature_names = [name.replace(' ', '_TIMES_').replace('^2', '_SQ') for name in poly.get_feature_names_out(cols_for_poly)]
        print(f"  生成了 {len(poly_feature_names)} 个多项式/交互项特征。")

        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_poly.index)
        
        df_poly = df_poly.drop(columns=cols_for_poly, errors='ignore')
        df_final = pd.concat([df_poly, poly_df], axis=1)

        print(f"交互项创建完毕。数据集新维度: {df_final.shape}")
        return df_final
    except Exception as e:
        print(f"  错误: 创建交互项时出错: {e}。返回未修改的数据框。")
        return df_poly

# --- (调用 5.e) ---
# (使用未被分箱的特征)
continuous_cols_for_interactions = [
    '得房率',
    '室', '厅', '卫'
]
continuous_cols_for_interactions = [col for col in continuous_cols_for_interactions if col in df_price_eng.columns]

df_price_final_eng = create_polynomial_interactions(df_price_eng, n_train_price, continuous_cols_for_interactions)


# --- 最终检查 ---
print("\\n" + "=" * 30)
print("系统性特征工程完成后的数据信息:")
print(f"最终形状: {df_price_final_eng.shape}")
df_price_final_eng.info()

\n--- 步骤 4: 正在执行目标编码 ---
处理 ['城市', '区域', '区县', '板块'] (K-Fold Target Encoding, n_splits=6)...
  将对以下存在的列进行编码: ['城市', '区域', '区县', '板块']
  计算了基于 103871 训练样本的完整均值图谱。
  已将完整图谱应用于 34017 个测试样本。
    填充训练集中 K-Fold 后剩余的 95 个 NaN...
  K-Fold Target Encoding 完成。新特征: 'Location_Target_Encoded'。
第五步：系统性特征工程 (剩余部分)
开始创建比率特征...
  创建比率特征 [平均每栋房屋数]...
  创建比率特征 [停车位与房屋总数比]...
  创建比率特征 [绿化率与容积率比]...
  创建比率特征 [室厅比]...
  创建比率特征 [室卫比]...
比率特征创建完毕。
开始进行对数变换...
  已对 'PlotRatio' 应用 log1p 转换 -> 'log_PlotRatio'
  已对 '楼栋总数' 应用 log1p 转换 -> 'log_楼栋总数'
  已对 'HeatingFee' 应用 log1p 转换 -> 'log_HeatingFee'
  已对 'ParkingSpots' 应用 log1p 转换 -> 'log_ParkingSpots'
  已对 '房屋总数' 应用 log1p 转换 -> 'log_房屋总数'
  已对 'PropertyFee' 应用 log1p 转换 -> 'log_PropertyFee'
  已对 '距离中心_公里' 应用 log1p 转换 -> 'log_距离中心_公里'
  已对 '距离中心_公里_平方' 应用 log1p 转换 -> 'log_距离中心_公里_平方'
  已对 '平均每栋房屋数' 应用 log1p 转换 -> 'log_平均每栋房屋数'
  已对 '停车位与房屋总数比' 应用 log1p 转换 -> 'log_停车位与房屋总数比'
  已对 '建筑面积' 应用 log1p 转换 -> 'log_建筑面积'
  已对 '套内面积' 应用 log1p 转换 -> 'log_套内面积'
  已对 'Location_Tar

In [103]:

# 步骤 5: 特征选择 (LassoCV)

print("\n--- 步骤 5: 使用 LassoCV 进行特征选择 ---")

# 使用 6 折交叉验证
lasso_cv_selector = LassoCV(
    cv=6, 
    random_state=111, 
    max_iter=5000, 
    n_jobs=1
)

# 在完整的 (清理后的) 训练数据上拟合
lasso_cv_selector.fit(X_train_clean_scaled, y_train_clean_log)

print(f"LassoCV 选出的最佳 alpha: {lasso_cv_selector.alpha_:.6f}")

# 创建被选中特征的掩码
mask_selected = lasso_cv_selector.coef_ != 0
num_selected = mask_selected.sum()
num_total = len(mask_selected)

print(f"LassoCV 选出了 {num_selected} 个特征 (总共 {num_total} 个)")

# 应用特征掩码
X_train_selected = X_train_scaled[:, mask_selected]
X_val_selected = X_val_scaled[:, mask_selected]
X_train_clean_selected = X_train_clean_scaled[:, mask_selected]
X_test_kaggle_selected = X_test_kaggle_scaled[:, mask_selected]



--- 步骤 5: 使用 LassoCV 进行特征选择 ---
LassoCV 选出的最佳 alpha: 0.001017
LassoCV 选出了 92 个特征 (总共 725 个)


In [104]:


# 步骤 6: 建模、调参与评估

print("\n--- 步骤 6: 建模、调参与评估 ---")

# 准备 6 折交叉验证
cv_6 = KFold(n_splits=6, shuffle=True, random_state=111)

# 自定义评分函数 
# 注意：GridSearchCV 和 cross_val_score 默认 "分数越大越好"，而 MAE 是 "越小越好"，所以返回负数
def mae_original_scorer(y_true_log, y_pred_log):
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)
    mae = mean_absolute_error(y_true_orig, y_pred_orig)
    return -mae # 返回负MAE

mae_scorer = make_scorer(mae_original_scorer, greater_is_better=True)

# 存储结果的字典
results = {}
best_models = {} # 存储最佳模型对象

# --- 辅助函数：用于计算和打印 IS 和 OOS 指标 ---
def evaluate_model(model, model_name):
    # 1. In-Sample (IS) 评估 (在 X_train_selected 上)
    y_pred_log_is = model.predict(X_train_selected)
    y_pred_orig_is = np.expm1(y_pred_log_is)
    is_mae = mean_absolute_error(y_train_orig, y_pred_orig_is)
    is_rmse = np.sqrt(mean_squared_error(y_train_orig, y_pred_orig_is))
    
    # 2. Out-of-Sample (OOS) 评估 (在 X_val_selected 上)
    y_pred_log_oos = model.predict(X_val_selected)
    y_pred_orig_oos = np.expm1(y_pred_log_oos)
    oos_mae = mean_absolute_error(y_val_orig, y_pred_orig_oos)
    oos_rmse = np.sqrt(mean_squared_error(y_val_orig, y_pred_orig_oos))
    
    return {
        "IS_MAE": is_mae, "IS_RMSE": is_rmse,
        "OOS_MAE": oos_mae, "OOS_RMSE": oos_rmse,
        "CV_MAE": np.nan, "CV_RMSE": np.nan # CV 将单独计算
    }

# --- 6.1: OLS (Linear Regression) ---
print("  正在训练 OLS...")
model_ols = LinearRegression()
model_ols.fit(X_train_selected, y_train_log)

# 评估 IS 和 OOS
results['OLS'] = evaluate_model(model_ols, 'OLS')

# 评估 CV (使用完整的清理后数据)
cv_scores = cross_val_score(model_ols, X_train_clean_selected, y_train_clean_log, 
                            cv=cv_6, scoring=mae_scorer, n_jobs=-1)
results['OLS']['CV_MAE'] = -np.mean(cv_scores)
best_models['OLS'] = model_ols


# --- 6.2: Lasso ---
print("  正在调优 Lasso...")
param_grid_lasso = {'alpha': np.logspace(-6, -1, 10)}
grid_lasso = GridSearchCV(
    Lasso(max_iter=5000, random_state=111),
    param_grid_lasso,
    cv=cv_6,
    scoring=mae_scorer,
    n_jobs=-1
)
# 在完整的清理后数据上拟合
grid_lasso.fit(X_train_clean_selected, y_train_clean_log)
model_lasso_best = grid_lasso.best_estimator_

# 评估 IS 和 OOS
results['Lasso'] = evaluate_model(model_lasso_best, 'Lasso')
# CV 分数直接来自 GridSearch
results['Lasso']['CV_MAE'] = -grid_lasso.best_score_
best_models['Lasso'] = model_lasso_best
print(f"    Lasso 最佳 alpha: {grid_lasso.best_params_['alpha']:.6f}")


# --- 6.3: Ridge  ---
print("  正在精调 Ridge ...")
param_grid_ridge_fine = {'alpha': np.logspace(-6, 0, 10)} 
grid_ridge = GridSearchCV(
    Ridge(random_state=111),
    param_grid_ridge_fine, 
    cv=cv_6,
    scoring=mae_scorer,
    n_jobs=-1
)
grid_ridge.fit(X_train_clean_selected, y_train_clean_log)
model_ridge_best = grid_ridge.best_estimator_


results['Ridge'] = evaluate_model(model_ridge_best, 'Ridge')
results['Ridge']['CV_MAE'] = -grid_ridge.best_score_
best_models['Ridge'] = model_ridge_best
print(f"    Ridge 精调后最佳 alpha: {grid_ridge.best_params_['alpha']:.6f}")

# --- 6.4: ElasticNet---
print("  正在调优 ElasticNet...")
param_grid_enet = {
    'alpha': np.logspace(-6, -1, 6),
    'l1_ratio': [0.1, 0.5, 0.9, 0.95, 1.0]
}
grid_enet = GridSearchCV(
    ElasticNet(max_iter=5000, random_state=111),
    param_grid_enet,
    cv=cv_6,
    scoring=mae_scorer,
    n_jobs=-1
)
grid_enet.fit(X_train_clean_selected, y_train_clean_log)
model_enet_best = grid_enet.best_estimator_

# 评估 IS 和 OOS
results['ElasticNet'] = evaluate_model(model_enet_best, 'ElasticNet')
# CV 分数
results['ElasticNet']['CV_MAE'] = -grid_enet.best_score_
best_models['ElasticNet'] = model_enet_best
print(f"    ElasticNet 最佳 alpha: {grid_enet.best_params_['alpha']:.6f}")
print(f"    ElasticNet 最佳 l1_ratio: {grid_enet.best_params_['l1_ratio']:.2f}")



import lightgbm as lgb
print("  正在调优 LightGBM...")

param_grid_lgbm = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40],
    'subsample': [0.8] 
}

grid_lgbm = GridSearchCV(
    
    lgb.LGBMRegressor(random_state=111, n_jobs=1, verbose=-1, subsample_freq=1),
    param_grid_lgbm,
    cv=cv_6,
    scoring=mae_scorer,
    n_jobs=-1,
    verbose=0 
)

# 在完整的清理后数据上拟合
grid_lgbm.fit(X_train_clean_selected, y_train_clean_log)
model_lgbm_best = grid_lgbm.best_estimator_

# 评估 IS 和 OOS
results['LightGBM'] = evaluate_model(model_lgbm_best, 'LightGBM')
# CV 分数
results['LightGBM']['CV_MAE'] = -grid_lgbm.best_score_
best_models['LightGBM'] = model_lgbm_best
print(f"    LGBM 最佳参数: {grid_lgbm.best_params_}")


# 步骤 7: 结果报告 

print("\n--- 步骤 7: 最终结果报告 ---")

# --- 找到最佳的 *线性* 模型 ---

linear_models_to_compare = {
    'OLS': results['OLS']['CV_MAE'],
    'Lasso': results['Lasso']['CV_MAE'],
    'Ridge': results['Ridge']['CV_MAE'],
    'ElasticNet': results['ElasticNet']['CV_MAE']
}
best_linear_model_name = min(linear_models_to_compare, key=linear_models_to_compare.get)

# 将最佳线性模型的数据复制到 'Best Linear Model' 键
results['Best Linear Model'] = results[best_linear_model_name]
best_models['Best Linear Model'] = best_models[best_linear_model_name]

print(f"最佳 线性 模型 (基于 CV MAE): {best_linear_model_name}")

# --- 准备 MAE 报告 ---
report_data_mae = {
    'In-Sample MAE': [
        results['OLS']['IS_MAE'],
        results['Lasso']['IS_MAE'],
        results['Ridge']['IS_MAE'],
        results['Best Linear Model']['IS_MAE'],
        results['LightGBM']['IS_MAE']  
    ],
    'Out-of-Sample MAE': [
        results['OLS']['OOS_MAE'],
        results['Lasso']['OOS_MAE'],
        results['Ridge']['OOS_MAE'],
        results['Best Linear Model']['OOS_MAE'],
        results['LightGBM']['OOS_MAE']  
    ],
    '6-Fold CV MAE': [
        results['OLS']['CV_MAE'],
        results['Lasso']['CV_MAE'],
        results['Ridge']['CV_MAE'],
        results['Best Linear Model']['CV_MAE'],
        results['LightGBM']['CV_MAE']  
    ]
}
df_report_mae = pd.DataFrame(
    report_data_mae,
    index=['OLS', 'Lasso', 'Ridge', 'Best Linear Model', 'LightGBM'] # <-- 新增
)

# --- 准备 RMSE 报告 ("RMAE") ---
report_data_rmse = {
    'In-Sample RMSE': [
        results['OLS']['IS_RMSE'],
        results['Lasso']['IS_RMSE'],
        results['Ridge']['IS_RMSE'],
        results['Best Linear Model']['IS_RMSE'],
        results['LightGBM']['IS_RMSE']  # <-- 新增
    ],
    'Out-of-Sample RMSE': [
        results['OLS']['OOS_RMSE'],
        results['Lasso']['OOS_RMSE'],
        results['Ridge']['OOS_RMSE'],
        results['Best Linear Model']['OOS_RMSE'],
        results['LightGBM']['OOS_RMSE']  # <-- 新增
    ]
}
df_report_rmse = pd.DataFrame(
    report_data_rmse,
    index=['OLS', 'Lasso', 'Ridge', 'Best Linear Model', 'LightGBM']
)

print("\n" + "="*40)
print(" 性能报告 (MAE - 原始价格) ")
print("="*40)
print(df_report_mae)

print("\n" + "="*40)
print(" 性能报告 (RMSE - 原始价格) ")
print("="*40)
print(df_report_rmse)

print("\n" + "="*40)
print(f"报告所用的总预测样本数 (移除y异常值后): {len(y_train_clean_orig)}")
print("="*40)





--- 步骤 6: 建模、调参与评估 ---
  正在训练 OLS...
  正在调优 Lasso...
    Lasso 最佳 alpha: 0.000046
  正在精调 Ridge ...
    Ridge 精调后最佳 alpha: 0.000001
  正在调优 ElasticNet...
    ElasticNet 最佳 alpha: 0.000100
    ElasticNet 最佳 l1_ratio: 1.00
  正在调优 LightGBM...
    LGBM 最佳参数: {'learning_rate': 0.1, 'n_estimators': 500, 'num_leaves': 40, 'subsample': 0.8}

--- 步骤 7: 最终结果报告 ---
最佳 线性 模型 (基于 CV MAE): ElasticNet

 性能报告 (MAE - 原始价格) 
                   In-Sample MAE  Out-of-Sample MAE  6-Fold CV MAE
OLS                   471087.256         469931.274     471480.134
Lasso                 471265.017         469349.701     471381.119
Ridge                 471339.828         469500.234     471480.134
Best Linear Model     471274.902         469259.371     471370.248
LightGBM              213423.555         213989.713     241424.393

 性能报告 (RMSE - 原始价格) 
                   In-Sample RMSE  Out-of-Sample RMSE
OLS                    847721.839          850417.140
Lasso                  848856.335          849736.184
Ridg

In [105]:
import os

# 步骤 8: 为 *所有* 模型生成 Kaggle 提交文件

print("\n--- 步骤 8: 生成 Kaggle 提交文件 ---")

# 1. 定义输出文件夹
output_dir = './output'
# 2. 创建文件夹 
os.makedirs(output_dir, exist_ok=True)
print(f"文件将保存到: '{output_dir}' 文件夹")

# 3. 遍历 `best_models` 字典中的每一个模型
#    (这包括 OLS, Lasso, Ridge, ElasticNet, LightGBM, 和 'Best Linear Model')
for model_name, final_model in best_models.items():
    
    print(f"  正在为模型 '{model_name}' 生成预测...")
    
    # 4. 在 Kaggle 测试集上预测 (已缩放、已选择特征)
    y_kaggle_pred_log = final_model.predict(X_test_kaggle_selected)

    # 5. 转换回原始价格
    y_kaggle_pred_orig = np.expm1(y_kaggle_pred_log)

    # 6. 创建提交文件
    submission = pd.DataFrame({
        'ID': test_ids_price,
        'Price': y_kaggle_pred_orig
    })

    # 7. 检查是否有负数预测
    submission['Price'] = submission['Price'].clip(lower=0)

   
    safe_model_name = model_name.replace(' ', '_') # 替换空格
    file_name = f'submission_price_{safe_model_name}.csv'
    file_path = os.path.join(output_dir, file_name)

    #  保存文件
    submission.to_csv(file_path, index=False)
    print(f"    已生成: '{file_path}'")

print("\n--- 所有提交文件已生成完毕 ---")


--- 步骤 8: 生成 Kaggle 提交文件 ---
文件将保存到: './output' 文件夹
  正在为模型 'OLS' 生成预测...
    已生成: './output\submission_price_OLS.csv'
  正在为模型 'Lasso' 生成预测...
    已生成: './output\submission_price_Lasso.csv'
  正在为模型 'Ridge' 生成预测...
    已生成: './output\submission_price_Ridge.csv'
  正在为模型 'ElasticNet' 生成预测...
    已生成: './output\submission_price_ElasticNet.csv'
  正在为模型 'LightGBM' 生成预测...
    已生成: './output\submission_price_LightGBM.csv'
  正在为模型 'Best Linear Model' 生成预测...
    已生成: './output\submission_price_Best_Linear_Model.csv'

--- 所有提交文件已生成完毕 ---


# rent数据集

In [106]:
from sklearn.metrics import root_mean_squared_error
# --- 辅助函数：地理位置  ---

def handle_district_r(df, n_train): 
    """
    处理 '区县' 列 (数字型分类特征)。
    1. 填充 NaN 为 '未知'。
    2. 转换为 'string' 类型。
    """
    col = '区县'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        return df
    df[col] = df[col].fillna('未知')
    df[col] = df[col].astype(str)
    return df

def handle_board_r(df, n_train): 
    """
    处理 '板块' 列。
    1. 填充 NaN 为 '未知'。
    2. 转换为 'string' 类型。
    """
    col = '板块'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        return df
    df[col] = df[col].fillna('未知')
    df[col] = df[col].astype(str)
    return df

def handle_ring_road_r(df, n_train): 
    """
    处理 '环线位置' 列:
    1. 填充NaN为 '未知'
    2. 进行有序编码
    """
    col = '环线位置'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['环线_ordinal'] = 5
        return df

    df[col] = df[col].fillna('未知')

    ring_map = {
        '二环内': 1, '二至三环': 1,
        '三至四环': 2, '四至五环': 2,
        '五至六环': 3,
        '六环外': 4,
        '内环内': 1, '内环至中环': 2,
        '中环至外环': 3, '内环至外环': 3, # Grouped similar shanghai rings
        '外环外': 4,
        '未知': 4 # Treat unknown as outer ring
    }

    df['环线_ordinal'] = df[col].map(ring_map).fillna(5).astype(int) # Use 5 for values not in map
    df = df.drop(col, axis=1, errors='ignore')
    return df

# --- 地理空间特征  ---
def compute_city_center_and_distances_r(df, n_train):
    """计算城市中心点及房源到中心点的距离。 (基于 n_train 计算中心)"""
    df_out = df.copy()
    print("Calculating geospatial features (distance to center)...")
    if '城市' not in df_out.columns or 'lon' not in df_out.columns or 'lat' not in df_out.columns:
        print("  Warning: Missing '城市', 'lon', or 'lat'. Cannot calculate distance features.")
        
        df_out['距离中心_公里'] = 15.0 
        df_out['距离中心_公里_平方'] = 225.0
        return df_out

    train_part = df_out.iloc[:n_train]
    city_centers = train_part.groupby('城市', observed=True)[['lon', 'lat']].mean().reset_index()
    city_centers = city_centers.rename(columns={'lon': 'center_lon', 'lat': 'center_lat'})
    print(f"  Calculated centers for {len(city_centers)} cities (based on training set)." )

    df_out = pd.merge(df_out, city_centers, on='城市', how='left')

    def compute_distance(row):
        if pd.isna(row['lat']) or pd.isna(row['lon']) or pd.isna(row['center_lat']) or pd.isna(row['center_lon']):
            return np.nan
        try:
            return geodesic((row['lat'], row['lon']), (row['center_lat'], row['center_lon'])).km
        except ValueError:
            return np.nan

    df_out['距离中心_公里'] = df_out.apply(compute_distance, axis=1)

   
    median_dist_train = df_out.iloc[:n_train]['距离中心_公里'].median()
    if pd.isna(median_dist_train): median_dist_train = 15.0 # Fallback median
    df_out['距离中心_公里'].fillna(median_dist_train, inplace=True)
    print(f"  Calculated '距离中心_公里', imputed NaNs using train median ({median_dist_train:.2f} km).")

    df_out['距离中心_公里_平方'] = df_out['距离中心_公里'] ** 2
    df_out = df_out.drop(columns=['center_lon', 'center_lat'], errors='ignore')
    print("Distance features created.")
    return df_out

def create_geo_clusters_r(df, n_train, n_clusters=20, city_col='城市', lon_col='lon', lat_col='lat'):
    """为每个城市计算地理聚类并进行独热编码。(Fit ONLY on n_train)"""
    df_processed = df.copy()
    print(f"Creating geo clusters ({n_clusters} per city)...")

    required_cols = [city_col, lon_col, lat_col]
    if not all(col in df_processed.columns for col in required_cols):
        print(f"  Error: Missing required columns: {required_cols}. Skipping clustering...")
        return df_processed

    
    train_part_coords = df_processed.iloc[:n_train]
    median_lon_train = train_part_coords[lon_col].median()
    median_lat_train = train_part_coords[lat_col].median()
    if pd.isna(median_lon_train): median_lon_train = 116.4 
    if pd.isna(median_lat_train): median_lat_train = 39.9 

    df_processed[lon_col].fillna(median_lon_train, inplace=True)
    df_processed[lat_col].fillna(median_lat_train, inplace=True)

    cluster_col_temp = '地理聚类_temp'
    df_processed[cluster_col_temp] = -1
    all_city_labels = df_processed[city_col].unique()

    kmeans_models = {} 

    for city_label in all_city_labels:
        city_mask = (df_processed[city_col] == city_label)
        city_data_train = df_processed.loc[city_mask & (df_processed.index < n_train), [lon_col, lat_col]]

        if len(city_data_train) >= n_clusters:
            try:
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
                kmeans.fit(city_data_train)
                kmeans_models[city_label] = kmeans 
            except Exception as e:
                print(f"    Warning: KMeans failed for city {city_label}: {e}")
        elif len(city_data_train) > 0:
             print(f"    Warning: Not enough training data points ({len(city_data_train)}) for city {city_label}. Skipping clustering.")

    # Now predict for ALL data using the stored models
    for city_label, kmeans in kmeans_models.items():
        city_mask = (df_processed[city_col] == city_label)
        city_data_all = df_processed.loc[city_mask, [lon_col, lat_col]]
        if not city_data_all.empty:
            clusters_all = kmeans.predict(city_data_all)
            df_processed.loc[city_data_all.index, cluster_col_temp] = clusters_all

    combined_label_col = '地理聚类_带城市'
    df_processed[combined_label_col] = np.where(
         (df_processed[cluster_col_temp] != -1) & df_processed[city_col].notna(),
         'C' + df_processed[city_col].astype(str).str.split('.').str[0] + '_' + df_processed[cluster_col_temp].astype(str),
         'GeoCluster_Unknown'
    )
    df_processed = pd.get_dummies(df_processed, columns=[combined_label_col], prefix='GeoCluster', drop_first=False)
    df_processed = df_processed.drop(columns=[cluster_col_temp], errors='ignore')
    print("Geo cluster features created.")
    return df_processed

# --- 辅助函数：房屋基本属性 ---

def handle_rent_house_type_r(df, n_train): 
    """
    处理 '户型' 列:
    1. 预处理 '.', '车库', '未知室', '居室', '房间'
    2. 提取 '室', '厅', '卫' 数量
    3. 使用【训练集】中位数填充所有NaN值
    """
    print("Processing [户型]...")
    col = '户型'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        # Create dummy columns if missing
        df['卧室数'] = 2.0
        df['客厅数'] = 1.0
        df['卫生间数'] = 1.0
        return df

    house_type = df[col].copy()
    house_type = house_type.replace('.', np.nan)
    house_type = house_type.astype(str) # Ensure string type before replace
    house_type = house_type.replace('车库', '0室0厅0卫', regex=False)
    house_type = house_type.replace('未知室', '0室', regex=False)
    house_type = house_type.replace('房间', '室', regex=False)
    house_type = house_type.replace('居室', '室', regex=False)

    # Use regex that handles missing parts better
    layout_info = house_type.str.extract(r'(\d+)[室房居](?:(\d+)厅)?(?:.*?(\d+)卫)?', expand=True)
    layout_info.columns = ['卧室数', '客厅数', '卫生间数']
    df_extracted = pd.DataFrame(index=df.index)
    for c in layout_info.columns:
        df_extracted[c] = pd.to_numeric(layout_info[c], errors='coerce')

    print(f"  Calculating medians based on first {n_train} rows...")
    train_part = df_extracted.iloc[:n_train]
    medians = {}
    fallback_medians = {'卧室数': 2.0, '客厅数': 1.0, '卫生间数': 1.0}

    for col_name in ['卧室数', '客厅数', '卫生间数']:
        # Calculate median excluding 0 for more representative imputation
        temp_col_for_median = train_part[col_name].replace(0, np.nan).dropna()
        median_val = temp_col_for_median.median() if not temp_col_for_median.empty else np.nan

        if pd.isna(median_val) or median_val == 0:
            median_val = fallback_medians[col_name]
            print(f"    Warning: Median for '{col_name}' failed or was 0. Using fallback {median_val}")
        else:
            print(f"    Median for '{col_name}' (excluding 0): {median_val}")
        medians[col_name] = median_val
        # Impute NaNs in the FULL extracted dataframe
        df_extracted[col_name].fillna(median_val, inplace=True)

    # Add the processed columns to the original dataframe
    df[['卧室数', '客厅数', '卫生间数']] = df_extracted[['卧室数', '客厅数', '卫生间数']]
    print("Imputation complete.")

    return df.drop(col, axis=1, errors='ignore')

def handle_listing_date_r(df, n_train): 
    """
    处理 '交易时间' 列 (YYYY-MM-DD 格式)
    1. 提取 'Listing_Year' (分类)
    2. 提取 'Listing_Month' (分类)
    3. 用【训练集】众数填充未知值 (NaN)
    """
    col = '交易时间'
    print(f"Processing [{col}] (Extracting Year and Month categories)...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
    
        df['Listing_Year'] = '2024'
        df['Listing_Month'] = '6'
        return df

    parsed_dates = pd.to_datetime(df[col], errors='coerce')
    df['Listing_Year'] = parsed_dates.dt.year.astype(str)
    df['Listing_Month'] = parsed_dates.dt.month.astype(str)

    # --- 仅使用训练集进行插值 ---
    train_part = df.iloc[:n_train]
    year_mode = train_part['Listing_Year'].mode()
    month_mode = train_part['Listing_Month'].mode()

    fill_year = year_mode[0] if not year_mode.empty else '2024' # Fallback mode
    fill_month = month_mode[0] if not month_mode.empty else '6'  # Fallback mode

    df['Listing_Year'] = df['Listing_Year'].replace('NaT', fill_year)
    df['Listing_Month'] = df['Listing_Month'].replace('NaT', fill_month)

    print(f"  Created 'Listing_Year' (imputed with train mode: {fill_year})")
    print(f"  Created 'Listing_Month' (imputed with train mode: {fill_month})")

    return df.drop(col, axis=1, errors='ignore')

def handle_rent_floor(df, n_train): 
    """
    处理 '楼层' 列 (Rent version, revised)
    Imputation based on n_train.
    """
    print("Processing [楼层]...")
    col = '楼层'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        # Create dummy/default columns
        df['总楼层数'] = 18.0
        df['楼层位置'] = '中楼层'
        df['相对楼层'] = 0.5
        df['绝对楼层'] = 9.0
        df['总楼层类型'] = '中高层(12-19)'
        df['地下室标志'] = 0
        df['顶层标志'] = 0
        df['相对楼层_总楼层数'] = 9.0
        df['相对楼层_平方'] = 0.25
        df['总楼层数_平方'] = 324.0
        df['相对楼层分箱'] = '中(0.4-0.6)'
        df['总楼层数分箱'] = '12-19层'
        df['楼层位置_总楼层类型'] = '中楼层_中高层(12-19)'
        return df

    floor_col = df[col].copy().fillna('未知')
    floor_col = floor_col.replace('.', '未知')
    floor_col = floor_col.astype(str) # Ensure string for extraction

    # --- 1. Extract Total Floors ---
    df['总楼层数'] = floor_col.str.extract(r'/(\d+)层?', expand=False).astype(float)

    # --- 仅使用训练集进行插值 ---
    train_part = df.iloc[:n_train]
    median_total_floor_train = train_part.loc[train_part['总楼层数'] > 0, '总楼层数'].median()
    if pd.isna(median_total_floor_train) or median_total_floor_train == 0:
        median_total_floor_train = 18.0 
    print(f"  Median '总楼层数' (train): {median_total_floor_train}")
    df['总楼层数'].fillna(median_total_floor_train, inplace=True)
    df['总楼层数'] = df['总楼层数'].replace(0, median_total_floor_train)

    # --- 2. 提取精确绝对楼层 ---
    abs_floor_1 = floor_col.str.extract(r'^(\d+)/', expand=False).astype(float)
    abs_floor_2 = floor_col.str.extract(r'地下(\d+)层', expand=False).astype(float) * -1
    abs_floor_3 = floor_col.str.contains('地下室/').map({True: -1.0, False: np.nan})
    df['绝对楼层_precise'] = abs_floor_1.fillna(abs_floor_2).fillna(abs_floor_3)

    # --- 3. 提取楼层位置 ---
    df['楼层位置'] = floor_col.str.extract(r'(地下室|地下|低楼层|中楼层|高楼层)', expand=False)
    df['楼层位置'] = df['楼层位置'].replace('地下', '地下室')
    is_top = (df['绝对楼层_precise'] == df['总楼层数']) & (df['总楼层数'] > 0)
    is_bottom = (df['绝对楼层_precise'] == 1)
    is_basement = (df['绝对楼层_precise'] < 0)
    df.loc[is_top, '楼层位置'] = '顶层'
    df.loc[is_bottom, '楼层位置'] = '底层'
    df.loc[is_basement, '楼层位置'] = '地下室'

   
    mode_floor_pos_train = df.iloc[:n_train]['楼层位置'].mode()
    fill_pos_train = mode_floor_pos_train[0] if not mode_floor_pos_train.empty else '中楼层'
    df['楼层位置'].fillna(fill_pos_train, inplace=True)

    # --- 4. 估算相对下限 ---
    floor_position_map = {'地下室': 0.0, '底层': 0.05, '低楼层': 0.25, '中楼层': 0.5, '高楼层': 0.75, '顶层': 1.0, '未知': 0.5}
    df['相对楼层'] = df['楼层位置'].map(floor_position_map)
    precise_relative = (df['绝对楼层_precise'] / df['总楼层数']).replace([np.inf, -np.inf], np.nan)
    df['相对楼层'] = precise_relative.fillna(df['相对楼层'])
    df['相对楼层'] = df['相对楼层'].clip(lower=-0.1) # Clip potential negatives from 지하

    df['绝对楼层'] = df['绝对楼层_precise']
    estimated_absolute = (df['相对楼层'] * df['总楼层数']).round()
    df['绝对楼层'].fillna(estimated_absolute, inplace=True)
    # --- 最终绝对下限插值仅使用训练集中位数 ---
    median_abs_floor_train = df.iloc[:n_train].loc[df.iloc[:n_train]['绝对楼层'] > 0, '绝对楼层'].median()
    if pd.isna(median_abs_floor_train): median_abs_floor_train = 3.0 # Fallback
    df['绝对楼层'].fillna(median_abs_floor_train, inplace=True)

    # --- 6. 创建衍生特征 ---
    df['总楼层类型'] = pd.cut(df['总楼层数'], bins=[0, 6, 11, 19, 100], labels=['低层(1-6)', '小高层(7-11)', '中高层(12-19)', '高层(20+)'], include_lowest=True, right=True)
    df['总楼层类型'] = df['总楼层类型'].cat.add_categories('未知').fillna('未知')
    df['地下室标志'] = (df['楼层位置'] == '地下室').astype(int)
    df['顶层标志'] = (df['楼层位置'] == '顶层').astype(int)
    df['相对楼层_总楼层数'] = df['相对楼层'] * df['总楼层数']
    df['相对楼层_平方'] = df['相对楼层'] ** 2
    df['总楼层数_平方'] = df['总楼层数'] ** 2
    df['相对楼层分箱'] = pd.cut(df['相对楼层'], bins=[-0.1, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['地下/极低', '低(0.2-0.4)', '中(0.4-0.6)', '高(0.6-0.8)', '极高(0.8-1.0)'], include_lowest=True)
    df['相对楼层分箱'] = df['相对楼层分箱'].cat.add_categories('未知').fillna('未知')
    df['总楼层数分箱'] = pd.cut(df['总楼层数'], bins=[0, 6, 11, 19, 100], labels=['1-6层', '7-11层', '12-19层', '20+层'], include_lowest=True)
    df['总楼层数分箱'] = df['总楼层数分箱'].cat.add_categories('未知').fillna('未知')
    df['楼层位置_总楼层类型'] = df['楼层位置'].astype(str) + '_' + df['总楼层类型'].astype(str)

    print(f"  Floor position distribution (modes): {df.iloc[:n_train]['楼层位置'].mode().tolist()}")

    
    return df.drop([col, '绝对楼层_precise'], axis=1, errors='ignore')

def handle_building_structure_r(df, n_train):  
    """
    处理 '建筑结构' 列 (Multi-Hot)
    """
    print("Processing [建筑结构] (Rent version - Multi-Hot encoding)...")
    col = '建筑结构'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        # Create dummy columns
        df['is_BType_Tower'] = 0
        df['is_BType_Slab'] = 0
        df['is_BType_Combo'] = 0
        df['is_BType_Bungalow'] = 0
        return df

    structure_col = df[col].copy().fillna('未知')
    structure_col = structure_col.replace(r'^\s*$', '未知', regex=True)
    structure_col = structure_col.replace('.', '未知')
    structure_col = structure_col.astype(str) # Ensure string

    df['is_BType_Tower'] = structure_col.str.contains('塔楼').astype(int)
    df['is_BType_Slab'] = structure_col.str.contains('板楼').astype(int)
    df['is_BType_Combo'] = structure_col.str.contains('塔板结合').astype(int)
    df['is_BType_Bungalow'] = structure_col.str.contains('平房').astype(int)
    print("  Created 'is_BType_Tower', 'is_BType_Slab', 'is_BType_Combo', 'is_BType_Bungalow'.")

    return df.drop(col, axis=1, errors='ignore')

def handle_rent_area(df, n_train): 
    """
    处理 '面积' 列
    """
    print("Processing [面积]...")
    col = '面积'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = 70.0  
        return df

    series_str = df[col].astype(str).copy()
    cleaned_str_series = series_str.str.replace(r'[\s\xa0\u3000]+|㎡', '', regex=True)
    extracted_area_str = cleaned_str_series.str.extract(r'^(\d+\.?\d*)$', expand=False)
    area_numeric = pd.to_numeric(extracted_area_str, errors='coerce')

    # --- 仅使用训练集中位数进行插值 ---
    train_part = area_numeric.iloc[:n_train]
    valid_areas_train = train_part.dropna()
    valid_areas_train = valid_areas_train[valid_areas_train > 0]
    median_area_train = np.nan
    if not valid_areas_train.empty:
        median_area_train = valid_areas_train.median()
    if pd.isna(median_area_train):
        median_area_train = 70.0 # Fallback median
        print(f"  Warning: Could not calculate valid train median for '{col}'. Using fallback {median_area_train}.")

    filled_area = area_numeric.fillna(median_area_train)
   
    df[col] = filled_area.apply(lambda x: median_area_train if pd.isna(x) or x <= 0 else x)

    print(f"  Cleaned and imputed '{col}' using train median ({median_area_train:.2f}).")
    return df

def handle_rent_orientation(df, n_train): 
    """
    处理 '朝向' 列 (Multi-Hot, Unknown sets all flags to 0).
    """
    print("Processing [朝向]...")
    col = '朝向'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['is_朝南'] = 0
        df['is_朝东'] = 0
        df['is_朝西'] = 0
        df['is_朝北'] = 0
        return df

    orientation_col = df[col].copy().fillna('未知')
    orientation_col = orientation_col.replace('.', '未知')
    orientation_col = orientation_col.astype(str) # Ensure string

    df['is_朝南'] = orientation_col.str.contains('南').astype(int)
    df['is_朝东'] = orientation_col.str.contains('东').astype(int)
    df['is_朝西'] = orientation_col.str.contains('西').astype(int)
    df['is_朝北'] = orientation_col.str.contains('北').astype(int)

    unknown_mask = orientation_col.str.contains('未知')
    direction_cols = ['is_朝南', 'is_朝东', 'is_朝西', 'is_朝北']
    df.loc[unknown_mask, direction_cols] = 0

    print("  Created 'is_朝南/东/西/北' features.")
    return df.drop(col, axis=1, errors='ignore')

def handle_rent_decoration(df, n_train): 
    """
    处理 '装修' 列 (精装修 / 非精装修).
    """
    print("Processing [装修]...")
    col = '装修'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = '非精装修' 
        return df

    deco_col = df[col].copy().fillna('非精装修')
    deco_col = deco_col.astype(str).str.strip()
    df[col] = deco_col.map(lambda x: '精装修' if x == '精装修' else '非精装修')

    print(f"  Processed '{col}' into '精装修'/'非精装修'. Unique values: {df[col].unique()}")
    return df


# --- 辅助函数：租赁特定属性 ---

def handle_rent_elevator(df, n_train): 
    """
    处理 '电梯' 列 (有 / 无).
    """
    print("Processing [电梯]...")
    col = '电梯'
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = '无' 
        return df

    elevator_col = df[col].copy().fillna('无')
    elevator_col = elevator_col.astype(str).str.strip()
    df[col] = elevator_col.map(lambda x: '有' if x == '有' else '无')

    print(f"  Processed '{col}' into '有'/'无'. Unique values: {df[col].unique()}")
    return df

def handle_property_type_r(df, n_train): 
    """
    处理 '物业类别' 列 (Multi-Hot).
    """
    col = '物业类别'
    print(f"Processing [{col}] (Multi-Hot encoding)...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        # Create dummy columns
        df['is_Property_Res'] = 0
        df['is_Property_Com'] = 0
        df['is_Property_Villa'] = 0
        df['is_Property_Apt'] = 0
        df['is_Property_Storage'] = 0
        df['is_Property_Indust'] = 0
        return df

    prop_col = df[col].copy().fillna('未知')
    prop_col = prop_col.replace(r'^\s*\(空白\)\s*$', '未知', regex=True)
    prop_col = prop_col.replace(r'^\s*$', '未知', regex=True)
    prop_col = prop_col.replace('.', '未知')
    prop_col = prop_col.astype(str).str.replace(' ', '')

    res_terms = '普通住宅|住宅|平房|四合院|花园洋房|里弄|老公寓|商住两用'
    com_terms = '商业|底商|写字楼|办公|商住两用'
    villa_terms = '别墅'
    apt_terms = '公寓'
    storage_terms = '车库|车位|仓储|库房'
    indust_terms = '工业厂房'

    df['is_Property_Res'] = prop_col.str.contains(res_terms).astype(int)
    df['is_Property_Com'] = prop_col.str.contains(com_terms).astype(int)
    df['is_Property_Villa'] = prop_col.str.contains(villa_terms).astype(int)
    df['is_Property_Apt'] = prop_col.str.contains(apt_terms).astype(int)
    df['is_Property_Storage'] = prop_col.str.contains(storage_terms).astype(int)
    df['is_Property_Indust'] = prop_col.str.contains(indust_terms).astype(int)

    unknown_mask = (prop_col == '未知')
    flag_cols = ['is_Property_Res', 'is_Property_Com', 'is_Property_Villa', 'is_Property_Apt', 'is_Property_Storage', 'is_Property_Indust']
    df.loc[unknown_mask, flag_cols] = 0

    print(f"  Created {len(flag_cols)} Multi-Hot features.")
    return df.drop(col, axis=1, errors='ignore')

def handle_transaction_ownership_r(df, n_train): 
    """
    处理 '产权描述' 列 (Grouping).
    """
    col = '产权描述'
    print(f"Processing [{col}] (Grouping)...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = '未知' 
        return df

    prop_col = df[col].copy().fillna('未知')
    prop_col = prop_col.replace(r'^\s*\(空白\)\s*$', '未知', regex=True)
    prop_col = prop_col.replace(r'^\s*$', '未知', regex=True)
    prop_col = prop_col.replace('.', '未知')
    prop_col = prop_col.astype(str)

    conditions = [
        prop_col.str.contains('经济适用房'),
        prop_col.str.contains('安置房|回迁房|还建房'),
        prop_col.str.contains('公房|房改房|央产房|公租房'),
        prop_col.str.contains('商品房'),
        prop_col.str.contains('乡产|使用权|共有产权房|军产|宅基房|廉租房|校产|私产|集资房'),
        prop_col == '未知'
    ]

    choices = [
        '经济适用房',
        '安置房',
        '政策房',
        '商品房',
        '其他稀有产权',
        '未知'
    ]

    df[col] = np.select(conditions, choices, default='其他稀有产权')

    print(f"  Processed categories into: {np.unique(df[col])}")
    return df

def handle_payment_method_r(df, n_train): 
    """
    处理 '付款方式' 列
    """
    col = '付款方式'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = '未知' 
        return df

    payment_col = df[col].copy().fillna('未知')
    payment_col = payment_col.astype(str).str.strip()

    valid_categories = ['半年付价', '季付价', '年付价', '双月付价', '月付价']

    df[col] = np.where(payment_col.isin(valid_categories), payment_col, '未知')

    print(f"  Processed '{col}'. Unique values: {df[col].unique()}")
    return df

def handle_lease_type_r(df, n_train): 
    """
    处理 '租赁方式' 列
    """
    col = '租赁方式'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = '未知' 
        return df

    df[col] = df[col].fillna('未知')
    print(f"  Processed '{col}'. Unique values: {df[col].unique()}")
    return df

def parse_term(term_str):
    """辅助函数：解析 '租期' 字符串为月数"""
    term_str = str(term_str).replace(' ', '')
    if term_str in ['(缺失值)', '未知', 'nan']:
        return np.nan
    try:
        range_month = re.search(r'(\d+)~(\d+)个月', term_str)
        if range_month: return (float(range_month.group(1)) + float(range_month.group(2))) / 2
        range_year = re.search(r'(\d+)~(\d+)年', term_str)
        if range_year: return ((float(range_year.group(1)) + float(range_year.group(2))) / 2) * 12
        within_year = re.search(r'(\d+)年以内', term_str)
        if within_year: return float(within_year.group(1)) * 12 / 2
        within_month = re.search(r'(\d+)个月以内', term_str)
        if within_month: return float(within_month.group(1)) / 2
        above_year = re.search(r'(\d+)年以上', term_str)
        if above_year: return float(above_year.group(1)) * 12 + 6
        above_month = re.search(r'(\d+)个月以上', term_str)
        if above_month: return float(above_month.group(1)) + 6
        exact_year = re.search(r'^(\d+)年$', term_str)
        if exact_year: return float(exact_year.group(1)) * 12
        exact_month = re.search(r'^(\d+)个月$', term_str)
        if exact_month: return float(exact_month.group(1))
        return np.nan
    except Exception:
        return np.nan

def handle_lease_term(df, n_train): 
    """处理 '租期' 列 中位数填充"""
    col = '租期'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['租期_月'] = 6.0 
        return df

    s = df[col].copy().astype(str)
    df['租期_月'] = s.apply(parse_term)

    # --- 仅使用训练集中位数进行插值 ---
    median_term_train = df.iloc[:n_train]['租期_月'].median()
    if pd.isna(median_term_train) or median_term_train == 0:
        print(f"  Warning: Could not calculate valid train median for '{col}'. Using fallback 6.0.")
        median_term_train = 6.0 # Fallback median

    df['租期_月'].fillna(median_term_train, inplace=True)
    print(f"  Created '租期_月' (imputed with train median: {median_term_train:.1f}).")
    return df.drop(col, axis=1, errors='ignore')


# --- 辅助函数：小区与费用 ---

def handle_developer_r(df, n_train): 
    """处理 '开发商' 列:"""
    col = '开发商'
    new_col = 'has_Developer'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 0
        return df

    no_developer_list = ['无', '无开发公司', '无开发商', '暂无信息', '暂无资料','暂无']
    df[col] = df[col].fillna('无')
    df[new_col] = (~df[col].isin(no_developer_list)).astype(int)
    df = df.drop(col, axis=1, errors='ignore')
    print(f"  Created new feature '{new_col}'")
    return df

def handle_property_management_r(df, n_train): 
    """处理 '物业公司' 列:"""
    col = '物业公司'
    new_col = 'has_PropertyMgmt'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 0
        return df

    no_management_list = ['无', '无物业', '无物业管理', '无物业管理服务', '暂时无物业公司', '暂无信息']
    df[col] = df[col].fillna('无物业')
    df[new_col] = (~df[col].isin(no_management_list)).astype(int)
    df = df.drop(col, axis=1, errors='ignore')
    print(f"  Created new feature '{new_col}'")
    return df

def clean_numeric_str(s):
    """辅助函数：从 '1317户' 或 '19栋' 提取数字。"""
    if pd.isna(s): return np.nan
    match = re.search(r'(\d+)', str(s))
    return int(match.group(1)) if match else np.nan

def handle_community_stats(df, n_train): 
    """处理 '房屋总数' 和 '楼栋总数':"""
    houses_col = '房屋总数'
    buildings_col = '楼栋总数'
    print(f"Processing [{houses_col}] and [{buildings_col}]...")

    if houses_col not in df.columns or buildings_col not in df.columns:
        print(f"  Warning: Missing '{houses_col}' or '{buildings_col}'.")
        # Create dummy columns if missing
        df[houses_col] = 1000.0
        df[buildings_col] = 10.0
        df['avg_units_per_building'] = 100.0
        return df

    df[houses_col] = df[houses_col].apply(clean_numeric_str)
    df[buildings_col] = df[buildings_col].apply(clean_numeric_str)

    temp_buildings = df[buildings_col].replace(0, np.nan)
    df['avg_units_per_building'] = df[houses_col] / temp_buildings

    # --- 仅使用训练集中位数进行插值 ---
    train_part = df.iloc[:n_train]
    for col in [houses_col, buildings_col, 'avg_units_per_building']:
        median_val_train = train_part[col].median()
        if pd.isna(median_val_train):
          
            if col == houses_col: median_val_train = 1000.0
            elif col == buildings_col: median_val_train = 10.0
            else: median_val_train = 100.0
            print(f"    Warning: Could not calculate train median for '{col}'. Using fallback {median_val_train}.")
        df[col].fillna(median_val_train, inplace=True)
        print(f"    Imputed '{col}' using train median ({median_val_train:.2f}).")

    print(f"  Cleaned and imputed '{houses_col}', '{buildings_col}', 'avg_units_per_building'.")
    return df

def parse_building_year(s):
    """辅助函数：从字符串中解析年份"""
    if pd.isna(s): return np.nan
    s_str = str(s)
    nums = re.findall(r'(\d{4})', s_str)
    if not nums: return np.nan
    return float(nums[0]) if len(nums) == 1 else (float(nums[0]) + float(nums[-1])) / 2 # Use first/last if range

def handle_building_age(df, n_train, current_year=2025): 
    """处理 '建筑年代' 列:"""
    col = '建筑年代'
    new_col = 'BuildingAge'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 20.0 
        return df

    df['Parsed_Year'] = df[col].apply(parse_building_year)
    df[new_col] = current_year - df['Parsed_Year']

    
    median_age_train = df.iloc[:n_train][new_col].median()
    if pd.isna(median_age_train) or median_age_train < 0:
        print(f"  Warning: 无法计算有效的训练样本中位数。使用备用值20.0。")
        median_age_train = 20.0 # Fallback median age

    df[new_col].fillna(median_age_train, inplace=True)

    df[new_col] = df[new_col].clip(lower=0)

    print(f"  Created new feature '{new_col}' (中位数: {median_age_train:.2f}).")
    df = df.drop([col, 'Parsed_Year'], axis=1, errors='ignore')
    return df

def _parse_fee(s):
    """辅助函数：解析费用"""
    if pd.isna(s): return np.nan
    s_str = str(s).replace(' ', '')
    if '空白' in s_str or '暂无' in s_str: return np.nan
    nums = re.findall(r'(\d+\.?\d*)', s_str)
    if not nums: return np.nan
    return float(nums[0]) if len(nums) == 1 else (float(nums[0]) + float(nums[-1])) / 2

def handle_greenery_rate(df, n_train): 
    """处理 '绿化率' 列 (中位数填充):"""
    col = '绿化率'
    new_col = 'GreeneryRate'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 30.0 
        return df

    s = df[col].astype(str).str.replace(r'[\s%]', '', regex=True)
    s_numeric = pd.to_numeric(s, errors='coerce')

    # --- 仅使用训练集中位数进行插值 ---
    valid_train_rates = s_numeric.iloc[:n_train][s_numeric.iloc[:n_train].between(0, 100, inclusive='both')]
    median_val_train = valid_train_rates.median()
    if pd.isna(median_val_train):
        print(f"  Warning: Could not calculate valid train median green rate. Using fallback 30.0.")
        median_val_train = 30.0 # Fallback

    df[new_col] = s_numeric.fillna(median_val_train)
    # Clip ALL values outside 0-100 range after imputation
    df[new_col] = df[new_col].clip(lower=0, upper=100)

    print(f"  Created new feature '{new_col}' (imputed with train median: {median_val_train:.2f}, clipped to 0-100).")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_plot_ratio(df, n_train): 
    """处理 '容积率' 列 中位数填充"""
    col = '容积率'
    new_col = 'PlotRatio'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 2.0 # Default
        return df

    s_numeric = pd.to_numeric(df[col], errors='coerce')

    median_val_train = s_numeric.iloc[:n_train].median()
    if pd.isna(median_val_train) or median_val_train <= 0:
        print(f"  Warning: Could not calculate valid train median plot ratio. Using fallback 2.0.")
        median_val_train = 2.0 # Fallback

    df[new_col] = s_numeric.fillna(median_val_train)
    # Clip potentially negative values after imputation
    df[new_col] = df[new_col].clip(lower=0.01) # Ensure positive

    print(f"  Created new feature '{new_col}' (imputed with train median: {median_val_train:.2f}).")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_property_fee(df, n_train):
    """处理 '物业费' 列 中位数填充"""
    col = '物业费'
    new_col = 'PropertyFee'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 0.0 # Default
        return df

    s_numeric = df[col].apply(_parse_fee)

    median_val_train = s_numeric.iloc[:n_train].median()
    if pd.isna(median_val_train) or median_val_train < 0:
         print(f"  Warning: Could not calculate valid train median property fee. Using fallback 0.0.")
         median_val_train = 0.0 # Fallback

    df[new_col] = s_numeric.fillna(median_val_train)
    # Clip potentially negative values
    df[new_col] = df[new_col].clip(lower=0)

    print(f"  Created new feature '{new_col}' (imputed with train median: {median_val_train:.2f}).")
    df = df.drop(col, axis=1, errors='ignore')
    return df


# --- 辅助函数：能源与设施 ---

def handle_rent_water_supply(df, n_train): 
    """处理 '用水' 列"""
    col = '用水'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['is_Water_Civil'] = 0
        df['is_Water_Commercial'] = 0
        return df

    s = df[col].fillna('')
    df['is_Water_Civil'] = s.str.contains('民水').astype(int)
    df['is_Water_Commercial'] = s.str.contains('商水').astype(int)
    print("  Created 'is_Water_Civil' and 'is_Water_Commercial' features.")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_rent_heating(df, n_train): 
    """处理 '采暖' 列"""
    col = '采暖'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['is_Heating_Central'] = 0
        df['is_Heating_Self'] = 0
        df['is_Heating_None'] = 0
        return df

    s = df[col].fillna('')
    df['is_Heating_Central'] = s.str.contains('集中供暖').astype(int)
    df['is_Heating_Self'] = s.str.contains('自采暖').astype(int)
    df['is_Heating_None'] = s.str.contains('无').astype(int) # Check for '无' instead of '无供暖' for broader match
    print("  Created 'is_Heating_Central', 'is_Heating_Self', 'is_Heating_None' features.")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_rent_electricity(df, n_train): 
    """处理 '用电' 列"""
    col = '用电'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['is_Electricity_Civil'] = 0
        df['is_Electricity_Commercial'] = 0
        return df

    s = df[col].fillna('')
    df['is_Electricity_Civil'] = s.str.contains('民电').astype(int)
    df['is_Electricity_Commercial'] = s.str.contains('商电').astype(int)
    print("  Created 'is_Electricity_Civil' and 'is_Electricity_Commercial' features.")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_gas_supply(df, n_train): 
    """处理 '燃气' 列 (有/无)"""
    col = '燃气'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df['is_Gas_Available'] = 0
        return df

    s = df[col].fillna('未知')
    df['is_Gas_Available'] = (s == '有').astype(int)
    print("  Created 'is_Gas_Available' feature.")
    return df.drop(col, axis=1, errors='ignore')

def _parse_fee2(s): 
    """辅助函数：解析费用"""
    if pd.isna(s): return np.nan
    s_str = str(s).replace(' ', '')
    if '空白' in s_str: return np.nan
    nums = re.findall(r'(\d+\.?\d*)', s_str)
    if not nums: return np.nan
    return float(nums[0]) if len(nums) == 1 else (float(nums[0]) + float(nums[-1])) / 2

def handle_gas_fee(df, n_train): 
    """处理 '燃气费' 列 中位数填充"""
    col = '燃气费'
    new_col = 'GasFee'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 0.0 # Default
        return df

    s_numeric = df[col].apply(_parse_fee2) 

    
    median_val_train = s_numeric.iloc[:n_train].median()
    if pd.isna(median_val_train) or median_val_train < 0:
        print(f"  Warning: Could not calculate valid train median gas fee. Using fallback 0.0.")
        median_val_train = 0.0 # Fallback

    df[new_col] = s_numeric.fillna(median_val_train)
    df[new_col] = df[new_col].clip(lower=0)

    print(f"  Created new feature '{new_col}' (imputed with train median: {median_val_train:.2f}).")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_heating_fee(df, n_train): 
    """处理 '供热费' 列 中位数填充"""
    col = '供热费'
    new_col = 'HeatingFee'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 0.0 
        return df

    s_numeric = df[col].apply(_parse_fee2) 

   
    median_val_train = s_numeric.iloc[:n_train].median()
    if pd.isna(median_val_train) or median_val_train < 0:
        print(f"  Warning: Could not calculate valid train median heating fee. Using fallback 0.0.")
        median_val_train = 0.0 # Fallback

    df[new_col] = s_numeric.fillna(median_val_train)
    df[new_col] = df[new_col].clip(lower=0)

    print(f"  Created new feature '{new_col}' (imputed with train median: {median_val_train:.2f}).")
    df = df.drop(col, axis=1, errors='ignore')
    return df


# --- 辅助函数：其他设施 ---

def handle_parking_spots(df, n_train): 
    """处理 '停车位' 列 中位数填充"""
    col = '停车位'
    new_col = 'ParkingSpots'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col] = 0.0
        return df

    s_numeric = pd.to_numeric(df[col], errors='coerce')

    # --- 仅使用训练集中位数进行插值 ---
    median_val_train = s_numeric.iloc[:n_train].median()
    if pd.isna(median_val_train) or median_val_train < 0:
        print(f"  Warning: Could not calculate valid train median parking spots. Using fallback 0.0.")
        median_val_train = 0.0 # Fallback

    df[new_col] = s_numeric.fillna(median_val_train)
    df[new_col] = df[new_col].clip(lower=0)

    print(f"  Created new feature '{new_col}' (imputed with train median: {median_val_train:.2f}).")
    df = df.drop(col, axis=1, errors='ignore')
    return df

def handle_parking_type(df, n_train): 
    """处理 '车位' 列"""
    col = '车位'
    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[col] = '未知' 
        return df

    parking_col = df[col].copy().fillna('未知')
    valid_categories = ['免费使用', '租用车位']
    df[col] = np.where(parking_col.isin(valid_categories), parking_col, '未知')

    print(f"  Processed '{col}'. Unique values: {df[col].unique()}")
    return df

def handle_amenities(df, n_train): 
    """处理 '配套设施' 列:"""
    col = '配套设施'
    new_col_count = 'amenity_count'
    new_col_metro = 'is_Amenity_Metro'

    print(f"Processing [{col}]...")
    if col not in df.columns:
        print(f"  Warning: Column '{col}' not found.")
        df[new_col_count] = 0
        df[new_col_metro] = 0
        return df

    s = df[col].fillna('')
    df[new_col_count] = s.apply(lambda x: len(x.split('、')) if x else 0)
    df[new_col_metro] = s.str.contains('地铁').astype(int)

    print(f"  Created '{new_col_count}' and '{new_col_metro}' features.")
    return df.drop(col, axis=1, errors='ignore')



In [107]:
print("\n--- 步骤 2: 数据加载、合并、初步清理 ---")

# --- 2a: 加载 ---
try:
    df_train_raw = pd.read_csv('./data/ruc_Class25Q2_train_rent.csv')
    df_test_raw = pd.read_csv('./data/ruc_Class25Q2_test_rent.csv')
    print(f"训练数据加载成功: {df_train_raw.shape}")
    print(f"Kaggle 测试数据加载成功: {df_test_raw.shape}")
except FileNotFoundError:
    print("错误：未找到数据文件。请检查 './data/' 目录下的 CSV 文件名。")
    raise

# --- 2b: 存储原始信息 ---
n_train = df_train_raw.shape[0]
print(f"原始训练集行数 (n_train): {n_train}")

y_train_full = df_train_raw['Price'].copy() # Store original target
y_train_ln_full = np.log1p(y_train_full) # Store log target

test_ids = df_test_raw['ID'].copy() # Store test IDs

# --- 2c: 合并 ---
print("合并训练集和测试集...")
df_train_to_concat = df_train_raw.drop(columns=['Price'], errors='ignore')
df_test_to_concat = df_test_raw.drop(columns=['ID', 'Price'], errors='ignore') 

df_train_to_concat['source'] = 'train'
df_test_to_concat['source'] = 'test'

df_full = pd.concat([df_train_to_concat, df_test_to_concat], ignore_index=True)
print(f"合并后 df_full 形状: {df_full.shape}")

# --- 2d: 初步列删除 ---
columns_to_drop_initial = [
    '年份', '物业办公电话', '供水', '供暖', '供电', '停车费用',
    'coord_x', 'coord_y', '客户反馈', 'source' 
]
existing_cols_to_drop = [col for col in columns_to_drop_initial if col in df_full.columns]
print(f"初步删除 {len(existing_cols_to_drop)} 列: {existing_cols_to_drop}")
df_full = df_full.drop(columns=existing_cols_to_drop, errors='ignore')
print(f"初步列删除后 df_full 形状: {df_full.shape}")

del df_train_raw, df_test_raw, df_train_to_concat, df_test_to_concat # Clean up memory
gc.collect()


--- 步骤 2: 数据加载、合并、初步清理 ---
训练数据加载成功: (98899, 46)
Kaggle 测试数据加载成功: (9773, 46)
原始训练集行数 (n_train): 98899
合并训练集和测试集...
合并后 df_full 形状: (108672, 46)
初步删除 10 列: ['年份', '物业办公电话', '供水', '供暖', '供电', '停车费用', 'coord_x', 'coord_y', '客户反馈', 'source']
初步列删除后 df_full 形状: (108672, 36)


0

In [108]:

print("\n--- 步骤 3: 定义并应用通用特征工程管道 ---")

def apply_base_feature_engineering(df, n_train):
    """
    Apply all basic handle_... functions in sequence.
    Uses n_train for leakage prevention.
    """
    df_processed = df.copy()
    print(f"\n>>> Starting base feature engineering (Shape: {df_processed.shape})")

    # Geography related
    df_processed = handle_district_r(df_processed, n_train)
    df_processed = handle_board_r(df_processed, n_train)
    df_processed = handle_ring_road_r(df_processed, n_train)

    # Basic property attributes
    df_processed = handle_rent_house_type_r(df_processed, n_train)
    df_processed = handle_listing_date_r(df_processed, n_train)
    df_processed = handle_rent_floor(df_processed, n_train)
    df_processed = handle_building_structure_r(df_processed, n_train)
    df_processed = handle_rent_area(df_processed, n_train)
    df_processed = handle_rent_orientation(df_processed, n_train)
    df_processed = handle_rent_decoration(df_processed, n_train)

    # Rent-specific attributes
    df_processed = handle_rent_elevator(df_processed, n_train)
    df_processed = handle_property_type_r(df_processed, n_train)
    df_processed = handle_transaction_ownership_r(df_processed, n_train)
    df_processed = handle_payment_method_r(df_processed, n_train)
    df_processed = handle_lease_type_r(df_processed, n_train)
    df_processed = handle_lease_term(df_processed, n_train)

    # Community and fees
    df_processed = handle_developer_r(df_processed, n_train)
    df_processed = handle_property_management_r(df_processed, n_train)
    df_processed = handle_community_stats(df_processed, n_train)
    df_processed = handle_building_age(df_processed, n_train) # Uses n_train implicit median calc
    df_processed = handle_greenery_rate(df_processed, n_train)
    df_processed = handle_plot_ratio(df_processed, n_train)
    df_processed = handle_property_fee(df_processed, n_train)

    # Utilities
    df_processed = handle_rent_water_supply(df_processed, n_train)
    df_processed = handle_rent_heating(df_processed, n_train)
    df_processed = handle_rent_electricity(df_processed, n_train)
    df_processed = handle_gas_supply(df_processed, n_train)
    df_processed = handle_gas_fee(df_processed, n_train)
    df_processed = handle_heating_fee(df_processed, n_train)

    # Parking and amenities
    df_processed = handle_parking_spots(df_processed, n_train)
    df_processed = handle_parking_type(df_processed, n_train)
    df_processed = handle_amenities(df_processed, n_train)

    print(f"<<< Base feature engineering complete (Shape: {df_processed.shape})")
    return df_processed

# Apply base engineering
df_processed = apply_base_feature_engineering(df_full, n_train)

del df_full # Clean up memory
gc.collect()



--- 步骤 3: 定义并应用通用特征工程管道 ---

>>> Starting base feature engineering (Shape: (108672, 36))
Processing [区县]...
Processing [板块]...
Processing [环线位置]...
Processing [户型]...
  Calculating medians based on first 98899 rows...
    Median for '卧室数' (excluding 0): 2.0
    Median for '客厅数' (excluding 0): 1.0
    Median for '卫生间数' (excluding 0): 1.0
Imputation complete.
Processing [交易时间] (Extracting Year and Month categories)...
  Created 'Listing_Year' (imputed with train mode: 2024)
  Created 'Listing_Month' (imputed with train mode: 5)
Processing [楼层]...
  Median '总楼层数' (train): 18.0
  Floor position distribution (modes): ['中楼层']
Processing [建筑结构] (Rent version - Multi-Hot encoding)...
  Created 'is_BType_Tower', 'is_BType_Slab', 'is_BType_Combo', 'is_BType_Bungalow'.
Processing [面积]...
  Cleaned and imputed '面积' using train median (79.00).
Processing [朝向]...
  Created 'is_朝南/东/西/北' features.
Processing [装修]...
  Processed '装修' into '精装修'/'非精装修'. Unique values: ['精装修' '非精装修']
Processing [电梯]...

62

In [109]:
print("\n--- 步骤 4: 应用高级特征工程 ---")

# --- 4a: Geospatial features ---
print("\n--- 4a: 创建地理空间特征 ---")
df_processed = compute_city_center_and_distances_r(df_processed, n_train)
df_processed = create_geo_clusters_r(df_processed, n_train, n_clusters=20)

print("\n--- 4b: 执行目标编码 ---")
def apply_target_encoding_combined(df, y_target, n_train, loc_cols, n_splits=6, random_state=111):
    """在合并的数据框上对 loc_cols 进行 K-Fold 目标编码。(Fit ONLY on n_train)"""
    df_te = df.copy()
    print(f"Processing {loc_cols} (K-Fold Target Encoding, n_splits={n_splits})...")
    if not loc_cols: return df_te

    existing_loc_cols = [col for col in loc_cols if col in df_te.columns]
    if not existing_loc_cols:
        print("  Warning: No location columns found for target encoding.")
        return df_te
    print(f"  Encoding existing columns: {existing_loc_cols}")

    new_col = 'Location_Target_Encoded'
    
    global_mean = y_target.iloc[:n_train].mean() # Use y_train_full (original price)
    print(f"  Calculated global mean from original train target: {global_mean:.2f}")

    def create_key(df_slice):
        return df_slice[existing_loc_cols].astype(str).agg('_'.join, axis=1)
    df_te['key'] = create_key(df_te)

    
    X_train_part = df_te.iloc[:n_train].copy()
    X_train_part['target'] = y_target.iloc[:n_train] # Use original train target
    full_train_map = X_train_part.groupby('key')['target'].mean()
    print(f"  Calculated full mean map based on {n_train} training samples.")

    
    X_test_part = df_te.iloc[n_train:].copy()
    df_te.loc[X_test_part.index, new_col] = X_test_part['key'].map(full_train_map).fillna(global_mean)
    print(f"  Applied full map to {len(X_test_part)} test samples.")

    
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    df_te.loc[X_train_part.index, new_col] = np.nan

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_part)):
        X_train_fold = X_train_part.iloc[train_idx]
        X_val_fold = X_train_part.iloc[val_idx]
        fold_map = X_train_fold.groupby('key')['target'].mean()
        df_te.loc[X_val_fold.index, new_col] = X_val_fold['key'].map(fold_map)

    
    is_train_mask = df_te.index < n_train
    is_nan_mask = df_te[new_col].isnull()
    fill_mask = is_train_mask & is_nan_mask
    fill_count = fill_mask.sum()
    if fill_count > 0:
        print(f"    Filling {fill_count} remaining NaNs in train set using full map...")
        values_to_fill = df_te.loc[fill_mask, 'key'].map(full_train_map).values
        df_te.loc[fill_mask, new_col] = values_to_fill

    
    df_te[new_col].fillna(global_mean, inplace=True)

    # --- Clean up ---
    cols_to_drop = existing_loc_cols + ['key', 'target'] # Remove target if it exists
    df_te = df_te.drop(columns=cols_to_drop, errors='ignore')
    print(f"  K-Fold Target Encoding complete. New feature: '{new_col}'.")
    return df_te

location_columns_to_encode = ['城市', '区县', '板块'] # Rent uses these
df_processed = apply_target_encoding_combined(df_processed, y_train_full, n_train, location_columns_to_encode)

# --- 4c: Ratio features ---
print("\n--- 4c: 创建比率特征 ---")
def create_ratio_features(df):
    df_out = df.copy()
    print("Creating ratio features...")
    def cal_ratio(df_r, num, den, new_name):
        if num in df_r.columns and den in df_r.columns:
            print(f"  Creating ratio [{new_name}]...")
            df_r[new_name] = np.where(df_r[den] > 0, df_r[num] / df_r[den], 0)
        else: print(f"  Warning: Cannot create '{new_name}', missing columns.")

    cal_ratio(df_out, '卧室数', '客厅数', '室厅比')
    cal_ratio(df_out, '卧室数', '卫生间数', '室卫比')
    cal_ratio(df_out, '面积', '卧室数', '平均每卧室面积')
    cal_ratio(df_out, 'GreeneryRate', 'PlotRatio', '绿化容积比')
    cal_ratio(df_out, 'ParkingSpots', '房屋总数', '车位房屋比')
    cal_ratio(df_out, 'amenity_count', '面积', '设施密度')
    print("Ratio features created.")
    return df_out

df_processed = create_ratio_features(df_processed)

# --- 4d: Log Transform ---
print("\n--- 4d: 执行对数变换 ---")
def log_transform(df, skewed_cols):
    df_transformed = df.copy()
    print("Applying log1p transform...")
    transformed_count = 0
    for col in skewed_cols:
        if col in df_transformed.columns and pd.api.types.is_numeric_dtype(df_transformed[col]):
            min_val = df_transformed[col].min()
            if min_val >= 0:
                col_log = f'log_{col}'
                df_transformed[col_log] = np.log1p(df_transformed[col])
                df_transformed = df_transformed.drop(columns=[col], errors='ignore')
                transformed_count += 1
                print(f"  Transformed '{col}' -> '{col_log}'")
            else:
                print(f"  Warning: '{col}' has negative values (min: {min_val}), skipping log1p.")
        else:
             if col in df_transformed.columns: print(f"  Warning: '{col}' is not numeric, skipping log1p.")
    print(f"Log transform complete. Processed {transformed_count} columns.")
    return df_transformed


skewed_cols = [
    '面积', '总楼层数', '绝对楼层', '相对楼层_总楼层数', '总楼层数_平方',
    '房屋总数', '楼栋总数', 'avg_units_per_building', 'BuildingAge',
    'GreeneryRate', 'PlotRatio', 'PropertyFee', 'GasFee', 'HeatingFee',
    'ParkingSpots', 'amenity_count', '距离中心_公里', '距离中心_公里_平方',
    'Location_Target_Encoded', '室厅比', '室卫比', '平均每卧室面积',
    '绿化容积比', '车位房屋比', '设施密度', '租期_月'
]
skewed_cols_existing = [col for col in skewed_cols if col in df_processed.columns]
df_processed = log_transform(df_processed, skewed_cols_existing)

# --- 4e: Binning ---
print("\n--- 4e: 执行特征分箱 ---")
def bin_and_encode(df, n_train, feature, n_bins=5, strategy='kmeans'):
    df_binned = df.copy()
    print(f"Binning feature [{feature}] (n_bins={n_bins}, strategy='{strategy}')...")
    if feature not in df_binned.columns or not pd.api.types.is_numeric_dtype(df_binned[feature]):
        print(f"  Warning: Feature '{feature}' not found or not numeric. Skipping.")
        return df_binned

    
    median_val_train = df_binned.iloc[:n_train][feature].median()
    if pd.isna(median_val_train):
        print(f"  Warning: Could not calculate train median for '{feature}'. Using 0 for imputation.")
        median_val_train = 0.0 # Fallback
    df_binned[feature].fillna(median_val_train, inplace=True)
    

    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=None)
    feature_binned_col = f'{feature}_分箱'

    try:
        train_data_for_fit = df_binned.iloc[:n_train][[feature]]
        
        if train_data_for_fit[feature].nunique() < n_bins:
             actual_bins = train_data_for_fit[feature].nunique()
             print(f"  Warning: Unique values ({actual_bins}) < n_bins ({n_bins}). Using uniform strategy with {actual_bins} bins.")
             if actual_bins < 2:
                 print(f"  Skipping binning for '{feature}' due to < 2 unique train values.")
                 return df_binned
             binner = KBinsDiscretizer(n_bins=actual_bins, encode='ordinal', strategy='uniform', subsample=None)

        binner.fit(train_data_for_fit)
        df_binned[feature_binned_col] = binner.transform(df_binned[[feature]])
        print(f"  Created binned feature '{feature_binned_col}'.")
    except ValueError as e:
        print(f"  Error binning '{feature}': {e}. Skipping...")
        return df_binned # Return original df if binning fails

    # One-hot encode the binned feature and remove original
    df_binned = pd.get_dummies(df_binned, columns=[feature_binned_col], prefix=f'{feature}段', drop_first=False)
    df_binned = df_binned.drop(columns=[feature], errors='ignore')
    print(f"  One-hot encoded bins and removed original '{feature}'.")
    return df_binned

features_to_bin = [
    'log_BuildingAge', 'log_总楼层数', 'log_距离中心_公里', 'log_面积',
    'log_Location_Target_Encoded', 'log_租期_月'
]
features_to_bin_existing = [col for col in features_to_bin if col in df_processed.columns]
for feature in features_to_bin_existing:
    df_processed = bin_and_encode(df_processed, n_train, feature=feature, n_bins=5, strategy='kmeans')

print("Binning complete.")

# --- 4f: Interaction Terms ---
print("\n--- 4f: 创建交互项 ---")
def create_polynomial_interactions(df, n_train, continuous_cols_candidates, degree=2, interaction_only=True):
    df_poly = df.copy()
    print(f"Creating interaction terms (degree={degree}, interaction_only={interaction_only})...")

    
    binary_cols = [col for col in df_poly.columns if df_poly[col].nunique(dropna=False) == 2 and df_poly[col].min() == 0 and df_poly[col].max() == 1]
    all_numeric_cols = df_poly.select_dtypes(include=np.number).columns.tolist()
    current_continuous_cols = [col for col in all_numeric_cols if col not in binary_cols]
    cols_for_poly = [col for col in continuous_cols_candidates if col in current_continuous_cols]

    print(f"  Creating interactions for {len(cols_for_poly)} features: {cols_for_poly}")
    if not cols_for_poly: return df_poly

    poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, include_bias=False)

    # --- Impute using ONLY train median before fitting ---
    train_part_poly = df_poly.iloc[:n_train][cols_for_poly]
    medians_poly = train_part_poly.median()
    
    df_poly[cols_for_poly] = df_poly[cols_for_poly].fillna(medians_poly)
    # Final fallback fill if medians were NaN
    if df_poly[cols_for_poly].isnull().any().any():
        print("    Warning: Fallback filling remaining NaNs with 0 before interaction.")
        df_poly[cols_for_poly] = df_poly[cols_for_poly].fillna(0)

    try:
        poly.fit(df_poly.iloc[:n_train][cols_for_poly])
        poly_features = poly.transform(df_poly[cols_for_poly])
        poly_feature_names = [name.replace(' ', '_TIMES_').replace('^2', '_SQ') for name in poly.get_feature_names_out(cols_for_poly)]
        print(f"  Generated {len(poly_feature_names)} polynomial/interaction features.")

        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_poly.index)

        # Drop original columns used for interactions
        df_poly = df_poly.drop(columns=cols_for_poly, errors='ignore')
        df_final = pd.concat([df_poly, poly_df], axis=1)

        print(f"Interaction terms created. New shape: {df_final.shape}")
        return df_final
    except Exception as e:
        print(f"  Error creating interaction terms: {e}. Returning unmodified dataframe.")
        return df_poly

# Select features for interaction 
# Use the un-transformed bedroom/livingroom/bathroom counts
interaction_candidates = ['卧室数', '客厅数', '卫生间数', '相对楼层', 'log_平均每卧室面积'] # Example
interaction_candidates_existing = [col for col in interaction_candidates if col in df_processed.columns]
df_final_eng = create_polynomial_interactions(df_processed, n_train, interaction_candidates_existing)

del df_processed # Clean up memory
gc.collect()

print("\n--- Advanced Feature Engineering Complete ---")
print(f"Final engineered shape: {df_final_eng.shape}")



--- 步骤 4: 应用高级特征工程 ---

--- 4a: 创建地理空间特征 ---
Calculating geospatial features (distance to center)...
  Calculated centers for 12 cities (based on training set).
  Calculated '距离中心_公里', imputed NaNs using train median (13.38 km).
Distance features created.
Creating geo clusters (20 per city)...
Geo cluster features created.

--- 4b: 执行目标编码 ---
Processing ['城市', '区县', '板块'] (K-Fold Target Encoding, n_splits=6)...
  Encoding existing columns: ['城市', '区县', '板块']
  Calculated global mean from original train target: 582908.98
  Calculated full mean map based on 98899 training samples.
  Applied full map to 9773 test samples.
    Filling 53 remaining NaNs in train set using full map...
  K-Fold Target Encoding complete. New feature: 'Location_Target_Encoded'.

--- 4c: 创建比率特征 ---
Creating ratio features...
  Creating ratio [室厅比]...
  Creating ratio [室卫比]...
  Creating ratio [平均每卧室面积]...
  Creating ratio [绿化容积比]...
  Creating ratio [车位房屋比]...
  Creating ratio [设施密度]...
Ratio features created.


In [110]:
print("\n--- 步骤 5: 最终数据准备 ---")

# --- 5a: 分离 Kaggle 测试集 ---
X_train_full_eng = df_final_eng.iloc[:n_train].copy()
X_test_kaggle_eng = df_final_eng.iloc[n_train:].copy()
print(f"分离后: X_train_full_eng={X_train_full_eng.shape}, X_test_kaggle_eng={X_test_kaggle_eng.shape}")

# Target variables (already stored: y_train_full, y_train_ln_full)

# --- 5b: Y 异常值处理 (基于原始 Rent Price IQR) ---
print("\n--- 5b: 处理 Y 异常值 (IQR on original Price) ---")
q_low = 0.01
q_high = 0.95
Q1 = y_train_full.quantile(q_low)
Q3 = y_train_full.quantile(q_high)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

mask_y_clean = (y_train_full >= lower_bound) & (y_train_full <= upper_bound)
rows_before = len(y_train_full)
rows_after = mask_y_clean.sum()

print(f"IQR outlier bounds (original Price, {q_low}-{q_high}): [{lower_bound:.0f}, {upper_bound:.0f}]")
print(f"移除 Y 异常值前: {rows_before} 行")
print(f"移除 Y 异常值后: {rows_after} 行 (移除了 {rows_before - rows_after} 行)")

X_train_clean_eng = X_train_full_eng[mask_y_clean].copy()
y_train_clean = y_train_full[mask_y_clean].copy() # Original price scale, cleaned
y_train_ln_clean = y_train_ln_full[mask_y_clean].copy() # Log price scale, cleaned

# --- 5c: 训练集/验证集划分 (80/20) ---
print("\n--- 5c: 划分训练/验证集 (80/20) ---")
X_train, X_val, y_train_ln, y_val_ln = train_test_split(
    X_train_clean_eng,
    y_train_ln_clean, 
    test_size=0.2,
    random_state=111
)
# Get corresponding original prices for evaluation
y_train_orig = y_train_clean.loc[y_train_ln.index]
y_val_orig = y_train_clean.loc[y_val_ln.index]

print(f"划分后: X_train={X_train.shape}, X_val={X_val.shape}")
print(f"目标变量: y_train_ln={y_train_ln.shape}, y_val_ln={y_val_ln.shape}")
print(f"评估用目标: y_train_orig={y_train_orig.shape}, y_val_orig={y_val_orig.shape}")

# --- 5d: X 特征最终处理 (Clipping & ColumnTransformer: Impute, Scale, OHE) ---
print("\n--- 5d: X 特征最终处理 (Clipping & ColumnTransformer) ---")

# --- Clipping (Thresholds from X_train ONLY) ---
print("  Calculating and applying clipping thresholds (based on 80% train split)...")


numeric_features_final = X_train.select_dtypes(include=np.number, exclude=['category', bool]).columns

lower_bounds_X = X_train[numeric_features_final].quantile(0.01)
upper_bounds_X = X_train[numeric_features_final].quantile(0.99)

# Apply clipping

lower_bounds_X = lower_bounds_X.astype(float) # Ensure bounds are float
upper_bounds_X = upper_bounds_X.astype(float)

print(f"  Applying clipping to {len(numeric_features_final)} columns...")

# Iterate through columns to clip safely
for col in numeric_features_final:
    if col in X_train.columns:
        X_train[col] = X_train[col].clip(lower=lower_bounds_X.get(col), upper=upper_bounds_X.get(col))
    if col in X_val.columns:
        X_val[col] = X_val[col].clip(lower=lower_bounds_X.get(col), upper=upper_bounds_X.get(col))
    if col in X_test_kaggle_eng.columns:
        X_test_kaggle_eng[col] = X_test_kaggle_eng[col].clip(lower=lower_bounds_X.get(col), upper=upper_bounds_X.get(col))
    if col in X_train_clean_eng.columns:
        X_train_clean_eng[col] = X_train_clean_eng[col].clip(lower=lower_bounds_X.get(col), upper=upper_bounds_X.get(col))



print("  Clipping applied.")

# --- ColumnTransformer ---
print("  Defining and fitting ColumnTransformer...")

numeric_features = X_train.select_dtypes(include=np.number, exclude=['category', bool]).columns.tolist() # Exclude category/bool again
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
boolean_features = X_train.select_dtypes(include=[bool]).columns.tolist() # From geo clusters

print(f"    Found {len(numeric_features)} numeric features.")
print(f"    Found {len(categorical_features)} categorical features.")
print(f"    Found {len(boolean_features)} boolean features.")

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='未知')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

boolean_transformer = 'passthrough' 

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features)],
    remainder='drop')


preprocessor.fit(X_train)

# Transform all datasets
X_train_scaled = preprocessor.transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
X_test_kaggle_scaled = preprocessor.transform(X_test_kaggle_eng)
X_train_clean_scaled_full = preprocessor.transform(X_train_clean_eng) # For LassoCV

# Get feature names after transformation
feature_names_out = preprocessor.get_feature_names_out()

# Convert back to DataFrame (optional but helpful for feature importance)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names_out, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=feature_names_out, index=X_val.index)
X_test_kaggle_scaled = pd.DataFrame(X_test_kaggle_scaled, columns=feature_names_out, index=X_test_kaggle_eng.index)
X_train_clean_scaled_full = pd.DataFrame(X_train_clean_scaled_full, columns=feature_names_out, index=X_train_clean_eng.index)

print(f"  ColumnTransformer applied. Final shapes:")
print(f"    X_train_scaled: {X_train_scaled.shape}")
print(f"    X_val_scaled: {X_val_scaled.shape}")
print(f"    X_test_kaggle_scaled: {X_test_kaggle_scaled.shape}")
print(f"    X_train_clean_scaled_full: {X_train_clean_scaled_full.shape}")


print(f"  Final NaN check (X_train_scaled): {X_train_scaled.isnull().sum().sum()}")




--- 步骤 5: 最终数据准备 ---
分离后: X_train_full_eng=(98899, 348), X_test_kaggle_eng=(9773, 348)

--- 5b: 处理 Y 异常值 (IQR on original Price) ---
IQR outlier bounds (original Price, 0.01-0.95): [-2067162, 3693197]
移除 Y 异常值前: 98899 行
移除 Y 异常值后: 98240 行 (移除了 659 行)

--- 5c: 划分训练/验证集 (80/20) ---
划分后: X_train=(78592, 348), X_val=(19648, 348)
目标变量: y_train_ln=(78592,), y_val_ln=(19648,)
评估用目标: y_train_orig=(78592,), y_val_orig=(19648,)

--- 5d: X 特征最终处理 (Clipping & ColumnTransformer) ---
  Calculating and applying clipping thresholds (based on 80% train split)...
  Applying clipping to 65 columns...
  Clipping applied.
  Defining and fitting ColumnTransformer...
    Found 65 numeric features.
    Found 13 categorical features.
    Found 270 boolean features.
  ColumnTransformer applied. Final shapes:
    X_train_scaled: (78592, 412)
    X_val_scaled: (19648, 412)
    X_test_kaggle_scaled: (9773, 412)
    X_train_clean_scaled_full: (98240, 412)
  Final NaN check (X_train_scaled): 0


In [111]:
print("\n--- 步骤 6: 特征选择 (LassoCV) ---")
lasso_cv_selector = LassoCV(cv=6, random_state=111, max_iter=5000, n_jobs=-1)

# Fit on the FULL cleaned & scaled training data
lasso_cv_selector.fit(X_train_clean_scaled_full, y_train_ln_clean)

print(f"LassoCV best alpha: {lasso_cv_selector.alpha_:.6f}")

mask_selected = lasso_cv_selector.coef_ != 0
num_selected = mask_selected.sum()
num_total = len(mask_selected)
print(f"LassoCV selected {num_selected} features out of {num_total}.")

# Apply feature mask
X_train_selected = X_train_scaled.loc[:, mask_selected]
X_val_selected = X_val_scaled.loc[:, mask_selected]
X_train_clean_selected_full = X_train_clean_scaled_full.loc[:, mask_selected]
X_test_kaggle_selected = X_test_kaggle_scaled.loc[:, mask_selected]

print(f"Shapes after selection:")
print(f"  X_train_selected: {X_train_selected.shape}")
print(f"  X_val_selected: {X_val_selected.shape}")
print(f"  X_test_kaggle_selected: {X_test_kaggle_selected.shape}")
print(f"  X_train_clean_selected_full: {X_train_clean_selected_full.shape}")



--- 步骤 6: 特征选择 (LassoCV) ---
LassoCV best alpha: 0.001993
LassoCV selected 80 features out of 412.
Shapes after selection:
  X_train_selected: (78592, 80)
  X_val_selected: (19648, 80)
  X_test_kaggle_selected: (9773, 80)
  X_train_clean_selected_full: (98240, 80)


In [112]:
print("\n--- 步骤 7: 建模、调参与评估 ---")

cv_6 = KFold(n_splits=6, shuffle=True, random_state=111)

# Scorer for evaluation in original scale
def mae_original_scorer(y_true_log, y_pred_log):
    y_true_orig = np.expm1(y_true_log)
    y_pred_orig = np.expm1(y_pred_log)
    return -mean_absolute_error(y_true_orig, y_pred_orig) # Return negative MAE

mae_scorer = make_scorer(mae_original_scorer, greater_is_better=True)

results = {}
best_models = {}

# Evaluation helper
def evaluate_model(model, model_name, X_tr, y_tr_ln, y_tr_orig, X_v, y_v_ln, y_v_orig):
    # In-Sample
    y_pred_log_is = model.predict(X_tr)
    y_pred_orig_is = np.expm1(y_pred_log_is)
    is_mae = mean_absolute_error(y_tr_orig, y_pred_orig_is)
    is_rmse = root_mean_squared_error(y_tr_orig, y_pred_orig_is) # Use root_mean_squared_error

    # Out-of-Sample
    y_pred_log_oos = model.predict(X_v)
    y_pred_orig_oos = np.expm1(y_pred_log_oos)
    oos_mae = mean_absolute_error(y_v_orig, y_pred_orig_oos)
    oos_rmse = root_mean_squared_error(y_v_orig, y_pred_orig_oos)

    return {"IS_MAE": is_mae, "IS_RMSE": is_rmse, "OOS_MAE": oos_mae, "OOS_RMSE": oos_rmse}

# --- OLS ---
print("  Training OLS...")
model_ols = LinearRegression()
model_ols.fit(X_train_selected, y_train_ln)
results['OLS'] = evaluate_model(model_ols, 'OLS', X_train_selected, y_train_ln, y_train_orig, X_val_selected, y_val_ln, y_val_orig)
cv_scores = cross_val_score(model_ols, X_train_clean_selected_full, y_train_ln_clean, cv=cv_6, scoring=mae_scorer, n_jobs=-1)
results['OLS']['CV_MAE'] = -np.mean(cv_scores)
best_models['OLS'] = model_ols
print(f"    OLS CV MAE: {results['OLS']['CV_MAE']:.2f}")

# --- Lasso ---
print("  Tuning Lasso...")
param_grid_lasso = {'alpha': np.logspace(-6, -1, 10)}
grid_lasso = GridSearchCV(Lasso(max_iter=5000, random_state=111), param_grid_lasso, cv=cv_6, scoring=mae_scorer, n_jobs=-1)
grid_lasso.fit(X_train_clean_selected_full, y_train_ln_clean)
model_lasso_best = grid_lasso.best_estimator_
results['Lasso'] = evaluate_model(model_lasso_best, 'Lasso', X_train_selected, y_train_ln, y_train_orig, X_val_selected, y_val_ln, y_val_orig)
results['Lasso']['CV_MAE'] = -grid_lasso.best_score_
best_models['Lasso'] = model_lasso_best
print(f"    Lasso best alpha: {grid_lasso.best_params_['alpha']:.6f}, CV MAE: {results['Lasso']['CV_MAE']:.2f}")

# --- Ridge ---
print("  Tuning Ridge...")
param_grid_ridge_fine = {'alpha': np.logspace(-4, 0, 10)}
grid_ridge = GridSearchCV(Ridge(random_state=111), param_grid_ridge_fine, cv=cv_6, scoring=mae_scorer, n_jobs=-1)
grid_ridge.fit(X_train_clean_selected_full, y_train_ln_clean)
model_ridge_best = grid_ridge.best_estimator_
results['Ridge'] = evaluate_model(model_ridge_best, 'Ridge', X_train_selected, y_train_ln, y_train_orig, X_val_selected, y_val_ln, y_val_orig)
results['Ridge']['CV_MAE'] = -grid_ridge.best_score_
best_models['Ridge'] = model_ridge_best
print(f"    Ridge best alpha: {grid_ridge.best_params_['alpha']:.3f}, CV MAE: {results['Ridge']['CV_MAE']:.2f}")

# --- ElasticNet ---
print("  Tuning ElasticNet...")
param_grid_enet = {'alpha': np.logspace(-6, -1, 6), 'l1_ratio': [0.1, 0.5, 0.9, 0.95, 1.0]}
grid_enet = GridSearchCV(ElasticNet(max_iter=5000, random_state=111), param_grid_enet, cv=cv_6, scoring=mae_scorer, n_jobs=-1)
grid_enet.fit(X_train_clean_selected_full, y_train_ln_clean)
model_enet_best = grid_enet.best_estimator_
results['ElasticNet'] = evaluate_model(model_enet_best, 'ElasticNet', X_train_selected, y_train_ln, y_train_orig, X_val_selected, y_val_ln, y_val_orig)
results['ElasticNet']['CV_MAE'] = -grid_enet.best_score_
best_models['ElasticNet'] = model_enet_best
print(f"    ElasticNet best params: {grid_enet.best_params_}, CV MAE: {results['ElasticNet']['CV_MAE']:.2f}")

# --- LightGBM ---
print("  Tuning LightGBM...")
param_grid_lgbm = {
    'n_estimators': [300, 500], # Reduced for speed
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 40],
    'subsample': [0.8]
}
grid_lgbm = GridSearchCV(
    lgb.LGBMRegressor(random_state=111, n_jobs=1, verbose=-1, subsample_freq=1, objective='mae', metric='mae'),
    param_grid_lgbm, cv=cv_6, scoring=mae_scorer, n_jobs=-1, verbose=0)
grid_lgbm.fit(X_train_clean_selected_full, y_train_ln_clean)
model_lgbm_best = grid_lgbm.best_estimator_
results['LightGBM'] = evaluate_model(model_lgbm_best, 'LightGBM', X_train_selected, y_train_ln, y_train_orig, X_val_selected, y_val_ln, y_val_orig)
results['LightGBM']['CV_MAE'] = -grid_lgbm.best_score_
best_models['LightGBM'] = model_lgbm_best
print(f"    LGBM best params: {grid_lgbm.best_params_}, CV MAE: {results['LightGBM']['CV_MAE']:.2f}")



--- 步骤 7: 建模、调参与评估 ---
  Training OLS...
    OLS CV MAE: 114128.14
  Tuning Lasso...
    Lasso best alpha: 0.000599, CV MAE: 113860.24
  Tuning Ridge...
    Ridge best alpha: 1.000, CV MAE: 114126.87
  Tuning ElasticNet...
    ElasticNet best params: {'alpha': np.float64(0.001), 'l1_ratio': 0.5}, CV MAE: 113894.51
  Tuning LightGBM...
    LGBM best params: {'learning_rate': 0.1, 'n_estimators': 500, 'num_leaves': 40, 'subsample': 0.8}, CV MAE: 64811.11


In [120]:
print("\n--- 步骤 8: 最终结果报告 ---")

linear_models_to_compare = {
    'OLS': results['OLS']['CV_MAE'],
    'Lasso': results['Lasso']['CV_MAE'],
    'Ridge': results['Ridge']['CV_MAE'],
    'ElasticNet': results['ElasticNet']['CV_MAE']
}
best_linear_model_name = min(linear_models_to_compare, key=linear_models_to_compare.get)
results['Best Linear Model'] = results[best_linear_model_name]
best_models['Best Linear Model'] = best_models[best_linear_model_name]

print(f"最佳 *线性* 模型 (基于 CV MAE): {best_linear_model_name}")

# Prepare report DataFrames
# 检查你的 index_order，如果 LightGBM 在结果中，也应包含在内
index_order = ['OLS', 'Lasso', 'Ridge', 'ElasticNet', 'Best Linear Model']
if 'LightGBM' in results:
    index_order.append('LightGBM')


report_data_mae = {
    'In-Sample MAE': [results[m]['IS_MAE'] for m in index_order],
    'Out-of-Sample MAE': [results[m]['OOS_MAE'] for m in index_order],
    '6-Fold CV MAE': [results[m]['CV_MAE'] for m in index_order]
}
df_report_mae = pd.DataFrame(report_data_mae, index=index_order)

report_data_rmse = {
    'In-Sample RMSE': [results[m]['IS_RMSE'] for m in index_order],
    'Out-of-Sample RMSE': [results[m]['OOS_RMSE'] for m in index_order],
    
}
df_report_rmse = pd.DataFrame(report_data_rmse, index=index_order)


print("\n" + "="*40)
print(" 性能报告 (MAE - 原始价格) ")
print("="*40)
# --- 修改在这里 ---
# 使用 floatfmt=".3f" 
print(df_report_mae.to_markdown(floatfmt=".3f"))
# --- 修改结束 ---

print("\n" + "="*40)
print(" 性能报告 (RMSE - 原始价格) ")
print("="*40)
# --- 修改在这里 ---
# 同样应用于 RMSE 表格
print(df_report_rmse.to_markdown(floatfmt=".3f"))
# --- 修改结束 ---

# 确保你使用的是正确的 'y_train_clean' 变量 (来自租金或房价模型)
if 'y_train_clean' in locals():
    print(f"\n报告使用的总训练样本数 (移除y异常值后): {len(y_train_clean)}")
elif 'y_train_clean_orig' in locals():
    print(f"\n报告使用的总训练样本数 (移除y异常值后): {len(y_train_clean_orig)}")
else:
    print("\n无法确定报告样本数。")


--- 步骤 8: 最终结果报告 ---
最佳 *线性* 模型 (基于 CV MAE): Lasso

 性能报告 (MAE - 原始价格) 
|                   |   In-Sample MAE |   Out-of-Sample MAE |   6-Fold CV MAE |
|:------------------|----------------:|--------------------:|----------------:|
| OLS               |      114202.194 |          113185.516 |      114128.144 |
| Lasso             |      113995.175 |          112894.116 |      113860.242 |
| Ridge             |      114267.411 |          113161.249 |      114126.872 |
| ElasticNet        |      114027.716 |          112942.791 |      113894.511 |
| Best Linear Model |      113995.175 |          112894.116 |      113860.242 |
| LightGBM          |       56489.993 |           56304.284 |       64811.112 |

 性能报告 (RMSE - 原始价格) 
|                   |   In-Sample RMSE |   Out-of-Sample RMSE |
|:------------------|-----------------:|---------------------:|
| OLS               |       208038.696 |           204481.756 |
| Lasso             |       208175.922 |           204388.803 |
| Ridge  

In [116]:
import os

# 步骤 9: 为 *所有* 模型生成 Kaggle 提交文件

print("\n--- 步骤 9: 生成 Kaggle 提交文件 ---")

# 1. 定义输出文件夹
output_dir = './output'
# 2. 创建文件夹 (如果它不存在)
os.makedirs(output_dir, exist_ok=True)
print(f"文件将保存到: '{output_dir}' 文件夹")

# 3. 遍历 `best_models` 字典中的每一个模型
#    (这包括 OLS, Lasso, Ridge, ElasticNet, LightGBM, 和 'Best Linear Model')
for model_name, final_model in best_models.items():
    
    print(f"  正在为模型 '{model_name}' 生成预测...")
    
    # 4. 在 Kaggle 测试集上预测 (已缩放、已选择特征)
    #    (使用来自 7.ipynb 的 X_test_kaggle_selected)
    y_kaggle_pred_log = final_model.predict(X_test_kaggle_selected)

    # 5. 转换回原始价格
    y_kaggle_pred_orig = np.expm1(y_kaggle_pred_log)

    # 6. 检查是否有负数预测
    y_kaggle_pred_orig = np.clip(y_kaggle_pred_orig, a_min=0, a_max=None) 

    # 7. 创建提交文件

    submission = pd.DataFrame({
        'ID': test_ids,
        'Price': y_kaggle_pred_orig
    })

    # 8. 创建动态的文件名和路径
    safe_model_name = model_name.replace(' ', '_') # 替换空格
    file_name = f'submission_rent_{safe_model_name}.csv'
    file_path = os.path.join(output_dir, file_name)

    # 9. 保存文件
    submission.to_csv(file_path, index=False)
    print(f"    已生成: '{file_path}'")

print("\n--- 所有提交文件已生成完毕 ---")


--- 步骤 9: 生成 Kaggle 提交文件 ---
文件将保存到: './output' 文件夹
  正在为模型 'OLS' 生成预测...
    已生成: './output\submission_rent_OLS.csv'
  正在为模型 'Lasso' 生成预测...
    已生成: './output\submission_rent_Lasso.csv'
  正在为模型 'Ridge' 生成预测...
    已生成: './output\submission_rent_Ridge.csv'
  正在为模型 'ElasticNet' 生成预测...
    已生成: './output\submission_rent_ElasticNet.csv'
  正在为模型 'LightGBM' 生成预测...
    已生成: './output\submission_rent_LightGBM.csv'
  正在为模型 'Best Linear Model' 生成预测...
    已生成: './output\submission_rent_Best_Linear_Model.csv'

--- 所有提交文件已生成完毕 ---


In [122]:
# 合并OLS模型的price和rent预测
pd.concat([pd.read_csv('./output/submission_price_OLS.csv'), pd.read_csv('./output/submission_rent_OLS.csv')], axis=1).to_csv('./output/combined_submission_OLS.csv', index=False)

# 合并Lasso模型的price和rent预测
pd.concat([pd.read_csv('./output/submission_price_Lasso.csv'), pd.read_csv('./output/submission_rent_Lasso.csv')], axis=1).to_csv('./output/combined_submission_Lasso.csv', index=False)

# 合并Ridge模型的price和rent预测
pd.concat([pd.read_csv('./output/submission_price_Ridge.csv'), pd.read_csv('./output/submission_rent_Ridge.csv')], axis=1).to_csv('./output/combined_submission_Ridge.csv', index=False)

# 合并ElasticNet模型的price和rent预测
pd.concat([pd.read_csv('./output/submission_price_ElasticNet.csv'), pd.read_csv('./output/submission_rent_ElasticNet.csv')], axis=1).to_csv('./output/combined_submission_ElasticNet.csv', index=False)