# 房价预测建模
## Part 1: 环境设置和数据预处理
### 1. 导入必要的库

In [19]:
# 数据处理
import pandas as pd
import numpy as np
from geopy.distance import geodesic 

import sys
import gc
import types

# 数据可视化
import matplotlib.pyplot as plt

# 数据预处理与特征工程
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import PolynomialFeatures

# 机器学习模型
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# 评估指标
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 其他
import re
import warnings
warnings.filterwarnings('ignore')


# 设置 Matplotlib 支持中文显示
plt.rcParams['font.sans-serif'] = ['SimSong']
plt.rcParams['axes.unicode_minus'] = False

### 2. 数据加载

首先，从同一文件目录下加载相应的.csv格式的文件

In [20]:
# Load the datasets
try:
    df_train_price = pd.read_csv('ruc_Class25Q2_train_price.csv')
    df_test_price = pd.read_csv('ruc_Class25Q2_test_price.csv')
    df_train_rent = pd.read_csv('ruc_Class25Q2_train_rent.csv')
    df_test_rent = pd.read_csv('ruc_Class25Q2_test_rent.csv')
    
    print("Files loaded successfully!")
    print(f"Training data shape (price): {df_train_price.shape}")
    print(f"Test data shape (price): {df_test_price.shape}")
    print(f"Training data shape (rent): {df_train_rent.shape}")
    print(f"Test data shape (rent): {df_test_rent.shape}")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")

Files loaded successfully!
Training data shape (price): (103871, 55)
Test data shape (price): (34017, 55)
Training data shape (rent): (98899, 46)
Test data shape (rent): (9773, 46)


然后将测试集和训练集拼接在一起，并新增一列"source"来标明来源

In [21]:
# 保存训练集的原始行数
n_train_price = df_train_price.shape[0]
n_train_rent = df_train_rent.shape[0]

# b. 保存训练集的目标变量
y_train_price = df_train_price['Price'].copy()
y_train_rent = df_train_rent['Price'].copy()

# c. 保存测试集的 ID，用于最终提交
test_ids_price = df_test_price['ID'].copy()
test_ids_rent = df_test_rent['ID'].copy()


# 使用 pd.concat 将去掉 Price 和 ID 列的数据集进行纵向合并
df_price = pd.concat([ df_train_price.drop(columns=['Price']),
                          df_test_price.drop(columns=['ID'])], ignore_index=True)
df_rent = pd.concat([ df_train_rent.drop(columns=['Price']),
                          df_test_rent.drop(columns=['ID'])], ignore_index=True)

### 3.  探索性分析

In [22]:
df_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137888 entries, 0 to 137887
Data columns (total 54 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   城市         137888 non-null  int64  
 1   区域         137888 non-null  float64
 2   板块         137888 non-null  float64
 3   环线         56096 non-null   object 
 4   房屋户型       137294 non-null  object 
 5   所在楼层       137888 non-null  object 
 6   建筑面积       137888 non-null  object 
 7   套内面积       45899 non-null   object 
 8   房屋朝向       137887 non-null  object 
 9   建筑结构       137294 non-null  object 
 10  装修情况       137294 non-null  object 
 11  梯户比例       134634 non-null  object 
 12  配备电梯       121445 non-null  object 
 13  别墅类型       1597 non-null    object 
 14  交易时间       137888 non-null  object 
 15  交易权属       137888 non-null  object 
 16  上次交易       105198 non-null  object 
 17  房屋用途       137887 non-null  object 
 18  房屋年限       82082 non-null   object 
 19  产权所属       137888 non-n

###  4. 数据处理

#### a. 删除无用和信息冗余的列

In [23]:
cols_to_drop_price = [
    '抵押信息',       # 完全为空
    '别墅类型',       # 缺失值超过98%
    '物业办公电话',   # 对价格无预测能力
    '区县',           # 与'区域'列信息高度重叠
    '开发商',
    '物业公司',
    '环线位置',       # 与'环线'列信息高度重叠
    'coord_x',        # 与 lon/lat 重复
    'coord_y',        # 与 lon/lat 重复
    # 文本描述列
    '房屋优势', '核心卖点', '户型介绍', '周边配套', '交通出行', '客户反馈'
]

df_price = df_price.drop(columns=cols_to_drop_price, errors='ignore')

cols_to_drop_rent = [
    '车位',   # 对价格无预测能力
    '开发商',           # 与'区域'列信息高度重叠
    '物业公司',
    '物业办公电话',       # 与'环线'列信息高度重叠
    'coord_x',        # 与 lon/lat 重复
    'coord_y',        # 与 lon/lat 重复
    # 文本描述列
    '客户反馈',
    '装修',
    '供水',
    '供暖',
    '供电',
]
df_rent = df_rent.drop(columns=cols_to_drop_rent, errors='ignore')

#### b. 处理区块的列数据

In [24]:
# 查看相似的列中值不一致的行
inconsistent_rows = df_price[(df_price['板块'] != df_price['板块_comm']) &  (df_price['板块_comm'].notnull())]
display(inconsistent_rows[['板块', '板块_comm']])

Unnamed: 0,板块,板块_comm
287,1103.0,466.0
372,1103.0,466.0
420,1103.0,466.0
424,1103.0,466.0
506,1103.0,466.0
...,...,...
137781,716.0,1130.0
137799,716.0,1130.0
137802,716.0,1130.0
137815,716.0,1130.0


In [25]:
def process_plate_feature(df):
    """
    创建一个更可靠的板块特征 '板块_final'。
    策略：优先使用小区板块信息 ('板块_comm')，再用房产板块信息 ('板块') 填充其缺失值。
    """
    df = df.copy()

    df['板块_final'] = np.where(df['板块_comm'].notna() & (df['板块_comm'] != ''), df['板块_comm'], df['板块'])

    # 删除原始的两列
    df = df.drop(columns=['板块', '板块_comm'], errors='ignore')
    
    df['板块'] = df['板块_final'].astype(str)
    df = df.drop(columns=['板块_final'], errors='ignore')

    return df


df_price = process_plate_feature(df_price)

## Part 2: 缺失值处理
### 1. 解析结构化文本列

#### a. 处理房屋户型列

In [26]:
def parse_layout(df,col_name):
    """
    解析'房屋户型'列，提取'室', '厅', '卫'的数量。
    """
    df = df.copy()
        
    # 使用正则表达式提取室、厅、卫的数量
    layout_info = df[col_name].str.extract(r'(\d+)[室房](\d+)厅(?:.*?(\d+)卫)?', expand=True) 
    # ?: 表示厨房信息可选; (?:...) 表示非捕获组; [室房] 匹配'室'或'房'
    layout_info.columns = ['室', '厅', '卫']
    
    # 将提取出的信息合并回原DataFrame
    df = pd.concat([df, layout_info], axis=1)
    
    # 转换提取出的列为数值类型, errors='coerce' 会将无法转换的设为NaN
    for col in ['室', '厅', '卫']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    # --- 处理缺失值 ---
    # 计算中位数（只在训练集部分计算）
    for col in ['室', '厅', '卫']:
         if col in df.columns:
             median_val = df[col].median()
             df[col].fillna(median_val, inplace=True)
             
    df = df.drop(columns=[col_name], errors='ignore')
            
    return df

df_price = parse_layout(df_price,'房屋户型') 
df_rent = parse_layout(df_rent,'户型')

#### b. 处理房屋面积与套内面积列

使用训练集中房屋面积与套内面积的比例（得房率）来预测缺失值

In [27]:
def process_area_features_revised(df, n_train):
    """
    专门处理建筑面积和套内面积列，并创建得房率特征
    """
    df_processed = df.copy()

    # 清理并转换为数值类型
    for col in ['建筑面积', '套内面积']:
        df_processed[col] = pd.to_numeric(df_processed[col].astype(str).str.replace('㎡', ''), errors='coerce')

    # 计算训练集的平均得房率
    train_part = df_processed.iloc[:n_train]
    valid_train_data = train_part[(train_part['套内面积'].notna()) & (train_part['建筑面积'].notna()) ]

    # 计算得房率
    efficiency_rate_train = valid_train_data['套内面积'].sum() / valid_train_data['建筑面积'].sum() 


    # 尝试使用计算出的得房率填充 '套内面积' 缺失值
    missing_mask = ((df_processed['套内面积'].isna()) 
                    | (df_processed['套内面积'] / df_processed['建筑面积'] >0.95) 
                    | (df_processed['套内面积'] / df_processed['建筑面积'] <0.6))
    df_processed.loc[missing_mask, '套内面积'] = df_processed.loc[missing_mask, '建筑面积'] * efficiency_rate_train



    # 5. 创建 '得房率' 特征
    df_processed['得房率'] = np.nan
    df_processed['得房率'] = df_processed['套内面积'] / df_processed['建筑面积']

    return df_processed

# 调用函数处理面积相关列
df_price = process_area_features_revised(df_price, n_train_price)
df_rent['面积'] = pd.to_numeric(df_rent['面积'].astype(str).str.replace('㎡', ''), errors='coerce')

#### c. 处理楼层信息

In [28]:
def parse_floor_robust(df, n_train, col='所在楼层'):
    """
    解析楼层列，提取'总楼层'和'楼层类别'。
    支持 "(共X层)"、"Y/X层" 和 "地下X层" 格式，并能推断类别，最后填充缺失值并独热编码。
    """
    df_processed = df.copy()

    if col not in df_processed.columns:
        print(f"错误：列 '{col}' 不在DataFrame中，无法处理楼层。")
        # 可以返回原始df或者引发错误
        return df

    # --- 步骤 1 & 2: 提取总楼层和当前楼层 ---
    # 尝试从 "(共X层)" 提取总楼层
    total_floors1 = df_processed[col].str.extract(r'\(共(\d+)层\)', expand=False)
    # 尝试从 "/X层" 或 "/X)" 提取总楼层 (更兼容的正则)
    total_floors2 = df_processed[col].str.extract(r'/(\d+)(?:层|\))?$', expand=False)
    # 尝试从 "Y/..." 提取当前楼层
    current_floor_ext = df_processed[col].str.extract(r'^(\d+)/', expand=False)

    # 合并总楼层：优先用 total_floors1, 缺失则用 total_floors2
    df_processed['总楼层'] = pd.to_numeric(total_floors1.fillna(total_floors2), errors='coerce')
    # 转换当前楼层
    df_processed['当前楼层_temp'] = pd.to_numeric(current_floor_ext, errors='coerce')

    # --- 步骤 3: 提取明确的类别 (包括地下室) ---
    df_processed['楼层类别_explicit'] = np.nan # 初始化为 NaN
    # 使用 .loc 确保安全赋值
    df_processed.loc[df_processed[col].astype(str).str.contains('高楼层', na=False), '楼层类别_explicit'] = '高楼层'
    df_processed.loc[df_processed[col].astype(str).str.contains('中楼层', na=False), '楼层类别_explicit'] = '中楼层'
    df_processed.loc[df_processed[col].astype(str).str.contains('低楼层', na=False), '楼层类别_explicit'] = '低楼层'
    df_processed.loc[df_processed[col].astype(str).str.contains('顶层', na=False), '楼层类别_explicit'] = '顶层'
    df_processed.loc[df_processed[col].astype(str).str.contains('底层', na=False), '楼层类别_explicit'] = '底层'
    # ******** 新增：识别地下室 ********
    df_processed.loc[df_processed[col].astype(str).str.contains('地下', na=False), '楼层类别_explicit'] = '地下室'
    # ***********************************

    # --- 步骤 4: 推断类别 (排除已明确分类和总楼层无效的) ---
    df_processed['楼层类别_inferred'] = np.nan
    # 推断条件：没有明确类别，且有有效的当前和总楼层 (总楼层>0)
    infer_mask = (df_processed['楼层类别_explicit'].isna()) & \
                 (df_processed['当前楼层_temp'].notna()) & \
                 (df_processed['总楼层'].notna()) & \
                 (df_processed['总楼层'] > 0)

    if infer_mask.any():
        # 只在需要推断的行上操作
        current = df_processed.loc[infer_mask, '当前楼层_temp']
        total = df_processed.loc[infer_mask, '总楼层']
        ratio = current / total

        # 应用推断规则 (优先判断顶层/底层)
        # 注意：这里使用 .loc[infer_mask & (condition), ...] 来赋值
        df_processed.loc[infer_mask, '楼层类别_inferred'] = '中楼层' # 默认中楼层
        df_processed.loc[infer_mask & (ratio <= 1/3), '楼层类别_inferred'] = '低楼层'
        df_processed.loc[infer_mask & (ratio >= 2/3), '楼层类别_inferred'] = '高楼层'
        df_processed.loc[infer_mask & (current == total), '楼层类别_inferred'] = '顶层' # 顶层优先级高
        df_processed.loc[infer_mask & (current == 1), '楼层类别_inferred'] = '底层'   # 底层优先级最高

    # --- 步骤 5: 合并与填充 ---
    # 合并类别：优先 explicit, 再 inferred
    df_processed['楼层类别'] = df_processed['楼层类别_explicit'].fillna(df_processed['楼层类别_inferred'])

    # 计算填充值 (基于训练集)
    train_part = df_processed.iloc[:n_train]
    # 计算中位数，处理可能全是NaN的情况
    median_total_floors_train = train_part['总楼层'].median()
    if pd.isna(median_total_floors_train): median_total_floors_train = 6 # 合理备用值，例如6层

    mode_floor_category_train = train_part['楼层类别'].mode()
    fill_category_train = mode_floor_category_train[0] if not mode_floor_category_train.empty else '中楼层' # 备用值

    # 填充 '总楼层'
    original_na_total_floors = df_processed['总楼层'].isnull().sum()
    if original_na_total_floors > 0:
        df_processed['总楼层'].fillna(median_total_floors_train, inplace=True)
        print(f"填充了 '总楼层' 列的 {original_na_total_floors} 个缺失值 (使用训练集中位数 {median_total_floors_train:.0f})。")

    # 填充 '楼层类别'
    original_na_category = df_processed['楼层类别'].isnull().sum()
    if original_na_category > 0:
        df_processed['楼层类别'].fillna(fill_category_train, inplace=True)
        print(f"填充了 '楼层类别' 列的 {original_na_category} 个缺失值 (使用训练集众数 '{fill_category_train}')。")


    # --- 步骤 6: 清理与编码 ---
    # 删除原始列和临时列
    cols_to_drop = [col, '当前楼层_temp', '楼层类别_explicit', '楼层类别_inferred']
    df_processed = df_processed.drop(columns=cols_to_drop, errors='ignore')

    # 对最终的 '楼层类别' 进行独热编码
    df_processed = pd.get_dummies(df_processed, columns=['楼层类别'], prefix='楼层', drop_first=False, dummy_na=False) # dummy_na=False 因为已填充

    return df_processed

# --- 如何调用 ---
# 假设 df_price, df_rent 已经是合并后的数据框
# n_train_price, n_train_rent 是对应的训练集行数

# print("--- 处理 df_price ---")
df_price = parse_floor_robust(df_price, n_train_price, col='所在楼层')
# print("\n--- 处理 df_rent ---")
df_rent = parse_floor_robust(df_rent, n_train_rent, col='楼层')

# --- 检查 df_rent 的结果 ---
print("\n--- df_rent 处理结果 ---")
print("总楼层 缺失值:", df_rent['总楼层'].isnull().sum())
floor_category_cols = [c for c in df_rent.columns if c.startswith('楼层_')]
print("楼层类别列:", floor_category_cols)
display(df_rent[['总楼层'] + floor_category_cols].head())
print("楼层类别列缺失值:", df_rent[floor_category_cols].isnull().sum().sum())

填充了 '总楼层' 列的 12 个缺失值 (使用训练集中位数 18)。
填充了 '楼层类别' 列的 5 个缺失值 (使用训练集众数 '中楼层')。

--- df_rent 处理结果 ---
总楼层 缺失值: 0
楼层类别列: ['楼层_中楼层', '楼层_低楼层', '楼层_地下室', '楼层_底层', '楼层_顶层', '楼层_高楼层']


Unnamed: 0,总楼层,楼层_中楼层,楼层_低楼层,楼层_地下室,楼层_底层,楼层_顶层,楼层_高楼层
0,6.0,False,False,False,False,False,True
1,6.0,False,False,False,False,False,True
2,18.0,False,False,False,True,False,False
3,10.0,False,False,False,True,False,False
4,18.0,False,False,False,False,True,False


楼层类别列缺失值: 0


#### d. 处理房屋朝向

将其转化为 8 个 0-1 变量

In [29]:
def process_orientation(df,orientation_col):
    """
    处理'房屋朝向'列，将其转换为多个二元特征列。
    """
    df_processed = df.copy()


    # 生成虚拟变量
    orientation_dummies = df_processed[orientation_col].str.get_dummies(sep=' ')

    # 添加前缀，避免与其他列名冲突
    orientation_dummies = orientation_dummies.add_prefix('朝向_')

    # 将新创建的虚拟变量列合并回主数据框
    df_processed = pd.concat([df_processed, orientation_dummies], axis=1)

    # 删除原始的 '房屋朝向' 列
    df_processed = df_processed.drop(columns=[orientation_col], errors='ignore')

    return df_processed

df_price = process_orientation(df_price, '房屋朝向')
df_rent = process_orientation(df_rent, '朝向')

#### f. 批量处理剩余的虚拟变量部分

In [30]:
def process_structure(df,structure_col,fillna_value,prefix):
    """
    处理列，将其转换为虚拟变量。
    """
    df_processed = df.copy() 

    # 填充缺失值
    df_processed[structure_col].fillna(fillna_value, inplace=True)

    # 转化为虚拟变量
    structure_dummies = pd.get_dummies(df_processed[structure_col], prefix=prefix, dummy_na=False)

    # 虚拟变量列合并回主数据框
    df_processed = pd.concat([df_processed, structure_dummies], axis=1)

    # 删除原始的列
    df_processed = df_processed.drop(columns=[structure_col], errors='ignore')

    return df_processed

# 房屋结构
df_price = process_structure(df_price,structure_col = '建筑结构',fillna_value='其他结构',prefix='结构')
df_price['结构_未知或其他'] = df_price['结构_未知结构'] | df_price.get('结构_其他结构', 0)
df_price = df_price.drop(columns=['结构_未知结构', '结构_其他结构'], errors='ignore')

for col in ['装修情况','产权所属','交易权属','年份','楼层类别']:
    if col in df_price.columns:
        df_price = process_structure(df_price,structure_col = col,fillna_value='其他',prefix=col)

for col in ['付款方式','租赁方式','电梯','燃气','装修情况','年份']:
    if col in df_rent.columns:
        if col == '付款方式':
            df_rent = process_structure(df_rent,structure_col = col,fillna_value='季付价',prefix=col)
        elif col == '电梯':
            df_rent = process_structure(df_rent,structure_col = col,fillna_value='有',prefix=col)
        elif col == '燃气':
            df_rent = process_structure(df_rent,structure_col = col,fillna_value='有',prefix=col)
        elif col == '建筑结构':
            df_rent = process_structure(df_rent,structure_col = '建筑结构',fillna_value='其他结构',prefix='结构')
            df_rent['结构_未知或其他'] = df_rent['结构_未知结构'] | df_rent.get('结构_其他结构', 0)
            df_rent = df_rent.drop(columns=['结构_未知结构', '结构_其他结构'], errors='ignore')
        else:
            df_rent = process_structure(df_rent,structure_col = col,fillna_value='其他',prefix=col)
        


In [31]:
col_to_process = '房屋用途'

# 仅使用训练集数据计算众数
train_part = df_price.iloc[:n_train_price]
if not train_part[col_to_process].mode().empty:
    global_mode = train_part[col_to_process].mode()[0]
else:
    global_mode = '普通住宅' 

df_price[col_to_process].fillna(global_mode, inplace=True)

# 合并稀有类别
keep_categories = ['普通住宅', '商住两用', '别墅']

# 将所有不在保留列表中的值都标记为 '其他'
df_price[col_to_process] = df_price[col_to_process].apply(lambda x: x if x in keep_categories else '其他')

# 复用 process_structure 函数
df_price = process_structure(
    df_price,
    structure_col=col_to_process,
    fillna_value='其他', 
    prefix='用途'
)

### 2. 地理空间特征

#### a. 分组填充梯户比例等缺失值

对于“有无电梯”的缺失值使用按区域分组后的众数填充，如果明确无电梯，且梯户比用 0 填充，对于有电梯且梯户比例缺失的数据，使用组内的中位数填充。

In [32]:
def chinese_to_arabic(cn_num):
    """将常见的中文数字（包括'十','百'）转换为阿拉伯数字"""

    cn_map = {'零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
    cn_unit = {'十': 10, '百': 100}
    unit = 0
    ldig = []
    if cn_num.startswith('十'):
         cn_num = '一' + cn_num
    for cndig in reversed(cn_num):
        if cndig in cn_unit:
            unit = cn_unit.get(cndig)
            if unit == 100:
                ldig.append(unit)
        elif cndig in cn_map:
            dig = cn_map.get(cndig)
            if unit:
                dig *= unit
                unit = 0
            ldig.append(dig)
        else:
            return np.nan
    if not ldig:
        return np.nan
    val, tmp = 0, 0
    for x in reversed(ldig):
         if x == 100:
             tmp *= x
             val += tmp
             tmp = 0
         elif x == 10:
             if tmp == 0: tmp = 1
             tmp *= x
         else:
             tmp += x
    val += tmp
    return val

def parse_elevator_ratio_strict(text):
    """
    严格解析 'X梯Y户' 格式的文本, X和Y是阿拉伯数字。
    返回 (梯数, 户数) 或 (NaN, NaN)。
    """
    if pd.isna(text) or text == '' or text == '未知':
        return np.nan, np.nan

    match = re.fullmatch(r'(\d+|[一二三四五六七八九十百零]+)梯(\d+|[一二三四五六七八九十百零]+)户', str(text))
    if match:
        t_str, h_str = match.groups()
        t = chinese_to_arabic(t_str)
        h = chinese_to_arabic(h_str)
        if pd.notna(t) and pd.notna(h) and t > 0 and h > 0:
            return int(t), int(h)
    return np.nan, np.nan

def process_elevator_ratio_grouped_median_impute(df, n_train, grouping_col='板块'):
    """
    处理电梯特征：
    1. 确定 '有电梯' (0/1) 状态，优先基于 '配备电梯' 并用分组众数填充。
    2. 计算 '梯户比'。
    3. 对 '有电梯'=1 但 '梯户比' 缺失的情况，用分组中位数填充。
    4. '有电梯'=0 时 '梯户比' 为 0。
    5. 最终保留 '有电梯' 和 '梯户比'。
    """
    df_processed = df.copy()
    ratio_col = '梯户比例'
    has_elevator_col = '配备电梯'

    # 处理 '有电梯' 
    if has_elevator_col in df_processed.columns:
        df_processed['有电梯_numeric_temp'] = df_processed[has_elevator_col].map({'有': 1, '无': 0})

        # 计算每组众数（仅使用非NaN值）
        mode_map_binary = df_processed.loc[df_processed['有电梯_numeric_temp'].notna()].groupby(grouping_col)['有电梯_numeric_temp'].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan) # 改为nan以防空组
        
        nan_mask_binary = df_processed['有电梯_numeric_temp'].isnull()
        fill_values_binary = df_processed.loc[nan_mask_binary, grouping_col].map(mode_map_binary)
        df_processed['有电梯'] = df_processed['有电梯_numeric_temp'].fillna(fill_values_binary)
        
        # 全局回退
        if df_processed['有电梯'].isnull().any():
            # 优先使用训练集的众数
            train_mode_binary = df_processed.iloc[:n_train]['有电梯_numeric_temp'].mode()
            global_fallback_binary = train_mode_binary[0] if not train_mode_binary.empty else 0 # 默认0(无电梯)
            df_processed['有电梯'].fillna(global_fallback_binary, inplace=True)
            
        df_processed['有电梯'] = df_processed['有电梯'].astype(int)
        df_processed = df_processed.drop(columns=['有电梯_numeric_temp'], errors='ignore')

    # 解析 '梯户比例'
    temp_elevators = pd.Series(np.nan, index=df_processed.index)
    temp_households = pd.Series(np.nan, index=df_processed.index)
    
    if ratio_col in df_processed.columns:
        parsed_ratios = df_processed[ratio_col].apply(parse_elevator_ratio_strict)
        temp_elevators = parsed_ratios.apply(lambda x: x[0])
        temp_households = parsed_ratios.apply(lambda x: x[1])

    # 计算 '梯户比'
    df_processed['梯户比'] = np.nan
    df_processed.loc[df_processed['有电梯'] == 0, '梯户比'] = 0.0

    # 计算 '有电梯' 且解析成功的行的梯户比
    mask_calc_ratio = (df_processed['有电梯'] == 1) & pd.notna(temp_households) & (temp_households > 0) & pd.notna(temp_elevators)
    df_processed.loc[mask_calc_ratio, '梯户比'] = temp_elevators[mask_calc_ratio] / temp_households[mask_calc_ratio]

    # 填充 '有电梯' 但 '梯户比' 缺失的值 (关键修正)
    fill_mask_ratio = (df_processed['有电梯'] == 1) & (df_processed['梯户比'].isna())

    if fill_mask_ratio.any():
        
        # 计算每个分组内，有电梯且梯户比有效的房产的梯户比中位数
        median_map_ratio = df_processed.loc[(df_processed['有电梯'] == 1) & df_processed['梯户比'].notna()].groupby(grouping_col)['梯户比'].median()

        # 仅对需要填充的行应用map
        fill_values_ratio = df_processed.loc[fill_mask_ratio, grouping_col].map(median_map_ratio)
        df_processed.loc[fill_mask_ratio, '梯户比'] = fill_values_ratio

        still_nan_mask = fill_mask_ratio & df_processed['梯户比'].isnull()
        
        if still_nan_mask.any():
            
            # 使用训练集全局中位数作为回退
            train_part = df_processed.iloc[:n_train]
            global_median_ratio = train_part.loc[((train_part['有电梯'] == 1) 
                                                  & train_part['梯户比'].notna()), '梯户比'].median()
            
            # 仅填充那些仍然为 NaN 且 '有电梯'=1 的行
            df_processed.loc[still_nan_mask, '梯户比'] = global_median_ratio

    # 清理
    cols_to_drop_final = [ratio_col, has_elevator_col]
    df_processed = df_processed.drop(columns=cols_to_drop_final, errors='ignore')

    return df_processed

df_price = process_elevator_ratio_grouped_median_impute(df_price, n_train_price, grouping_col='板块')

#### b. 分组填充电、水、暖、物业费、停车位费等缺失值

In [33]:
def process_utility_impute_and_encode(df, n_train, col_to_process, group_col_l1='板块', group_col_l2='区域'):
    """
    处理工具类特征 (如供水、供暖、供电):
    1. 仅使用训练集 (n_train) 数据计算填充地图 (mode)。
    2. 优先使用 L1 (板块) 级别的众数进行填充。
    3. 回退使用 L2 (区域) 级别的众数进行填充。
    4. 使用全局众数（训练集）作为最终回退。
    5. 将处理干净的列转换为哑变量。
    """
    df_processed = df.copy()

    # 仅使用训练集创建填充地图
    train_part = df_processed.iloc[:n_train]

    # L1 地图: '板块' -> 众数
    mode_map_l1 = train_part.groupby(group_col_l1)[col_to_process].agg(
        lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
    )

    # L2 地图: '区域' -> 众数
    mode_map_l2 = train_part.groupby(group_col_l2)[col_to_process].agg(
        lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
    )
    
    # L3 (Global) 回退值
    global_mode_series = train_part[col_to_process].mode()
    
    # 设置一个硬回退值 '未知'
    fill_global = global_mode_series[0] if not global_mode_series.empty else '未知'
    
    # 应用填充
    nan_mask = df_processed[col_to_process].isnull()

    # L1 填充
    fill_values_l1 = df_processed.loc[nan_mask, group_col_l1].map(mode_map_l1)
    df_processed[col_to_process].fillna(fill_values_l1, inplace=True)

    # L2 填充
    nan_mask_l2 = df_processed[col_to_process].isnull()
    if nan_mask_l2.any():
        # print(f"... L1 填充后剩余 {nan_mask_l2.sum()} 个缺失值。使用 L2(区域) 填充...")
        fill_values_l2 = df_processed.loc[nan_mask_l2, group_col_l2].map(mode_map_l2)
        df_processed[col_to_process].fillna(fill_values_l2, inplace=True)
    
    # L3 (Global) 填充
    nan_mask_l3 = df_processed[col_to_process].isnull()
    if nan_mask_l3.any():
        df_processed[col_to_process].fillna(fill_global, inplace=True)

    # 转化为哑变量
    df_processed = pd.get_dummies(df_processed, 
                                columns=[col_to_process], 
                                prefix=col_to_process, 
                                dummy_na=False) # 我们已手动填充所有 NaNs
    

    return df_processed


# 处理供水
df_price = process_utility_impute_and_encode(
    df_price, 
    n_train_price,
    col_to_process='供水',
    group_col_l1='板块',
    group_col_l2='区域'
)

# 处理供暖
df_price = process_utility_impute_and_encode(
    df_price, 
    n_train_price,
    col_to_process='供暖',
    group_col_l1='板块',
    group_col_l2='区域'
)

# 处理供电
df_price = process_utility_impute_and_encode(
    df_price, 
    n_train_price,
    col_to_process='供电',
    group_col_l1='板块',
    group_col_l2='区域'
)

df_rent = process_utility_impute_and_encode(
    df_rent, 
    n_train_rent,
    col_to_process='用电',
    group_col_l1='板块',
    group_col_l2='区县'
)

df_rent = process_utility_impute_and_encode(
    df_rent, 
    n_train_rent,
    col_to_process='用水',
    group_col_l1='板块',
    group_col_l2='区县'
)

df_rent = process_utility_impute_and_encode(
    df_rent, 
    n_train_rent,
    col_to_process='采暖',
    group_col_l1='板块',
    group_col_l2='区县'
)

In [34]:
def extract_fee_value(text):
    """
    从字符串中提取费用数值，支持区间和单位：
    1. "3.5-4.5元/月" -> 4.0 (区间取中点)
    2. "2.5元/立方米" -> 2.5 (单个数字)
    3. 5 或 "5" -> 5.0 (纯数字)
    4. "未知", "免费", "包含在物业费中" -> NaN
    """
    # 辅助函数：尝试将捕获的组转换为浮点数
    def to_float(group):
        try:
            return float(group)
        except (ValueError, TypeError):
            return np.nan

    # 处理非字符串
    if pd.isna(text):
        return np.nan
    if isinstance(text, (int, float)):
        return float(text)

    # 处理字符串
    s = str(text).strip()
    if s == "":
        return np.nan

    # 尝试匹配区间 (e.g., "3.5-4.5")
    range_match = re.search(r'(\d+\.?\d*)\s*[-—–]\s*(\d+\.?\d*)', s)
    if range_match:
        num1 = to_float(range_match.group(1))
        num2 = to_float(range_match.group(2))
        
        if pd.notna(num1) and pd.notna(num2):
            return (num1 + num2) / 2.0

    # 4. 尝试匹配单个数字
    single_match = re.search(r'(\d+\.?\d*)', s)
    if single_match:
        num = to_float(single_match.group(1))
        if pd.notna(num):
            return num
            
    return np.nan

def process_numerical_impute(df, n_train, cols_to_process, 
                             group_col_l1='板块', group_col_l2='区域', 
                             final_fallback_value=0.0):
    """
    (更新版本)
    处理数值型费用特征 (如煤气费、水费、取暖费):
    1. 使用 extract_fee_value 清洗列，提取数字（支持区间）。
    2. 仅使用训练集 (n_train) 数据计算填充地图 (median)。
    3. 优先使用 L1 (板块) 级别的中位数进行填充。
    4. 回退使用 L2 (区域) 级别的中位数进行填充。
    5. 使用全局中位数（训练集）作为最终回退。
    6. 使用 0.0 作为所有回退的最终值。
    """
    df_processed = df.copy()
    
    # 循环处理传入的每一列
    for col in cols_to_process:

        # 清洗并转换为数值
        # 使用新的、更强大的辅助函数
        df_processed[col] = df_processed[col].apply(extract_fee_value)

        if col == '绿 化 率':
            # 找出所有 > 100 的行 (例如 10500)
            cap_mask = (df_processed[col] > 100)
            rows_affected = cap_mask.sum()
            if rows_affected > 0:
                df_processed.loc[cap_mask, col] = np.nan

        if col == '容 积 率':
            # 容积率 > 30 (约等于99.99%分位数) 
            cap_mask = (df_processed[col] > 30) 
            rows_affected = cap_mask.sum()
            if rows_affected > 0:
                df_processed.loc[cap_mask, col] = np.nan
        
        # 仅使用训练集创建填充地图
        train_part = df_processed.iloc[:n_train]
        
        # 计算中位数
        valid_train_data = train_part.loc[train_part[col] > 0]
        
        if valid_train_data.empty:
            fill_global = final_fallback_value
            median_map_l1 = pd.Series(dtype=float) # 空地图
            median_map_l2 = pd.Series(dtype=float) # 空地图
        else:
            # L1 地图: '板块' -> 中位数
            median_map_l1 = valid_train_data.groupby(group_col_l1)[col].median()
            # L2 地图: '区域' -> 中位数
            median_map_l2 = valid_train_data.groupby(group_col_l2)[col].median()
            # L3 (Global) 回退值
            global_median = valid_train_data[col].median()
            fill_global = global_median if pd.notna(global_median) else final_fallback_value

        # 应用填充
        
        # L1 填充: 使用 '板块' 中位数
        nan_mask_l1 = df_processed[col].isnull()
        fill_values_l1 = df_processed.loc[nan_mask_l1, group_col_l1].map(median_map_l1)
        df_processed[col].fillna(fill_values_l1, inplace=True)

        # L2 填充: (只填充 L1 没填上的)
        nan_mask_l2 = df_processed[col].isnull()
        if nan_mask_l2.any():
            fill_values_l2 = df_processed.loc[nan_mask_l2, group_col_l2].map(median_map_l2)
            df_processed[col].fillna(fill_values_l2, inplace=True)
        
        # L3 (Global) 填充: (填充 L2 没填上的)
        nan_mask_l3 = df_processed[col].isnull()
        if nan_mask_l3.any():
            df_processed[col].fillna(fill_global, inplace=True)
            

    return df_processed

# 处理燃气费、水费、停车费用
fee_columns = ['燃气费', '供热费', '停车费用','物 业 费', '绿 化 率', '容 积 率']
df_price = process_numerical_impute(
    df_price, 
    n_train_price,
    cols_to_process=fee_columns,
    group_col_l1='板块',
    group_col_l2='区域',
    final_fallback_value=0.0
)


df_price.loc[(df_price['供暖_自采暖'] == True) |(df_price['供暖_无供暖'] == True) | (df_price['供暖_自采暖/无供暖'] == False), '供热费'] = 0



fee_columns_rent = ['燃气费','物 业 费', '绿 化 率', '容 积 率','供热费','停车费用']
df_rent = process_numerical_impute(
    df_rent, 
    n_train_rent,
    cols_to_process=fee_columns_rent,
    group_col_l1='板块',
    group_col_l2='区县',
    final_fallback_value=0.0
)


In [35]:
def process_multi_hot_impute_and_encode(df, n_train, 
                                        col_to_process,      # 原始列名 (无空格)
                                        keywords_list,       # 要提取的关键词列表
                                        prefix,              # 新列的前缀
                                        group_col_l1='板块', 
                                        group_col_l2='区域'):
    """
    (通用函数)
    处理具有组合关键词的类别型特征 (如 物业类别, 产权描述):
    1. 使用 L1(板块)/L2(区域)/全局 众数填充 NaN。
    2. 执行多标签独热编码 (Multi-hot)，根据 keywords_list 拆分属性。
    """
    df_processed = df.copy()
    col = col_to_process 
            
    # --- 步骤 1: 填充缺失值 (L1/L2/L3 众数) ---
    
    train_part = df_processed.iloc[:n_train]
    
    mode_map_l1 = train_part.groupby(group_col_l1)[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    mode_map_l2 = train_part.groupby(group_col_l2)[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    
    # 全局回退值 (使用列表中的第一个关键词，或 '其他')
    global_mode_series = train_part[col].mode()
    fallback = keywords_list[0] if keywords_list else '其他'
    fill_global = global_mode_series[0] if not global_mode_series.empty else fallback
    
    # 应用填充
    nan_mask = df_processed[col].isnull()
    if nan_mask.any():
        
        fill_values_l1 = df_processed.loc[nan_mask, group_col_l1].map(mode_map_l1)
        df_processed[col].fillna(fill_values_l1, inplace=True)
        
        nan_mask_l2 = df_processed[col].isnull()
        if nan_mask_l2.any():
            fill_values_l2 = df_processed.loc[nan_mask_l2, group_col_l2].map(mode_map_l2)
            df_processed[col].fillna(fill_values_l2, inplace=True)
            
        nan_mask_l3 = df_processed[col].isnull()
        if nan_mask_l3.any():
            print(f"... L1/L2 填充后剩余 {nan_mask_l3.sum()} 个缺失值。使用全局回退 '{fill_global}'...")
            df_processed[col].fillna(fill_global, inplace=True)
            

    # --- 步骤 2: 拆分与多标签独热编码 ---
    
    df_processed[col] = df_processed[col].astype(str)

    new_cols = []
    for keyword in keywords_list:
        # e.g., "物业_is_住宅"
        col_name = f"{prefix}_is_{keyword}" 
        df_processed[col_name] = df_processed[col].str.contains(keyword).astype(int)
        new_cols.append(col_name)

    # --- 步骤 3: 创建 '其他' 列 ---
    other_col_name = f"{prefix}_is_其他"
    is_known_type = df_processed[new_cols].sum(axis=1)
    df_processed[other_col_name] = (is_known_type == 0).astype(int)
    
    # --- 步骤 4: 清理 ---
    df_processed = df_processed.drop(columns=[col], errors='ignore')


    return df_processed

# 1. 定义 '物业类别' 的关键词 (基于您的透视)
keywords_type = ['住宅', '商业', '底商', '别墅', '办公', '公寓']

# 2. 调用通用函数
df_price = process_multi_hot_impute_and_encode(
    df_price,
    n_train_price,
    col_to_process='物业类别',
    keywords_list=keywords_type,
    prefix='物业',
    group_col_l1='板块',
    group_col_l2='区域'
)
df_rent = process_multi_hot_impute_and_encode(
    df_rent,
    n_train_rent,
    col_to_process='物业类别',
    keywords_list=keywords_type,
    prefix='物业',
    group_col_l1='板块',
    group_col_l2='区县'
)
# 1. 定义 '产权描述' 的关键词
keywords_rights = ['商品房', '使用权', '已购公房', '私产']

# 复用函数
df_price = process_multi_hot_impute_and_encode(
    df_price,
    n_train_price,
    col_to_process='产权描述',
    keywords_list=keywords_rights,
    prefix='产权',
    group_col_l1='板块',
    group_col_l2='区域'
)
df_rent = process_multi_hot_impute_and_encode(
    df_rent,
    n_train_rent,
    col_to_process='产权描述', 
    keywords_list=keywords_rights,
    prefix='建构',
    group_col_l1='板块',
    group_col_l2='区县'
)

keywords_structure = ['板楼', '塔楼', '塔板结合', '平房']
df_price = process_multi_hot_impute_and_encode(
    df_price,
    n_train_price,
    col_to_process='建筑结构_comm', 
    keywords_list=keywords_structure,
    prefix='建构',
    group_col_l1='板块',
    group_col_l2='区域'
)
df_rent = process_multi_hot_impute_and_encode(
    df_rent,
    n_train_rent,
    col_to_process='建筑结构', 
    keywords_list=keywords_structure,
    prefix='建构',
    group_col_l1='板块',
    group_col_l2='区县'
)

keywords_furniture = ['洗衣机', '冰箱', '空调', '衣柜', '热水器', '电视', '天然气']
df_rent = process_multi_hot_impute_and_encode(
    df_rent,
    n_train_rent,
    col_to_process='配套设施',
    keywords_list=keywords_furniture,
    prefix='物业',
    group_col_l1='板块',
    group_col_l2='区县'
)


... L1/L2 填充后剩余 23243 个缺失值。使用全局回退 '普通住宅'...
... L1/L2 填充后剩余 20827 个缺失值。使用全局回退 '普通住宅'...
... L1/L2 填充后剩余 23243 个缺失值。使用全局回退 '商品房'...
... L1/L2 填充后剩余 20827 个缺失值。使用全局回退 '商品房'...
... L1/L2 填充后剩余 23283 个缺失值。使用全局回退 '塔楼'...
... L1/L2 填充后剩余 21902 个缺失值。使用全局回退 '塔楼'...
... L1/L2 填充后剩余 3010 个缺失值。使用全局回退 '洗衣机、空调、衣柜、电视、冰箱、热水器、床、天然气'...


#### c. 填充环线位置等缺失值

In [36]:
def process_ring_road_impute_and_encode(df, n_train, col_to_process='环线', group_col_l1='板块', group_col_l2='区域'):
    """
    处理环线特征：
    1. 仅使用训练集 (n_train) 数据计算填充地图 (mode)。
    2. 优先使用 ('区域', '板块') 级别的众数进行填充。
    3. 回退使用 '城市' 级别的众数进行填充。
    4. 将所有剩余的 NaN (包括结构性缺失) 填充为 "无环线"。
    5. 对处理干净的 '环线' 列进行独热编码。
    """
    df_processed = df.copy()
    train_part = df_processed.iloc[:n_train]
    
    # 地图: ('区域', '板块') -> 环线众数
    mode_map_plate = train_part.groupby([group_col_l2, group_col_l1])[col_to_process].agg(
        lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
    )

    # 地图: ('区域') -> 环线众数
    mode_map_city = train_part.groupby(group_col_l2)[col_to_process].agg(
        lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan
    )

    # 应用填充 (应用到整个数据集)
    
    mapped_values_l1 = df_processed.set_index([group_col_l2, group_col_l1]).index.map(mode_map_plate)
    fill_values_plate = pd.Series(mapped_values_l1, index=df_processed.index)

    fill_values_city = df_processed[group_col_l2].map(mode_map_city)

    df_processed[col_to_process].fillna(fill_values_plate, inplace=True)
    df_processed[col_to_process].fillna(fill_values_city, inplace=True)

    # 处理结构性缺失    
    fill_label = "无环线"
    df_processed[col_to_process].fillna(fill_label, inplace=True)

    # 独热编码
    df_processed = pd.get_dummies(df_processed, 
                                columns=[col_to_process], 
                                prefix=col_to_process, 
                                dummy_na=False)
    

    # 打印新生成的列以供检查
    new_cols = [col for col in df_processed.columns if col.startswith(f"{col_to_process}_")]

    return df_processed


df_price = process_ring_road_impute_and_encode(
     df_price, 
     n_train_price, 
     col_to_process='环线', 
     group_col_l1='板块', 
     group_col_l2='区域'
)

community_cols = ['房屋总数', '楼栋总数', '停车位']
df_price = process_numerical_impute(
    df_price, 
    n_train_price,
    cols_to_process=community_cols,
    group_col_l1='板块',
    group_col_l2='区域',
    final_fallback_value=0.0
)

df_rent = process_ring_road_impute_and_encode(
     df_rent, 
     n_train_rent, 
     col_to_process='环线位置', 
     group_col_l1='板块', 
     group_col_l2='区县'
)

community_cols = ['房屋总数', '楼栋总数', '停车位']
df_price = process_numerical_impute(
    df_price, 
    n_train_price,
    cols_to_process=community_cols,
    group_col_l1='板块',
    group_col_l2='区域',
    final_fallback_value=0.0
)
df_rent = process_numerical_impute(
    df_rent, 
    n_train_rent,
    cols_to_process=community_cols,
    group_col_l1='板块',
    group_col_l2='区县',
    final_fallback_value=0.0
)

### 3. 提取时间相关数据

#### a. 提取交易时间的年代和月份

In [37]:
def process_time_and_fill_age_col(df, time_col='交易时间', last_time_col='上次交易', age_col='房屋年限'):
    """
    处理时间特征并填充 '房屋年限' 列，然后进行独热编码：
    1. 提取 '交易时间' 的年份和月份。
    2. 对 'age_col' (房屋年限) 中的 NaN 值进行填充：
       a. 如果 '上次交易' 有效，根据 '交易时间'-'上次交易' 的间隔计算并填充 
          '未满两年', '满两年', '满五年'。
       b. 如果 '上次交易' 无效 (NaN)，填充为 '其他'。
    3. 对填充完毕的 'age_col' 进行独热编码。
    """
    df_processed = df.copy()
    
    # 统一日期格式：将 '/' 替换为 '-'，以便 pandas 能正确解析
    # 例如：'2024/11/28' -> '2024-11-28'
    df_processed[time_col] = df_processed[time_col].astype(str).str.replace('/', '-')
    
    # 提取年份和月份
    t_trans = pd.to_datetime(df_processed[time_col], errors='coerce')
    
    # 如果存在 last_time_col，也需要统一格式
    if last_time_col in df_processed.columns:
        df_processed[last_time_col] = df_processed[last_time_col].astype(str).str.replace('/', '-')
        t_last = pd.to_datetime(df_processed[last_time_col], errors='coerce')
    else:
        t_last = None

    # 提取年份和月份
    df_processed['交易年份'] = t_trans.dt.year
    df_processed['交易月份'] = t_trans.dt.month

    # 检查是否有解析失败的情况
    failed_count = t_trans.isna().sum()
    if failed_count > 0:
        print(f"警告: 有 {failed_count} 条交易时间数据无法解析")

    if last_time_col not in df_processed.columns or t_last is None:
        # 租赁数据集没有 '上次交易' 列，直接返回
        df_processed = df_processed.drop(columns=[time_col], errors='ignore')
        return df_processed

    # 找出需要填充的行
    mask_calc = (df_processed[age_col].isna()) & (t_trans.notna()) & (t_last.notna())
    
    # 找出无法填充的行：
    mask_fill_other = (df_processed[age_col].isna()) & (~mask_calc)
    
    # 执行填充
    # 根据计算结果填充
    if mask_calc.any():
        
        # 1. 计算 "满X年" 的布尔掩码
        # [mask_calc] 确保只对相关行进行日期计算，提高效率
        two_years_later = t_last[mask_calc] + pd.DateOffset(years=2)
        five_years_later = t_last[mask_calc] + pd.DateOffset(years=5)
        
        t_trans_masked = t_trans[mask_calc] # 确保索引对齐
        
        is_g5 = (t_trans_masked >= five_years_later)
        is_g2 = (t_trans_masked >= two_years_later) & (~is_g5)
        is_u2 = (~is_g2) & (~is_g5) # 默认是 "未满两年"
        
        # 获取需要操作的行的实际索引
        idx_u2 = is_u2[is_u2].index
        idx_g2 = is_g2[is_g2].index
        idx_g5 = is_g5[is_g5].index
        
        # 执行 .loc 填充
        df_processed.loc[idx_u2, age_col] = "未满两年"
        df_processed.loc[idx_g2, age_col] = "满两年"
        df_processed.loc[idx_g5, age_col] = "满五年"

    # 填充 "其他"
    if mask_fill_other.any():

        df_processed.loc[mask_fill_other, age_col] = "其他"

    df_processed = pd.get_dummies(df_processed, 
                                columns=[age_col], 
                                prefix='年限', 
                                dummy_na=False) 

    # 删除原始时间列
    df_processed = df_processed.drop(columns=[time_col, last_time_col], errors='ignore')


    return df_processed

df_price = process_time_and_fill_age_col(
    df_price,
    time_col='交易时间',
    last_time_col='上次交易',
    age_col='房屋年限'
)

# 处理 df_rent 的交易时间（df_rent 没有 '上次交易' 和 '房屋年限' 列）
df_rent = process_time_and_fill_age_col(
    df_rent,
    time_col='交易时间',
    last_time_col='上次交易',  # 不存在，函数会处理
    age_col='房屋年限'  # 不存在，函数会处理
)

# 交易月份独热编码
df_price = process_structure(df_price, structure_col='交易月份', fillna_value='', prefix='交易月份')
df_rent = process_structure(df_rent, structure_col='交易月份', fillna_value='', prefix='交易月份')

#### b. 提取建筑年代（房龄）


In [38]:
def parse_building_year(text):
    """
    解析建筑年代：
    1. "1955-2000年" -> 1977.5 (区间取中点)
    2. "2005年" -> 2005.0 (单个数字)
    3. "未知" -> NaN
    """
    if pd.isna(text):
        return np.nan
    
    s = str(text).strip()

    # 尝试匹配区间 
    range_match = re.search(r'(\d{4})\s*[-—–]\s*(\d{4})', s)
    if range_match:
        try:
            year1 = float(range_match.group(1))
            year2 = float(range_match.group(2))
            return (year1 + year2) / 2.0
        except ValueError:
            pass

    # 试匹配单个4位数年份
    single_match = re.search(r'(\d{4})', s)
    if single_match:
        try:
            return float(single_match.group(1))
        except ValueError:
            return np.nan
            
    # 匹配失败
    return np.nan

def process_building_age(df, n_train, col='建筑年代', 
                         trans_year_col='交易年份',
                         group_col_l1='板块', group_col_l2='区域'):
    """
    处理 '建筑年代' 特征：
    1. 使用 parse_building_year 解析原始列。
    2. 计算 '房龄' = '交易年份' - '解析后的建筑年代'。
    3. 使用 L1/L2/L3 中位数填充 '房龄' 的缺失值。
    4. 清理异常值 (如负房龄)。
    """
    df_processed = df.copy()

    # 解析建筑年代
    parsed_year_col = col + '_parsed'
    df_processed[parsed_year_col] = df_processed[col].apply(parse_building_year)
    
    # 计算 '房龄'
    age_col = '房龄'
    df_processed[age_col] = df_processed[trans_year_col] - df_processed[parsed_year_col]

    nan_count = df_processed[age_col].isnull().sum()


    # 填充 '房龄' 的缺失值 (L1/L2/L3 中位数)
    train_part = df_processed.iloc[:n_train]
    valid_train_age = train_part.loc[train_part[age_col] >= 0]
    
    median_map_l1 = valid_train_age.groupby(group_col_l1)[age_col].median()
    median_map_l2 = valid_train_age.groupby(group_col_l2)[age_col].median()
    global_median = valid_train_age[age_col].median()
    fill_global = global_median if pd.notna(global_median) else 20
    
    # L1 填充
    nan_mask_l1 = df_processed[age_col].isnull()
    fill_values_l1 = df_processed.loc[nan_mask_l1, group_col_l1].map(median_map_l1)
    df_processed[age_col].fillna(fill_values_l1, inplace=True)

    # L2 填充
    nan_mask_l2 = df_processed[age_col].isnull()
    if nan_mask_l2.any():
        fill_values_l2 = df_processed.loc[nan_mask_l2, group_col_l2].map(median_map_l2)
        df_processed[age_col].fillna(fill_values_l2, inplace=True)
    
    # L3 (Global) 填充
    nan_mask_l3 = df_processed[age_col].isnull()
    if nan_mask_l3.any():
        df_processed[age_col].fillna(fill_global, inplace=True)

    # 清理异常值
    negative_age_count = (df_processed[age_col] < 0).sum()
    if negative_age_count > 0:
        print(f"... 发现 {negative_age_count} 行负房龄，将其修正为 0。")
        df_processed[age_col] = np.maximum(0, df_processed[age_col])
        
    # 清理临时列 
    df_processed = df_processed.drop(columns=[col, parsed_year_col], errors='ignore')

    
    return df_processed

df_price = process_building_age(
    df_price,
    n_train_price,
    col='建筑年代',
    trans_year_col='交易年份',
    group_col_l1='板块',
    group_col_l2='区域'
)
df_rent = process_building_age(
    df_rent,
    n_train_rent,
    col='建筑年代',
    trans_year_col='交易年份',
    group_col_l1='板块',
    group_col_l2='区县'
)

#### c. 处理租期数据

In [39]:
def parse_lease_term(text):
    """
    解析租期字符串，转换为统一的【月数】。
    1. "X年以内" -> X * 12
    2. "X个月" -> X
    3. "X～Y月" -> (X+Y)/2
    4. 其他/NaN -> NaN
    """
    if pd.isna(text):
        return np.nan
    
    s = str(text).strip()
    
    match_year = re.search(r'(\d+)\s*年以内', s)
    if match_year:
        try:
            years = int(match_year.group(1))
            return float(years * 12)
        except ValueError: pass
            
    match_range = re.search(r'(\d+)\s*[～\-]\s*(\d+)\s*月', s)
    if match_range:
        try:
            month1 = int(match_range.group(1))
            month2 = int(match_range.group(2))
            return float((month1 + month2) / 2.0)
        except ValueError: pass

    match_month = re.search(r'(\d+)\s*个?月', s)
    if match_month:
        try:
            months = int(match_month.group(1))
            return float(months)
        except ValueError: pass
            
    try:
        num = int(s)
        if num > 0: return float(num)
    except ValueError: pass

    return np.nan

# --- 主处理函数：(仅清洗+填充) ---
def process_lease_duration_numeric(df, n_train, col='租期'):
    """
    (修改版 - 仅清洗+填充)
    处理'租期'特征：
    1. 使用 parse_lease_term 清洗并转换为数值（月数），存入新列 '租期_月数'。
    2. 使用训练集的中位数填充 '租期_月数' 的 NaN。
    3. 删除原始 '租期' 列。
    """
    df_processed = df.copy()

    # 清洗并转换为数值 (月数) 
    lease_col_numeric = col + '_月数' # 新列名，例如 '租期_月数'
    df_processed[lease_col_numeric] = df_processed[col].apply(parse_lease_term)
    
    nan_count = df_processed[lease_col_numeric].isnull().sum()
    
    # 填充缺失值 (使用训练集 Median)
    if nan_count > 0:
        train_part = df_processed.iloc[:n_train]
        median_lease = train_part[lease_col_numeric].median()
        # 提供一个合理的回退值 (例如 12 个月)
        fill_value = median_lease if pd.notna(median_lease) else 12.0 
        
        print(f"... 使用训练集的中位数 {fill_value:.1f} 月填充 '{lease_col_numeric}' 的缺失值...")
        df_processed[lease_col_numeric].fillna(fill_value, inplace=True)

    # 清理
    df_processed = df_processed.drop(columns=[col], errors='ignore')
    
    return df_processed

df_rent = process_lease_duration_numeric(df_rent, n_train_rent, col='租期')

... 使用训练集的中位数 12.0 月填充 '租期_月数' 的缺失值...


### 4. 创建复杂特征列

#### a.创建一些比率类特征

In [40]:
def cal_ratio(df_price, NumeratorValue, DenominatorValue, new_col_name):
    """
    计算一些比率类特征变量，处理除以零的情况。
    """
    df_price[new_col_name] = np.where(
        df_price[DenominatorValue] > 0,
        df_price[NumeratorValue] / df_price[DenominatorValue],
        0
    )
def feature(df):
    cal_ratio(df, '房屋总数', '楼栋总数', '平均每栋房屋数')
    cal_ratio(df, '停车位', '房屋总数', '停车位与房屋总数比')
    cal_ratio(df, '绿 化 率', '容 积 率', '绿化率与容积率比')
    cal_ratio(df, '室', '厅', '室厅比')
    cal_ratio(df, '室', '卫', '室卫比')
    df = process_structure(df,structure_col = '卫',fillna_value='',prefix='卫')
    df = process_structure(df,structure_col = '厅',fillna_value='',prefix='厅')
    df = process_structure(df,structure_col = '室',fillna_value='',prefix='室')
feature(df_price)
feature(df_rent)

#### b. 计算距城市中心距离

In [41]:
def compute_city_center_and_distances(df):
    """
    计算每个城市的中心点 (平均经纬度) 和房源到中心点的距离。
    """
    city_centers = df.groupby('城市', observed=True)[['lon', 'lat']].mean().reset_index()
    
    # 重命名，以便合并
    city_centers = city_centers.rename(columns={'lon': 'center_lon', 'lat': 'center_lat'})


    # 将中心点坐标合并回主 DataFrame
    df = pd.merge(df, city_centers, on='城市', how='left')


    # 定义距离计算函数
    def compute_distance(row):
        # 检查必要的列是否存在且非 NaN
        if pd.isna(row['lat']) or pd.isna(row['lon']) or pd.isna(row['center_lat']) or pd.isna(row['center_lon']):
            return np.nan
        
        # geopy.distance.geodesic 需要 (纬度, 经度) 格式
        center_coords = (row['center_lat'], row['center_lon'])
        property_coords = (row['lat'], row['lon'])
        
        try:
            distance_km = geodesic(center_coords, property_coords).km
            return distance_km
        except ValueError:
            # 处理 geopy 可能因无效坐标抛出的错误
            return np.nan

    # 应用函数计算距离并创建新特征
    df['距离中心_公里'] = df.apply(compute_distance, axis=1)
    
    # 创建距离的平方特征
    df['距离中心_公里_平方'] = df['距离中心_公里'] ** 2
 
    df = df.drop(columns=['center_lon', 'center_lat'], errors='ignore')

    return df
df_price = compute_city_center_and_distances(df_price)
df_rent = compute_city_center_and_distances(df_rent)

#### c. 对经纬度进行 K-means 聚类

In [42]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans # 确保已导入

def create_geo_clusters(df, n_clusters=50, city_col='城市', lon_col='lon', lat_col='lat'):
    """
    (完善版)
    为每个城市计算地理聚类 (基于 lon, lat) 并进行独热编码。

    参数:
    df (pd.DataFrame): 输入 DataFrame，必须包含 city_col, lon_col, lat_col。
    n_clusters (int): 为每个城市生成的聚类数量。
    city_col (str): 代表城市标签的列名。
    lon_col (str): 代表经度的列名。
    lat_col (str): 代表纬度的列名。

    返回:
    pd.DataFrame: 增加了 GeoCluster_* 独热编码列的 DataFrame 副本。
                 原始的 city_col, lon_col, lat_col 仍在。
                 临时的 '地理聚类_temp' 列会被删除。
    """
    df_processed = df.copy() # 在副本上操作

    # --- 1. 检查必需的列 ---
    required_cols = [city_col, lon_col, lat_col]
    if not all(col in df_processed.columns for col in required_cols):
        print(f"错误: DataFrame 缺少必需的列: {required_cols}。跳过聚类...")
        return df # 返回原始 DataFrame

    print(f"--- 开始为 '{city_col}' 列创建地理聚类 (每个城市 {n_clusters} 个簇) ---")

    # --- 2. 处理坐标 NaN ---
    coord_nan_mask = df_processed[[lon_col, lat_col]].isnull().any(axis=1)
    initial_nan_count = coord_nan_mask.sum()
    if initial_nan_count > 0:
        print(f"    (警告: 发现 {initial_nan_count} 行的经纬度坐标存在 NaN，这些行将不会被分配聚类)")

    # --- 3. 初始化聚类列 ---
    cluster_col_temp = '地理聚类_temp'
    df_processed[cluster_col_temp] = -1 # 初始化为 -1 (未分配)

    all_city_labels = df_processed[city_col].unique()
    
    # --- 4. 循环为每个城市进行聚类 ---
    for city_label in all_city_labels:
        # 筛选出当前城市的数据，并且坐标有效
        city_mask = (df_processed[city_col] == city_label) & (~coord_nan_mask)
        city_data = df_processed.loc[city_mask, [lon_col, lat_col]]
        
        # 确保该城市有足够的数据点进行聚类
        if len(city_data) >= n_clusters:
            try:
                # 初始化并拟合 KMeans
                kmeans = KMeans(n_clusters=n_clusters, 
                                random_state=42, 
                                n_init='auto') # n_init='auto' 抑制未来警告
                # 预测聚类标签 (0, 1, 2...)
                clusters = kmeans.fit_predict(city_data)
                
                # 将聚类标签写回主 DataFrame (使用 .loc 和 city_data 的索引确保准确)
                df_processed.loc[city_data.index, cluster_col_temp] = clusters
                
            except Exception as e:
                print(f"    (警告: 城市 {city_label} KMeans 失败: {e})")
        elif len(city_data) > 0: # 如果城市有数据但太少
             print(f"    (警告: 城市 {city_label} 有效数据点不足 ({len(city_data)})，跳过聚类)")
        # 如果城市完全没有有效坐标数据，则不做任何事，保持 -1

    # --- 5. 创建组合标签 ---
    #    例如：城市0的簇1 -> "C0_1", 城市1的簇1 -> "C1_1"
    #    对于未分配聚类的行 (包括 NaN 坐标或小城市)，创建一个特定标签
    combined_label_col = '地理聚类_带城市'
    df_processed[combined_label_col] = np.where(
         (df_processed[cluster_col_temp] != -1) & df_processed[city_col].notna(), # 仅组合有效聚类和城市
         'C' + df_processed[city_col].astype(str) + '_' + df_processed[cluster_col_temp].astype(str),
         'GeoCluster_Unknown' # 为所有未聚类的行指定一个标签
    )
    
    # --- 6. 独热编码 ---

    df_processed = pd.get_dummies(df_processed, 
                                columns=[combined_label_col], 
                                prefix='GeoCluster', 
                                drop_first=False) # drop_first=False 对聚类标签通常更安全
    
    # --- 7. 清理临时列 ---
    df_processed = df_processed.drop(columns=[cluster_col_temp], errors='ignore')

    return df_processed # 返回修改后的副本


df_price = create_geo_clusters(df_price, n_clusters=100) 
df_rent = create_geo_clusters(df_rent, n_clusters=100)

--- 开始为 '城市' 列创建地理聚类 (每个城市 100 个簇) ---
--- 开始为 '城市' 列创建地理聚类 (每个城市 100 个簇) ---


### 5. 清理与最终准备

In [43]:
df_price = process_structure(df_price,structure_col = '城市',fillna_value='',prefix='城市')
df_rent = process_structure(df_rent,structure_col = '城市',fillna_value='',prefix='城市')

df_price.describe()

Unnamed: 0,区域,建筑面积,套内面积,lon,lat,房屋总数,楼栋总数,绿 化 率,容 积 率,物 业 费,...,建构_is_其他,交易年份,房龄,平均每栋房屋数,停车位与房屋总数比,绿化率与容积率比,室厅比,室卫比,距离中心_公里,距离中心_公里_平方
count,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,...,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0,137888.0
mean,65.877031,98.128441,80.657047,115.04158,32.390722,1915.849323,31.232943,33.218714,2.657228,2.403204,...,0.0,2024.042375,16.647613,128.733055,2.363185,15.813693,1.743992,1.939371,17.467988,477.315786
std,34.223416,44.855143,37.474389,5.815359,6.010348,1839.795985,61.627979,8.243373,1.391097,3.643612,...,0.0,0.851006,8.002008,126.660009,33.158597,21.424876,0.729444,0.658077,13.121983,1244.364781
min,1.0,11.7,9.647826,103.482624,23.025853,1.0,1.0,0.01,0.02,0.02,...,0.0,2018.0,2.0,0.666667,0.000286,0.003333,0.0,0.0,0.204854,0.041965
25%,35.0,70.74,58.03,107.647988,30.42838,714.0,8.0,30.0,2.0,1.3,...,0.0,2024.0,11.5,40.833333,0.274194,10.801394,1.5,1.5,8.538432,72.904821
50%,68.0,90.77,74.469674,116.124574,32.18721,1372.0,15.0,34.0,2.5,1.9,...,0.0,2024.0,15.0,91.777778,0.534985,13.6,1.5,2.0,14.537074,211.326531
75%,89.0,117.44,96.478262,121.534146,40.479251,2477.0,29.0,35.0,3.0,2.65,...,0.0,2025.0,19.0,175.428571,1.013564,17.6,2.0,2.0,22.579847,509.849483
max,131.0,508.11,466.36,122.966669,42.693676,12669.0,734.0,80.0,30.0,76.45,...,0.0,2025.0,90.0,2175.0,4350.0,2250.0,17.0,8.0,227.459046,51737.617648


#### a. 清除作为数据填充依据的列

In [44]:
cols_to_drop_final = ['区域', 'lon', 'lat', '板块', '区县', '建筑结构_comm']
df_price = df_price.drop(columns=cols_to_drop_final, errors='ignore')
df_rent = df_rent.drop(columns=cols_to_drop_final, errors='ignore')

#### b. 重新分离训练集和测试集

In [45]:
y_train_ln_price = np.log1p(y_train_price)
X_train_price = df_price.iloc[:n_train_price]
X_test = df_price.iloc[n_train_price:]

y_train_ln_rent = np.log1p(y_train_rent)
X_train_rent = df_rent.iloc[:n_train_rent]
X_test_rent = df_rent.iloc[n_train_rent:]

#### c. 保存数据

In [46]:
X_train_price.to_parquet('X_train_price.parquet')
X_test.to_parquet('X_test_price.parquet')
y_train_ln_price.to_frame().to_parquet('y_train_price.parquet')
df_price.to_parquet('df_price.parquet')

X_train_rent.to_parquet('X_train_rent.parquet')
X_test_rent.to_parquet('X_test_rent.parquet')
y_train_ln_rent.to_frame().to_parquet('y_train_rent.parquet')
df_rent.to_parquet('df_rent.parquet')

## Part3：特征工程

### 1. 异常值检测与处理
#### a. 识别特征类型

In [47]:
# 加载上一步保存的数据
X_train_price = pd.read_parquet('X_train_price.parquet')
X_test_price = pd.read_parquet('X_test_price.parquet')
y_train_ln_price = pd.read_parquet('y_train_price.parquet')

X_train_rent = pd.read_parquet('X_train_rent.parquet')
X_test_rent = pd.read_parquet('X_test_rent.parquet')
y_train_ln_rent = pd.read_parquet('y_train_rent.parquet')

#### b. 识别并清除异常值
接下来使用 Q1 和 Q99 分位数识别异常值并清除（对于数值型变量）

In [48]:
def detect_outliers(X_train, X_test, y_train):
    """
    (修正版)
    使用Q1和Q99分位数对数值特征进行“缩尾处理”(Capping)。
    """
    X_train = X_train.copy() # 使用 .copy() 避免 SettingWithCopyWarning
    X_test = X_test.copy()
    y_train = y_train.copy() # 对 y_train 也使用 .copy()

    # --- (*** 修正点：更精确地识别数值列 ***) ---
    numeric_cols = []
    binary_cols = []
    other_cols = [] 

    for col in X_train.columns:
        dtype = X_train[col].dtype

        if pd.api.types.is_numeric_dtype(dtype) and not pd.api.types.is_bool_dtype(dtype):

            if X_train[col].nunique(dropna=False) == 2 and X_train[col].min() == 0 and X_train[col].max() == 1:
                binary_cols.append(col)
            else:
                numeric_cols.append(col) # 这是真正的数值列
        elif pd.api.types.is_bool_dtype(dtype):
            binary_cols.append(col)
        else:
            other_cols.append(col)
    
    numeric_cols.remove('供热费')
    quantiles = X_train[numeric_cols].quantile([0.01, 0.99])

    for col in numeric_cols:
        # 从训练集获取 Q1 和 Q99
        Q1 = quantiles.loc[0.01, col]
        Q99 = quantiles.loc[0.99, col]
        
        X_train[col] = X_train[col].clip(Q1, Q99)
        if col in X_test.columns:
                X_test[col] = X_test[col].clip(Q1, Q99)
    
    return X_train, X_test, y_train


X_train_price, X_test_price, y_train_ln_price = detect_outliers(X_train_price, X_test_price, y_train_ln_price)
X_train_rent, X_test_rent, y_train_ln_rent = detect_outliers(X_train_rent, X_test_rent, y_train_ln_rent)


### 2. 创建非线性特征

#### a. 对数转换

In [49]:
def cal_skew(df):
    # 重新识别 0/1 哑变量列
    binary_cols = [col for col in df.columns
                   if df[col].nunique(dropna=False) == 2 and
                   df[col].min() == 0 and
                   df[col].max() == 1]

    # 识别连续型数值列
    continuous_numeric_cols = [col for col in df.columns if col not in binary_cols]

    X_train_numeric = df[continuous_numeric_cols]

    skewness = X_train_numeric.skew()
    kurtosis = X_train_numeric.kurtosis()

    # 创建一个汇总 DataFrame
    distribution_summary = pd.DataFrame({
        'Skew (偏度)': skewness,
        'Kurtosis (峰度)': kurtosis
    })

    # 按偏度的绝对值降序排列
    distribution_summary_sorted = distribution_summary.reindex(
        distribution_summary['Skew (偏度)'].abs().sort_values(ascending=False).index
    )

    print(distribution_summary_sorted)

cal_skew(X_train_price)
cal_skew(X_train_rent)

                  Skew (偏度)  Kurtosis (峰度)
供热费               89.462816    8391.021565
停车位与房屋总数比          6.267866      44.141269
楼栋总数               4.426539      23.101704
物 业 费              3.480153      15.804279
距离中心_公里_平方         3.002323      10.329502
...                     ...            ...
GeoCluster_C2_5    0.000000       0.000000
GeoCluster_C5_56   0.000000       0.000000
GeoCluster_C5_61   0.000000       0.000000
GeoCluster_C5_82   0.000000       0.000000
GeoCluster_C9_99   0.000000       0.000000

[69 rows x 2 columns]
                   Skew (偏度)  Kurtosis (峰度)
楼栋总数                4.463038      23.762716
交易年份                3.793161      12.388322
停车位与房屋总数比           3.533512      16.618094
物 业 费               3.013789      11.841948
卫                   2.936996       6.626078
停车位                 2.928420       9.504336
距离中心_公里_平方          2.583159       7.585866
租期_月数               2.540422       5.104637
容 积 率               1.995659       5.375790
停车费用                1

In [50]:
def log_transform(X_train, X_test, skewed_cols):
    """
    对训练集和测试集中指定的偏斜数值特征应用 log1p 转换。
    会检查列是否在各自的DataFrame中存在。
    转换后会删除原始列。

    Args:
        X_train (pd.DataFrame): 训练集特征.
        X_test (pd.DataFrame): 测试集特征.
        skewed_cols (list): 需要进行对数转换的列名列表.

    Returns:
        tuple: 包含转换后的 X_train 和 X_test 的元组.
    """
    # 创建副本以避免修改原始传入的DataFrame
    X_train_transformed = X_train.copy()
    X_test_transformed = X_test.copy()
    
    # 记录实际转换的列和跳过的列
    transformed_in_train = []
    transformed_in_test = []
    skipped = []

    for col in skewed_cols:
        col_log = f'log_{col}' # 新列名

        # 处理训练集
        if col in X_train_transformed.columns:
            # 检查数据类型是否为数值型，以防万一
            if pd.api.types.is_numeric_dtype(X_train_transformed[col]):
                X_train_transformed[col_log] = np.log1p(X_train_transformed[col])
                X_train_transformed = X_train_transformed.drop(columns=[col], errors='ignore')
                transformed_in_train.append(col)

        # 处理测试集
        if col in X_test_transformed.columns:
             # 检查数据类型是否为数值型
            if pd.api.types.is_numeric_dtype(X_test_transformed[col]):
                X_test_transformed[col_log] = np.log1p(X_test_transformed[col])
                X_test_transformed = X_test_transformed.drop(columns=[col], errors='ignore')
                transformed_in_test.append(col)

                if col not in transformed_in_train and (col + " (train, 非数值)") not in skipped:
                     skipped.append(col + " (test, 非数值)")


        # 记录那些根本不存在于任一集合的列（或只存在于一个集合中）
        if col not in transformed_in_train and col not in transformed_in_test and \
           (col + " (train, 非数值)") not in skipped and (col + " (test, 非数值)") not in skipped:
             if col not in X_train.columns and col not in X_test.columns:
                  skipped.append(col + " (train&test 均不存在)")
             elif col not in X_train.columns:
                  skipped.append(col + " (train 不存在)")
             elif col not in X_test.columns:
                  skipped.append(col + " (test 不存在)")

    return X_train_transformed, X_test_transformed

skewed_cols = ['容 积 率', '套内面积', '建筑面积', '燃气费', '楼栋总数',
               '供热费', '停车位', '房屋总数', '物 业 费','停车费用']

X_train_price, X_test_price = log_transform(X_train_price, X_test_price, skewed_cols)
X_train_rent, X_test_rent = log_transform(X_train_rent, X_test_rent, skewed_cols)

#### b.分箱

In [51]:
def bin_and_encode(X_train, X_test, feature='房龄'):
    """
    对 '房龄' 特征进行分箱和独热编码
    """
    # 检查特征是否同时存在于训练集和测试集中
    if feature not in X_train.columns or feature not in X_test.columns:
        return X_train, X_test
    
    # 创建 KBinsDiscretizer 实例
    binner = KBinsDiscretizer(n_bins=5,   # 分为 5 个箱
                              encode='ordinal', # 输出 0, 1, 2, 3, 4
                              strategy='kmeans', 
                              subsample=None) # 使用所有数据

    # 只在 X_train 上 .fit()
    binner.fit(X_train[[feature]])

    # 在 X_train 和 X_test 上 .transform()
    X_train[f'{feature}_分箱'] = binner.transform(X_train[[feature]])
    X_test[f'{feature}_分箱'] = binner.transform(X_test[[feature]])

    # 将新的分箱列转换为独热编码
    X_train = pd.get_dummies(X_train, columns=[f'{feature}_分箱'], prefix=f'{feature}段', drop_first=True)
    X_test = pd.get_dummies(X_test, columns=[f'{feature}_分箱'], prefix=f'{feature}段', drop_first=True)

    # 删除原始列
    X_train = X_train.drop(columns=[feature], errors='ignore')
    X_test = X_test.drop(columns=[feature], errors='ignore')

    # 确保 X_test 和 X_train 有完全相同的"房龄段"列
    X_train_cols = set(X_train.columns)
    X_test_cols = set(X_test.columns)

    missing_in_test = list(X_train_cols - X_test_cols)
    for col in missing_in_test:
        X_test[col] = 0 # 在 X_test 中添加缺失的哑变量列，并设为 0

    missing_in_train = list(X_test_cols - X_train_cols)
    for col in missing_in_train:
        X_train[col] = 0 # (以防万一)

    X_test = X_test[X_train.columns] # 保证顺序一致

    return X_train, X_test

feature_list = ['log_供热费', 'log_物 业 费', 'log_停车费用',
                'log_房屋总数', 'log_停车位', '交易年份','房龄']
for feature in feature_list:
    X_train_price, X_test_price = bin_and_encode(X_train_price, X_test_price, feature=feature)
    X_train_rent, X_test_rent = bin_and_encode(X_train_rent, X_test_rent, feature=feature)

In [52]:
X_train_price.to_csv('X_train_price_pre_featured.csv')
X_test_price.to_csv('X_test_price_pre_featured.csv')
y_train_ln_price.to_csv('y_train_price.csv')

X_train_rent.to_csv('X_train_rent_pre_featured.csv')
X_test_rent.to_csv('X_test_rent_pre_featured.csv')
y_train_ln_rent.to_csv('y_train_rent.csv')

### 3. 创建交互项

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations, product
from pathlib import Path

# ---------------------------------------------------------------------------
# 1. 定义特征生成函数
# ---------------------------------------------------------------------------

def generate_nonlinear_features(df, verbose=True):
    """
    根据明确的规则生成非线性特征：
    1. 数值特征的平方项
    2. 数值特征之间的二元交互项
    3. 数值特征与二元(虚拟)特征的交互项
    
    Args:
        df (pd.DataFrame): 合并后的 (train + test) DataFrame.
        verbose (bool): 是否打印日志.

    Returns:
        pd.DataFrame: 增加了新特征的 DataFrame.
    """
    df_aug = df.copy()
    
    # --- 步骤 A: 自动识别列类型 ---
    # 假设二元/虚拟变量是那些只包含 0 和 1 的列
    binary_cols = [col for col in df.columns if set(df[col].unique()) <= {0, 1}]
    
    # 假设数值变量是所有 'number' 类型的列，且不是二元变量
    numeric_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in binary_cols]
    
    if verbose:
        print(f"  自动识别: {len(numeric_cols)} 个数值特征, {len(binary_cols)} 个二元特征。")

    new_features_count = 0

    # --- 步骤 B: (要求 1) 创建数值类平方项 ---
    for col in numeric_cols:
        new_col_name = f'{col}_sq'
        if new_col_name not in df_aug.columns:
            df_aug[new_col_name] = df_aug[col] ** 2
            new_features_count += 1

    # --- 步骤 C: (要求 2) 创建数值类内部交互项 ---
    for col1, col2 in combinations(numeric_cols, 2):
        new_col_name = f'{col1}_x_{col2}'
        if new_col_name not in df_aug.columns:
            df_aug[new_col_name] = df_aug[col1] * df_aug[col2]
            new_features_count += 1
            
    if verbose:
        print(f"  成功生成 {new_features_count} 个新的非线性特征。")

    return df_aug

# ---------------------------------------------------------------------------
# 2. 执行特征工程 (Price 数据集)
# ---------------------------------------------------------------------------
try:
    print(f"Processing Price dataset: {X_train_price.shape}")
    # 合并训练/测试，保证交互特征一致性
    combined_price = pd.concat([X_train_price, X_test_price], axis=0)
    
    # 生成特征
    X_aug_combined_price = generate_nonlinear_features(combined_price)
    
    # 拆回训练/测试
    X_aug_price = X_aug_combined_price.loc[X_train_price.index]
    X_aug_test_price = X_aug_combined_price.loc[X_test_price.index]
    
    # 保存生成的特征文件 (使用你之前的 'pre_filtering' 命名)
    X_aug_price.to_parquet('X_train_price_pre_filtering.parquet', index=False)
    X_aug_test_price.to_parquet('X_test_price_pre_filtering.parquet', index=False)
    
    print(f"Saved augmented Price data (train shape={X_aug_price.shape}, test shape={X_aug_test_price.shape})")
    
    # 更新变量以供后续单元使用
    X_train_price = X_aug_price
    X_test_price = X_aug_test_price
    
except Exception as e:
    print(f"Error processing Price dataset augmentation: {e}")

# ---------------------------------------------------------------------------
# 3. 执行特征工程 (Rent 数据集)
# ---------------------------------------------------------------------------
try:
    print(f"\nProcessing Rent dataset: {X_train_rent.shape}")
    # 合并
    combined_rent = pd.concat([X_train_rent, X_test_rent], axis=0)
    
    # 生成特征
    X_aug_combined_rent = generate_nonlinear_features(combined_rent)
    
    # 拆分
    X_aug_rent = X_aug_combined_rent.loc[X_train_rent.index]
    X_aug_test_rent = X_aug_combined_rent.loc[X_test_rent.index]

    # 保存生成的特征文件
    X_aug_rent.to_parquet('X_train_rent_pre_filtering.parquet', index=False)
    X_aug_test_rent.to_parquet('X_test_rent_pre_filtering.parquet', index=False)
    
    print(f"Saved augmented Rent data (train shape={X_aug_rent.shape}, test shape={X_aug_test_rent.shape})")
    
    # 更新变量以供后续单元使用
    X_train_rent = X_aug_rent
    X_test_rent = X_aug_test_rent
    
except Exception as e:
    print(f"Error processing Rent dataset augmentation: {e}")

print("\n--- Feature generation complete. ---")

Processing Price dataset: (103871, 233)
  自动识别: 19 个数值特征, 214 个二元特征。
  成功生成 190 个新的非线性特征。
Saved augmented Price data (train shape=(103871, 423), test shape=(34017, 423))

Processing Rent dataset: (98899, 196)
  自动识别: 17 个数值特征, 179 个二元特征。
  成功生成 153 个新的非线性特征。
Saved augmented Rent data (train shape=(98899, 349), test shape=(9773, 349))

--- Feature generation complete. ---


## Part 4: 模型训练与评估

### 1. 导入Z-score 后的数据

#### a. 加载数据

In [None]:
def load_feature_data(suffix='final'):
    """
    加载特征数据
    """
    
    X_train_price = pd.read_parquet(f'X_train_price{suffix}.parquet')
    X_test_price = pd.read_parquet(f'X_test_price{suffix}.parquet')
    y_train_ln_price = pd.read_parquet('y_train_price.parquet')

    X_train_rent = pd.read_parquet(f'X_train_rent{suffix}.parquet')
    X_test_rent = pd.read_parquet(f'X_test_rent{suffix}.parquet')
    y_train_ln_rent = pd.read_parquet('y_train_rent.parquet')
    
    return X_train_price, X_test_price, y_train_ln_price, \
           X_train_rent, X_test_rent, y_train_ln_rent

# 加载数据
X_train_price, X_test_price, y_train_ln_price, \
X_train_rent, X_test_rent, y_train_ln_rent = load_feature_data('_pre_featured')

#### b. Z-score 标准化

In [None]:
def standardize_data(X_train, X_test):
    binary_cols = [col for col in X_train.columns 
                    if X_train[col].nunique(dropna=False) == 2 and 
                        X_train[col].min() == 0 and 
                        X_train[col].max() == 1]
    numeric_cols = [col for col in X_train.columns if col not in binary_cols]

    numeric_cols = [col for col in numeric_cols if col in X_train.columns]

    # 初始化缩放器
    scaler = StandardScaler()

    X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])
    return X_train, X_test

X_train_price, X_test_price = standardize_data(X_train_price, X_test_price)
X_train_rent, X_test_rent = standardize_data(X_train_rent, X_test_rent)

In [None]:
import pandas as pd
from pathlib import Path

# 复制当前的训练与测试特征，后续建模直接使用这些变量
train_price_features = X_train_price.copy()
test_price_features = X_test_price.copy()
train_rent_features = X_train_rent.copy()
test_rent_features = X_test_rent.copy()

# 确保测试集列顺序与训练集一致，缺失列用 0 填充
test_price_features = test_price_features.reindex(columns=train_price_features.columns, fill_value=0)
test_rent_features = test_rent_features.reindex(columns=train_rent_features.columns, fill_value=0)

# 将对数目标转换为 Series 形式
y_price_log = y_train_ln_price.squeeze()
if isinstance(y_price_log, pd.DataFrame):
    y_price_log = y_price_log.iloc[:, 0]
y_rent_log = y_train_ln_rent.squeeze()
if isinstance(y_rent_log, pd.DataFrame):
    y_rent_log = y_rent_log.iloc[:, 0]

# 重新加载测试集 ID，确保提交文件行顺序正确
test_ids_price = pd.read_csv('ruc_Class25Q2_test_price.csv')['ID']
test_ids_rent = pd.read_csv('ruc_Class25Q2_test_rent.csv')['ID']

# 建立容器以便后续记录模型表现和提交路径
price_metrics = {}
rent_metrics = {}
submission_registry = {}

#### c. 初始化 Pipline

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV, KFold, cross_validate, train_test_split
from sklearn.metrics import mean_absolute_error, make_scorer
# Removed mean_squared_error import if not needed elsewhere
import warnings
import time # Import time for optional timing info

warnings.filterwarnings('ignore', category=UserWarning)

#统一设置 6 折交叉验证
kf = KFold(n_splits=6, shuffle=True, random_state=111)

# --- Scoring Functions (MAE and RMAE, positive and negative versions) ---
def mae_original_scale_neg(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    # Handle potential overflows or NaNs after expm1, fallback to median/sensible value might be better if frequent
    y_pred = np.nan_to_num(y_pred, nan=0.0, posinf=np.finfo(np.float64).max, neginf=0.0)
    y_pred = np.clip(y_pred, 0, None) # Ensure non-negative predictions
    mae = mean_absolute_error(y_true, y_pred)
    return -mae

def mae_original_scale(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    y_pred = np.nan_to_num(y_pred, nan=0.0, posinf=np.finfo(np.float64).max, neginf=0.0)
    y_pred = np.clip(y_pred, 0, None)
    return mean_absolute_error(y_true, y_pred)

def rmae_original_scale_neg(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    y_pred = np.nan_to_num(y_pred, nan=0.0, posinf=np.finfo(np.float64).max, neginf=0.0)
    y_pred = np.clip(y_pred, 0, None)
    mae = mean_absolute_error(y_true, y_pred)
    rmae = np.sqrt(mae) if mae >= 0 else 0
    return -rmae

def rmae_original_scale(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    y_pred = np.nan_to_num(y_pred, nan=0.0, posinf=np.finfo(np.float64).max, neginf=0.0)
    y_pred = np.clip(y_pred, 0, None)
    mae = mean_absolute_error(y_true, y_pred)
    return np.sqrt(mae) if mae >= 0 else 0

# Scorers for Scikit-learn (expect negative values as higher score = better)
mae_neg_scorer = make_scorer(mae_original_scale_neg, greater_is_better=True)
rmae_neg_scorer = make_scorer(rmae_original_scale_neg, greater_is_better=True)


# --- Speed-Optimized Evaluation Function ---
def evaluate_and_submit(model_pipeline, X, y_log, X_test, test_ids, target_tag, model_tag, param_grid=None, output_path=None):
    """
    (Speed-Optimized Version using MAE and RMAE)
    - Assumes model_pipeline includes preprocessing.
    - Runs GridSearchCV on full training data (X) for CV scores.
    - Trains a separate model on 80% split ONLY for IS/OOS metrics.
    - Uses GridSearchCV's refit estimator for final predictions.
    """
    start_time = time.time() # Start timing

    # 1. Train/Validation Split (ONLY for calculating non-CV In-Sample and Holdout metrics)
    X_train_split, X_valid, y_train_split, y_valid = train_test_split(
        X, y_log, test_size=0.2, random_state=111
    )

    base_pipeline = clone(model_pipeline)
    best_params = None
    final_estimator_trained_on_full_X = None # Will hold the model for submission
    cv_mae_mean, cv_mae_std = np.nan, np.nan
    cv_rmae_mean, cv_rmae_std = np.nan, np.nan

    if param_grid:
        print(f"--- Running GridSearchCV for {model_tag} ({target_tag}) on FULL training data ---")
        # 2a. GridSearch on the *entire* training dataset (X, y_log)
        # refit='mae' ensures the best estimator (by MAE) is trained on X
        search = GridSearchCV(
            estimator=base_pipeline,
            param_grid=param_grid,
            scoring={'mae': mae_neg_scorer, 'rmae': rmae_neg_scorer},
            refit='mae', # Refit the best model based on MAE score on the full data X
            cv=kf,
            n_jobs=8,
            verbose=0,
            return_train_score=False
        )
        search.fit(X, y_log) # Fit on the complete training data

        best_params = search.best_params_
        # This estimator is trained on FULL X with best params - ready for submission
        final_estimator_trained_on_full_X = search.best_estimator_

        # 3a. Extract CV scores directly from GridSearchCV results
        best_idx = search.best_index_
        # Scores are negative, take negative to get positive MAE/RMAE
        cv_mae_mean = -search.cv_results_['mean_test_mae'][best_idx]
        cv_mae_std = search.cv_results_['std_test_mae'][best_idx]
        cv_rmae_mean = -search.cv_results_['mean_test_rmae'][best_idx] # Changed key
        cv_rmae_std = search.cv_results_['std_test_rmae'][best_idx] # Changed key

        # 4a. Train a *separate* model on the 80% split JUST for IS/OOS metrics
        print(f"--- Fitting model on 80% split for IS/OOS metrics ({model_tag}, {target_tag}) ---")
        estimator_for_metrics = clone(base_pipeline)
        estimator_for_metrics.set_params(**best_params)
        estimator_for_metrics.fit(X_train_split, y_train_split)

    else:
        # No parameter tuning
        print(f"--- Running Cross-Validation on FULL data for {model_tag} ({target_tag}) ---")
        # 2b. Run cross_validate on the *full* training set to get CV scores
        cv_estimator = clone(base_pipeline)
        cv_results = cross_validate(
            cv_estimator,
            X,
            y_log,
            cv=kf,
            scoring={'mae': mae_neg_scorer, 'rmae': rmae_neg_scorer}, # Changed key
            n_jobs=8,
            return_train_score=False,
            verbose=1
        )
        # Scores are negative, take negative to get positive MAE/RMAE
        cv_mae_scores = -cv_results['test_mae']
        cv_rmae_scores = -cv_results['test_rmae'] # Changed key
        cv_mae_mean = cv_mae_scores.mean()
        cv_mae_std = cv_mae_scores.std()
        cv_rmae_mean = cv_rmae_scores.mean()
        cv_rmae_std = cv_rmae_scores.std()

        # 3b. Fit model on 80% split for IS/OOS metrics
        print(f"--- Fitting model on 80% split for IS/OOS metrics ({model_tag}, {target_tag}) ---")
        estimator_for_metrics = clone(base_pipeline)
        estimator_for_metrics.fit(X_train_split, y_train_split)

        # 4b. Fit final model on FULL data X for submission
        print(f"--- Fitting final model on FULL data for {model_tag} ({target_tag}) ---")
        final_estimator_trained_on_full_X = clone(base_pipeline)
        final_estimator_trained_on_full_X.fit(X, y_log)


    # 5. Calculate In-Sample and Holdout metrics using the model trained on the 80% split
    print(f"--- Calculating IS/OOS metrics for {model_tag} ({target_tag}) ---")
    y_train_pred_log = estimator_for_metrics.predict(X_train_split)
    y_valid_pred_log = estimator_for_metrics.predict(X_valid)

    metrics = {
        'in_sample_mae': mae_original_scale(y_train_split, y_train_pred_log),
        'in_sample_rmae': rmae_original_scale(y_train_split, y_train_pred_log), # Changed key
        'holdout_mae': mae_original_scale(y_valid, y_valid_pred_log),
        'holdout_rmae': rmae_original_scale(y_valid, y_valid_pred_log), # Changed key
        'cv_mae_mean': cv_mae_mean,
        'cv_mae_std': cv_mae_std,
        'cv_rmae_mean': cv_rmae_mean, # Changed key
        'cv_rmae_std': cv_rmae_std    # Changed key
    }

    # 6. Predict test set using the final model trained on FULL X
    print(f"--- Predicting on test data for {model_tag} ({target_tag}) ---")
    if final_estimator_trained_on_full_X is None:
         # This case should ideally not happen if logic is correct
         print("Error: Final estimator not trained on full data!")
         # Fallback: refit using best params if available, else base.
         final_estimator_trained_on_full_X = clone(base_pipeline)
         if best_params:
             final_estimator_trained_on_full_X.set_params(**best_params)
         final_estimator_trained_on_full_X.fit(X, y_log)

    test_pred_log = final_estimator_trained_on_full_X.predict(X_test)
    test_pred = np.expm1(test_pred_log)
    # Apply safety checks after exponentiation
    test_pred = np.nan_to_num(test_pred, nan=np.expm1(y_log.median()), posinf=np.finfo(np.float64).max, neginf=0.0) # Use median as fallback for NaN
    test_pred = np.clip(test_pred, 0, None) # Ensure non-negative predictions
    submission_df = pd.DataFrame({'ID': test_ids, 'Price': test_pred})

    # 7. Save submission file
    submission_path = None
    if output_path is not None:
        submission_path = Path(output_path)
        submission_path.parent.mkdir(parents=True, exist_ok=True)
        submission_df.to_csv(submission_path, index=False)
        print(f"Submission file saved to: {submission_path}")

    # 8. Store results and print summary
    metrics['best_params'] = best_params
    metrics['submission'] = str(submission_path) if submission_path else None

    end_time = time.time() # End timing
    elapsed_time = end_time - start_time

    print(f"\n=== {model_tag.upper()} - {target_tag.capitalize()} Final Results (Time: {elapsed_time:.2f}s) ===")
    if best_params:
        print(f"  Best Parameters: {best_params}")
    print(f"  In-Sample -> MAE: {metrics['in_sample_mae']:.4f}, RMAE: {metrics['in_sample_rmae']:.4f}")
    print(f"  Holdout   -> MAE: {metrics['holdout_mae']:.4f}, RMAE: {metrics['holdout_rmae']:.4f}")
    print(f"  6-Fold CV -> MAE: {metrics['cv_mae_mean']:.4f} ± {metrics['cv_mae_std']:.4f}")
    print(f"             RMAE: {metrics['cv_rmae_mean']:.4f} ± {metrics['cv_rmae_std']:.4f}")
    if submission_path:
        print(f"  Submission File: {submission_path}")
    print("=" * (len(model_tag) + len(target_tag) + 30)) # Adjust separator length

    # Return metrics, submission DataFrame, final trained model instance (on full X), best params
    return metrics, submission_df, final_estimator_trained_on_full_X, best_params

#### d. OLS模型

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

def build_ols_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('model', LinearRegression())
    ])

price_metrics['OLS'], price_submission_df, _, _ = evaluate_and_submit(
    model_pipeline=build_ols_pipeline(),
    X=train_price_features,
    y_log=y_price_log,
    X_test=test_price_features,
    test_ids=test_ids_price,
    target_tag='price',
    model_tag='ols'
 )

rent_metrics['OLS'], rent_submission_df, _, _ = evaluate_and_submit(
    model_pipeline=build_ols_pipeline(),
    X=train_rent_features,
    y_log=y_rent_log,
    X_test=test_rent_features,
    test_ids=test_ids_rent,
    target_tag='rent',
    model_tag='ols'
 )

submission_ols_df = pd.concat([price_submission_df, rent_submission_df], ignore_index=True)
submission_ols_path = Path('submission_ols_filtered.csv')
submission_ols_df.to_csv(submission_ols_path, index=False)

price_metrics['OLS']['submission'] = str(submission_ols_path)
rent_metrics['OLS']['submission'] = str(submission_ols_path)
submission_registry['OLS'] = str(submission_ols_path)

--- Running Cross-Validation on FULL data for ols (price) ---


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of   6 | elapsed:   49.9s finished


--- Fitting model on 80% split for IS/OOS metrics (ols, price) ---
--- Fitting final model on FULL data for ols (price) ---
--- Calculating IS/OOS metrics for ols (price) ---
--- Predicting on test data for ols (price) ---

=== OLS - Price Final Results (Time: 76.11s) ===
  In-Sample -> MAE: 349859.7959, RMAE: 591.4895
  Holdout   -> MAE: 347307.4272, RMAE: 589.3279
  6-Fold CV -> MAE: 353300.2521 ± 5359.2686
             RMAE: 594.3736 ± 4.5037
--- Running Cross-Validation on FULL data for ols (rent) ---


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of   6 | elapsed:   47.8s finished


--- Fitting model on 80% split for IS/OOS metrics (ols, rent) ---
--- Fitting final model on FULL data for ols (rent) ---
--- Calculating IS/OOS metrics for ols (rent) ---
--- Predicting on test data for ols (rent) ---

=== OLS - Rent Final Results (Time: 72.00s) ===
  In-Sample -> MAE: 89231.9096, RMAE: 298.7171
  Holdout   -> MAE: 90369.2374, RMAE: 300.6148
  6-Fold CV -> MAE: 90431.1436 ± 1406.2312
             RMAE: 300.7086 ± 2.3380


Exception ignored in: <function ResourceTracker.__del__ at 0x108785bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107ef1bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


#### e. Ridge模型

In [None]:
from sklearn.linear_model import Ridge

def build_ridge_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Ridge())  # 注意：Ridge 通常不需要 max_iter
    ])

# Ridge 的 alpha 参数网格，可以沿用 Lasso 的范围
ridge_param_grid = {
    'model__alpha': np.logspace(-4, 2, 10)
}

# --- 训练和评估 'price' 模型 ---
price_metrics['Ridge'], price_ridge_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_ridge_pipeline(),
    X=train_price_features,
    y_log=y_price_log,
    X_test=test_price_features,
    test_ids=test_ids_price,
    target_tag='price',
    model_tag='ridge',  # 标签改为 'ridge'
    param_grid=ridge_param_grid
 )

# --- 训练和评估 'rent' 模型 ---
rent_metrics['Ridge'], rent_ridge_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_ridge_pipeline(),
    X=train_rent_features,
    y_log=y_rent_log,
    X_test=test_rent_features,
    test_ids=test_ids_rent,
    target_tag='rent',
    model_tag='ridge',  # 标签改为 'ridge'
    param_grid=ridge_param_grid
 )

# --- 合并并保存 Ridge 的提交文件 ---
submission_ridge_df = pd.concat([price_ridge_submission, rent_ridge_submission], ignore_index=True)
submission_ridge_path = Path('submission_ridge_50K.csv') # 文件名
submission_ridge_df.to_csv(submission_ridge_path, index=False)

# --- 更新 metrics 和 registry ---
price_metrics['Ridge']['submission'] = str(submission_ridge_path)
rent_metrics['Ridge']['submission'] = str(submission_ridge_path)
submission_registry['Ridge'] = str(submission_ridge_path)

--- Running GridSearchCV for ridge (price) on FULL training data ---
--- Fitting model on 80% split for IS/OOS metrics (ridge, price) ---
--- Calculating IS/OOS metrics for ridge (price) ---
--- Predicting on test data for ridge (price) ---

=== RIDGE - Price Final Results (Time: 1936.95s) ===
  Best Parameters: {'model__alpha': np.float64(0.0001)}
  In-Sample -> MAE: 349859.7974, RMAE: 591.4895
  Holdout   -> MAE: 347221.3134, RMAE: 589.2549
  6-Fold CV -> MAE: 353359.2267 ± 5300.8601
             RMAE: 594.4236 ± 4.4550
--- Running GridSearchCV for ridge (rent) on FULL training data ---


Exception ignored in: <function ResourceTracker.__del__ at 0x105051bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


--- Fitting model on 80% split for IS/OOS metrics (ridge, rent) ---
--- Calculating IS/OOS metrics for ridge (rent) ---
--- Predicting on test data for ridge (rent) ---

=== RIDGE - Rent Final Results (Time: 1447.34s) ===
  Best Parameters: {'model__alpha': np.float64(0.0001)}
  In-Sample -> MAE: 89231.9100, RMAE: 298.7171
  Holdout   -> MAE: 90397.7409, RMAE: 300.6622
  6-Fold CV -> MAE: 90445.4053 ± 1407.7163
             RMAE: 300.7323 ± 2.3399


Exception ignored in: <function ResourceTracker.__del__ at 0x1036d1bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106129bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10406dbc0>
Traceback (most recent call last

#### f. LASSO模型

In [None]:
from sklearn.linear_model import Lasso

def build_lasso_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', Lasso(max_iter=10000))
    ])

lasso_param_grid = {
    'model__alpha': np.logspace(-2, 3, 11)
}

price_metrics['Lasso'], price_lasso_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_lasso_pipeline(),
    X=train_price_features,
    y_log=y_price_log,
    X_test=test_price_features,
    test_ids=test_ids_price,
    target_tag='price',
    model_tag='lasso',
    param_grid=lasso_param_grid
 )

rent_metrics['Lasso'], rent_lasso_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_lasso_pipeline(),
    X=train_rent_features,
    y_log=y_rent_log,
    X_test=test_rent_features,
    test_ids=test_ids_rent,
    target_tag='rent',
    model_tag='lasso',
    param_grid=lasso_param_grid
 )

submission_lasso_df = pd.concat([price_lasso_submission, rent_lasso_submission], ignore_index=True)
submission_lasso_path = Path('submission_lasso_100K.csv')
submission_lasso_df.to_csv(submission_lasso_path, index=False)

price_metrics['Lasso']['submission'] = str(submission_lasso_path)
rent_metrics['Lasso']['submission'] = str(submission_lasso_path)

submission_registry['Lasso'] = str(submission_lasso_path)

--- Running GridSearchCV for lasso (price) on FULL training data ---


Exception ignored in: <function ResourceTracker.__del__ at 0x11006dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102cf1bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


--- Fitting model on 80% split for IS/OOS metrics (lasso, price) ---
--- Calculating IS/OOS metrics for lasso (price) ---
--- Predicting on test data for lasso (price) ---

=== LASSO - Price Final Results (Time: 815.10s) ===
  Best Parameters: {'model__alpha': np.float64(0.01)}
  In-Sample -> MAE: 505194.8406, RMAE: 710.7706
  Holdout   -> MAE: 499899.4255, RMAE: 707.0357
  6-Fold CV -> MAE: 504794.9683 ± 7785.1124
             RMAE: 710.4682 ± 5.4695
--- Running GridSearchCV for lasso (rent) on FULL training data ---


Exception ignored in: <function ResourceTracker.__del__ at 0x10461dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


--- Fitting model on 80% split for IS/OOS metrics (lasso, rent) ---
--- Calculating IS/OOS metrics for lasso (rent) ---
--- Predicting on test data for lasso (rent) ---

=== LASSO - Rent Final Results (Time: 525.86s) ===
  Best Parameters: {'model__alpha': np.float64(0.01)}
  In-Sample -> MAE: 125104.5866, RMAE: 353.7013
  Holdout   -> MAE: 123339.3151, RMAE: 351.1970
  6-Fold CV -> MAE: 125154.9308 ± 2055.9627
             RMAE: 353.7605 ± 2.9015


Exception ignored in: <function ResourceTracker.__del__ at 0x1056d9bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104fd9bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10716dbc0>
Traceback (most recent call last

In [None]:
from sklearn.linear_model import ElasticNet

def build_elasticnet_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', ElasticNet(max_iter=10000, l1_ratio=0.5))
    ])

elastic_param_grid = {
    'model__alpha': np.logspace(-2, 0, 5),
    'model__l1_ratio': np.linspace(0.1, 0.9, 9)
}

price_metrics['ElasticNet'], price_elastic_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_elasticnet_pipeline(),
    X=train_price_features,
    y_log=y_price_log,
    X_test=test_price_features,
    test_ids=test_ids_price,
    target_tag='price',
    model_tag='elasticnet',
    param_grid=elastic_param_grid
 )

rent_metrics['ElasticNet'], rent_elastic_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_elasticnet_pipeline(),
    X=train_rent_features,
    y_log=y_rent_log,
    X_test=test_rent_features,
    test_ids=test_ids_rent,
    target_tag='rent',
    model_tag='elasticnet',
    param_grid=elastic_param_grid
 )

submission_elastic_df = pd.concat([price_elastic_submission, rent_elastic_submission], ignore_index=True)
submission_elastic_path = Path('submission_elasticnet.csv')
submission_elastic_df.to_csv(submission_elastic_path, index=False)

price_metrics['ElasticNet']['submission'] = str(submission_elastic_path)
rent_metrics['ElasticNet']['submission'] = str(submission_elastic_path)
submission_registry['ElasticNet'] = str(submission_elastic_path)

--- Running GridSearchCV for elasticnet (price) on FULL training data ---
--- Fitting model on 80% split for IS/OOS metrics (elasticnet, price) ---
--- Calculating IS/OOS metrics for elasticnet (price) ---
--- Predicting on test data for elasticnet (price) ---

=== ELASTICNET - Price Final Results (Time: 163.88s) ===
  Best Parameters: {'model__alpha': np.float64(0.01), 'model__l1_ratio': np.float64(0.1)}
  In-Sample -> MAE: 498152.9856, RMAE: 705.7995
  Holdout   -> MAE: 492792.2497, RMAE: 701.9916
  6-Fold CV -> MAE: 497664.1799 ± 7110.0579
             RMAE: 705.4351 ± 5.0441
--- Running GridSearchCV for elasticnet (rent) on FULL training data ---


Exception ignored in: <function ResourceTracker.__del__ at 0x103155bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104a09bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104f29bc0>
Traceback (most recent call last

--- Fitting model on 80% split for IS/OOS metrics (elasticnet, rent) ---
--- Calculating IS/OOS metrics for elasticnet (rent) ---
--- Predicting on test data for elasticnet (rent) ---

=== ELASTICNET - Rent Final Results (Time: 105.71s) ===
  Best Parameters: {'model__alpha': np.float64(0.01), 'model__l1_ratio': np.float64(0.1)}
  In-Sample -> MAE: 113592.6136, RMAE: 337.0350
  Holdout   -> MAE: 113207.2482, RMAE: 336.4628
  6-Fold CV -> MAE: 113685.7270 ± 1678.8931
             RMAE: 337.1639 ± 2.4972


In [None]:
def summarize_metrics(metrics_map):
    df = pd.DataFrame(metrics_map).T
    ordered_cols = [
        'in_sample_mae',
        'in_sample_rmae',
        'holdout_mae',
        'holdout_rmae',
        'cv_mae_mean',
        'cv_mae_std',
        'cv_rmae_mean',
        'cv_rmae_std',
        'best_params',
        'submission'
    ]
    return df[ordered_cols]

price_summary = summarize_metrics(price_metrics)
rent_summary = summarize_metrics(rent_metrics)

print("Price 任务模型表现一览：")
display(price_summary)

print("Rent 任务模型表现一览：")
display(rent_summary)

Price 任务模型表现一览：


Unnamed: 0,in_sample_mae,in_sample_rmae,holdout_mae,holdout_rmae,cv_mae_mean,cv_mae_std,cv_rmae_mean,cv_rmae_std,best_params,submission
OLS,494872.751689,703.471927,488144.089816,698.673092,494495.675191,6838.155125,703.187028,4.866141,,submission_ols.csv
Ridge,494877.276222,703.475143,488191.199353,698.706805,494499.745206,6828.967145,703.189967,4.859563,{'model__alpha': 1.0},submission_ridge.csv
Lasso,532365.714446,729.633959,529917.315984,727.954199,531458.600456,7199.918495,728.995315,4.942732,{'model__alpha': 0.01},submission_lasso.csv
ElasticNet,498152.985647,705.799536,492792.24974,701.991631,497664.17994,7110.05791,705.43514,5.044126,"{'model__alpha': 0.01, 'model__l1_ratio': 0.1}",submission_elasticnet.csv


Rent 任务模型表现一览：


Unnamed: 0,in_sample_mae,in_sample_rmae,holdout_mae,holdout_rmae,cv_mae_mean,cv_mae_std,cv_rmae_mean,cv_rmae_std,best_params,submission
OLS,112642.558515,335.622643,112613.889557,335.57993,112801.227288,1432.736067,335.852124,2.139581,,submission_ols.csv
Ridge,112642.558516,335.622643,112613.889558,335.57993,112801.227289,1432.736069,335.852124,2.139581,{'model__alpha': 1e-05},submission_ridge.csv
Lasso,122182.783561,349.54654,120699.326698,347.418086,122076.931517,2026.485816,349.383017,2.904946,{'model__alpha': 0.01},submission_lasso.csv
ElasticNet,113592.613631,337.035033,113207.248171,336.462848,113685.72696,1678.893084,337.163894,2.497156,"{'model__alpha': 0.01, 'model__l1_ratio': 0.1}",submission_elasticnet.csv


#### g.非线性模型（附）

In [None]:
# --- Random Forest model cell ---
from sklearn.ensemble import RandomForestRegressor

def build_rf_pipeline():
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(random_state=111))
    ])

rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_leaf': [1, 2, 4]
}

# Run for price
price_metrics['RandomForest'], price_rf_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_rf_pipeline(),
    X=train_price_features,
    y_log=y_price_log,
    X_test=test_price_features,
    test_ids=test_ids_price,
    target_tag='price',
    model_tag='random_forest',
    param_grid=rf_param_grid,
    output_path='submission_rf_price.csv'
)

# Run for rent
rent_metrics['RandomForest'], rent_rf_submission, _, _ = evaluate_and_submit(
    model_pipeline=build_rf_pipeline(),
    X=train_rent_features,
    y_log=y_rent_log,
    X_test=test_rent_features,
    test_ids=test_ids_rent,
    target_tag='rent',
    model_tag='random_forest',
    param_grid=rf_param_grid,
    output_path='submission_rf_rent.csv'
)

# Consolidate and save a single csv
submission_rf_df = pd.concat([price_rf_submission, rent_rf_submission], ignore_index=True)
submission_rf_path = Path('submission_rf.csv')
submission_rf_df.to_csv(submission_rf_path, index=False)

price_metrics['RandomForest']['submission'] = str(submission_rf_path)
rent_metrics['RandomForest']['submission'] = str(submission_rf_path)
submission_registry['RandomForest'] = str(submission_rf_path)

print('RandomForest done — saved to', submission_rf_path)


--- Running GridSearchCV for random_forest (price) on FULL training data ---


Exception ignored in: <function ResourceTracker.__del__ at 0x104f2dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x111a6dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x111a6dbc0>
Traceback (most recent call last

In [None]:
# --- LightGBM model cell ---
# This cell checks for lightgbm and, if available, builds a pipeline and runs evaluate_and_submit_faster
try:
    from lightgbm import LGBMRegressor
except Exception as _e:
    LGBMRegressor = None
    print('LightGBM not available in this environment:', _e)

if LGBMRegressor is None:
    print('\n跳过 LightGBM：未检测到 lightgbm 包。要启用，请运行 `pip install lightgbm` 并重启 kernel，然后重新运行此单元。')
else:
    def build_lgb_pipeline():
        return Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('model', LGBMRegressor(random_state=111, n_jobs=8))
        ])

    lgb_param_grid = {
        'model__n_estimators': [100, 300],
        'model__num_leaves': [31, 64],
        'model__learning_rate': [0.05, 0.1]
    }

    price_metrics['LightGBM'], price_lgb_submission, _, _ = evaluate_and_submit(
        model_pipeline=build_lgb_pipeline(),
        X=train_price_features,
        y_log=y_price_log,
        X_test=test_price_features,
        test_ids=test_ids_price,
        target_tag='price',
        model_tag='lightgbm',
        param_grid=lgb_param_grid,
        output_path='submission_lgb_price.csv'
    )

    rent_metrics['LightGBM'], rent_lgb_submission, _, _ = evaluate_and_submit(
        model_pipeline=build_lgb_pipeline(),
        X=train_rent_features,
        y_log=y_rent_log,
        X_test=test_rent_features,
        test_ids=test_ids_rent,
        target_tag='rent',
        model_tag='lightgbm',
        param_grid=lgb_param_grid,
        output_path='submission_lgb_rent.csv'
    )

    submission_lgb_df = pd.concat([price_lgb_submission, rent_lgb_submission], ignore_index=True)
    submission_lgb_path = Path('submission_lightgbm.csv')
    submission_lgb_df.to_csv(submission_lgb_path, index=False)

    price_metrics['LightGBM']['submission'] = str(submission_lgb_path)
    rent_metrics['LightGBM']['submission'] = str(submission_lgb_path)
    submission_registry['LightGBM'] = str(submission_lgb_path)

    print('LightGBM done — saved to', submission_lgb_path)


--- Running GridSearchCV for lightgbm (price) on FULL training data ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.199796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3521
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.260149
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.173410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3524
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.264765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.141573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory 



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.188087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114685 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3528
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.263147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079040 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.088116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3529
[LightGBM] [Info] Number of data points in the train set: 86560, number of used features: 218
[LightGBM] [Info] Start training from score 14.264708
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3521
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.260149




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3528
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.263147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3529
[LightGBM] [Info] Number of data points in the train set: 86560, number of used features: 218
[LightGBM] [Info] Start training from score 14.264708
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082906 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3524
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.264765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3521
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.260149


Exception ignored in: <function ResourceTracker.__del__ at 0x11060dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3528
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.263147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3529
[LightGBM] [Info] Number of data points in the train set: 86560, number of used features: 218
[LightGBM] [Info] Start training from score 14.264708




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3524
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.264765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3521
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.260149




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3528
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.263147




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.077843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3529
[LightGBM] [Info] Number of data points in the train set: 86560, number of used features: 218
[LightGBM] [Info] Start training from score 14.264708
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079918 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3528
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.263147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071812 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055898 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3529
[LightGBM] [Info] Number of data points in the train set: 86560, number of used features: 218
[LightGBM] [Info] Start training from score 14.264708
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3524
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.264765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3528
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 217
[LightGBM] [Info] Start training from score 14.263147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3534
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.261743




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 86559, number of used features: 218
[LightGBM] [Info] Start training from score 14.262384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3529
[LightGBM] [Info] Number of data points in the train set: 86560, number of used features: 218
[LightGBM] [Info] Start training from score 14.264708




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3541
[LightGBM] [Info] Number of data points in the train set: 103871, number of used features: 219
[LightGBM] [Info] Start training from score 14.262816
--- Fitting model on 80% split for IS/OOS metrics (lightgbm, price) ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3527
[LightGBM] [Info] Number of data points in the train set: 83096, number of used features: 217
[LightGBM] [Info] Start training from score 14.264576
--- Calculating IS/OOS metrics for lightgbm (price) ---
--- Predicting on test data for lightgbm (price) ---
Su



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.132568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.958028
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.959197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092756 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2872
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.960063




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.958028
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.959197




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2879
[LightGBM] [Info] Number of data points in the train set: 82415, number of used features: 188
[LightGBM] [Info] Start training from score 12.958814
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2872
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.960063


Exception ignored in: <function ResourceTracker.__del__ at 0x1050bdbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106ec1bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060798 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.958028




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.959197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2879
[LightGBM] [Info] Number of data points in the train set: 82415, number of used features: 188
[LightGBM] [Info] Start training from score 12.958814




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2872
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.960063




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.958028




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.959197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2879
[LightGBM] [Info] Number of data points in the train set: 82415, number of used features: 188
[LightGBM] [Info] Start training from score 12.958814
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.958028




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.959197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2879
[LightGBM] [Info] Number of data points in the train set: 82415, number of used features: 188
[LightGBM] [Info] Start training from score 12.958814




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2872
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.960063




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2875
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.957929
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2881
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 189
[LightGBM] [Info] Start training from score 12.960619




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.958028




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2871
[LightGBM] [Info] Number of data points in the train set: 82416, number of used features: 188
[LightGBM] [Info] Start training from score 12.959197


Exception ignored in: <function ResourceTracker.__del__ at 0x1058d9bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1069b5bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2889
[LightGBM] [Info] Number of data points in the train set: 98899, number of used features: 189
[LightGBM] [Info] Start training from score 12.959108
--- Fitting model on 80% split for IS/OOS metrics (lightgbm, rent) ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2876
[LightGBM] [Info] Number of data points in the train set: 79119, number of used features: 188
[LightGBM] [Info] Start training from score 12.958874
--- Calculating IS/OOS metrics for lightgbm (rent) ---
--- Predicting on test data for lightgbm (rent) ---
Submis

Exception ignored in: <function ResourceTracker.__del__ at 0x107981bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x109259bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1040d9bc0>
Traceback (most recent call last