In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from fancyimpute import SoftImpute
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 数据路径配置
DATA_PATHS = {
    'train_price': 'ruc_Class25Q2_train_price.csv',
    'train_rent': 'ruc_Class25Q2_train_rent.csv',
    'test_price': 'ruc_Class25Q2_test_price.csv',
    'test_rent': 'ruc_Class25Q2_test_rent.csv'
}

def relative_mae(y_true, y_pred):
    """计算相对平均绝对误差 (rMAE)"""
    return np.mean(np.abs(y_true - y_pred)) / np.mean(np.abs(y_true))

def extract_numeric(text):
    """从文本中提取数值"""
    if pd.isna(text):
        return np.nan
    try:
        text = str(text)
        numeric_part = ''.join(filter(lambda x: x.isdigit() or x == '.', text))
        return float(numeric_part) if numeric_part else np.nan
    except:
        return np.nan

def extract_percent(text):
    """从诸如 '绿 化 率' 这类百分比文本中提取数值(0-1之间)"""
    if pd.isna(text):
        return np.nan
    s = str(text)
    try:
        if '%' in s:
            val = ''.join([c for c in s if c.isdigit() or c == '.'])
            return float(val) / 100.0 if val else np.nan
        return extract_numeric(s)
    except:
        return np.nan

def parse_year_from_range(text):
    """解析 '建筑年代' 字段，如 '1997-2008年' 或 '2005年' -> 返回年份数值(范围取中值)"""
    if pd.isna(text):
        return np.nan
    s = str(text)
    try:
        # 去掉非数字和连字符及点
        clean = ''.join([c for c in s if c.isdigit() or c == '-' or c == '.'])
        if '-' in clean:
            parts = clean.split('-')
            nums = [extract_numeric(p) for p in parts if p]
            nums = [n for n in nums if n is not None and not np.isnan(n)]
            if len(nums) >= 2:
                return float(np.mean(nums[:2]))
        # 单一年份
        val = extract_numeric(clean)
        return val
    except:
        return np.nan

def chinese_num_to_int(ch):
    """将常见中文数字(一二三四五六七八九十)转换为整数"""
    mapping = {
        '零':0,'〇':0,'一':1,'二':2,'两':2,'三':3,'四':4,'五':5,'六':6,'七':7,'八':8,'九':9,'十':10
    }
    if pd.isna(ch):
        return np.nan
    s = str(ch)
    # 处理如 '十一' '十二' 等
    if len(s) >= 2 and s[0] == '十':
        base = 10
        tail = mapping.get(s[1], 0)
        return base + tail
    return mapping.get(s, np.nan)

def parse_layout(text):
    """解析户型字符串，如 '4室2厅1厨2卫' 或 '1房间1卫' -> 返回(室, 厅, 厨, 卫) 数值"""
    rooms = 0
    halls = 0
    kitchens = 0
    baths = 0
    if pd.isna(text):
        return rooms, halls, kitchens, baths
    s = str(text)
    try:
        # 提取阿拉伯数字或中文数字
        def _num_from_match(m):
            val = m.group(1)
            if val.isdigit():
                return int(val)
            else:
                ci = chinese_num_to_int(val)
                try:
                    return int(ci) if not np.isnan(ci) else None
                except:
                    return None
        # 室
        m = re.search(r'(\d+|[一二两三四五六七八九十]+)\s*室', s)
        if m:
            rooms = _num_from_match(m)
        else:
            m2 = re.search(r'(\d+|[一二两三四五六七八九十]+)\s*房间', s)
            if m2:
                rooms = _num_from_match(m2)
        # 厅
        m = re.search(r'(\d+|[一二两三四五六七八九十]+)\s*厅', s)
        if m:
            halls = _num_from_match(m)
        # 厨
        m = re.search(r'(\d+|[一二两三四五六七八九十]+)\s*厨', s)
        if m:
            kitchens = _num_from_match(m)
        # 卫
        m = re.search(r'(\d+|[一二两三四五六七八九十]+)\s*卫', s)
        if m:
            baths = _num_from_match(m)
    except:
        pass
    return rooms, halls, kitchens, baths
    
def parse_ladder_ratio(text):
    """解析 '梯户比例' 如 '一梯三户' -> (梯数, 每梯户数, 梯户比)"""
    if pd.isna(text):
        return np.nan, np.nan, np.nan
    s = str(text)
    try:
        # 识别形如 X梯Y户
        if '梯' in s and '户' in s:
            parts = s.split('梯')
            left = parts[0]
            right = parts[1].split('户')[0]
            ladders = chinese_num_to_int(left) if not left.isdigit() else int(left)
            per_ladder = chinese_num_to_int(right) if not right.isdigit() else int(right)
            if not np.isnan(ladders) and not np.isnan(per_ladder) and per_ladder != 0:
                return ladders, per_ladder, ladders / per_ladder
    except:
        pass
    return np.nan, np.nan, np.nan

def extract_floor_comprehensive(floor_str):
    """全面楼层解析函数"""
    if pd.isna(floor_str):
        return np.nan, np.nan
    
    s = str(floor_str).strip()
    
    try:# 1. 处理地下室情况 - 处理如"地下室 (共0层)", "地下室/0层", "地下1层"等格式
        if '地下室' in s or '地下' in s:
            # 提取地下层数，如"地下1层" -> -1
            underground_match = re.search(r'地下(\d+)层', s)
            if underground_match:
                current_floor = -int(underground_match.group(1))
            else:
                current_floor = -1  # 默认地下1层
            
            # 提取总楼层数
            total_match = re.search(r'共(\d+)层', s)
            if total_match:
                total_floor = int(total_match.group(1))
            else:
                total_floor = np.nan
            
            return current_floor, total_floor
        
        # 2. 处理"共X层"格式（带文字描述）- 处理如"高楼层(共6层)", "中楼层(共12层)"
        if '共' in s and '层' in s:
            # 提取当前楼层描述部分
            if '(' in s and ')' in s:
                current_floor_desc = s.split('(')[0].strip()
                total_floor_str = s.split('共')[1].split('层')[0].strip()
            else:
                # 处理没有括号的情况，如"低楼层/28层"
                parts = s.split('/')
                if len(parts) == 2 and '层' in parts[1]:
                    current_floor_desc = parts[0].strip()
                    total_floor_str = parts[1].replace('层', '').strip()
                else:
                    current_floor_desc = s.split('共')[0].strip()
                    total_floor_str = s.split('共')[1].split('层')[0].strip()
            
            # 转换总楼层
            total_floor = int(total_floor_str) if total_floor_str.isdigit() else 0
            
            # 处理当前楼层描述
            if '底层' in current_floor_desc:
                current_floor = 1
            elif '顶层' in current_floor_desc:
                current_floor = total_floor
            elif '高楼层' in current_floor_desc:
                current_floor = int(total_floor * 0.8) if total_floor > 0 else np.nan
            elif '中楼层' in current_floor_desc:
                current_floor = int(total_floor * 0.5) if total_floor > 0 else np.nan
            elif '低楼层' in current_floor_desc:
                current_floor = int(total_floor * 0.2) if total_floor > 0 else np.nan
            elif '地下室' in current_floor_desc or '地下' in current_floor_desc:
                current_floor = -1  # 地下室情况
            else:
                # 尝试提取数字，如"3/6层"中的3
                num_match = re.search(r'(\d+)', current_floor_desc)
                if num_match:
                    current_floor = int(num_match.group(1))
                else:
                    current_floor = np.nan
                    
            return current_floor, total_floor
        
        # 3. 处理"X/Y层"格式，支持文字描述
        if '/' in s and '层' in s:
            parts = s.split('/')
            if len(parts) == 2:
                current_str = parts[0].strip()
                total_str = parts[1].replace('层', '').strip()
                
                # 处理当前楼层 - 增强文字描述支持
                if current_str.isdigit():
                    current_floor = int(current_str)
                elif current_str in ['低楼层', '中楼层', '高楼层', '顶层', '底层']:
                    #根据总楼层估算当前楼层
                    if total_str.isdigit():
                        total_floor_temp = int(total_str)
                        # 使用与"共X层"格式相同的估算逻辑
                        if current_str == '底层':
                            current_floor = 1
                        elif current_str == '顶层':
                            current_floor = total_floor_temp
                        elif current_str == '高楼层':
                            current_floor = int(total_floor_temp * 0.8)
                        elif current_str == '中楼层':
                            current_floor = int(total_floor_temp * 0.5)
                        elif current_str == '低楼层':
                            current_floor = int(total_floor_temp * 0.2)
                    else:
                        current_floor = np.nan  # 总楼层未知，无法估算
                else:
                    # 尝试提取数字
                    num_match = re.search(r'(\d+)', current_str)
                    current_floor = int(num_match.group(1)) if num_match else np.nan
                
                # 处理总楼层
                if total_str.isdigit():
                    total_floor = int(total_str)
                else:
                    total_floor = 0
                
                return current_floor, total_floor
        
        # 4. 处理纯数字或简单描述
        if '层' in s:
            # 提取数字
            num_match = re.search(r'(\d+)', s)
            if num_match:
                floor_num = int(num_match.group(1))
                if '地下' in s:
                    current_floor = -floor_num
                    total_floor = 0  # 纯地下层，总楼层为0
                else:
                    current_floor = floor_num
                    total_floor = np.nan  # 无法确定总楼层
                return current_floor, total_floor
        
        # 5. 处理纯文字描述（没有数字）
        if s in ['底层', '低楼层', '中楼层', '高楼层', '顶层', '地下室']:
            # 只有描述没有具体楼层信息，无法准确确定
            return np.nan, np.nan
            
    except Exception as e:
        print(f"楼层解析错误: {s}, 错误: {e}")
    
    return np.nan, np.nan

In [4]:
def process_price_features(df):
    """房价数据特征工程"""
    df = df.copy()
    
    if '物业办公电话' in df.columns:
        df = df.drop(columns=['物业办公电话'])
    
    # 处理面积特征
    area_columns = [col for col in df.columns if '面积' in col]
    for col in area_columns:
        if col in df.columns:
            df[f'{col}_数值'] = df[col].apply(lambda x: extract_numeric(str(x).split('㎡')[0]) if pd.notna(x) else np.nan)
        
    if '所在楼层' in df.columns:
        floor_data = df['所在楼层'].apply(extract_floor_comprehensive).tolist()
        df[['当前楼层', '总楼层']] = pd.DataFrame(floor_data, index=df.index)
        df['楼层比例'] = df['当前楼层'] / df['总楼层']
    
    # 处理朝向特征
    def process_orientation(orientation_str):
        if pd.isna(orientation_str):
            return 0, 0, 0, 0
        
        orientation_str = str(orientation_str)
        east = 1 if '东' in orientation_str else 0
        south = 1 if '南' in orientation_str else 0
        west = 1 if '西' in orientation_str else 0
        north = 1 if '北' in orientation_str else 0
        
        return east, south, west, north
    
    if '房屋朝向' in df.columns:
        orientation_data = df['房屋朝向'].apply(process_orientation).tolist()
        df[['朝东', '朝南', '朝西', '朝北']] = pd.DataFrame(orientation_data, index=df.index)
    
    # 处理装修特征
    def process_decoration(decoration_str):
        if pd.isna(decoration_str):
            return 0, 0, 0, 0
        
        decoration_str = str(decoration_str)
        rough = 1 if '毛坯' in decoration_str else 0
        simple = 1 if '简装' in decoration_str else 0
        medium = 1 if '中装' in decoration_str else 0
        deluxe = 1 if '精装' in decoration_str else 0
        
        return rough, simple, medium, deluxe
    
    if '装修情况' in df.columns:
        decoration_data = df['装修情况'].apply(process_decoration).tolist()
        df[['毛坯', '简装', '中装', '精装']] = pd.DataFrame(decoration_data, index=df.index)

    # 户型拆解：房屋户型 -> 数值特征
    if '房屋户型' in df.columns:
        layout_data = df['房屋户型'].apply(parse_layout).tolist()
        df[['室数', '厅数', '厨数', '卫数']] = pd.DataFrame(layout_data, index=df.index)
    
    # 建筑年代 -> 数值年份
    if '建筑年代' in df.columns:
        df['建筑年代_年'] = df['建筑年代'].apply(parse_year_from_range)

    # 交易时间 -> 年/月/季度
    if '交易时间' in df.columns:
        try:
            dt = pd.to_datetime(df['交易时间'], errors='coerce')
            df['交易年'] = dt.dt.year
            df['交易月'] = dt.dt.month
            df['交易季'] = dt.dt.quarter
        except:
            pass

    # 别墅类型 -> 有值指示（有值视为别墅）
    if '别墅类型' in df.columns:
        df['别墅类型_有值'] = df['别墅类型'].apply(lambda x: 0 if pd.isna(x) or str(x).strip()=='' else 1)

    # 城市/区域/板块 -> 类别独热编码（严格基于源数据）
    for col in ['城市', '区域', '板块']:
        if col in df.columns:
            df[col] = df[col].astype(str)
            dummies = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, dummies], axis=1)
    
    # 绿化率/容积率/物业费数值化（若存在）
    for col in ['绿 化 率', '容 积 率', '物 业 费']:
        if col in df.columns:
            if col == '绿 化 率':
                df['绿化率_数值'] = df[col].apply(extract_percent)
            elif col == '容 积 率':
                df['容积率_数值'] = df[col].apply(extract_numeric)
            elif col == '物 业 费':
                df['物业费_数值'] = df[col].apply(extract_numeric)

    # 电梯/配备电梯 -> 二值
    for col in ['配备电梯', '电梯']:
        if col in df.columns:
            df[f'{col}_有电梯'] = df[col].apply(lambda x: 1 if str(x).strip() in ['有', '是', '有电梯', '有电梯房'] else 0)

    # 梯户比例解析
    if '梯户比例' in df.columns:
        ladders = df['梯户比例'].apply(parse_ladder_ratio).tolist()
        df[['梯数', '每梯户数', '梯户比']] = pd.DataFrame(ladders, index=df.index)

    return df

In [5]:
def process_rent_features(df):
    """租金数据特征工程"""
    df = df.copy()

    if '物业办公电话' in df.columns:
        df = df.drop(columns=['物业办公电话'])
    
    # 处理面积特征
    if '建筑面积' in df.columns:
        df['建筑面积_数值'] = df['建筑面积'].apply(extract_numeric)
    
    if '套内面积' in df.columns:
        df['套内面积_数值'] = df['套内面积'].apply(extract_numeric)
    
    if '楼层' in df.columns:
        floor_data = df['楼层'].apply(extract_floor_comprehensive).tolist()
        df[['当前楼层', '总楼层']] = pd.DataFrame(floor_data, index=df.index)
        df['楼层比例'] = df['当前楼层'] / df['总楼层']
    
    # 处理朝向特征
    if '朝向' in df.columns:
        def process_orientation_rent(orientation_str):
            if pd.isna(orientation_str):
                return 0, 0, 0, 0
            s = str(orientation_str)
            east = 1 if '东' in s else 0
            south = 1 if '南' in s else 0
            west = 1 if '西' in s else 0
            north = 1 if '北' in s else 0
            return east, south, west, north
        orientation_data = df['朝向'].apply(process_orientation_rent).tolist()
        df[['朝东', '朝南', '朝西', '朝北']] = pd.DataFrame(orientation_data, index=df.index)
    
    # 装修变量（租金）：有值即精装修（二值）
    if '装修' in df.columns:
        df['装修_精装'] = df['装修'].apply(lambda x: 1 if pd.notna(x) and str(x).strip()!='' else 0)
    elif '装修情况' in df.columns:
        # 备用处理
        df['装修_精装'] = df['装修情况'].apply(lambda x: 1 if pd.notna(x) and ('精' in str(x)) else 0)

    # 建筑年代
    if '建筑年代' in df.columns:
        df['建筑年代_年'] = df['建筑年代'].apply(parse_year_from_range)

    # 交易时间 -> 年/月/季度
    if '交易时间' in df.columns:
        try:
            dt = pd.to_datetime(df['交易时间'], errors='coerce')
            df['交易年'] = dt.dt.year
            df['交易月'] = dt.dt.month
            df['交易季'] = dt.dt.quarter
        except:
            pass

    # 电梯/配备电梯 -> 二值（若存在）
    for col in ['配备电梯', '电梯']:
        if col in df.columns:
            df[f'{col}_有电梯'] = df[col].apply(lambda x: 1 if str(x).strip() in ['有', '是', '有电梯', '有电梯房'] else 0)

    # 梯户比例解析（若存在）
    if '梯户比例' in df.columns:
        ladders = df['梯户比例'].apply(parse_ladder_ratio).tolist()
        df[['梯数', '每梯户数', '梯户比']] = pd.DataFrame(ladders, index=df.index)

    # 户型拆解：租金户型 -> 数值特征
    if '户型' in df.columns:
        layout_data = df['户型'].apply(parse_layout).tolist()
        df[['室数', '厅数', '厨数', '卫数']] = pd.DataFrame(layout_data, index=df.index)

    # 城市/区域/板块 -> 类别独热编码
    for col in ['城市', '区域', '板块']:
        if col in df.columns:
            df[col] = df[col].astype(str)
            dummies = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, dummies], axis=1)
            
    return df

In [6]:
def remove_outliers_iqr(df, target_col):
    """使用IQR方法移除异常值"""
    Q1 = df[target_col].quantile(0.25)
    Q3 = df[target_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    return df[(df[target_col] >= lower_bound) & (df[target_col] <= upper_bound)]

def fancy_impute_numeric(df, numeric_cols):
    """使用SoftImpute进行高级缺失值填充；失败或不可用时回退为KNN/中位数填充"""
    df = df.copy()
    # 将inf和-inf替换为NaN
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

    FANCYIMPUTE_AVAILABLE = True
    if FANCYIMPUTE_AVAILABLE:
        try:
            train_matrix = df[numeric_cols].values
            filled_matrix = SoftImpute(max_iters=150, convergence_threshold=0.001).fit_transform(train_matrix)
            df[numeric_cols] = filled_matrix
            return df
        except Exception as e:
            print(f"SoftImpute failed: {e}, fallback to KNNImputer")

    # 回退：KNNImputer；若KNN失败，再回退到中位数
    try:
        imputer = KNNImputer(n_neighbors=5, weights='distance')
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    except Exception as e:
        print(f"KNNImputer failed: {e}, fallback to median imputation")
        for col in numeric_cols:
            if col in df.columns:
                imp = SimpleImputer(strategy='median')
                df[col] = imp.fit_transform(df[[col]]).flatten()
    return df

def evaluate_and_predict(X_train, y_train, X_test, models, task_name):
    """评估模型并生成预测"""
    results = {}
    predictions = {}
    
    # 只选择数值型特征进行特征选择
    numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # 特征选择
    k = min(35, len(numeric_features))
    selector = SelectKBest(f_regression, k=k)
    X_train_selected = selector.fit_transform(X_train[numeric_features], y_train)
    selected_indices = selector.get_support(indices=True)
    selected_features = [numeric_features[i] for i in selected_indices]
    
    print(f"{task_name} - Selected features: {len(selected_features)}")
    
    # 准备测试集特征
    X_test_selected = X_test[selected_features]
    
    # 分割训练集和验证集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train[selected_features], y_train, test_size=0.2, random_state=42
    )
    
    for model_name, model in models.items():
        print(f"\n{task_name} - Training {model_name}...")
        
        if model_name == 'XGBoost':
            # XGBoost可以处理缺失值，不需要标准化
            # In-sample评估（引入early stopping）
            # 某些版本的xgboost不支持early_stopping_rounds参数，直接拟合
            model.fit(X_train_split, y_train_split)
            y_train_pred = model.predict(X_train_split)
            in_sample_rmae = relative_mae(y_train_split, y_train_pred)
            
            # Out-of-sample评估
            y_val_pred = model.predict(X_val_split)
            out_sample_rmae = relative_mae(y_val_split, y_val_pred)
            
            # 交叉验证
            kf = KFold(n_splits=6, shuffle=True, random_state=42)
            cv_scores = []
            
            for train_idx, val_idx in kf.split(X_train[selected_features]):
                X_cv_train = X_train[selected_features].iloc[train_idx]
                X_cv_val = X_train[selected_features].iloc[val_idx]
                y_cv_train = y_train.iloc[train_idx]
                y_cv_val = y_train.iloc[val_idx]
                
                model.fit(X_cv_train, y_cv_train)
                y_cv_pred = model.predict(X_cv_val)
                cv_rmae = relative_mae(y_cv_val, y_cv_pred)
                cv_scores.append(cv_rmae)
            
            cv_rmae = np.mean(cv_scores)
            
            # 使用最佳树数在完整训练集上重新训练并预测测试集
            # 使用既定树数在完整训练集上重新拟合
            best_iters = getattr(model, 'n_estimators', 1000)
            model.set_params(n_estimators=int(best_iters))
            model.fit(X_train[selected_features], y_train)
            test_pred = model.predict(X_test_selected)
            
        else:
            # 其他模型需要标准化
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_split)
            X_val_scaled = scaler.transform(X_val_split)
            
            # In-sample评估（对目标取log1p以提升线性模型稳定性）
            y_train_log = np.log1p(y_train_split)
            model.fit(X_train_scaled, y_train_log)
            y_train_pred = np.expm1(model.predict(X_train_scaled))
            in_sample_rmae = relative_mae(y_train_split, y_train_pred)
            
            # Out-of-sample评估
            y_val_pred = np.expm1(model.predict(X_val_scaled))
            out_sample_rmae = relative_mae(y_val_split, y_val_pred)
            
            # 交叉验证
            kf = KFold(n_splits=6, shuffle=True, random_state=42)
            cv_scores = []
            
            for train_idx, val_idx in kf.split(X_train[selected_features]):
                X_cv_train = X_train[selected_features].iloc[train_idx]
                X_cv_val = X_train[selected_features].iloc[val_idx]
                y_cv_train = y_train.iloc[train_idx]
                y_cv_val = y_train.iloc[val_idx]
                
                # 在每次交叉验证中重新拟合scaler
                cv_scaler = StandardScaler()
                X_cv_train_scaled = cv_scaler.fit_transform(X_cv_train)
                X_cv_val_scaled = cv_scaler.transform(X_cv_val)
                
                model.fit(X_cv_train_scaled, np.log1p(y_cv_train))
                y_cv_pred = np.expm1(model.predict(X_cv_val_scaled))
                cv_rmae = relative_mae(y_cv_val, y_cv_pred)
                cv_scores.append(cv_rmae)
            
            cv_rmae = np.mean(cv_scores)
            
            # 使用完整训练集重新训练并预测测试集
            full_scaler = StandardScaler()
            X_train_full_scaled = full_scaler.fit_transform(X_train[selected_features])
            X_test_scaled = full_scaler.transform(X_test_selected)
            
            model.fit(X_train_full_scaled, np.log1p(y_train))
            test_pred = np.expm1(model.predict(X_test_scaled))
        
        results[model_name] = {
            'in_sample_rmae': in_sample_rmae,
            'out_sample_rmae': out_sample_rmae,
            'cv_rmae': cv_rmae
        }
        
        predictions[model_name] = test_pred
        
        print(f"{model_name} Results:")
        print(f"  In-sample rMAE: {in_sample_rmae:.4f}")
        print(f"  Out-of-sample rMAE: {out_sample_rmae:.4f}")
        print(f"  6-fold CV rMAE: {cv_rmae:.4f}")
    
    return results, predictions

In [7]:
def main():
    print("=== 改进的整合模型训练系统 ===")
    
    # 1. 加载数据
    print("\n1. 加载数据...")
    train_price = pd.read_csv(DATA_PATHS['train_price'])
    train_rent = pd.read_csv(DATA_PATHS['train_rent'])
    test_price = pd.read_csv(DATA_PATHS['test_price'])
    test_rent = pd.read_csv(DATA_PATHS['test_rent'])
    
    print(f"房价训练集: {train_price.shape}")
    print(f"租金训练集: {train_rent.shape}")
    print(f"房价测试集: {test_price.shape}")
    print(f"租金测试集: {test_rent.shape}")
    
    # 检查测试集是否有ID列
    if 'ID' not in test_price.columns:
        raise ValueError("测试集缺少ID列")
    if 'ID' not in test_rent.columns:
        raise ValueError("测试集缺少ID列")
    
    # 2. 数据清洗和特征工程
    print("\n2. 房价数据处理...")
    
    # 房价数据处理
    train_price_cleaned = remove_outliers_iqr(train_price, 'Price')
    print(f"房价异常值移除后: {len(train_price_cleaned)} 行")
    
    train_price_processed = process_price_features(train_price_cleaned)
    test_price_processed = process_price_features(test_price)
    
    # 房价数据缺失值填充
    price_numeric_cols = train_price_processed.select_dtypes(include=[np.number]).columns.tolist()
    price_numeric_cols = [col for col in price_numeric_cols if col not in ['Price']]
    
    train_price_processed = fancy_impute_numeric(train_price_processed, price_numeric_cols)
    test_price_processed = fancy_impute_numeric(test_price_processed, price_numeric_cols)
    
    # 处理剩余NaN值
    train_price_processed = train_price_processed.fillna(0)
    test_price_processed = test_price_processed.fillna(0)
    
    print("\n3. 租金数据处理...")
    
    # 租金数据处理
    train_rent_cleaned = remove_outliers_iqr(train_rent, 'Price')
    print(f"租金异常值移除后: {len(train_rent_cleaned)} 行")
    
    train_rent_processed = process_rent_features(train_rent_cleaned)
    test_rent_processed = process_rent_features(test_rent)
    
    # 租金数据缺失值填充（使用KNN/SoftImpute）
    rent_numeric_cols = train_rent_processed.select_dtypes(include=[np.number]).columns.tolist()
    rent_numeric_cols = [col for col in rent_numeric_cols if col not in ['Price', 'ID']]
    
    train_rent_processed = fancy_impute_numeric(train_rent_processed, rent_numeric_cols)
    test_rent_processed = fancy_impute_numeric(test_rent_processed, rent_numeric_cols)
    
    # 处理剩余NaN值
    train_rent_processed = train_rent_processed.fillna(0)
    test_rent_processed = test_rent_processed.fillna(0)
    
    # 3. 定义模型
    print("\n4. 定义模型...")
    models = {
        'Linear Regression': LinearRegression(),
        'Lasso': Lasso(alpha=0.01, max_iter=5000),
        'Ridge': Ridge(alpha=3.0),
        'Elastic Net': ElasticNet(alpha=0.01, l1_ratio=0.3, max_iter=10000),
        'Linear (MAE)': SGDRegressor(
        loss='epsilon_insensitive',
        epsilon=0,  # 设置为0即退化为MAE
        penalty=None,
        max_iter=1000,
        random_state=42
    )
    }
    
    models['XGBoost'] = xgb.XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist',
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=6,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.0,
            reg_lambda=1.0,
            gamma=0,
            random_state=42
        )
    
    # 4. 房价模型训练和评估
    print("\n5. 房价模型训练和评估...")
    
    # 准备房价数据
    price_feature_cols = [col for col in train_price_processed.columns 
                         if col not in ['Price'] and col in test_price_processed.columns]
    
    X_train_price = train_price_processed[price_feature_cols]
    y_train_price = train_price_processed['Price']
    X_test_price = test_price_processed[price_feature_cols]
    
    price_results, price_predictions = evaluate_and_predict(
        X_train_price, y_train_price, X_test_price, models, "房价"
    )
    
    # 5. 租金模型训练和评估
    print("\n6. 租金模型训练和评估...")
    
    # 准备租金数据
    rent_feature_cols = [col for col in train_rent_processed.columns 
                        if col not in ['Price', 'ID'] and col in test_rent_processed.columns]
    
    X_train_rent = train_rent_processed[rent_feature_cols]
    y_train_rent = train_rent_processed['Price']
    X_test_rent = test_rent_processed[rent_feature_cols]
    
    rent_results, rent_predictions = evaluate_and_predict(
        X_train_rent, y_train_rent, X_test_rent, models, "租金"
    )
    
    # 6. 生成每个模型的独立提交文件
    print("\n7. 生成提交文件...")
    
    for model_name in models.keys():
        # 房价预测
        price_submission = pd.DataFrame({
            'ID': test_price['ID'],
            'Price': price_predictions[model_name]
        })
        
        # 租金预测
        rent_submission = pd.DataFrame({
            'ID': test_rent['ID'],
            'Price': rent_predictions[model_name]
        })
        
        # 合并提交
        combined_submission = pd.concat([price_submission, rent_submission])
        
        # 保存文件
        filename = f'submission_{model_name.replace(" ", "_").lower()}.csv'
        combined_submission.to_csv(filename, index=False)
        print(f"已保存: {filename} ({len(combined_submission)} 行)")
        
       
    # 7. 生成性能比较表格
    print("\n8. 生成性能比较表格...")
    
    # 房价模型性能表格
    print("\n房价模型性能比较:")
    print("=" * 80)
    print(f"{'Model':<15} {'In-sample rMAE':<15} {'Out-of-sample rMAE':<18} {'Cross-validation rMAE':<20}")
    print("-" * 80)
    
    price_performance_data = []
    for model_name, results in price_results.items():
        print(f"{model_name:<15} {results['in_sample_rmae']:<15.4f} {results['out_sample_rmae']:<18.4f} {results['cv_rmae']:<20.4f}")
        price_performance_data.append({
            'Model': model_name,
            'In-sample': results['in_sample_rmae'],
            'Out-of-sample': results['out_sample_rmae'],
            'Cross-validation': results['cv_rmae']
        })
    
    # 保存房价性能表格
    price_performance_df = pd.DataFrame(price_performance_data)
    price_performance_df.to_csv('price_model_performance.csv', index=False)
    print(f"\n房价模型性能表格已保存: price_model_performance.csv")
    
    # 租金模型性能表格
    print("\n租金模型性能比较:")
    print("=" * 80)
    print(f"{'Model':<15} {'In-sample rMAE':<15} {'Out-of-sample rMAE':<18} {'Cross-validation rMAE':<20}")
    print("-" * 80)
    
    rent_performance_data = []
    for model_name, results in rent_results.items():
        print(f"{model_name:<15} {results['in_sample_rmae']:<15.4f} {results['out_sample_rmae']:<18.4f} {results['cv_rmae']:<20.4f}")
        rent_performance_data.append({
            'Model': model_name,
            'In-sample': results['in_sample_rmae'],
            'Out-of-sample': results['out_sample_rmae'],
            'Cross-validation': results['cv_rmae']
        })
    
    # 保存租金性能表格
    rent_performance_df = pd.DataFrame(rent_performance_data)
    rent_performance_df.to_csv('rent_model_performance.csv', index=False)
    print(f"\n租金模型性能表格已保存: rent_model_performance.csv")
    
    print("\n=== 所有任务完成 ===")
    print(f"生成的文件:")
    print(f"- {len(models)} 个模型的合并提交文件")
    print(f"- 2 个性能比较表格")

In [8]:
if __name__ == "__main__":
    main()

=== 改进的整合模型训练系统 ===

1. 加载数据...
房价训练集: (103871, 55)
租金训练集: (98899, 46)
房价测试集: (34017, 55)
租金测试集: (9773, 46)

2. 房价数据处理...
房价异常值移除后: 96048 行
[SoftImpute] Max Singular Value of X_init = 1035418.227861
[SoftImpute] Iter 1: observed MAE=10.074207 rank=4
[SoftImpute] Iter 2: observed MAE=9.742688 rank=4
[SoftImpute] Iter 3: observed MAE=9.652334 rank=4
[SoftImpute] Iter 4: observed MAE=9.623315 rank=4
[SoftImpute] Iter 5: observed MAE=9.615518 rank=4
[SoftImpute] Iter 6: observed MAE=9.615231 rank=4
[SoftImpute] Iter 7: observed MAE=9.617692 rank=4
[SoftImpute] Iter 8: observed MAE=9.620453 rank=4
[SoftImpute] Iter 9: observed MAE=9.622549 rank=4
[SoftImpute] Iter 10: observed MAE=9.623738 rank=4
[SoftImpute] Iter 11: observed MAE=9.624087 rank=4
[SoftImpute] Iter 12: observed MAE=9.623719 rank=4
[SoftImpute] Iter 13: observed MAE=9.622730 rank=4
[SoftImpute] Iter 14: observed MAE=9.621242 rank=4
[SoftImpute] Iter 15: observed MAE=9.619299 rank=4
[SoftImpute] Iter 16: observed MAE=9.616957 