In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import re
import os
from pathlib import Path

# 机器学习库
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold


In [3]:
"""
房屋定价模型 - 针对实际数据格式的优化版本
"""
class HousePriceModel:
    """房屋定价模型 - 针对实际数据格式优化"""
    
    def __init__(self):
        self.TARGET = "Price"
        self.RANDOM_STATE = 111
        self.TEST_SIZE = 0.2
        self.CV_FOLDS = 6
        
        # 文件路径
        self.DATA_FILE = "ruc_Class25Q2_train_price.csv"
        self.TEST_FILE = "ruc_Class25Q2_test_price.csv"
        self.OUTPUT_DIR = "output_final"
        
        # 创建输出目录
        os.makedirs(self.OUTPUT_DIR, exist_ok=True)
        
        # 模型存储
        self.models = {}
        self.scaler = StandardScaler()
        self.label_encoders = {}
        
    # ---------- 小工具函数 ----------
    def extract_numeric_value(self, x):
        """从文本中提取数值（处理带单位的数值）"""
        if pd.isna(x) or x == '': 
            return np.nan
        
        x_str = str(x).strip()
        
        # 处理百分比
        if '%' in x_str:
            match = re.search(r'(\d+\.?\d*)%', x_str)
            if match:
                return float(match.group(1)) / 100
        
        # 处理面积（带㎡）
        if '㎡' in x_str or '平方米' in x_str:
            match = re.search(r'(\d+\.?\d*)\s*㎡', x_str)
            if match:
                return float(match.group(1))
        
        # 处理费用（带元）
        if '元' in x_str:
            # 处理区间值，如"1.3-1.65元/月/㎡"
            if '-' in x_str:
                numbers = re.findall(r'(\d+\.?\d*)', x_str)
                if len(numbers) >= 2:
                    return (float(numbers[0]) + float(numbers[1])) / 2
            # 处理单个值
            match = re.search(r'(\d+\.?\d*)\s*元', x_str)
            if match:
                return float(match.group(1))
        
        # 处理普通数值
        match = re.search(r'(\d+\.?\d*)', x_str)
        if match:
            return float(match.group(1))
        
        return np.nan

    def parse_area(self, x):
        """解析面积值"""
        return self.extract_numeric_value(x)

    def parse_floor(self, x):
        """解析楼层信息"""
        if pd.isna(x): return np.nan, np.nan, np.nan
        x_str = str(x).strip()
        
        cur_map = {"低": 1, "中": 3, "高": 5, "顶": 7, "底": 1}
        current_floor = np.nan
        floor_type = np.nan
        
        for key, val in cur_map.items():
            if key in x_str:
                current_floor = val
                floor_type = key
                break
        
        # 提取总楼层
        total_floors = np.nan
        match = re.search(r'共(\d+)层', x_str)
        if match:
            total_floors = int(match.group(1))
        
        return current_floor, total_floors, floor_type

    def direction_score(self, x):
        """计算朝向评分"""
        if pd.isna(x): return 0
        x_str = str(x)
        score = 0
        
        direction_scores = {'南': 3, '东': 2, '西': 1, '北': 0.5}
        for direction, points in direction_scores.items():
            if direction in x_str:
                score += points
        
        # 特殊组合加分
        if '南北' in x_str:
            score += 2
        if '东南' in x_str or '南东' in x_str:
            score += 1
        
        return min(score, 8)

    # ---------- 数据处理函数 ----------
    def remove_long_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """删除每格平均文字超过100的纯文本列"""
        text_cols = df.select_dtypes(include=['object']).columns
        cols_to_remove = []
        
        for col in text_cols:
            # 检查是否是纯文本列（不包含数值信息）
            sample_values = df[col].dropna().head(10)
            has_numeric_pattern = any(
                re.search(r'\d+\.?\d*', str(val)) for val in sample_values if pd.notna(val)
            )
            
            # 如果是纯文本列且平均长度过长，则删除
            if not has_numeric_pattern:
                avg_length = df[col].astype(str).str.len().mean()
                if avg_length > 100:
                    cols_to_remove.append(col)
                    print(f"删除过长纯文本列: {col} (平均长度: {avg_length:.2f})")
        
        if cols_to_remove:
            df = df.drop(columns=cols_to_remove)
            print(f"共删除 {len(cols_to_remove)} 个过长纯文本列")
        
        return df

    def convert_numeric_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """将包含数值信息的文本列转换为数值列"""
        df = df.copy()
        
        # 需要转换的列模式
        numeric_patterns = [
            # 费用相关
            r'.*费.*', r'.*价.*', r'.*金额.*', r'.*费用.*',
            # 面积相关
            r'.*面积.*', r'.*积.*率.*', 
            # 数量相关
            r'.*数.*', r'.*量.*', r'.*户.*', r'.*栋.*', r'.*车位.*',
            # 百分比相关
            r'.*率.*', r'.*比.*',
            # 其他数值
            r'.*年.*', r'.*层.*', r'.*高.*', r'.*宽.*', r'.*长.*'
        ]
        
        text_cols = df.select_dtypes(include=['object']).columns
        converted_cols = []
        
        for col in text_cols:
            # 检查列名是否匹配数值模式
            is_numeric_col = any(re.search(pattern, col, re.IGNORECASE) for pattern in numeric_patterns)
            
            if is_numeric_col:
                # 尝试转换为数值
                numeric_values = df[col].apply(self.extract_numeric_value)
                # 如果成功提取到足够多的数值，则替换原列
                if numeric_values.notna().sum() > len(df) * 0.1:  # 至少10%的数据成功转换
                    df[col] = numeric_values
                    converted_cols.append(col)
                    print(f"转换文本列为数值列: {col}")
        
        print(f"共转换 {len(converted_cols)} 个文本列为数值列")
        return df

    def group_fill_numeric_missing(self, df: pd.DataFrame) -> pd.DataFrame:
        """数值型缺失值按城市、区域、板块分组填充"""
        df = df.copy()
        
        # 检查是否有分组列
        group_cols = []
        for col in ['城市', '区域', '板块']:
            if col in df.columns:
                group_cols.append(col)
        
        if not group_cols:
            print("未找到城市、区域、板块列，使用全局中位数填充")
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if df[col].isnull().any():
                    median_val = df[col].median()
                    if pd.notna(median_val):
                        df[col] = df[col].fillna(median_val)
                    else:
                        df[col] = df[col].fillna(0)
            return df
        
        print(f"使用 {group_cols} 进行分组填充")
        
        # 先填充分组列本身的缺失值
        for col in group_cols:
            if df[col].isnull().any():
                df[col] = df[col].fillna("未知")
        
        # 对数值列进行分组填充
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if df[col].isnull().any():
                try:
                    # 创建分组填充的值
                    group_means = df.groupby(group_cols)[col].transform('median')
                    # 如果分组后仍有NaN，用全局中位数填充
                    global_median = df[col].median()
                    df[col] = df[col].fillna(group_means).fillna(global_median)
                    print(f"列 {col}: 使用分组填充 + 全局中位数填充")
                except Exception as e:
                    print(f"列 {col} 分组填充失败: {e}, 使用全局中位数填充")
                    df[col] = df[col].fillna(df[col].median())
        
        return df

    def fill_text_missing(self, df: pd.DataFrame) -> pd.DataFrame:
        """文本类型列缺失值填充为'未知'"""
        text_cols = df.select_dtypes(include=['object']).columns
        
        for col in text_cols:
            if df[col].isnull().any():
                df[col] = df[col].fillna("未知")
                print(f"文本列 {col}: 缺失值填充为'未知'")
        
        return df

    def encode_categorical_variables(self, df: pd.DataFrame) -> pd.DataFrame:
        """对分类变量进行编码用于回归"""
        df = df.copy()
        categorical_cols = df.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            # 跳过ID列和过高的基数列（超过50个不同值）
            if col == 'ID' or df[col].nunique() > 50:
                continue
                
            # 创建标签编码器
            le = LabelEncoder()
            # 处理未知类别
            unique_vals = df[col].fillna("未知").unique()
            le.fit(unique_vals)
            
            # 转换列
            df[col] = le.transform(df[col].fillna("未知"))
            
            # 保存编码器
            self.label_encoders[col] = le
            print(f"分类变量编码: {col} ({len(unique_vals)}个类别)")
        
        return df

    # ---------- 主要处理函数 ----------
    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
        """数据清洗主函数"""
        print("开始数据清洗...")
        df = df.copy()
        
        # 1. 删除过长纯文本列
        df = self.remove_long_text_columns(df)
        
        # 2. 重命名列（去除空格）
        df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)
        
        # 3. 删除数据泄露特征
        leak = ['开发商', '物业公司', '物业办公电话', 'coord_x', 'coord_y']
        df = df.drop(columns=[c for c in leak if c in df.columns])
        
        # 4. 转换包含数值信息的文本列为数值列
        df = self.convert_numeric_text_columns(df)
        
        # 5. 处理特殊列
        # 建筑面积
        if "建筑面积" in df.columns:
            df["建筑面积"] = df["建筑面积"].apply(self.parse_area)
        
        # 套内面积
        if "套内面积" in df.columns:
            df["套内面积"] = df["套内面积"].apply(self.parse_area)
        
        # 所在楼层
        if "所在楼层" in df.columns:
            df["所在楼层"] = df["所在楼层"].astype(str)  # 添加这行
            cur, tot, tp = zip(*df["所在楼层"].apply(self.parse_floor))
            df["当前楼层"] = cur
            df["总楼层"] = tot
            df["楼层类型"] = tp
            df["楼层比例"] = np.array(cur) / np.clip(np.array(tot), 1, None)
        
        # 房屋朝向
        if "房屋朝向" in df.columns:
            df["房屋朝向"] = df["房屋朝向"].astype(str)  # 添加这行
            df["朝向得分"] = df["房屋朝向"].apply(self.direction_score)
            df["是否南北通透"] = df["房屋朝向"].str.contains("南.*北|北.*南", na=False).astype(int)
        
        # 房屋户型
        if "房屋户型" in df.columns:
            # 确保房屋户型列是字符串类型
            df["房屋户型"] = df["房屋户型"].astype(str)
            df["室"] = df["房屋户型"].str.extract(r"(\d+)室").fillna(0).astype(int)
            df["厅"] = df["房屋户型"].str.extract(r"(\d+)厅").fillna(0).astype(int)
            df["卫"] = df["房屋户型"].str.extract(r"(\d+)卫").fillna(0).astype(int)
            df["厨"] = df["房屋户型"].str.extract(r"(\d+)厨").fillna(0).astype(int)
        
        # 装修情况
        if "装修情况" in df.columns:
            decoration_map = {"精装": 3, "简装": 2, "毛坯": 1, "其他": 2}
            df["装修等级"] = df["装修情况"].map(decoration_map).fillna(2)
        
        # 配备电梯
        if "配备电梯" in df.columns:
            df["电梯有无"] = df["配备电梯"].map({"有": 1, "无": 0}).fillna(0)
        
        # 交易时间
        if "交易时间" in df.columns:
            df["交易时间"] = pd.to_datetime(df["交易时间"], errors='coerce')
            df["交易年份"] = df["交易时间"].dt.year
            df["交易月份"] = df["交易时间"].dt.month
        
        # 建筑年代
        if "建筑年代" in df.columns:
            df["建筑年代"] = df["建筑年代"].astype(str).str.extract(r"(\d{4})").astype(float)
        
        # 6. 分组填充数值型缺失值
        df = self.group_fill_numeric_missing(df)
        
        # 7. 文本列缺失值填充
        df = self.fill_text_missing(df)
        
        # 8. 分类变量编码
        df = self.encode_categorical_variables(df)
        
        print(f"数据清洗完成，最终形状: {df.shape}")
        return df

    def engineer(self, df: pd.DataFrame) -> pd.DataFrame:
        """特征工程"""
        df = df.copy()
        print("开始特征工程...")
        
        # 面积相关特征
        if "建筑面积" in df.columns and "室" in df.columns:
            df["面积室比"] = df["建筑面积"] / (df["室"] + 1)
            df["建筑面积2"] = df["建筑面积"] ** 2
            df["面积分箱"] = pd.qcut(df["建筑面积"], q=5, labels=False, duplicates="drop")
        
        # 户型相关特征
        if {"室", "厅", "卫", "厨"}.issubset(df.columns):
            df["房间总数"] = df["室"] + df["厅"] + df["卫"] + df["厨"]
            df["卫室比"] = df["卫"] / (df["室"] + 1)
            df["厅室比"] = df["厅"] / (df["室"] + 1)
            
            # 户型分类
            df["是否一居室"] = (df["室"] == 1).astype(int)
            df["是否二居室"] = (df["室"] == 2).astype(int)
            df["是否三居室"] = (df["室"] == 3).astype(int)
            df["是否大户型"] = (df["室"] >= 4).astype(int)
        
        # 房龄特征
        if {"交易年份", "建筑年代"}.issubset(df.columns):
            df["房龄"] = df["交易年份"] - df["建筑年代"]
            df["是否新房"] = (df["房龄"] <= 5).astype(int)
            df["是否老房"] = (df["房龄"] >= 30).astype(int)
        
        # 楼层特征
        if "楼层比例" in df.columns:
            df["是否高楼层"] = (df["楼层比例"] >= 0.8).astype(int)
            df["是否低楼层"] = (df["楼层比例"] <= 0.2).astype(int)
            df["是否中层"] = ((df["楼层比例"] > 0.2) & (df["楼层比例"] < 0.8)).astype(int)
        
        # 朝向特征
        if "朝向得分" in df.columns:
            df["朝向是否优秀"] = (df["朝向得分"] >= 5).astype(int)
            df["朝向是否良好"] = ((df["朝向得分"] >= 3) & (df["朝向得分"] < 5)).astype(int)
        
        # 费用特征（如果有费用相关列）
        fee_cols = [col for col in df.columns if any(word in col for word in ['费', '价', '金额'])]
        if fee_cols:
            df["总费用"] = df[fee_cols].sum(axis=1, skipna=True).fillna(0)
            df["平均费用"] = df[fee_cols].mean(axis=1, skipna=True).fillna(0)
        
        print(f"特征工程完成，最终形状: {df.shape}")
        return df

    def remove_outlier(self, df: pd.DataFrame) -> pd.DataFrame:
        """异常值处理"""
        if self.TARGET not in df.columns:
            return df
            
        cap = 0.995
        q = df[self.TARGET].quantile(cap)
        original_len = len(df)
        df = df[df[self.TARGET] <= q].copy()
        removed_count = original_len - len(df)
        print(f"[Outlier] 保留 {len(df)} 行，移除 {removed_count} 个异常值")
        return df

    def select_features(self, df: pd.DataFrame):
        """特征选择"""
        print("开始特征选择...")
        
        X = df.drop(columns=[self.TARGET])
        y = df[self.TARGET]
        
        # 仅数值特征
        X_num = X.select_dtypes("number")
        X_num = X_num.replace([np.inf, -np.inf], np.nan).fillna(X_num.median()).clip(-1e15, 1e15)
        
        # 低方差过滤
        vt = VarianceThreshold(threshold=1e-3)
        X_vt = vt.fit_transform(X_num)
        vt_mask = vt.get_support()
        vt_cols = X_num.columns[vt_mask].tolist()
        X_vt_df = pd.DataFrame(X_vt, columns=vt_cols, index=X_num.index)
        
        # KBest选择
        k = min(60, X_vt_df.shape[1])
        skb = SelectKBest(f_regression, k=k)
        X_new = skb.fit_transform(X_vt_df, y)
        skb_mask = skb.get_support()
        selected = X_vt_df.columns[skb_mask].tolist()
        
        print(f"[Select] {len(selected)} features kept")
        return df[selected + [self.TARGET]], selected

    # ---------- 模型训练部分 ----------
    def train_linear_models(self, X_train, y_train, X_test, y_test):
        """训练线性模型 - 简化版"""
        print("开始训练线性模型...")
        
        # 确保数据没有NaN
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        
        # 标准化
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        results = []
        
        # 1. OLS 线性回归
        print("训练OLS线性回归...")
        ols = LinearRegression()
        ols.fit(X_train_scaled, y_train)
        self.models['OLS'] = ols
        results.append(self._calculate_model_metrics(ols, X_train_scaled, y_train, X_test_scaled, y_test, 'OLS'))
        
        # 2. Lasso回归
        print("训练Lasso回归...")
        lasso = Lasso(alpha=0.01, random_state=self.RANDOM_STATE, max_iter=1000)
        lasso.fit(X_train_scaled, y_train)
        self.models['Lasso'] = lasso
        results.append(self._calculate_model_metrics(lasso, X_train_scaled, y_train, X_test_scaled, y_test, 'Lasso'))
        
        # 3. Ridge回归
        print("训练Ridge回归...")
        ridge = Ridge(alpha=1.0, random_state=self.RANDOM_STATE)
        ridge.fit(X_train_scaled, y_train)
        self.models['Ridge'] = ridge
        results.append(self._calculate_model_metrics(ridge, X_train_scaled, y_train, X_test_scaled, y_test, 'Ridge'))
        
        # 4. Elastic Net回归
        print("训练Elastic Net回归...")
        elastic = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=self.RANDOM_STATE, max_iter=1000)
        elastic.fit(X_train_scaled, y_train)
        self.models['ElasticNet'] = elastic
        results.append(self._calculate_model_metrics(elastic, X_train_scaled, y_train, X_test_scaled, y_test, 'ElasticNet'))
        

        
        # 确定最佳模型
        results_df = pd.DataFrame(results)
        best_idx = results_df['Test_MAE'].idxmin()
        best_model_name = results_df.loc[best_idx, 'Model']
        
        print(f"最佳模型: {best_model_name}")
        return results_df
    
    def _calculate_model_metrics(self, model, X_train, y_train, X_test, y_test, model_name):
        """计算模型的各种指标"""
        # 训练集预测
        y_pred_train = model.predict(X_train)
        # 测试集预测
        y_pred_test = model.predict(X_test)
        
        # 计算各项指标
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)
        
        # 计算RMAE (Relative MAE)
        train_rmae = (np.abs(y_train - y_pred_train) / y_train).mean()
        test_rmae = (np.abs(y_test - y_pred_test) / y_test).mean()
        
        # 6折交叉验证
        cv_scores_mae = -cross_val_score(model, X_train, y_train, cv=self.CV_FOLDS, 
                                       scoring='neg_mean_absolute_error')
        cv_mae = cv_scores_mae.mean()
        
        return {
            'Model': model_name,
            'Train_MAE': train_mae,
            'Test_MAE': test_mae,
            'Train_RMSE': train_rmse,
            'Test_RMSE': test_rmse,
            'Train_R²': train_r2,
            'Test_R²': test_r2,
            'Train_RMAE': train_rmae,
            'Test_RMAE': test_rmae,
            'CV_MAE': cv_mae
        }
    
    def create_performance_table(self, results_df):
        """创建性能表格"""
        print("\n==== Metrics Table ====")
        
    
        # 1. 按指定顺序取出真实存在的模型
        want_order = ['OLS', 'Lasso', 'Ridge']          # 基础模型
        exist_models = [m for m in want_order if m in results_df['Model'].values]
    
        # 2. BestLinear 放第三
        best_model_row = results_df.loc[results_df['Test_MAE'].idxmin()]
        final_order = ['OLS', 'Lasso', 'BestLinear', 'Ridge']
    
        # 3. 逐列抽数据
        def pull(col):
            vals = []
            for m in final_order:
                if m == 'BestLinear':
                    vals.append(best_model_row[col])
                else:
                    vals.append(results_df.loc[results_df['Model'] == m, col].values[0])
            return vals
    
        metrics_table = pd.DataFrame({
            'Model': final_order,
            'In-sample MAE': pull('Train_MAE'),
            'In-sample RMAE': pull('Train_RMAE'),
            'Out-sample MAE': pull('Test_MAE'),
            'Out-sample RMAE': pull('Test_RMAE'),
            'CV MAE': pull('CV_MAE'),
            'KaggleScore': [0, 0, 0, 0]
        })
    
        print(metrics_table)
        metrics_table.to_csv(f'{self.OUTPUT_DIR}/metrics_table.csv', index=False)
        metrics_table.to_excel(f'{self.OUTPUT_DIR}/metrics.xlsx', index=False)
        return metrics_table
        

    
    def predict_test_set(self, selected_features):
        """预测测试集并生成submission.csv"""
        print("正在预测测试集...")
        
        try:
            # 加载测试数据
            test_df = pd.read_csv(self.TEST_FILE)
            
            # 保存原始ID
            original_ids = test_df['ID'].copy()
            
            # 应用相同的处理流程
            test_clean = self.clean(test_df)
            test_engineered = self.engineer(test_clean)
            
            # 确保包含所有需要的特征
            missing_features = [f for f in selected_features if f not in test_engineered.columns]
            if missing_features:
                print(f"添加缺失特征: {len(missing_features)} 个")
                for feat in missing_features:
                    test_engineered[feat] = 0
            
            X_test = test_engineered[selected_features].fillna(0)
            X_test_scaled = self.scaler.transform(X_test)
            
            # 使用最佳模型预测
            best_model_name = self.results_df.loc[self.results_df['Test_MAE'].idxmin(), 'Model']
            best_model = self.models[best_model_name]
            predictions = best_model.predict(X_test_scaled)
            
            # 创建提交文件 - 直接输出到 submission.csv
            submission = pd.DataFrame({
                'ID': original_ids,
                'Price': predictions
            })
            
            # 保存提交文件到当前目录
            submission.to_csv('submission.csv', index=False)
            print(f"[Submit] 房屋价格预测完成，保存到 submission.csv，共 {len(submission)} 条记录")
            print(f"预测价格范围: {predictions.min():.2f} - {predictions.max():.2f}")
            
            # 同时在输出目录保存一份备份
            submission.to_csv(f'{self.OUTPUT_DIR}/submission_price_backup.csv', index=False)
            
            return submission
            
        except Exception as e:
            print(f"测试集预测失败: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def main_workflow(self):
        """主工作流"""
        print("="*50)
        print("开始房屋定价模型训练")
        print("="*50)
        
        try:
            # 1. 加载数据
            print("\n步骤1: 加载数据...")
            train_df = pd.read_csv(self.DATA_FILE)
            print(f"原始数据形状: {train_df.shape}")
            
            # 2. 数据清洗
            print("\n步骤2: 数据清洗...")
            train_clean = self.clean(train_df)
            
            # 3. 特征工程
            print("\n步骤3: 特征工程...")
            train_engineered = self.engineer(train_clean)
            
            # 4. 异常值处理
            print("\n步骤4: 异常值处理...")
            train_no_outlier = self.remove_outlier(train_engineered)
            
            # 5. 特征选择
            print("\n步骤5: 特征选择...")
            train_selected, selected_features = self.select_features(train_no_outlier)
            
            # 准备特征和目标变量
            X = train_selected.drop(columns=[self.TARGET])
            y = train_selected[self.TARGET]
            
            # 6. 数据分割 (80/20)
            print("\n步骤6: 数据分割...")
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=self.TEST_SIZE, random_state=self.RANDOM_STATE
            )
            print(f"训练集: {X_train.shape}, 测试集: {X_test.shape}")
            
            # 7. 模型训练
            print("\n步骤7: 模型训练...")
            self.results_df = self.train_linear_models(X_train, y_train, X_test, y_test)
            
            # 8. 性能汇总
            print("\n步骤8: 生成性能表格...")
            metrics_table = self.create_performance_table(self.results_df)
            
            # 9. 预测测试集
            print("\n步骤9: 预测测试集...")
            if Path(self.TEST_FILE).exists():
                submission = self.predict_test_set(selected_features)
            
            print("\n" + "="*50)
            print("模型训练完成！")
            print("="*50)
            
            return self.results_df, metrics_table, selected_features
            
        except Exception as e:
            print(f"主工作流失败: {e}")
            import traceback
            traceback.print_exc()
            return None, None, None
    def predict_all_models_separate(self, selected_features):
        """使用所有模型分别预测测试集并保存单独的文件"""
        print("正在使用所有模型分别预测测试集...")
        
        try:
            # 加载测试数据
            test_df = pd.read_csv(self.TEST_FILE)
            
            # 保存原始ID
            original_ids = test_df['ID'].copy()
            
            # 应用相同的处理流程
            test_clean = self.clean(test_df)
            test_engineered = self.engineer(test_clean)
            
            # 确保包含所有需要的特征
            missing_features = [f for f in selected_features if f not in test_engineered.columns]
            if missing_features:
                for feat in missing_features:
                    test_engineered[feat] = 0
            
            X_test = test_engineered[selected_features].fillna(0)
            X_test_scaled = self.scaler.transform(X_test)
            
            # 使用所有模型进行预测
            predictions = {}
            for model_name, model in self.models.items():
                pred = model.predict(X_test_scaled)
                pred = np.maximum(pred, 50000)
                pred = np.minimum(pred, 100000000)
                predictions[model_name] = pred
            
            # 确定最佳模型
            best_model_name = self.results_df.loc[self.results_df['Test_MAE'].idxmin(), 'Model']
            predictions['BestLinear'] = predictions[best_model_name]
            
            # 为每个模型创建单独的预测结果
            for model_name in ['OLS', 'Lasso', 'BestLinear', 'Ridge']:
                if model_name in predictions:
                    model_predictions = predictions[model_name]
                    submission = pd.DataFrame({
                        'ID': original_ids,
                        'Price': model_predictions
                    })
                    
                    # 保存为单独的文件
                    filename = f'submission_{model_name}.csv'
                    submission.to_csv(filename, index=False)
                    print(f"[{model_name}] 房价预测完成，保存到 {filename}，共 {len(submission)} 条记录")
                    print(f"[{model_name}] 预测价格范围: {model_predictions.min():.2f} - {model_predictions.max():.2f}")
            
            return predictions
            
        except Exception as e:
            print(f"测试集预测失败: {e}")
            import traceback
            traceback.print_exc()
            return None



In [4]:
"""
租金预测模型 - 修复版本
"""
class RentPriceModel:
    """租金预测模型 - 修复版本"""
    
    def __init__(self):
        self.TARGET = "Price"
        self.RANDOM_STATE = 111
        self.TEST_SIZE = 0.2
        self.CV_FOLDS = 6
        
        # 文件路径
        self.DATA_FILE = "ruc_Class25Q2_train_rent.csv"
        self.TEST_FILE = "ruc_Class25Q2_test_rent.csv"
        self.OUTPUT_DIR = "output_rent_final"
        
        # 创建输出目录
        os.makedirs(self.OUTPUT_DIR, exist_ok=True)
        
        # 模型存储
        self.models = {}
        self.scaler = StandardScaler()
        self.label_encoders = {}
        
    # ---------- 数据处理函数 ----------
    def extract_numeric_value(self, x):
        """从文本中提取数值"""
        if pd.isna(x) or x == '': 
            return np.nan
        
        x_str = str(x).strip()
        
        # 处理百分比
        if '%' in x_str:
            match = re.search(r'(\d+\.?\d*)%', x_str)
            if match:
                return float(match.group(1)) / 100
        
        # 处理面积（带㎡）
        if '㎡' in x_str or '平方米' in x_str:
            match = re.search(r'(\d+\.?\d*)\s*㎡', x_str)
            if match:
                return float(match.group(1))
        
        # 处理费用（带元）
        if '元' in x_str:
            if '-' in x_str:
                numbers = re.findall(r'(\d+\.?\d*)', x_str)
                if len(numbers) >= 2:
                    return (float(numbers[0]) + float(numbers[1])) / 2
            match = re.search(r'(\d+\.?\d*)\s*元', x_str)
            if match:
                return float(match.group(1))
        
        # 处理普通数值
        match = re.search(r'(\d+\.?\d*)', x_str)
        if match:
            return float(match.group(1))
        
        return np.nan

    def parse_floor(self, x):
        """解析楼层信息"""
        if pd.isna(x): return np.nan, np.nan, np.nan
        x_str = str(x).strip()
        
        cur_map = {"低": 1, "中": 3, "高": 5, "顶": 7, "底": 1}
        current_floor = np.nan
        floor_type = np.nan
        
        for key, val in cur_map.items():
            if key in x_str:
                current_floor = val
                floor_type = key
                break
        
        # 提取总楼层
        total_floors = np.nan
        match = re.search(r'共(\d+)层', x_str)
        if match:
            total_floors = int(match.group(1))
        
        return current_floor, total_floors, floor_type

    def direction_score(self, x):
        """计算朝向评分"""
        if pd.isna(x): return 0
        x_str = str(x)
        score = 0
        
        direction_scores = {'南': 3, '东': 2, '西': 1, '北': 0.5}
        for direction, points in direction_scores.items():
            if direction in x_str:
                score += points
        
        if '南北' in x_str:
            score += 2
        if '东南' in x_str or '南东' in x_str:
            score += 1
        
        return min(score, 8)

    def convert_numeric_text_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """将包含数值信息的文本列转换为数值列"""
        df = df.copy()
        
        numeric_patterns = [
            r'.*费.*', r'.*价.*', r'.*金额.*', r'.*费用.*',
            r'.*面积.*', r'.*积.*率.*', 
            r'.*数.*', r'.*量.*', r'.*户.*', r'.*栋.*', r'.*车位.*',
            r'.*率.*', r'.*比.*',
            r'.*年.*', r'.*层.*', r'.*高.*', r'.*宽.*', r'.*长.*'
        ]
        
        text_cols = df.select_dtypes(include=['object']).columns
        converted_cols = []
        
        for col in text_cols:
            is_numeric_col = any(re.search(pattern, col, re.IGNORECASE) for pattern in numeric_patterns)
            
            if is_numeric_col:
                numeric_values = df[col].apply(self.extract_numeric_value)
                if numeric_values.notna().sum() > len(df) * 0.1:
                    df[col] = numeric_values
                    converted_cols.append(col)
        
        print(f"转换 {len(converted_cols)} 个文本列为数值列")
        return df

    def group_fill_numeric_missing(self, df: pd.DataFrame) -> pd.DataFrame:
        """数值型缺失值分组填充"""
        df = df.copy()
        
        group_cols = []
        for col in ['城市', '区县', '板块']:
            if col in df.columns:
                group_cols.append(col)
        
        if not group_cols:
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if df[col].isnull().any():
                    median_val = df[col].median()
                    if pd.notna(median_val):
                        df[col] = df[col].fillna(median_val)
                    else:
                        df[col] = df[col].fillna(0)
            return df
        
        print(f"使用 {group_cols} 进行分组填充")
        
        for col in group_cols:
            if df[col].isnull().any():
                df[col] = df[col].fillna("未知")
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if df[col].isnull().any():
                try:
                    group_means = df.groupby(group_cols)[col].transform('median')
                    global_median = df[col].median()
                    df[col] = df[col].fillna(group_means).fillna(global_median)
                except Exception:
                    df[col] = df[col].fillna(df[col].median())
        
        return df

    def encode_categorical_variables(self, df: pd.DataFrame) -> pd.DataFrame:
        """对分类变量进行编码"""
        df = df.copy()
        categorical_cols = df.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            if col == 'ID' or df[col].nunique() > 50:
                continue
                
            le = LabelEncoder()
            unique_vals = df[col].fillna("未知").unique()
            le.fit(unique_vals)
            
            df[col] = le.transform(df[col].fillna("未知"))
            self.label_encoders[col] = le
        
        return df

    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
        """数据清洗主函数"""
        print("开始数据清洗...")
        df = df.copy()
        
        # 重命名列（去除空格）
        df.rename(columns=lambda x: x.replace(" ", ""), inplace=True)
        
        # 删除数据泄露特征
        leak = ['开发商', '物业公司', '物业办公电话', 'coord_x', 'coord_y', 'lon', 'lat']
        df = df.drop(columns=[c for c in leak if c in df.columns])
        
        # 转换包含数值信息的文本列为数值列
        df = self.convert_numeric_text_columns(df)
        
        # 处理特殊列
        # 面积
        if "面积" in df.columns:
            df["面积"] = df["面积"].apply(self.extract_numeric_value)
        
        # 楼层
        if "楼层" in df.columns:
            df["楼层"] = df["楼层"].astype(str)
            cur, tot, tp = zip(*df["楼层"].apply(self.parse_floor))
            df["当前楼层"] = cur
            df["总楼层"] = tot
            df["楼层类型"] = tp
            df["楼层比例"] = np.array(cur) / np.clip(np.array(tot), 1, None)
        
        # 朝向
        if "朝向" in df.columns:
            df["朝向"] = df["朝向"].astype(str)
            df["朝向得分"] = df["朝向"].apply(self.direction_score)
            df["是否南北通透"] = df["朝向"].str.contains("南.*北|北.*南", na=False).astype(int)
        
        # 户型
        if "户型" in df.columns:
            df["户型"] = df["户型"].astype(str)
            df["室"] = df["户型"].str.extract(r"(\d+)室").fillna(0).astype(int)
            df["厅"] = df["户型"].str.extract(r"(\d+)厅").fillna(0).astype(int)
            df["卫"] = df["户型"].str.extract(r"(\d+)卫").fillna(0).astype(int)
        
        # 装修情况
        if "装修" in df.columns:
            decoration_map = {"精装": 3, "精装修": 3, "简装": 2, "毛坯": 1, "其他": 2}
            df["装修等级"] = df["装修"].map(decoration_map).fillna(2)
        
        # 电梯
        if "电梯" in df.columns:
            df["电梯有无"] = df["电梯"].map({"有": 1, "无": 0}).fillna(0)
        
        # 交易时间
        if "交易时间" in df.columns:
            df["交易时间"] = pd.to_datetime(df["交易时间"], errors='coerce')
            df["交易年份"] = df["交易时间"].dt.year
            df["交易月份"] = df["交易时间"].dt.month
        
        # 建筑年代
        if "建筑年代" in df.columns:
            df["建筑年代"] = df["建筑年代"].astype(str).str.extract(r"(\d{4})").astype(float)
        
        # 分组填充数值型缺失值
        df = self.group_fill_numeric_missing(df)
        
        # 文本列缺失值填充
        text_cols = df.select_dtypes(include=['object']).columns
        for col in text_cols:
            if df[col].isnull().any():
                df[col] = df[col].fillna("未知")
        
        print(f"数据清洗完成，最终形状: {df.shape}")
        return df

    def engineer(self, df: pd.DataFrame) -> pd.DataFrame:
        """特征工程 - 修复版本"""
        df = df.copy()
        print("开始特征工程...")
        
        # 面积相关特征
        if "面积" in df.columns and "室" in df.columns:
            df["面积室比"] = df["面积"] / (df["室"] + 1)
            df["面积2"] = df["面积"] ** 2
        
        # 户型相关特征
        if {"室", "厅", "卫"}.issubset(df.columns):
            df["房间总数"] = df["室"] + df["厅"] + df["卫"]
            df["卫室比"] = df["卫"] / (df["室"] + 1)
            df["厅室比"] = df["厅"] / (df["室"] + 1)
            
            df["是否一居室"] = (df["室"] == 1).astype(int)
            df["是否二居室"] = (df["室"] == 2).astype(int)
            df["是否三居室"] = (df["室"] == 3).astype(int)
            df["是否大户型"] = (df["室"] >= 4).astype(int)
        
        # 房龄特征
        if {"交易年份", "建筑年代"}.issubset(df.columns):
            df["房龄"] = df["交易年份"] - df["建筑年代"]
            df["是否新房"] = (df["房龄"] <= 5).astype(int)
            df["是否老房"] = (df["房龄"] >= 30).astype(int)
        
        # 楼层特征
        if "楼层比例" in df.columns:
            df["是否高楼层"] = (df["楼层比例"] >= 0.8).astype(int)
            df["是否低楼层"] = (df["楼层比例"] <= 0.2).astype(int)
        
        # 朝向特征
        if "朝向得分" in df.columns:
            df["朝向是否优秀"] = (df["朝向得分"] >= 5).astype(int)
            df["朝向是否良好"] = ((df["朝向得分"] >= 3) & (df["朝向得分"] < 5)).astype(int)
        
        # 租赁特有特征 - 修复：确保列是字符串类型
        if "租赁方式" in df.columns:
            df["租赁方式"] = df["租赁方式"].astype(str)
            df["是否整租"] = (df["租赁方式"] == "整租").astype(int)
        
        if "付款方式" in df.columns:
            df["付款方式"] = df["付款方式"].astype(str)
            df["是否季付"] = (df["付款方式"].str.contains("季付", na=False)).astype(int)
        
        if "配套设施" in df.columns:
            df["配套设施"] = df["配套设施"].astype(str)
            df["配套设施数量"] = df["配套设施"].str.split('、').str.len().fillna(0)
        
        # 分类变量编码（在特征工程之后进行）
        df = self.encode_categorical_variables(df)
        
        print(f"特征工程完成，最终形状: {df.shape}")
        return df

    def remove_outlier(self, df: pd.DataFrame) -> pd.DataFrame:
        """异常值处理"""
        if self.TARGET not in df.columns:
            return df
            
        cap = 0.995
        q = df[self.TARGET].quantile(cap)
        original_len = len(df)
        df = df[df[self.TARGET] <= q].copy()
        removed_count = original_len - len(df)
        print(f"[Outlier] 保留 {len(df)} 行，移除 {removed_count} 个异常值")
        return df

    def select_features(self, df: pd.DataFrame):
        """特征选择"""
        print("开始特征选择...")
        
        X = df.drop(columns=[self.TARGET])
        y = df[self.TARGET]
        
        # 仅数值特征
        X_num = X.select_dtypes("number")
        X_num = X_num.replace([np.inf, -np.inf], np.nan).fillna(X_num.median()).clip(-1e15, 1e15)
        
        # 低方差过滤
        vt = VarianceThreshold(threshold=1e-3)
        X_vt = vt.fit_transform(X_num)
        vt_mask = vt.get_support()
        vt_cols = X_num.columns[vt_mask].tolist()
        X_vt_df = pd.DataFrame(X_vt, columns=vt_cols, index=X_num.index)
        
        # KBest选择
        k = min(60, X_vt_df.shape[1])
        skb = SelectKBest(f_regression, k=k)
        X_new = skb.fit_transform(X_vt_df, y)
        skb_mask = skb.get_support()
        selected = X_vt_df.columns[skb_mask].tolist()
        
        print(f"[Select] {len(selected)} features kept")
        return df[selected + [self.TARGET]], selected

    # ---------- 模型训练部分 ----------
    def train_linear_models(self, X_train, y_train, X_test, y_test):
        """训练线性模型"""
        print("开始训练线性模型...")
        
        # 确保数据没有NaN
        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        
        # 标准化
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        results = []
        
        # 1. OLS 线性回归
        print("训练OLS线性回归...")
        ols = LinearRegression()
        ols.fit(X_train_scaled, y_train)
        self.models['OLS'] = ols
        results.append(self._calculate_model_metrics(ols, X_train_scaled, y_train, X_test_scaled, y_test, 'OLS'))
        
        # 2. Lasso回归
        print("训练Lasso回归...")
        lasso = Lasso(alpha=0.01, random_state=self.RANDOM_STATE, max_iter=1000)
        lasso.fit(X_train_scaled, y_train)
        self.models['Lasso'] = lasso
        results.append(self._calculate_model_metrics(lasso, X_train_scaled, y_train, X_test_scaled, y_test, 'Lasso'))
        
        # 3. Ridge回归
        print("训练Ridge回归...")
        ridge = Ridge(alpha=1.0, random_state=self.RANDOM_STATE)
        ridge.fit(X_train_scaled, y_train)
        self.models['Ridge'] = ridge
        results.append(self._calculate_model_metrics(ridge, X_train_scaled, y_train, X_test_scaled, y_test, 'Ridge'))
        
        # 4. Elastic Net回归
        print("训练Elastic Net回归...")
        elastic = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=self.RANDOM_STATE, max_iter=1000)
        elastic.fit(X_train_scaled, y_train)
        self.models['ElasticNet'] = elastic
        results.append(self._calculate_model_metrics(elastic, X_train_scaled, y_train, X_test_scaled, y_test, 'ElasticNet'))
        
        # 确定最佳模型
        results_df = pd.DataFrame(results)
        best_idx = results_df['Test_MAE'].idxmin()
        best_model_name = results_df.loc[best_idx, 'Model']
        
        print(f"最佳模型: {best_model_name}")
        return results_df
    
    def _calculate_model_metrics(self, model, X_train, y_train, X_test, y_test, model_name):
        """计算模型的各种指标"""
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)
        
        train_rmae = (np.abs(y_train - y_pred_train) / y_train).mean()
        test_rmae = (np.abs(y_test - y_pred_test) / y_test).mean()
        
        cv_scores_mae = -cross_val_score(model, X_train, y_train, cv=self.CV_FOLDS, 
                                       scoring='neg_mean_absolute_error')
        cv_mae = cv_scores_mae.mean()
        
        return {
            'Model': model_name,
            'Train_MAE': train_mae,
            'Test_MAE': test_mae,
            'Train_RMSE': train_rmse,
            'Test_RMSE': test_rmse,
            'Train_R²': train_r2,
            'Test_R²': test_r2,
            'Train_RMAE': train_rmae,
            'Test_RMAE': test_rmae,
            'CV_MAE': cv_mae
        }
    
    def create_performance_table(self, results_df):
        print("\n==== Metrics Table ====")
    
        # 1. 按指定顺序取出真实存在的模型
        want_order = ['OLS', 'Lasso', 'Ridge']          # 基础模型
        exist_models = [m for m in want_order if m in results_df['Model'].values]
    
        # 2. BestLinear 放第三
        best_model_row = results_df.loc[results_df['Test_MAE'].idxmin()]
        final_order = ['OLS', 'Lasso', 'BestLinear', 'Ridge']
    
        # 3. 逐列抽数据
        def pull(col):
            vals = []
            for m in final_order:
                if m == 'BestLinear':
                    vals.append(best_model_row[col])
                else:
                    vals.append(results_df.loc[results_df['Model'] == m, col].values[0])
            return vals
    
        metrics_table = pd.DataFrame({
            'Model': final_order,
            'In-sample MAE': pull('Train_MAE'),
            'In-sample RMAE': pull('Train_RMAE'),
            'Out-sample MAE': pull('Test_MAE'),
            'Out-sample RMAE': pull('Test_RMAE'),
            'CV MAE': pull('CV_MAE'),
            'KaggleScore': [0, 0, 0, 0]
        })
    
        print(metrics_table)
        metrics_table.to_csv(f'{self.OUTPUT_DIR}/metrics_table.csv', index=False)
        metrics_table.to_excel(f'{self.OUTPUT_DIR}/metrics.xlsx', index=False)
        return metrics_table
    
    def predict_test_set(self, selected_features):
        """预测测试集并生成submission.csv"""
        print("正在预测测试集...")
        
        try:
            test_df = pd.read_csv(self.TEST_FILE)
            original_ids = test_df['ID'].copy()
            
            test_clean = self.clean(test_df)
            test_engineered = self.engineer(test_clean)
            
            missing_features = [f for f in selected_features if f not in test_engineered.columns]
            if missing_features:
                for feat in missing_features:
                    test_engineered[feat] = 0
            
            X_test = test_engineered[selected_features].fillna(0)
            X_test_scaled = self.scaler.transform(X_test)
            
            best_model_name = self.results_df.loc[self.results_df['Test_MAE'].idxmin(), 'Model']
            best_model = self.models[best_model_name]
            predictions = best_model.predict(X_test_scaled)
            
            predictions = np.maximum(predictions, 50000)
            predictions = np.minimum(predictions, 100000000)
            
            # 创建当前预测
            current_submission = pd.DataFrame({
                'ID': original_ids,
                'Price': predictions
            })
            
            # 检查是否已存在submission.csv（由房价模型创建）
            if os.path.exists('submission.csv'):
                # 读取现有文件并追加
                existing = pd.read_csv('submission.csv')
                combined = pd.concat([existing, current_submission], ignore_index=True)
                combined.to_csv('submission.csv', index=False)
                print(f"[Append] 租金预测追加到 submission.csv，总记录数: {len(combined)}")
            else:
                # 如果房价模型还没运行，创建新文件（但这种情况不应该发生）
                current_submission.to_csv('submission.csv', index=False)
                print(f"[Create] 创建 submission.csv，记录数: {len(current_submission)}")
            
            # 备份
            current_submission.to_csv(f'{self.OUTPUT_DIR}/submission_rent_backup.csv', index=False)
            
            print(f"租金预测完成，预测价格范围: {predictions.min():.2f} - {predictions.max():.2f}")
            
            return current_submission
            
        except Exception as e:
            print(f"测试集预测失败: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def main_workflow(self):
        """主工作流"""
        print("="*50)
        print("开始租金预测模型训练")
        print("="*50)
        
        try:
            # 1. 加载数据
            print("\n步骤1: 加载数据...")
            train_df = pd.read_csv(self.DATA_FILE)
            print(f"原始数据形状: {train_df.shape}")
            
            # 2. 数据清洗
            print("\n步骤2: 数据清洗...")
            train_clean = self.clean(train_df)
            
            # 3. 特征工程
            print("\n步骤3: 特征工程...")
            train_engineered = self.engineer(train_clean)
            
            # 4. 异常值处理
            print("\n步骤4: 异常值处理...")
            train_no_outlier = self.remove_outlier(train_engineered)
            
            # 5. 特征选择
            print("\n步骤5: 特征选择...")
            train_selected, selected_features = self.select_features(train_no_outlier)
            
            # 准备特征和目标变量
            X = train_selected.drop(columns=[self.TARGET])
            y = train_selected[self.TARGET]
            
            # 6. 数据分割 (80/20)
            print("\n步骤6: 数据分割...")
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=self.TEST_SIZE, random_state=self.RANDOM_STATE
            )
            print(f"训练集: {X_train.shape}, 测试集: {X_test.shape}")
            
            # 7. 模型训练
            print("\n步骤7: 模型训练...")
            self.results_df = self.train_linear_models(X_train, y_train, X_test, y_test)
            
            # 8. 性能汇总
            print("\n步骤8: 生成性能表格...")
            metrics_table = self.create_performance_table(self.results_df)
            
            # 9. 预测测试集
            print("\n步骤9: 预测测试集...")
            if Path(self.TEST_FILE).exists():
                submission = self.predict_test_set(selected_features)
            
            print("\n" + "="*50)
            print("租金模型训练完成！")
            print("="*50)
            
            return self.results_df, metrics_table, selected_features
            
        except Exception as e:
            print(f"主工作流失败: {e}")
            import traceback
            traceback.print_exc()
            return None, None, None
    def predict_all_models_separate(self, selected_features):
        """使用所有模型分别预测测试集并保存单独的文件"""
        print("正在使用所有模型分别预测测试集...")
        
        try:
            # 加载测试数据
            test_df = pd.read_csv(self.TEST_FILE)
            
            # 保存原始ID
            original_ids = test_df['ID'].copy()
            
            # 应用相同的处理流程
            test_clean = self.clean(test_df)
            test_engineered = self.engineer(test_clean)
            
            # 确保包含所有需要的特征
            missing_features = [f for f in selected_features if f not in test_engineered.columns]
            if missing_features:
                for feat in missing_features:
                    test_engineered[feat] = 0
            
            X_test = test_engineered[selected_features].fillna(0)
            X_test_scaled = self.scaler.transform(X_test)
            
            # 使用所有模型进行预测
            predictions = {}
            for model_name, model in self.models.items():
                pred = model.predict(X_test_scaled)
                pred = np.maximum(pred, 50000)
                pred = np.minimum(pred, 100000000)
                predictions[model_name] = pred
            
            # 确定最佳模型
            best_model_name = self.results_df.loc[self.results_df['Test_MAE'].idxmin(), 'Model']
            predictions['BestLinear'] = predictions[best_model_name]
            
            # 为每个模型创建单独的预测结果
            for model_name in ['OLS', 'Lasso', 'BestLinear', 'Ridge']:
                if model_name in predictions:
                    model_predictions = predictions[model_name]
                    submission = pd.DataFrame({
                        'ID': original_ids,
                        'Price': model_predictions
                    })
                    
                    # 保存为单独的文件
                    filename = f'submission_{model_name}.csv'
                    submission.to_csv(filename, index=False)
                    print(f"[{model_name}] 租金预测完成，保存到 {filename}，共 {len(submission)} 条记录")
                    print(f"[{model_name}] 预测价格范围: {model_predictions.min():.2f} - {model_predictions.max():.2f}")
            
            return predictions
            
        except Exception as e:
            print(f"测试集预测失败: {e}")
            import traceback
            traceback.print_exc()
            return None
            

In [5]:
def main():
    """主函数 - 修改版，为每个模型分别保存房价和租金的合并预测结果"""
    
    # 训练房价模型
    print("="*50)
    print("开始房屋定价模型训练")
    print("="*50)
    price_model = HousePriceModel()
    price_results_df, price_metrics_table, price_selected_features = price_model.main_workflow()
    
    # 使用所有模型分别预测房价测试集
    price_predictions = price_model.predict_all_models_separate(price_selected_features)
    
    # 训练租金模型
    print("\n" + "="*50)
    print("开始租金预测模型训练") 
    print("="*50)
    rent_model = RentPriceModel()
    rent_results_df, rent_metrics_table, rent_selected_features = rent_model.main_workflow()
    
    # 使用所有模型分别预测租金测试集
    rent_predictions = rent_model.predict_all_models_separate(rent_selected_features)
    
    # 合并房价和租金的预测结果到四个单独的文件
    if price_predictions is not None and rent_predictions is not None:
        # 为每个模型合并房价和租金的预测结果
        for model_name in ['OLS', 'Lasso', 'BestLinear', 'Ridge']:
            if model_name in price_predictions and model_name in rent_predictions:
                # 创建房价预测DataFrame
                price_df = pd.DataFrame({
                    'ID': pd.read_csv(price_model.TEST_FILE)['ID'],
                    'Price': price_predictions[model_name]
                })
                
                # 创建租金预测DataFrame  
                rent_df = pd.DataFrame({
                    'ID': pd.read_csv(rent_model.TEST_FILE)['ID'],
                    'Price': rent_predictions[model_name]
                })
                
                # 合并两个DataFrame（租金数据追加在房价数据后面）
                combined_df = pd.concat([price_df, rent_df], ignore_index=True)
                
                # 保存合并后的结果
                filename = f'submission_{model_name}.csv'
                combined_df.to_csv(filename, index=False)
                print(f"\n[Final] {model_name}模型预测结果已保存到 {filename}")
                print(f"[Final] 总记录数: {len(combined_df)} (Price: {len(price_df)}, Rent: {len(rent_df)})")
                print(f"[Final] 预测价格范围: {combined_df['Price'].min():.2f} - {combined_df['Price'].max():.2f}")
    
    # 打印最终结果
    if price_results_df is not None:
        print("\n=== 房价模型最终训练结果 ===")
        best_model_name = price_results_df.loc[price_results_df['Test_MAE'].idxmin(), 'Model']
        best_mae = price_results_df['Test_MAE'].min()
        best_r2 = price_results_df.loc[price_results_df['Test_MAE'].idxmin(), 'Test_R²']
        print(f"最佳模型: {best_model_name}")
        print(f"最佳测试集MAE: {best_mae:.2f}")
        print(f"最佳测试集R²: {best_r2:.4f}")
    
    if rent_results_df is not None:
        print("\n=== 租金模型最终训练结果 ===")
        best_model_name = rent_results_df.loc[rent_results_df['Test_MAE'].idxmin(), 'Model']
        best_mae = rent_results_df['Test_MAE'].min()
        best_r2 = rent_results_df.loc[rent_results_df['Test_MAE'].idxmin(), 'Test_R²']
        print(f"最佳模型: {best_model_name}")
        print(f"最佳测试集MAE: {best_mae:.2f}")
        print(f"最佳测试集R²: {best_r2:.4f}")
    
    return price_model, price_results_df, price_metrics_table, price_selected_features, \
           rent_model, rent_results_df, rent_metrics_table, rent_selected_features

In [6]:
if __name__ == "__main__":
    price_model, price_results_df, price_metrics_table, price_selected_features, \
    rent_model, rent_results_df, rent_metrics_table, rent_selected_features = main()

开始房屋定价模型训练
开始房屋定价模型训练

步骤1: 加载数据...
原始数据形状: (103871, 55)

步骤2: 数据清洗...
开始数据清洗...
转换文本列为数值列: 房屋户型
转换文本列为数值列: 所在楼层
转换文本列为数值列: 建筑面积
转换文本列为数值列: 套内面积
转换文本列为数值列: 户型介绍
转换文本列为数值列: 建筑年代
转换文本列为数值列: 房屋总数
转换文本列为数值列: 楼栋总数
转换文本列为数值列: 绿化率
转换文本列为数值列: 物业费
转换文本列为数值列: 燃气费
转换文本列为数值列: 供热费
转换文本列为数值列: 停车费用
共转换 13 个文本列为数值列
使用 ['城市', '区域', '板块'] 进行分组填充
列 套内面积: 使用分组填充 + 全局中位数填充
列 抵押信息: 使用分组填充 + 全局中位数填充
列 户型介绍: 使用分组填充 + 全局中位数填充
列 区县: 使用分组填充 + 全局中位数填充
列 板块_comm: 使用分组填充 + 全局中位数填充
列 建筑年代: 使用分组填充 + 全局中位数填充
列 房屋总数: 使用分组填充 + 全局中位数填充
列 楼栋总数: 使用分组填充 + 全局中位数填充
列 绿化率: 使用分组填充 + 全局中位数填充
列 容积率: 使用分组填充 + 全局中位数填充
列 物业费: 使用分组填充 + 全局中位数填充
列 燃气费: 使用分组填充 + 全局中位数填充
列 供热费: 使用分组填充 + 全局中位数填充
列 停车位: 使用分组填充 + 全局中位数填充
列 停车费用: 使用分组填充 + 全局中位数填充
列 当前楼层: 使用分组填充 + 全局中位数填充
列 总楼层: 使用分组填充 + 全局中位数填充
列 楼层类型: 使用分组填充 + 全局中位数填充
列 楼层比例: 使用分组填充 + 全局中位数填充
文本列 环线: 缺失值填充为'未知'
文本列 建筑结构: 缺失值填充为'未知'
文本列 装修情况: 缺失值填充为'未知'
文本列 梯户比例: 缺失值填充为'未知'
文本列 配备电梯: 缺失值填充为'未知'
文本列 别墅类型: 缺失值填充为'未知'
文本列 上次交易: 缺失值填充为'未知'
文本列 房屋用途: 缺失值填充为'未知'
文本列 房屋年限: 缺失值填充为'未知'
文本列 房屋优势: 缺失值填