In [1]:
pip install jieba snownlp

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 文本处理相关
try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    import jieba
    from snownlp import SnowNLP
    HAVE_TEXT_MODULES = True
except ImportError:
    HAVE_TEXT_MODULES = False
    print("警告: 未找到 jieba 或 snownlp 模块，将跳过文本处理")

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# ==================== 数据处理和特征工程函数 ====================

def load_train_data(file_path):
    """加载训练数据（没有ID列）"""
    try:
        df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        print(f"训练数据读取成功！形状: {df.shape}")
        return df
    except Exception as e:
        print(f"训练数据加载错误: {e}")
        return None

def load_test_data(file_path):
    """加载测试数据（有ID列）"""
    try:
        df = pd.read_csv(file_path, encoding='utf-8', low_memory=False)
        print(f"测试数据读取成功！形状: {df.shape}")
        
        # 分离ID列
        if 'ID' in df.columns:
            ids = df['ID'].copy()
            df = df.drop(columns=['ID'])
            print(f"已分离ID列，剩余特征形状: {df.shape}")
            return df, ids
        else:
            print("错误: 测试数据中没有ID列")
            return None, None
    except Exception as e:
        print(f"测试数据加载错误: {e}")
        return None, None

def data_cleaning(df):
    """数据清洗"""
    if df is None or df.empty:
        return df
        
    df_clean = df.copy()
    
    # 删除数据泄露特征
    leakage_features = ['community_avg_price', 'historical_price', 'appraisal_price', '评估价', '小区均价']
    df_clean = df_clean.drop(columns=[col for col in leakage_features if col in df_clean.columns], errors='ignore')
    
    # 处理缺失值
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    # 分类列用众数填充
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            mode_val = df_clean[col].mode()
            df_clean[col].fillna(mode_val[0] if not mode_val.empty else '未知', inplace=True)
    
    return df_clean

def safe_label_encode(series, le=None):
    """安全的标签编码，处理未知类别"""
    if le is None:
        le = LabelEncoder()
        unique_vals = series.unique()
        le.fit(np.append(unique_vals, '未知'))
    
    known_categories = set(le.classes_)
    mask = series.isin(known_categories)
    if not mask.all():
        series = series.where(mask, '未知')
    
    return le.transform(series), le

def process_text_features(df, text_col='客户反馈'):
    """处理文本特征：情感分析和TF-IDF"""
    if text_col not in df.columns:
        return df, {}
        
    df_text = df.copy()
    text_data = df_text[text_col].fillna('无反馈')
    
    # 情感分析
    if HAVE_TEXT_MODULES:
        def get_sentiment(text):
            try:
                return SnowNLP(text).sentiments
            except:
                return 0.5
        
        df_text['情感得分'] = text_data.apply(get_sentiment)
    else:
        df_text['情感得分'] = 0.5  # 默认值
    
    # TF-IDF特征（取前10个最重要的词）
    if HAVE_TEXT_MODULES:
        try:
            # 中文分词
            def chinese_tokenizer(text):
                return list(jieba.cut(text))
            
            tfidf = TfidfVectorizer(
                tokenizer=chinese_tokenizer,
                max_features=10,
                stop_words=['的', '了', '在', '是', '我', '有', '和', '就']
            )
            tfidf_features = tfidf.fit_transform(text_data)
            tfidf_df = pd.DataFrame(tfidf_features.toarray(), 
                                  columns=[f'TFIDF_{i}' for i in range(tfidf_features.shape[1])])
            
            # 合并回原数据
            df_text = pd.concat([df_text, tfidf_df], axis=1)
            text_features = {
                'tfidf': tfidf,
                'features': tfidf_df.columns.tolist()
            }
        except Exception as e:
            print(f"TF-IDF处理失败: {e}")
            text_features = {}
    else:
        text_features = {}
    
    return df_text, text_features

def feature_engineering(df, is_training=True, encoders=None, text_features=None):
    """特征工程，训练/测试都返回 df_feat, encoders, text_features"""
    if df is None or df.empty:
        return df, {} if is_training else df, encoders, text_features

    df_feat = df.copy()
    new_encoders = {} if is_training else encoders

    # 文本特征
    if is_training:
        df_feat, text_features = process_text_features(df_feat)
    else:
        if text_features and 'tfidf' in text_features and '客户反馈' in df_feat.columns:
            try:
                text_data = df_feat['客户反馈'].fillna('无反馈')
                tfidf_features = text_features['tfidf'].transform(text_data)
                tfidf_df = pd.DataFrame(tfidf_features.toarray(),
                                        columns=text_features['features'])
                df_feat = pd.concat([df_feat, tfidf_df], axis=1)
            except:
                pass

    # 楼层
    def process_floor(floor_str):
        if pd.isna(floor_str) or floor_str == '未知': return np.nan, np.nan
        try:
            if '/' in str(floor_str):
                parts = str(floor_str).split('/')
                current_floor = re.findall(r'\d+', parts[0])
                total_floor = re.findall(r'\d+', parts[1])
                current_floor = int(current_floor[0]) if current_floor else np.nan
                total_floor = int(total_floor[0]) if total_floor else np.nan
                return current_floor, total_floor
            else:
                return np.nan, np.nan
        except:
            return np.nan, np.nan

    if '楼层' in df_feat.columns:
        floor_info = df_feat['楼层'].apply(process_floor)
        df_feat['当前楼层'] = floor_info.apply(lambda x: x[0])
        df_feat['总楼层'] = floor_info.apply(lambda x: x[1])
        df_feat['楼层比'] = df_feat.apply(lambda row: row['当前楼层'] / row['总楼层'] 
                                        if pd.notna(row['当前楼层']) and pd.notna(row['总楼层']) and row['总楼层'] != 0 
                                        else np.nan, axis=1)

    # 面积处理
    area_cols = ['面积', '建筑面积']
    for col in area_cols:
        if col in df_feat.columns and df_feat[col].dtype == 'object':
            df_feat[col] = df_feat[col].str.replace('㎡','').astype(float)

    # 时间特征
    time_cols = ['交易时间', '上次交易']
    for col in time_cols:
        if col in df_feat.columns and not pd.api.types.is_datetime64_any_dtype(df_feat[col]):
            try:
                df_feat[col] = pd.to_datetime(df_feat[col], errors='coerce')
                df_feat[f'{col}_年份'] = df_feat[col].dt.year
                df_feat[f'{col}_月份'] = df_feat[col].dt.month
            except:
                pass

    # 环线编码
    if '环线位置' in df_feat.columns:
        ring_mapping = {'二环内':1,'二至三环':2,'三至四环':3,'四至五环':4,'五至六环':5,'六环外':6}
        df_feat['环线编码'] = df_feat['环线位置'].map(ring_mapping)

    # 建筑年代
    def process_building_year(year_str):
        if pd.isna(year_str) or year_str=='未知': return np.nan, np.nan
        try:
            years = re.findall(r'\d{4}', str(year_str))
            if len(years)>=2: return int(years[0]), int(years[1])
            elif len(years)==1: return int(years[0]), int(years[0])
            else: return np.nan, np.nan
        except: return np.nan, np.nan

    if '建筑年代' in df_feat.columns:
        building_years = df_feat['建筑年代'].apply(lambda x: process_building_year(x))
        df_feat['建筑起始年份'] = building_years.apply(lambda x: x[0])
        df_feat['建筑结束年份'] = building_years.apply(lambda x: x[1])
        df_feat['建筑年限'] = 2025 - df_feat['建筑结束年份']

    # 分类编码
    categorical_cols = ['城市','户型','装修','朝向','区县','板块','物业类别','建筑结构']
    for col in categorical_cols:
        if col in df_feat.columns:
            df_feat[col] = df_feat[col].astype(str)
            if is_training:
                encoded_series, le = safe_label_encode(df_feat[col])
                df_feat[f'{col}_编码'] = encoded_series
                new_encoders[col] = le
            else:
                if col in encoders:
                    le = encoders[col]
                    encoded_series, _ = safe_label_encode(df_feat[col], le)
                    df_feat[f'{col}_编码'] = encoded_series

    # 价格相关特征
    if 'Price' in df_feat.columns and '面积' in df_feat.columns:
        df_feat['单价'] = df_feat['Price'] / df_feat['面积']

    # 环线面积交互
    if '环线编码' in df_feat.columns and '面积' in df_feat.columns:
        df_feat['面积区间'] = pd.cut(df_feat['面积'], bins=[0,50,80,100,120,150,200,300,1000], labels=False)
        df_feat['环线_面积区间'] = df_feat['环线编码'].astype(str)+'_'+df_feat['面积区间'].astype(str)
        if is_training:
            le_interaction = LabelEncoder()
            df_feat['环线_面积区间_编码'] = le_interaction.fit_transform(df_feat['环线_面积区间'])
            new_encoders['环线_面积区间'] = le_interaction
        else:
            if '环线_面积区间' in encoders:
                le_interaction = encoders['环线_面积区间']
                known_categories = set(le_interaction.classes_)
                mask = df_feat['环线_面积区间'].isin(known_categories)
                df_feat['环线_面积区间'] = df_feat['环线_面积区间'].where(mask,'未知')
                df_feat['环线_面积区间_编码'] = le_interaction.transform(df_feat['环线_面积区间'])

    # 多项式特征
    numeric_cols = df_feat.select_dtypes(include=[np.number]).columns
    numeric_cols = [c for c in numeric_cols if c not in ['Price','单价']]
    for col in numeric_cols:
        if df_feat[col].notna().all(): df_feat[f'{col}_平方'] = df_feat[col]**2

    return df_feat, new_encoders, text_features


def prepare_features(df, target_col='Price', training_feature_names=None):
    """准备特征矩阵和目标向量，确保训练和测试特征一致"""
    if df is None or df.empty:
        return None, None, training_feature_names
    
    # 选择数值型特征
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    y = df[target_col] if target_col in df.columns else None
    
    # 移除目标变量
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)
    
    # 训练阶段：保存特征名
    if training_feature_names is None:
        X = df[numeric_cols] if numeric_cols else pd.DataFrame()
        training_feature_names = X.columns.tolist()
    # 测试阶段：确保特征与训练时一致
    else:
        X = pd.DataFrame(columns=training_feature_names)
        # 只保留训练时存在的特征
        for col in training_feature_names:
            if col in df.columns:
                X[col] = df[col]
            else:
                X[col] = 0  # 如果测试数据缺少某些特征，用0填充
    
    return X, y, training_feature_names

# ==================== 建模和评估类 ====================

class PropertyPricePredictor:
    def __init__(self, property_type='price'):
        self.feature_names = None
        self.property_type = property_type
        self.models = {
            'OLS': LinearRegression(),
            'LASSO': Lasso(random_state=111),
            'Ridge': Ridge(random_state=111),
            'ElasticNet': ElasticNet(random_state=111)
        }
        
        if property_type == 'price':
            self.param_grids = {
                'LASSO': {'alpha': [0.1, 1.0, 10.0, 100.0]},
                'Ridge': {'alpha': [0.1, 1.0, 10.0, 100.0]},
                'ElasticNet': {
                    'alpha': [0.1, 1.0, 10.0],
                    'l1_ratio': [0.1, 0.5, 0.9]
                }
            }
        else:
            self.param_grids = {
                'LASSO': {'alpha': [0.001, 0.01, 0.1, 1.0]},
                'Ridge': {'alpha': [0.001, 0.01, 0.1, 1.0]},
                'ElasticNet': {
                    'alpha': [0.001, 0.01, 0.1],
                    'l1_ratio': [0.3, 0.5, 0.7]
                }
            }
        
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.encoders = {}
        self.best_model = None
        self.best_model_name = None
        self.results = []
    
    def remove_outliers_iqr(self, X, y):
        """使用IQR方法移除异常值"""
        if X is None or len(X) == 0:
            return X, y
            
        Q1 = y.quantile(0.25)
        Q3 = y.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outlier_mask = (y >= lower_bound) & (y <= upper_bound)
        
        X_clean = X[outlier_mask]
        y_clean = y[outlier_mask]
        
        removed_count = len(X) - len(X_clean)
        print(f"移除异常值: {removed_count} 个")
        print(f"剩余样本数量: {len(X_clean)}")
        
        return X_clean, y_clean
    
    def evaluate_model(self, model, X_train, y_train, X_test, y_test, model_name):
        """评估模型性能"""
        y_train_pred = model.predict(X_train)
        mae_train = mean_absolute_error(y_train, y_train_pred)
        
        y_test_pred = model.predict(X_test)
        mae_test = mean_absolute_error(y_test, y_test_pred)
        
        try:
            cv_results = cross_validate(
                model, X_train, y_train, cv=6,
                scoring='neg_mean_absolute_error',
                return_train_score=False
            )
            mae_cv = -cv_results['test_score'].mean()
        except:
            mae_cv = mae_test
        
        if self.property_type == 'price':
            kaggle_score = max(0, 100 - (mae_test / 100000))
        else:
            kaggle_score = max(0, 100 - (mae_test / 1000))
        
        return {
            'Model': model_name,
            'In_sample_MAE': mae_train,
            'Out_sample_MAE': mae_test,
            'CV_MAE': mae_cv,
            'Kaggle_Score': kaggle_score,
            'Samples_after_outlier_removal': len(X_train)
        }
    
    def train_and_evaluate(self, X_train, y_train, X_test, y_test):
        """训练和评估所有模型"""
        if X_train is None or len(X_train) == 0:
            print("训练数据为空，无法训练模型")
            return pd.DataFrame()
        
        X_train_clean, y_train_clean = self.remove_outliers_iqr(X_train, y_train)
        
        if len(X_train_clean) == 0:
            print("移除异常值后没有剩余数据")
            return pd.DataFrame()

        # 保存特征名称
        if self.feature_names is None:
            self.feature_names = X_train_clean.columns.tolist()
        
        # 确保没有NaN值
        X_train_clean = self.imputer.fit_transform(X_train_clean)
        if X_test is not None:
            X_test = self.imputer.transform(X_test)
        
        # 标准化
        X_train_scaled = self.scaler.fit_transform(X_train_clean)
        X_test_scaled = self.scaler.transform(X_test) if X_test is not None else None
        
        for name, model in self.models.items():
            print(f"\n=== 训练 {name} 模型 ===")
            
            try:
                if name in self.param_grids:
                    grid_search = GridSearchCV(
                        model, self.param_grids[name], 
                        cv=min(6, len(X_train_clean)),
                        scoring='neg_mean_absolute_error',
                        n_jobs=-1
                    )
                    grid_search.fit(X_train_scaled, y_train_clean)
                    best_model = grid_search.best_estimator_
                    print(f"最佳参数: {grid_search.best_params_}")
                else:
                    best_model = model
                    best_model.fit(X_train_scaled, y_train_clean)
                
                if X_test is not None:
                    model_results = self.evaluate_model(
                        best_model, X_train_scaled, y_train_clean, X_test_scaled, y_test, name
                    )
                    self.results.append(model_results)
                    
                    self.models[name] = best_model
                    
                    print(f"{name}模型评估完成")
                    print(f"样本内MAE: {model_results['In_sample_MAE']:.4f}")
                    print(f"样本外MAE: {model_results['Out_sample_MAE']:.4f}")
                    print(f"交叉验证MAE: {model_results['CV_MAE']:.4f}")
                else:
                    print(f"{name}模型训练完成，但无法评估（无测试数据）")
                    
            except Exception as e:
                print(f"训练{name}模型时出错: {e}")
                continue
        
        if not self.results:
            print("没有模型成功训练")
            return pd.DataFrame()
        
        results_df = pd.DataFrame(self.results)
        best_idx = results_df['Out_sample_MAE'].idxmin()
        self.best_model_name = results_df.loc[best_idx, 'Model']
        self.best_model = self.models[self.best_model_name]
        
        return results_df
    
    def generate_presentation_table(self):
        """生成符合要求的展示表格"""
        if not self.results:
            return pd.DataFrame()
        
        # 创建展示表格
        presentation_data = []
        
        for result in self.results:
            # 将MAE转换为千元单位，保留2位小数
            in_sample_mae = result['In_sample_MAE'] / 1000
            out_sample_mae = result['Out_sample_MAE'] / 1000
            cv_mae = result['CV_MAE'] / 1000
            kaggle_score = result['Kaggle_Score']
            
            presentation_data.append({
                'Metrics': result['Model'],
                'In sample': f"{in_sample_mae:.2f}",
                'out of sample': f"{out_sample_mae:.2f}",
                'Cross-validation': f"{cv_mae:.2f}",
                'Kaggle Score': f"{kaggle_score:.2f}"
            })
        
        # 添加最佳模型行
        best_result = next((r for r in self.results if r['Model'] == self.best_model_name), None)
        if best_result:
            in_sample_mae = best_result['In_sample_MAE'] / 1000
            out_sample_mae = best_result['Out_sample_MAE'] / 1000
            cv_mae = best_result['CV_MAE'] / 1000
            kaggle_score = best_result['Kaggle_Score']
            
            presentation_data.append({
                'Metrics': 'Best Linear Model',
                'In sample': f"{in_sample_mae:.2f}",
                'out of sample': f"{out_sample_mae:.2f}",
                'Cross-validation': f"{cv_mae:.2f}",
                'Kaggle Score': f"{kaggle_score:.2f}"
            })
        
        # 添加其他模型行（如果需要）
        # presentation_data.append({
        #     'Metrics': 'Any Other Model (Not Required)',
        #     'In sample': "0.94",
        #     'out of sample': "0.92",
        #     'Cross-validation': "0.92",
        #     'Kaggle Score': "61"
        # })
        
        return pd.DataFrame(presentation_data)
    
    def predict(self, X):
        """使用最佳模型进行预测，并处理负值"""
        if self.best_model is None:
            raise ValueError("模型尚未训练")
        
        if X is None or len(X) == 0:
            print("预测数据为空")
            return np.array([])
        
        # 确保特征与训练时一致
        if self.feature_names is not None:
            # 检查并处理特征维度
            if X.shape[1] != len(self.feature_names):
                print(f"特征维度不匹配: 预测数据 {X.shape[1]} 特征, 训练数据 {len(self.feature_names)} 特征")
                
                # 如果预测数据特征更多，只保留训练时的特征
                if X.shape[1] > len(self.feature_names):
                    X = X.iloc[:, :len(self.feature_names)]
                # 如果预测数据特征更少，用0填充缺失特征
                else:
                    missing_features = len(self.feature_names) - X.shape[1]
                    zeros = np.zeros((X.shape[0], missing_features))
                    X_extended = np.hstack([X, zeros])
                    X = pd.DataFrame(X_extended, columns=self.feature_names)
            
            # 确保列名一致
            X.columns = self.feature_names
        
        X_imputed = self.imputer.transform(X)
        X_scaled = self.scaler.transform(X_imputed)
        predictions = self.best_model.predict(X_scaled)
        
        # 处理负的预测值 - 取绝对值
        negative_mask = predictions < 0
        negative_count = np.sum(negative_mask)
        if negative_count > 0:
            print(f"警告: 发现 {negative_count} 个负的预测值，已将其转换为正值")
            predictions[negative_mask] = np.abs(predictions[negative_mask])
        
        return predictions

# ==================== 主函数 ====================
def main():
    print("=== 房价和房租预测 ===")
    
    price_train_path = "ruc_Class25Q2_train_price.csv"
    price_test_path  = "ruc_Class25Q2_test_price.csv"
    rent_train_path  = "ruc_Class25Q2_train_rent.csv"
    rent_test_path   = "ruc_Class25Q2_test_rent.csv"
    output_path      = "submission_Class25Q2.csv"

    all_predictions = []
    presentation_tables = {}

    # ===== 房价预测 =====
    print("\n=== 开始房价预测 ===")
    price_train = load_train_data(price_train_path)
    price_test, price_test_ids = load_test_data(price_test_path)
    
    if price_train is not None and price_test is not None:
        # 训练数据处理
        price_train_clean = data_cleaning(price_train)
        price_train_feat, price_encoders, price_text_features = feature_engineering(
            price_train_clean, is_training=True
        )
        X_price_train, y_price_train, feature_names_price = prepare_features(
            price_train_feat, 'Price'
        )

        if X_price_train is not None and y_price_train is not None:
            # 训练模型
            predictor_price = PropertyPricePredictor('price')
            X_train, X_val, y_train, y_val = train_test_split(
                X_price_train, y_price_train, test_size=0.2, random_state=111
            )
            results_price = predictor_price.train_and_evaluate(X_train, y_train, X_val, y_val)
            
            # 生成展示表格
            price_presentation_table = predictor_price.generate_presentation_table()
            presentation_tables['Price'] = price_presentation_table
            print("\n=== 房价模型评估结果 ===")
            print(price_presentation_table.to_string(index=False))
            
            # 测试数据预测
            price_test_clean = data_cleaning(price_test)
            price_test_feat, _, _ = feature_engineering(
                price_test_clean, is_training=False,
                encoders=price_encoders, text_features=price_text_features
            )
            X_price_test, _, _ = prepare_features(
                price_test_feat, 'Price', training_feature_names=feature_names_price
            )
            
            if X_price_test is not None and not X_price_test.empty:
                price_preds = predictor_price.predict(X_price_test)
                
                # 保存预测结果 - 统一使用Price列
                for test_id, pred in zip(price_test_ids, price_preds):
                    all_predictions.append({'ID': test_id, 'Price': pred})
                
                print(f"房价预测完成: {len(price_preds)} 条记录")

    # ===== 房租预测 =====
    print("\n=== 开始房租预测 ===")
    rent_train = load_train_data(rent_train_path)
    rent_test, rent_test_ids = load_test_data(rent_test_path)
    
    if rent_train is not None and rent_test is not None:
        # 训练数据处理
        rent_train_clean = data_cleaning(rent_train)
        rent_train_feat, rent_encoders, rent_text_features = feature_engineering(
            rent_train_clean, is_training=True
        )
        # 注意：租金的目标列也是'Price'
        X_rent_train, y_rent_train, feature_names_rent = prepare_features(
            rent_train_feat, 'Price'
        )

        if X_rent_train is not None and y_rent_train is not None:
            # 训练模型
            predictor_rent = PropertyPricePredictor('rent')
            X_train, X_val, y_train, y_val = train_test_split(
                X_rent_train, y_rent_train, test_size=0.2, random_state=111
            )
            results_rent = predictor_rent.train_and_evaluate(X_train, y_train, X_val, y_val)
            
            # 生成展示表格
            rent_presentation_table = predictor_rent.generate_presentation_table()
            presentation_tables['Rent'] = rent_presentation_table
            print("\n=== 房租模型评估结果 ===")
            print(rent_presentation_table.to_string(index=False))
            
            # 测试数据预测
            rent_test_clean = data_cleaning(rent_test)
            rent_test_feat, _, _ = feature_engineering(
                rent_test_clean, is_training=False,
                encoders=rent_encoders, text_features=rent_text_features
            )
            # 注意：租金的目标列也是'Price'
            X_rent_test, _, _ = prepare_features(
                rent_test_feat, 'Price', training_feature_names=feature_names_rent
            )
            
            if X_rent_test is not None and not X_rent_test.empty:
                rent_preds = predictor_rent.predict(X_rent_test)
                
                # 保存预测结果 - 统一使用Price列
                for test_id, pred in zip(rent_test_ids, rent_preds):
                    all_predictions.append({'ID': test_id, 'Price': pred})
                
                print(f"房租预测完成: {len(rent_preds)} 条记录")

    # ===== 输出结果 =====
    if all_predictions:
        df_out = pd.DataFrame(all_predictions)
        df_out.to_csv(output_path, index=False)
        print(f"\n预测结果已保存到 {output_path}")
        print(f"总预测数量: {len(df_out)}")
        
        # 由于房价和租金都使用Price列，不需要分别统计
        print("输出格式: ID, Price (包含房价和租金预测)")
        
        # 保存展示表格到文件
        for table_name, table in presentation_tables.items():
            table_filename = f"presentation_table_{table_name}.csv"
            table.to_csv(table_filename, index=False)
            print(f"{table_name}展示表格已保存到 {table_filename}")
    else:
        print("没有生成预测结果")


if __name__ == "__main__":
    main()

=== 房价和房租预测 ===

=== 开始房价预测 ===
训练数据读取成功！形状: (103871, 55)
测试数据读取成功！形状: (34017, 55)
已分离ID列，剩余特征形状: (34017, 54)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\29454\AppData\Local\Temp\jieba.cache
Loading model cost 0.546 seconds.
Prefix dict has been built successfully.


移除异常值: 6312 个
剩余样本数量: 76784

=== 训练 OLS 模型 ===
OLS模型评估完成
样本内MAE: 575940.5726
样本外MAE: 916416.4398
交叉验证MAE: 576445.9573

=== 训练 LASSO 模型 ===
最佳参数: {'alpha': 100.0}
LASSO模型评估完成
样本内MAE: 582708.7047
样本外MAE: 925761.7448
交叉验证MAE: 583148.0713

=== 训练 Ridge 模型 ===
最佳参数: {'alpha': 0.1}
Ridge模型评估完成
样本内MAE: 576331.0344
样本外MAE: 917102.3970
交叉验证MAE: 576918.8299

=== 训练 ElasticNet 模型 ===
最佳参数: {'alpha': 0.1, 'l1_ratio': 0.9}
ElasticNet模型评估完成
样本内MAE: 590045.7819
样本外MAE: 934234.6974
交叉验证MAE: 590449.7741

=== 房价模型评估结果 ===
          Metrics In sample out of sample Cross-validation Kaggle Score
              OLS    575.94        916.42           576.45        90.84
            LASSO    582.71        925.76           583.15        90.74
            Ridge    576.33        917.10           576.92        90.83
       ElasticNet    590.05        934.23           590.45        90.66
Best Linear Model    575.94        916.42           576.45        90.84
警告: 发现 321 个负的预测值，已将其转换为正值
房价预测完成: 34017 条记录

=== 开始房租预测 =