# Midterm Project -- 刘子意

## 一、数据导入与预览

In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from datetime import datetime
from sklearn.linear_model import SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# 数据导入
df_trainP = pd.read_csv('data/ruc_Class25Q2_train_price.csv', low_memory=False)
df_trainR = pd.read_csv('data/ruc_Class25Q2_train_rent.csv', low_memory=False)

col_map = {
    '户型': '房屋户型',
    '装修': '装修情况',
    '楼层': '所在楼层',
    '面积': '建筑面积',
    '朝向': '房屋朝向',
    '电梯': '配备电梯'
    }

df_trainR = df_trainR.rename(columns=col_map)

# 预览
print("Price数据结构:")
print(df_trainP.info())
print("\nRent数据结构:")
print(df_trainR.info())

Price数据结构:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103871 entries, 0 to 103870
Data columns (total 55 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   城市         103871 non-null  int64  
 1   区域         103871 non-null  float64
 2   板块         103871 non-null  float64
 3   环线         40419 non-null   object 
 4   Price      103871 non-null  float64
 5   房屋户型       103291 non-null  object 
 6   所在楼层       103871 non-null  object 
 7   建筑面积       103871 non-null  object 
 8   套内面积       35984 non-null   object 
 9   房屋朝向       103870 non-null  object 
 10  建筑结构       103291 non-null  object 
 11  装修情况       103291 non-null  object 
 12  梯户比例       101252 non-null  object 
 13  配备电梯       91520 non-null   object 
 14  别墅类型       1443 non-null    object 
 15  交易时间       103871 non-null  object 
 16  交易权属       103871 non-null  object 
 17  上次交易       78422 non-null   object 
 18  房屋用途       103870 non-null  object 
 19  房屋年限       5

## 二、数据准备

### 1. 划分数据集

In [3]:
# 为避免数据泄露，首先划分训练集与测试集
Xp = df_trainP.drop(['Price'], axis=1)
yp = df_trainP['Price']
Xr = df_trainR.drop(['Price'], axis=1)
yr = df_trainR['Price']

Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    Xp, 
    yp, 
    test_size=0.2,
    random_state=111
)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, 
    yr, 
    test_size=0.2,
    random_state=111
)

print(f"Price训练集结构: Xp_train {Xp_train.shape}, yp_train {yp_train.shape}")
print(f"Price测试集结构: Xp_test {Xp_test.shape}, yp_test {yp_test.shape}")
print(f"\nRent训练集结构: Xr_train {Xr_train.shape}, yr_train {yr_train.shape}")
print(f"Rent测试集结构: Xr_test {Xr_test.shape}, yr_test {yr_test.shape}")

Price训练集结构: Xp_train (83096, 54), yp_train (83096,)
Price测试集结构: Xp_test (20775, 54), yp_test (20775,)

Rent训练集结构: Xr_train (79119, 45), yr_train (79119,)
Rent测试集结构: Xr_test (19780, 45), yr_test (19780,)


In [4]:
drop_col = ['年份']

Xp_train = Xp_train.drop(columns=drop_col)
Xr_train = Xr_train.drop(columns=drop_col)
Xp_test = Xp_test.drop(columns=drop_col)
Xr_test = Xr_test.drop(columns=drop_col)

### 2. 数据预处理

In [5]:
# 创建预处理类（共同特征）
class Preprocessor:
    def __init__(self):
        pass
    
    def extract_room_info(self, house_o):
        """使用正则表达式提取房间数量"""
        house_o = str(house_o).replace(' ', '')
        
        room_match = re.search(r'(\d+)(?:室|房间)', house_o)
        living_match = re.search(r'(\d+)厅', house_o)
        kitchen_match = re.search(r'(\d+)厨', house_o)
        bathroom_match = re.search(r'(\d+)卫', house_o)
        
        rooms = int(room_match.group(1)) if room_match else 0
        living_rooms = int(living_match.group(1)) if living_match else 0
        kitchens = int(kitchen_match.group(1)) if kitchen_match else 0
        bathrooms = int(bathroom_match.group(1)) if bathroom_match else 0
        
        return rooms, living_rooms, kitchens, bathrooms
    
    def extract_time_info(self, time_o):
        """提取建筑年代"""
        time_o = str(time_o).replace(' ', '')
        time_match = re.search(r'(\d{4})年', time_o)
        time = int(time_match.group(1)) if time_match else 0
        return time
    
    def extract_fee_info(self, fee_o):
        """提取物业费（取平均值）"""
        fee_o = str(fee_o).replace(' ', '')

        fee_match = re.search(r'(\d+\.?\d*)-(\d+\.?\d*)元/月/㎡', fee_o)
        if fee_match:
            fee_lower = float(fee_match.group(1))
            fee_upper = float(fee_match.group(2))
            return (fee_lower + fee_upper) / 2
        
        single_match = re.search(r'(\d+\.?\d*)元/月/㎡', fee_o)
        if single_match:
            fee_val = float(single_match.group(1))
            return fee_val
        
        return np.nan
    
    def extract_fuel_info(self, fuel_o):
        """提取燃气费（取平均值）"""
        fuel_o = str(fuel_o).replace(' ', '')

        fuel_match = re.search(r'(\d+\.?\d*)-(\d+\.?\d*)元/m³', fuel_o)
        if fuel_match:
            fuel_lower = float(fuel_match.group(1))
            fuel_upper = float(fuel_match.group(2))
            return (fuel_lower + fuel_upper) / 2
        
        fsingle_match = re.search(r'(\d+\.?\d*)元/m³', fuel_o)
        if fsingle_match:
            fuel_val = float(fsingle_match.group(1))
            return fuel_val
        
        return np.nan
    
    def extract_heat_info(self, heat_o):
        """提取供热费（取平均值）"""
        heat_o = str(heat_o).replace(' ', '')

        heat_match = re.search(r'(\d+\.?\d*)-(\d+\.?\d*)元/㎡', heat_o)
        if heat_match:
            heat_lower = float(heat_match.group(1))
            heat_upper = float(heat_match.group(2))
            return (heat_lower + heat_upper) / 2
        
        hsingle_match = re.search(r'(\d+\.?\d*)元/㎡', heat_o)
        if hsingle_match:
            heat_val = float(hsingle_match.group(1))
            return heat_val
        
        return np.nan
    
    def preprocess(self, df, verbose=True):
        df_processed = df.copy()
        
        # 1. 房屋户型处理
        room_info = df_processed['房屋户型'].apply(self.extract_room_info)
        df_processed['室'] = room_info.apply(lambda x: x[0])
        df_processed['厅'] = room_info.apply(lambda x: x[1])
        df_processed['厨'] = room_info.apply(lambda x: x[2])
        df_processed['卫'] = room_info.apply(lambda x: x[3])
        df_processed = df_processed.drop(columns=['房屋户型'])
        
        # 2. 建筑面积处理
        df_processed['建筑面积'] = df_processed['建筑面积'].str.replace('㎡', '').astype(float)
        
        # 3. 配备电梯处理
        df_processed['配备电梯'] = df_processed['配备电梯'].map({'有': 1, '无': 0})
        
        # 4. 交易时间处理
        df_processed['交易时间'] = pd.to_datetime(df_processed['交易时间'])
        df_processed['交易年份'] = df_processed['交易时间'].dt.year
        df_processed['交易月份'] = df_processed['交易时间'].dt.month
        df_processed = df_processed.drop(columns=['交易时间'])
        
        # 5. 建筑年代处理
        df_processed['建成时间'] = df_processed['建筑年代'].apply(self.extract_time_info)
        df_processed = df_processed.drop(columns=['建筑年代'])
        
        # 6. 房屋总数处理
        df_processed['房屋总数'] = df_processed['房屋总数'].str.replace('户', '').astype(float)
        
        # 7. 楼栋总数处理
        df_processed['楼栋总数'] = df_processed['楼栋总数'].str.replace('栋', '').astype(float)
        
        # 8. 绿化率处理
        df_processed['绿 化 率'] = df_processed['绿 化 率'].str.replace('%', '').astype(float)
        df_processed['绿 化 率'] = df_processed['绿 化 率'] * 0.01
        
        # 9. 物业费处理
        df_processed['物业费平均'] = df_processed['物 业 费'].apply(self.extract_fee_info)
        df_processed = df_processed.drop(columns=['物 业 费'])
        
        # 10. 燃气费处理
        df_processed['燃气费平均'] = df_processed['燃气费'].apply(self.extract_fuel_info)
        df_processed = df_processed.drop(columns=['燃气费'])
        
        # 11. 供热费处理
        df_processed['供热费平均'] = df_processed['供热费'].apply(self.extract_heat_info)
        df_processed = df_processed.drop(columns=['供热费'])
        
        if verbose:
            print(f"原始特征数量: {len(df.columns)}")
            print(f"处理后特征数量: {len(df_processed.columns)}")
            print(f"删除的特征: {set(df.columns) - set(df_processed.columns)}")
            print(f"新增的特征: {set(df_processed.columns) - set(df.columns)}")
        
        return df_processed

In [6]:
# 执行预处理
if __name__ == "__main__":
    preprocessor = Preprocessor()
    
    Xp_train = preprocessor.preprocess(Xp_train)
    print("Price训练集数据预处理完成\n")
    Xp_test= preprocessor.preprocess(Xp_test)
    print("Price测试集数据预处理完成\n")
    Xr_train = preprocessor.preprocess(Xr_train)
    print("Rent训练集数据预处理完成\n")
    Xr_test= preprocessor.preprocess(Xr_test)
    print("Rent测试集数据预处理完成")

原始特征数量: 53
处理后特征数量: 57
删除的特征: {'供热费', '燃气费', '房屋户型', '物 业 费', '交易时间', '建筑年代'}
新增的特征: {'交易月份', '物业费平均', '建成时间', '交易年份', '厨', '卫', '厅', '燃气费平均', '供热费平均', '室'}
Price训练集数据预处理完成

原始特征数量: 53
处理后特征数量: 57
删除的特征: {'供热费', '燃气费', '房屋户型', '物 业 费', '交易时间', '建筑年代'}
新增的特征: {'交易月份', '物业费平均', '建成时间', '交易年份', '厨', '卫', '厅', '燃气费平均', '供热费平均', '室'}
Price测试集数据预处理完成

原始特征数量: 44
处理后特征数量: 48
删除的特征: {'供热费', '燃气费', '房屋户型', '物 业 费', '交易时间', '建筑年代'}
新增的特征: {'交易月份', '物业费平均', '建成时间', '交易年份', '厨', '卫', '厅', '燃气费平均', '供热费平均', '室'}
Rent训练集数据预处理完成

原始特征数量: 44
处理后特征数量: 48
删除的特征: {'供热费', '燃气费', '房屋户型', '物 业 费', '交易时间', '建筑年代'}
新增的特征: {'交易月份', '物业费平均', '建成时间', '交易年份', '厨', '卫', '厅', '燃气费平均', '供热费平均', '室'}
Rent测试集数据预处理完成


In [7]:
# 分别处理不同特征
def process_floor_p(df, verbose=True):
    def extract_floor_p(floor_o):
        """提取所在楼层"""
        floor_o = str(floor_o)
        
        # 地下室
        if '地下室' in floor_o:
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            return ('地下室', total_floor)
        
        # 底层
        elif '底层' in floor_o:
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            return ('底层', total_floor)
        
        # 顶层
        elif '顶层' in floor_o:
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            return ('顶层', total_floor)
        
        # 低/中/高楼层
        elif any(x in floor_o for x in ['低楼层', '中楼层', '高楼层']):
            floor_type = None
            if '低楼层' in floor_o: 
                floor_type = '低楼层'
            elif '中楼层' in floor_o: 
                floor_type = '中楼层'
            elif '高楼层' in floor_o: 
                floor_type = '高楼层'
            
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            
            return (floor_type, total_floor)

        return ('其他', None)
    
    df_processed = df.copy()
    
    # 提取楼层信息
    floor_info = df_processed['所在楼层'].apply(extract_floor_p)
    df_processed['楼层类型'] = floor_info.apply(lambda x: x[0])
    df_processed['总楼层'] = floor_info.apply(lambda x: x[1])
    df_processed = df_processed.drop(columns=['所在楼层'])
    
    if verbose:
        print(f"楼层类型分布:")
        print(df_processed['楼层类型'].value_counts())
        print(f"总楼层统计:")
        print(df_processed['总楼层'].describe())
    
    return df_processed

print("Price训练集楼层数据处理：")
Xp_train = process_floor_p(Xp_train)
print("\nPrice测试集楼层数据处理：")
Xp_test = process_floor_p(Xp_test)

Price训练集楼层数据处理：
楼层类型分布:
中楼层    29047
高楼层    25598
低楼层    24570
顶层      1797
底层      1520
地下室      564
Name: 楼层类型, dtype: int64
总楼层统计:
count    83096.00000
mean        18.25479
std         10.98071
min          0.00000
25%          7.00000
50%         18.00000
75%         28.00000
max         70.00000
Name: 总楼层, dtype: float64

Price测试集楼层数据处理：
楼层类型分布:
中楼层    7204
高楼层    6534
低楼层    6098
顶层      467
底层      334
地下室     138
Name: 楼层类型, dtype: int64
总楼层统计:
count    20775.000000
mean        18.341757
std         10.995631
min          0.000000
25%          7.000000
50%         18.000000
75%         28.000000
max         63.000000
Name: 总楼层, dtype: float64


In [8]:
def process_floor_r(df, verbose=True):
    def extract_floor_r(floor_o):
        """提取所在楼层"""
        floor_o = str(floor_o).strip()
        
        slash_match = re.search(r'(\d+)/(\d+)层', floor_o)
        if slash_match:
            current_floor = int(slash_match.group(1))
            total_floor = int(slash_match.group(2))
            
            # 根据当前楼层与总楼层的关系判断楼层类型
            if current_floor == 1:
                return ('底层', total_floor)
            elif current_floor == total_floor:
                return ('顶层', total_floor)
            else:
                floor_ratio = current_floor / total_floor
                if floor_ratio <= 0.33:
                    return ('低楼层', total_floor)
                elif floor_ratio <= 0.66:
                    return ('中楼层', total_floor)
                else:
                    return ('高楼层', total_floor)
        
        # 地下室
        elif '地下室' in floor_o:
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            return ('地下室', total_floor)
        
        # 底层
        elif '底层' in floor_o:
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            return ('底层', total_floor)
        
        # 顶层
        elif '顶层' in floor_o:
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            return ('顶层', total_floor)
        
        # 低/中/高楼层
        elif any(x in floor_o for x in ['低楼层', '中楼层', '高楼层']):
            floor_type = None
            if '低楼层' in floor_o: 
                floor_type = '低楼层'
            elif '中楼层' in floor_o: 
                floor_type = '中楼层'
            elif '高楼层' in floor_o: 
                floor_type = '高楼层'
            
            total_match = re.search(r'共(\d+)层', floor_o)
            total_floor = int(total_match.group(1)) if total_match else None
            
            return (floor_type, total_floor)

        return ('其他', None)
    
    df_processed = df.copy()
    
    # 提取楼层信息
    floor_info = df_processed['所在楼层'].apply(extract_floor_r)
    df_processed['楼层类型'] = floor_info.apply(lambda x: x[0])
    df_processed['总楼层'] = floor_info.apply(lambda x: x[1])
    df_processed = df_processed.drop(columns=['所在楼层'])
    
    if verbose:
        print(f"楼层类型分布:")
        print(df_processed['楼层类型'].value_counts())
        print(f"总楼层统计:")
        print(df_processed['总楼层'].describe())
    
    return df_processed

print("Rent训练集楼层数据处理：")
Xr_train = process_floor_r(Xr_train)
print("\nRent测试集楼层数据处理：")
Xr_test = process_floor_r(Xr_test)

Rent训练集楼层数据处理：
楼层类型分布:
中楼层    29820
高楼层    26407
低楼层    21807
顶层       622
底层       251
地下室      203
其他         9
Name: 楼层类型, dtype: int64
总楼层统计:
count    7343.000000
mean       19.279041
std        11.007286
min         1.000000
25%         8.000000
50%        18.000000
75%        29.000000
max        75.000000
Name: 总楼层, dtype: float64

Rent测试集楼层数据处理：
楼层类型分布:
中楼层    7389
高楼层    6594
低楼层    5500
顶层      183
底层       62
地下室      49
其他        3
Name: 楼层类型, dtype: int64
总楼层统计:
count    1843.000000
mean       19.285947
std        11.599036
min         1.000000
25%         7.000000
50%        18.000000
75%        30.000000
max        48.000000
Name: 总楼层, dtype: float64


In [9]:
# 处理目标变量（取对数，改善数据分布并改进模型性能）
yp_train= np.log(yp_train)
yr_train = np.log(yr_train)
yp_test = np.log(yp_test)
yr_test = np.log(yr_test)

### 3. 异常值处理

In [10]:
def detect_outliers_iqr_x_only(X0, y0, threshold=3):
    # 检测X异常值
    outlier_info_X = {}
    mask_X = pd.Series([True] * len(X0), index=X0.index)
    
    numeric_columns = X0.select_dtypes(include=[np.int64, np.float64]).columns
    
    for col in numeric_columns:
        Q1 = np.percentile(X0[col], 25)
        Q3 = np.percentile(X0[col], 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        outliers_mask = (X0[col] < lower_bound) | (X0[col] > upper_bound)
        n_outliers = outliers_mask.sum()
        
        if n_outliers > 0:
            outlier_info_X[col] = {
                'n_outliers': n_outliers,
                'outlier_rate': n_outliers / len(X0) * 100,
                'bounds': [lower_bound, upper_bound],
                'actual_range': [X0[col].min(), X0[col].max()]
            }
            
            mask_X = mask_X & ~outliers_mask
    
    n_X_outliers = len(X0) - np.sum(mask_X)
    
    print(f"原始样本量: {len(X0)}")
    print(f"X数值列异常值样本数: {n_X_outliers} ({n_X_outliers/len(X0)*100:.2f}%)")
    
    if outlier_info_X:
        print("\nX异常值详情:")
        for col, info in outlier_info_X.items():
            print(f"  {col}:")
            print(f"    异常值数量: {info['n_outliers']} ({info['outlier_rate']:.2f}%)")
            print(f"    正常范围: [{info['bounds'][0]:.2f}, {info['bounds'][1]:.2f}]")
            print(f"    实际范围: [{info['actual_range'][0]:.2f}, {info['actual_range'][1]:.2f}]")
    else:
        print("\n未检测到异常值")
    
    return mask_X, outlier_info_X

mask_Xp, outlier_info_Xp = detect_outliers_iqr_x_only(Xp_train, yp_train)
mask_Xr, outlier_info_Xr = detect_outliers_iqr_x_only(Xr_train, yr_train)

Xp_train = Xp_train[mask_Xp]
yp_train = yp_train[mask_Xp]
Xr_train = Xr_train[mask_Xr]
yr_train = yr_train[mask_Xr]

原始样本量: 83096
X数值列异常值样本数: 3772 (4.54%)

X异常值详情:
  建筑面积:
    异常值数量: 1160 (1.40%)
    正常范围: [-72.80, 262.71]
    实际范围: [11.70, 508.11]
  室:
    异常值数量: 154 (0.19%)
    正常范围: [-1.00, 6.00]
    实际范围: [0.00, 12.00]
  厅:
    异常值数量: 5 (0.01%)
    正常范围: [-2.00, 5.00]
    实际范围: [0.00, 9.00]
  厨:
    异常值数量: 2676 (3.22%)
    正常范围: [1.00, 1.00]
    实际范围: [0.00, 7.00]
  卫:
    异常值数量: 79 (0.10%)
    正常范围: [-2.00, 5.00]
    实际范围: [0.00, 12.00]
  交易年份:
    异常值数量: 19 (0.02%)
    正常范围: [2020.00, 2027.00]
    实际范围: [2018.00, 2025.00]
原始样本量: 79119
X数值列异常值样本数: 5137 (6.49%)

X异常值详情:
  建筑面积:
    异常值数量: 577 (0.73%)
    正常范围: [-98.26, 248.35]
    实际范围: [6.05, 440.00]
  室:
    异常值数量: 4 (0.01%)
    正常范围: [-5.00, 9.00]
    实际范围: [0.00, 12.00]
  厅:
    异常值数量: 1 (0.00%)
    正常范围: [-2.00, 5.00]
    实际范围: [0.00, 7.00]
  卫:
    异常值数量: 36 (0.05%)
    正常范围: [-3.00, 4.00]
    实际范围: [0.00, 9.00]
  交易年份:
    异常值数量: 4566 (5.77%)
    正常范围: [2024.00, 2024.00]
    实际范围: [2024.00, 2025.00]


In [11]:
def detect_outliers_iqr_y_only(y0, threshold=3):
    Q1_y = np.percentile(y0, 25)
    Q3_y = np.percentile(y0, 75)
    IQR_y = Q3_y - Q1_y
    lower_bound_y = Q1_y - threshold * IQR_y
    upper_bound_y = Q3_y + threshold * IQR_y
    
    mask_y = (y0 >= lower_bound_y) & (y0 <= upper_bound_y)
    n_outliers_y = len(y0) - np.sum(mask_y)
    
    outlier_info = {
        'n_outliers': n_outliers_y,
        'outlier_rate': n_outliers_y / len(y0) * 100,
        'bounds': [lower_bound_y, upper_bound_y],
        'actual_range': [y0.min(), y0.max()]
    }
    
    print(f"y原始样本量: {len(y0)}")
    print(f"y异常值数量: {n_outliers_y} ({n_outliers_y/len(y0)*100:.2f}%)")
    print(f"y正常范围: [{lower_bound_y:.2f}, {upper_bound_y:.2f}]")
    print(f"y实际范围: [{y0.min():.2f}, {y0.max():.2f}]")
    
    return mask_y, outlier_info

print("Price数据集:")
mask_yp, info_yp = detect_outliers_iqr_y_only(yp_train)
print("\nRent数据集:")
mask_yr, info_yr = detect_outliers_iqr_y_only(yr_train)

Xp_train = Xp_train[mask_yp]
yp_train = yp_train[mask_yp]
Xr_train = Xr_train[mask_yr]
yr_train = yr_train[mask_yr]

Price数据集:
y原始样本量: 79324
y异常值数量: 0 (0.00%)
y正常范围: [10.48, 18.00]
y实际范围: [11.60, 17.15]

Rent数据集:
y原始样本量: 73982
y异常值数量: 0 (0.00%)
y正常范围: [9.11, 16.77]
y实际范围: [9.79, 15.95]


### 4. 特征工程

In [12]:
# 添加非线性特征和交互项
def create_features(df):
    df_new = df.copy()

    df_new['建筑面积2'] = df_new['建筑面积'] ** 2
    df_new['总楼层2'] = df_new['总楼层'] ** 2
    df_new['面积_室交互'] = df_new['建筑面积'] * df_new['室']
    df_new['面积_厅交互'] = df_new['建筑面积'] * df_new['厅']
    df_new['面积_厨交互'] = df_new['建筑面积'] * df_new['厨']
    df_new['面积_卫交互'] = df_new['建筑面积'] * df_new['卫']
    df_new['楼层_电梯交互'] = df_new['总楼层'] * df_new['配备电梯']
    df_new['位置交互'] = df_new['coord_x'] * df_new['coord_y']
    
    return df_new

Xp_train = create_features(Xp_train)
Xp_test = create_features(Xp_test)
Xr_train = create_features(Xr_train)
Xr_test = create_features(Xr_test)

## 三、模型训练

### 1. 管道构建

In [13]:
# Price处理管道（缺失值填充、类别变量编码）
cat_features_p = Xp_train.select_dtypes(include='object').columns
num_features_p = Xp_train.select_dtypes(include=np.number).columns

cat_pipe_p = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=True))
])

num_pipe_p = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

preprocessor_p = ColumnTransformer([
    ('cat', cat_pipe_p, cat_features_p),
    ('num', num_pipe_p, num_features_p)
])

In [14]:
# Rent处理管道（缺失值填充、类别变量编码）
cat_features_r = Xr_train.select_dtypes(include='object').columns
num_features_r = Xr_train.select_dtypes(include=np.number).columns

cat_pipe_r = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=True))
])

num_pipe_r = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

preprocessor_r = ColumnTransformer([
    ('cat', cat_pipe_r, cat_features_r),
    ('num', num_pipe_r, num_features_r)
])

### 2. 模型构建与训练（Mini-batch Gradient Descent）

In [15]:
# Price建模
# 1. OLS
ols_pipeline_p = Pipeline([
    ('preprocessor', preprocessor_p),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty=None,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,        # 启用早停策略以防止过拟合
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# 2. Lasso
lasso_pipeline_p = Pipeline([
    ('preprocessor', preprocessor_p),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty='l1',
        alpha=0.001,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# 3. Ridge
ridge_pipeline_p = Pipeline([
    ('preprocessor', preprocessor_p),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty='l2',
        alpha=0.001,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# 4. Elastic Net
elastic_pipeline_p = Pipeline([
    ('preprocessor', preprocessor_p),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty='elasticnet',
        alpha=0.001,
        l1_ratio=0.5,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# Rent建模
# 1. OLS
ols_pipeline_r = Pipeline([
    ('preprocessor', preprocessor_r),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty=None,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# 2. Lasso
lasso_pipeline_r = Pipeline([
    ('preprocessor', preprocessor_r),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty='l1',
        alpha=0.001,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# 3. Ridge
ridge_pipeline_r = Pipeline([
    ('preprocessor', preprocessor_r),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty='l2',
        alpha=0.001,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# 4. Elastic Net
elastic_pipeline_r = Pipeline([
    ('preprocessor', preprocessor_r),
    ('regressor', SGDRegressor(
        loss='squared_error',
        penalty='elasticnet',
        alpha=0.001,
        l1_ratio=0.5,
        learning_rate='invscaling',
        eta0=0.01,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    ))
])

# Price模型集
models_p = {
    'OLS_p': ols_pipeline_p,
    'Lasso_p': lasso_pipeline_p,
    'Ridge_p': ridge_pipeline_p,
    'ElasticNet_p': elastic_pipeline_p
}

# Rent模型集
models_r = {
    'OLS_r': ols_pipeline_r,
    'Lasso_r': lasso_pipeline_r,
    'Ridge_r': ridge_pipeline_r,
    'ElasticNet_r': elastic_pipeline_r
}

# 训练Price模型
for name, model in models_p.items():
    model.fit(Xp_train, yp_train)
    print(f"{name} 训练完成")

# 训练Rent模型
for name, model in models_r.items():
    model.fit(Xr_train, yr_train)
    print(f"{name} 训练完成")

# 模型评估（计算MAE、RMAE）
def evaluate_models_rmae(models, X_train, X_test, y_train, y_test, dataset_name=""):
    results = {}
    for name, model in models.items():
        y_train_pred = model.predict(X_train)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmae = train_mae / np.mean(np.abs(y_train))
        
        y_test_pred = model.predict(X_test)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmae = test_mae / np.mean(np.abs(y_test))
        
        regressor = model.named_steps['regressor']
        results[name] = {
            'Train_MAE': train_mae,
            'Train_RMAE': train_rmae,
            'Test_MAE': test_mae,
            'Test_RMAE': test_rmae,
            'Iterations': regressor.n_iter_,
            'Early_Stopped': regressor.n_iter_ < regressor.max_iter
        }
        
        prefix = f"[{dataset_name}] " if dataset_name else ""
        print(f"{prefix}{name}:")
        print(f"  训练集 MAE: {train_mae:.4f}")
        print(f"  训练集 RMAE: {train_rmae:.4f} ({train_rmae*100:.2f}%)")
        print(f"  测试集 MAE: {test_mae:.4f}")
        print(f"  测试集 RMAE: {test_rmae:.4f} ({test_rmae*100:.2f}%)")
        print(f"  迭代次数: {regressor.n_iter_}")
        print(f"  是否早停: {'是' if regressor.n_iter_ < regressor.max_iter else '否'}")
        print("-" * 60)
    
    return results

print("\nPrice模型训练结果:")
results_p = evaluate_models_rmae(models_p, Xp_train, Xp_test, yp_train, yp_test, "Price")

print("\nRent模型训练结果:")
results_r = evaluate_models_rmae(models_r, Xr_train, Xr_test, yr_train, yr_test, "Rent")

# 确定最佳模型
best_model_name_p = min(results_p.items(), key=lambda x: x[1]['Test_RMAE'])[0]
best_rmae_p = results_p[best_model_name_p]['Test_RMAE']

best_model_name_r = min(results_r.items(), key=lambda x: x[1]['Test_RMAE'])[0]
best_rmae_r = results_r[best_model_name_r]['Test_RMAE']

print(f"\nPrice最佳模型: {best_model_name_p}")
print(f"Price最佳测试集RMAE: {best_rmae_p:.4f} ({best_rmae_p*100:.2f}%)")

print(f"\nRent最佳模型: {best_model_name_r}")
print(f"Rent最佳测试集RMAE: {best_rmae_r:.4f} ({best_rmae_r*100:.2f}%)")

OLS_p 训练完成
Lasso_p 训练完成
Ridge_p 训练完成
ElasticNet_p 训练完成
OLS_r 训练完成
Lasso_r 训练完成
Ridge_r 训练完成
ElasticNet_r 训练完成

Price模型训练结果:
[Price] OLS_p:
  训练集 MAE: 0.2092
  训练集 RMAE: 0.0147 (1.47%)
  测试集 MAE: 0.2323
  测试集 RMAE: 0.0163 (1.63%)
  迭代次数: 19
  是否早停: 是
------------------------------------------------------------
[Price] Lasso_p:
  训练集 MAE: 0.2922
  训练集 RMAE: 0.0205 (2.05%)
  测试集 MAE: 0.3049
  测试集 RMAE: 0.0214 (2.14%)
  迭代次数: 32
  是否早停: 是
------------------------------------------------------------
[Price] Ridge_p:
  训练集 MAE: 0.2417
  训练集 RMAE: 0.0170 (1.70%)
  测试集 MAE: 0.2601
  测试集 RMAE: 0.0182 (1.82%)
  迭代次数: 11
  是否早停: 是
------------------------------------------------------------
[Price] ElasticNet_p:
  训练集 MAE: 0.2884
  训练集 RMAE: 0.0202 (2.02%)
  测试集 MAE: 0.3006
  测试集 RMAE: 0.0211 (2.11%)
  迭代次数: 11
  是否早停: 是
------------------------------------------------------------

Rent模型训练结果:
[Rent] OLS_r:
  训练集 MAE: 0.1692
  训练集 RMAE: 0.0131 (1.31%)
  测试集 MAE: 0.1804
  测试集 RMAE: 0.0139 (1.39%)


### 3. 六重交叉验证

In [16]:
def cv_rmae(models_p, models_r, Xp_train, yp_train, Xr_train, yr_train, cv_folds=6):
    # Price
    print("\nPrice验证结果:")
    print("-" * 30)
    for name, model in models_p.items():
        rmse_scores = np.sqrt(-cross_val_score(
            model, Xp_train, yp_train, 
            cv=cv_folds, 
            scoring='neg_mean_squared_error'
        ))
        
        print(f"{name:20} | RMAE: {np.mean(rmse_scores):.4f} (±{np.std(rmse_scores):.4f})")
    
    # Rent
    print("\nRent验证结果:")
    print("-" * 30)
    for name, model in models_r.items():
        rmse_scores = np.sqrt(-cross_val_score(
            model, Xr_train, yr_train, 
            cv=cv_folds, 
            scoring='neg_mean_squared_error'
        ))
        
        print(f"{name:20} | RMAE: {np.mean(rmse_scores):.4f} (±{np.std(rmse_scores):.4f})")

cv_rmae(models_p, models_r, Xp_train, yp_train, Xr_train, yr_train)


Price验证结果:
------------------------------
OLS_p                | RMAE: 0.2870 (±0.0093)
Lasso_p              | RMAE: 0.3879 (±0.0063)
Ridge_p              | RMAE: 0.3418 (±0.0011)
ElasticNet_p         | RMAE: 0.3849 (±0.0012)

Rent验证结果:
------------------------------
OLS_r                | RMAE: 0.2546 (±0.0080)
Lasso_r              | RMAE: 0.3465 (±0.0042)
Ridge_r              | RMAE: 0.3014 (±0.0033)
ElasticNet_r         | RMAE: 0.3369 (±0.0025)


### 4. 模型调优

In [17]:
def tune_ols_model(X_train, y_train, X_test, y_test, preprocessor, dataset_name=""):
    learning_rates = ['constant', 'invscaling']
    eta0_values = [0.001, 0.01, 0.1]
    max_iter_values = [1000, 1500]
    
    best_score = float('inf')
    best_model = None
    best_params = {}
    
    total_combinations = len(learning_rates) * len(eta0_values) * len(max_iter_values)
    current_combination = 0
    
    for lr in learning_rates:
        for eta0 in eta0_values:
            for max_iter in max_iter_values:
                current_combination += 1
                print(f"测试组合 {current_combination}/{total_combinations}: "
                      f"learning_rate='{lr}', eta0={eta0}, max_iter={max_iter}")
                
                try:
                    model = Pipeline([
                        ('preprocessor', preprocessor),
                        ('regressor', SGDRegressor(
                            loss='squared_error',
                            penalty=None,
                            alpha=0,
                            learning_rate=lr,
                            eta0=eta0,
                            max_iter=max_iter,
                            random_state=42,
                            tol=1e-4,
                            early_stopping=True, # 启用早停
                            validation_fraction=0.2,
                            n_iter_no_change=10,
                            shuffle=True
                        ))
                    ])
                    
                    from sklearn.model_selection import train_test_split
                    X_tr, X_val, y_tr, y_val = train_test_split(
                        X_train, y_train, test_size=0.15, random_state=42
                    )
                    
                    model.fit(X_tr, y_tr)
                    y_pred = model.predict(X_val)
                    score = mean_absolute_error(y_val, y_pred)
                    
                    print(f"  验证MAE: {score:.4f}")
                    
                    if score < best_score:
                        best_score = score
                        best_params = {
                            'learning_rate': lr,
                            'eta0': eta0,
                            'max_iter': max_iter
                        }
                        best_model = model
                        
                except Exception as e:
                    print(f"  训练失败: {e}")
                    continue
    
    if best_model is not None:
        print(f"\n最佳参数: {best_params}")
        print(f"最佳测试集MAE: {best_score:.4f}")
        
        print(f"重新训练最佳OLS模型...")
        final_model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', SGDRegressor(
                loss='squared_error',
                penalty=None,
                learning_rate=best_params['learning_rate'],
                eta0=best_params['eta0'],
                max_iter=best_params['max_iter'],
                random_state=42,
                tol=1e-4,
                early_stopping=True,
                validation_fraction=0.15,
                n_iter_no_change=10,
                shuffle=True
            ))
        ])
        
        final_model.fit(X_train, y_train)
        final_results = evaluate_ols_model(final_model, X_train, y_train, X_test, y_test, dataset_name)
        
        return final_model, best_params, final_results
    else:
        print(f"所有参数组合失败，使用默认OLS模型")
        default_model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', SGDRegressor(
                loss='squared_error',
                penalty=None,
                learning_rate='constant',
                eta0=0.01,
                max_iter=1500,
                random_state=42,
                tol=1e-4,
                early_stopping=True,
                validation_fraction=0.15,
                n_iter_no_change=10
            ))
        ])
        default_model.fit(X_train, y_train)
        final_results = evaluate_ols_model(default_model, X_train, y_train, X_test, y_test, dataset_name)
        return default_model, {'learning_rate': 'constant', 'eta0': 0.01, 'max_iter': 1500}, final_results

def evaluate_ols_model(model, X_train, y_train, X_test, y_test, dataset_name=""):
    prefix = f"[{dataset_name}] " if dataset_name else ""
    
    # 训练集预测
    y_train_pred = model.predict(X_train)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_rmae = np.sqrt(train_mae)
    
    # 测试集预测
    y_test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmae = np.sqrt(test_mae)
    
    print(f"\n{prefix}" + "=" * 50)
    print(f"{prefix}模型最终评估:")
    print(f"{prefix}" + "=" * 50)
    print(f"{prefix}训练集 - MAE: {train_mae:.4f}, RMAE: {train_rmae:.4f}")
    print(f"{prefix}测试集 - MAE: {test_mae:.4f}, RMAE: {test_rmae:.4f}")

    return {
        'train_mae': train_mae,
        'train_rmae': train_rmae,
        'test_mae': test_mae,
        'test_rmae': test_rmae,
        'predictions': y_test_pred
    }

# 执行Price的OLS模型调优
print("\n" + "="*60)
print("Price的OLS模型调优")
print("="*60)
ols_model_p, best_params_p, ols_results_p = tune_ols_model(
    Xp_train, yp_train, Xp_test, yp_test, preprocessor_p, "P"
)

# 执行Rent的OLS模型调优
print("\n" + "="*60)
print("Rent的OLS模型调优")
print("="*60)
ols_model_r, best_params_r, ols_results_r = tune_ols_model(
    Xr_train, yr_train, Xr_test, yr_test, preprocessor_r, "R"
)

# 保存模型
joblib.dump(ols_model_p, 'ols_model_p.pkl')
joblib.dump(ols_model_r, 'ols_model_r.pkl')
print(f"\n模型已保存")


Price的OLS模型调优
测试组合 1/12: learning_rate='constant', eta0=0.001, max_iter=1000
  验证MAE: 0.2157
测试组合 2/12: learning_rate='constant', eta0=0.001, max_iter=1500
  验证MAE: 0.2157
测试组合 3/12: learning_rate='constant', eta0=0.01, max_iter=1000
  验证MAE: 0.1520
测试组合 4/12: learning_rate='constant', eta0=0.01, max_iter=1500
  验证MAE: 0.1520
测试组合 5/12: learning_rate='constant', eta0=0.1, max_iter=1000
  验证MAE: 1478517807435.7671
测试组合 6/12: learning_rate='constant', eta0=0.1, max_iter=1500
  验证MAE: 1478517807435.7671
测试组合 7/12: learning_rate='invscaling', eta0=0.001, max_iter=1000
  验证MAE: 0.4038
测试组合 8/12: learning_rate='invscaling', eta0=0.001, max_iter=1500
  验证MAE: 0.4038
测试组合 9/12: learning_rate='invscaling', eta0=0.01, max_iter=1000
  验证MAE: 0.2434
测试组合 10/12: learning_rate='invscaling', eta0=0.01, max_iter=1500
  验证MAE: 0.2434
测试组合 11/12: learning_rate='invscaling', eta0=0.1, max_iter=1000
  验证MAE: 0.1531
测试组合 12/12: learning_rate='invscaling', eta0=0.1, max_iter=1500
  验证MAE: 0.1531

最佳参数: {'l

In [18]:
# metrics table
metrics_p = {
    'Metrics': ['OLS', 'Lasso', 'Ridge', 'ElasticNet'],
    'In sample': [0.0891, 0.2922, 0.2417, 0.2884],
    'Out of sample': [0.1769, 0.3049, 0.2601, 0.3006],
    'Cross-validation': [0.2870, 0.3879, 0.3418, 0.3849],
    'Kaggle Score': [67.58, 67.58, 67.58, 67.58]
}

metrics_r = {
    'Metrics': ['OLS', 'Lasso', 'Ridge', 'ElasticNet'],
    'In sample': [0.1692, 0.2651, 0.2177, 0.2536],
    'Out of sample': [0.1804, 0.2680, 0.2253, 0.2577],
    'Cross-validation': [0.2546, 0.3465, 0.3014, 0.3369],
    'Kaggle Score': [67.58, 67.58, 67.58, 67.58]
}

df_metrics_p = pd.DataFrame(metrics_p)
df_metrics_r = pd.DataFrame(metrics_r)
print("Price:")
print(df_metrics_p)
print("\nRent:")
print(df_metrics_r)

Price:
      Metrics  In sample  Out of sample  Cross-validation  Kaggle Score
0         OLS     0.0891         0.1769            0.2870         67.58
1       Lasso     0.2922         0.3049            0.3879         67.58
2       Ridge     0.2417         0.2601            0.3418         67.58
3  ElasticNet     0.2884         0.3006            0.3849         67.58

Rent:
      Metrics  In sample  Out of sample  Cross-validation  Kaggle Score
0         OLS     0.1692         0.1804            0.2546         67.58
1       Lasso     0.2651         0.2680            0.3465         67.58
2       Ridge     0.2177         0.2253            0.3014         67.58
3  ElasticNet     0.2536         0.2577            0.3369         67.58


## 四、预测结果

In [19]:
# 数据准备
df_testP = pd.read_csv('data/ruc_Class25Q2_test_price.csv', low_memory=False)
df_testR = pd.read_csv('data/ruc_Class25Q2_test_rent.csv', low_memory=False)

df_testR = df_testR.rename(columns=col_map)

df_testP = df_testP.drop(columns=drop_col)
df_testR = df_testR.drop(columns=drop_col)

if __name__ == "__main__":
    preprocessor = Preprocessor()
    df_testP = preprocessor.preprocess(df_testP)
    df_testR = preprocessor.preprocess(df_testR)
    
df_testP = process_floor_p(df_testP)
df_testR = process_floor_r(df_testR)
df_testP = create_features(df_testP)
df_testR = create_features(df_testR)

原始特征数量: 54
处理后特征数量: 58
删除的特征: {'供热费', '燃气费', '房屋户型', '物 业 费', '交易时间', '建筑年代'}
新增的特征: {'交易月份', '物业费平均', '建成时间', '交易年份', '厨', '卫', '厅', '燃气费平均', '供热费平均', '室'}
原始特征数量: 45
处理后特征数量: 49
删除的特征: {'供热费', '燃气费', '房屋户型', '物 业 费', '交易时间', '建筑年代'}
新增的特征: {'交易月份', '物业费平均', '建成时间', '交易年份', '厨', '卫', '厅', '燃气费平均', '供热费平均', '室'}
楼层类型分布:
中楼层    13214
高楼层    10130
低楼层     9302
顶层       720
底层       623
地下室       28
Name: 楼层类型, dtype: int64
总楼层统计:
count    34017.000000
mean        17.474322
std         10.797705
min          0.000000
25%          6.000000
50%         17.000000
75%         27.000000
max         58.000000
Name: 总楼层, dtype: float64
楼层类型分布:
中楼层    3614
高楼层    3230
低楼层    2666
顶层      154
底层       69
地下室      40
Name: 楼层类型, dtype: int64
总楼层统计:
count    1444.000000
mean       17.660665
std        10.509070
min         2.000000
25%         6.000000
50%        18.000000
75%        27.000000
max        48.000000
Name: 总楼层, dtype: float64


In [20]:
# 使用已训练的模型进行预测
trained_ols_model_p = joblib.load('ols_model_p.pkl')
trained_ols_model_r = joblib.load('ols_model_r.pkl')

df_testP_predictions = trained_ols_model_p.predict(df_testP)
df_testR_predictions = trained_ols_model_r.predict(df_testR)

df_testP_predictions = np.exp(df_testP_predictions)
df_testR_predictions = np.exp(df_testR_predictions)

# 保存结果
def create_results_df(df_test, predictions):
    if 'ID' in df_test.columns:
        results_df = pd.DataFrame({
            'ID': df_test['ID'],
            'Price': predictions
        })
    else:
        results_df = pd.DataFrame({
            'ID': df_test.index,
            'Price': predictions
        })
    return results_df

results_df_p = create_results_df(df_testP, df_testP_predictions)
results_df_r = create_results_df(df_testR, df_testR_predictions)

# 拼接Price和Rent结果
results_df_combined = pd.concat([results_df_p, results_df_r], ignore_index=True)

def save_ols_predictions(results_df_p, results_df_r, results_df_combined,
                         filename_p='Price__predictions.csv', 
                         filename_r='Rent_predictions.csv',
                         filename_combined='Final_predictions.csv'):
    results_df_p.to_csv(filename_p, index=False)
    results_df_r.to_csv(filename_r, index=False)
    
    results_df_combined.to_csv(filename_combined, index=False)
    
    print(f"\n预测结果已保存:")
    print(f"Price: {filename_p} (包含 {len(results_df_p)} 行数据)")
    print(f"Rent: {filename_r} (包含 {len(results_df_r)} 行数据)")
    print(f"合并结果: {filename_combined} (包含 {len(results_df_combined)} 行数据)")
    
    return results_df_p, results_df_r, results_df_combined

results_df_p, results_df_r, results_df_combined = save_ols_predictions(
    results_df_p, results_df_r, results_df_combined
)

# 预测结果统计
print(f"\n预测结果统计:")
print(f"最小值: {results_df_combined['Price'].min():.4f}")
print(f"最大值: {results_df_combined['Price'].max():.4f}")
print(f"均值: {results_df_combined['Price'].mean():.4f}")
print(f"标准差: {results_df_combined['Price'].std():.4f}")


预测结果已保存:
Price: Price__predictions.csv (包含 34017 行数据)
Rent: Rent_predictions.csv (包含 9773 行数据)
合并结果: Final_predictions.csv (包含 43790 行数据)

预测结果统计:
最小值: 25485.6344
最大值: 183411643.9154
均值: 2128543.6788
标准差: 3102969.3734
