In [6]:
import pandas as pd
import numpy as np
import re

In [7]:
df_raw = pd.read_csv('ruc_Class25Q2_train_price.csv',low_memory=False)
df_raw

Unnamed: 0,城市,区域,板块,环线,Price,房屋户型,所在楼层,建筑面积,套内面积,房屋朝向,...,供水,供暖,供电,燃气费,供热费,停车位,停车费用,coord_x,coord_y,客户反馈
0,0,109.0,150.0,二至三环,6.194049e+06,2室1厅1厨1卫,中楼层 (共5层),52.3㎡,,南 北,...,民水,集中供暖,民电,2.61元/m³,30元/㎡,300.0,暂无,117.424278,40.975752,听说，设施老旧，停车费高
1,0,65.0,299.0,五至六环,4.354153e+06,3室1厅1厨1卫,顶层 (共6层),127.44㎡,123.7㎡,南 北,...,商水/民水,自采暖,商电/民电,2.61元/m³,,1550.0,150,117.389228,41.091295,整体印象，网速快，面积适中
2,0,62.0,911.0,五至六环,3.321992e+06,3室2厅1厨2卫,低楼层 (共6层),118.02㎡,101.95㎡,东南,...,商水/民水,集中供暖/自采暖,商电/民电,2.61元/m³,30元/㎡,324.0,150,117.200934,40.747919,地段一般，停车划线清晰，说白了，居住体验佳
3,0,123.0,1102.0,六环外,7.895656e+06,6室3厅1厨3卫,底层 (共2层),293.23㎡,293.23㎡,东 南 西 北,...,民水,自采暖,民电,2.61-2.63元/m³,,500.0,暂无,117.767308,41.228803,暖气好，平均水准，厨房设备新，不过话说，气味中性
4,0,81.0,295.0,三至四环,1.902960e+06,1房间1卫,中楼层 (共10层),39.85㎡,29.94㎡,南,...,商水/民水,集中供暖/自采暖,商电/民电,2.61-2.63元/m³,30-45元/㎡,1800.0,1200,117.334530,40.952530,可以说，通风一般，空气清新，换个角度看，气味刺鼻
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103866,11,84.0,716.0,,7.903242e+05,2室2厅1厨1卫,中楼层 (共6层),93.6㎡,,南 北,...,民水,集中供暖/自采暖,民电,2.68元/m³,18元/㎡,420.0,,117.062762,40.513568,说句实话，家具完好，严格意义上讲，空气流通好
103867,11,84.0,716.0,,1.113952e+06,3室1厅1厨1卫,低楼层 (共9层),132㎡,,南,...,民水,自采暖,民电,2.68元/m³,,,300,117.075303,40.500796,窗户大小适中，某种意义上，陷阱条款
103868,11,84.0,716.0,,7.432028e+05,3室1厅1厨1卫,中楼层 (共6层),92㎡,,南 北,...,民水,集中供暖,民电,2.68元/m³,18元/㎡,,,117.029481,40.498825,空调噪音大，总体看，公共照明到位
103869,11,84.0,716.0,,1.290376e+06,3室1厅1厨1卫,中楼层 (共17层),123㎡,,南 北,...,民水,自采暖,民电,2.68元/m³,,,300,117.075373,40.500866,个人觉得，房屋保养好，总体上说，通勤时间中等，阳台通风佳


In [60]:
df_processed = df_raw.copy()
def process_dataframe(df_in):
    """
    接收原始DataFrame，返回一个经过全面清洗、解析和特征工程的新DataFrame。
    """
    df = df_in.copy()

    # --- 1. 目标变量 (价格) ---
    print("处理 [价格]...")
    # 对价格进行对数变换，使其分布更平滑，有利于线性模型
    df['Price_log'] = np.log1p(df['Price'])

    # --- 2. 日期和年份相关特征 ---
    print("处理日期和年份...")
    # 从'交易时间'提取年份和月份
    df['交易时间'] = pd.to_datetime(df['交易时间'], errors='coerce')
    df['交易年份'] = df['交易时间'].dt.year
    
    # 定义一个健壮的函数来解析'建筑年代'列
    def parse_build_year(year_str):
        year_str = str(year_str) # 强制转换为字符串
        years = re.findall(r'\d{4}', year_str)
        if years:
            # 如果是年份范围 (如 '1995-2000')，取平均值
            return np.mean([int(y) for y in years])
        return np.nan # 无法解析则返回空值

    df['建筑年份'] = df['建筑年代'].apply(parse_build_year)
    
    # 创建核心特征 '房龄'
    df['房龄'] = df['交易年份'] - df['建筑年份']

    # --- 3. 数值和复合文本的解析 ---
    print("解析面积、户型、楼层等复合信息...")
    # 面积 (清洗单位'm²'并转为数值)
    df['建筑面积'] = pd.to_numeric(df['建筑面积'].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')
    df['套内面积'] = pd.to_numeric(df['套内面积'].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')
    df['套内面积'] = df['套内面积'].fillna(df['建筑面积'])

    ring_road_mapping = {
        '一至二环': 1,
        '二至三环': 2,
        '三至四环': 3,
        '四至五环': 4,
        '五至六环': 5,
        '六环外': 6,
        # 您可以根据需要添加更多映射
        # 例如，如果数据中有'一环内'
        '一环内': 0 
    }
    
    # 2. 应用映射，创建一个新的数值列 '环线编码'
    # .map() 方法会根据字典进行替换。不在字典中的值会变成NaN。
    df['环线编码'] = df['环线'].map(ring_road_mapping)
    
    # 3. 处理映射后可能出现的NaN (即原始数据中存在字典里没有的环线值)
    # 我们可以用一个不显眼的、中性的值（比如中位数）来填充这些NaN
    ring_road_median = df['环线编码'].median()
    df['环线编码'] = df['环线编码'].fillna(ring_road_median)

    # 房屋户型 (提取室、厅、卫)
    df['卧室数'] = df['房屋户型'].astype(str).str.extract(r'(\d+)[室房]').fillna(0).astype(int)
    df['客厅数'] = df['房屋户型'].astype(str).str.extract(r'(\d+)厅').fillna(0).astype(int)
    df['卫生间数'] = df['房屋户型'].astype(str).str.extract(r'(\d+)卫').fillna(0).astype(int)

    # 我们用 (\d+|[一二三四五六七八九十]+) 来匹配阿拉伯数字或中文数字
    temp_df = df['梯户比例'].astype(str).str.extract(r'(\d+|[一二三四五六七八九十]+)梯(\d+|[一二三四五六七八九十]+)户')
    temp_df.columns = ['梯', '户'] # 重命名提取出的列

    # 1. 定义一个强大、正确的中文数字转换函数
    def chinese_to_arabic(cn_str):
        cn_str = str(cn_str)
        cn_num_map = {'零': 0, '一': 1, '二': 2,'两':2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
        cn_unit_map = {'十': 10, '百': 100, '千': 1000, '万': 10000}
        
        # 直接尝试转为整数，如果成功说明是阿拉伯数字，直接返回
        try:
            return int(cn_str)
        except ValueError:
            pass # 如果失败，说明是中文数字，继续处理

        # 处理中文数字
        if not cn_str:
            return np.nan
            
        # 特殊处理 "十" 开头的情况，例如 "十一"
        if cn_str.startswith('十'):
            cn_str = '一' + cn_str

        res = 0
        sec_res = 0
        unit = 1
        
        for char in reversed(cn_str):
            if char in cn_unit_map:
                unit = cn_unit_map[char]
                if unit > sec_res: # 处理 "万" 等大单位
                    res += sec_res
                    sec_res = 0
            elif char in cn_num_map:
                sec_res += cn_num_map[char] * unit
        
        res += sec_res
        return res

    # 2. 定义一个包装函数来提取和计算比例
    def get_ratio(text):
        text = str(text)
        # 匹配中文或阿拉伯数字
        parts = re.findall(r'(\d+|[一二三四五六七八九十百]+)', text)
        if len(parts) == 2:
            try:
                num_elevators = chinese_to_arabic(parts[0])
                num_households = chinese_to_arabic(parts[1])
                if num_elevators > 0:
                    return num_households / num_elevators
            except:
                return np.nan
        return np.nan

    # 3. 应用函数，创建新特征'户梯比'
    df['户梯比'] = df['梯户比例'].apply(get_ratio)
    
    # 4. 根据您的要求，将空值填充为1
    df['户梯比'] = df['户梯比'].fillna(1)

    # 所在楼层 (提取总楼层和楼层类型)
    df['总楼层'] = pd.to_numeric(df['所在楼层'].astype(str).str.extract(r'共(\d+)层').squeeze(), errors='coerce')
    df['楼层类型'] = df['所在楼层'].astype(str).str.extract(r'^(\S{1,3})(?:楼层|层)').fillna('未知').squeeze()

    # 小区信息 (清洗单位'%')
    df['绿 化 率'] = pd.to_numeric(df['绿 化 率'].astype(str).str.replace('%', ''), errors='coerce') / 100
    df['容 积 率'] = pd.to_numeric(df['容 积 率'], errors='coerce')
    df['社区绿化密度'] = df['容 积 率'] / (df['绿 化 率'] + 0.01)

    # --- 4. 类别和布尔（是/否）特征转换 ---
    print("转换类别和布尔特征...")
    # 将'有'/'无'转换为1/0
    df['配备电梯'] = df['配备电梯'].apply(lambda x: 1 if str(x) == '有' else 0)

    # 从可能为空的列创建二进制特征 (1表示有信息, 0表示无)
    df['是否别墅'] = df['别墅类型'].notna().astype(int)
    df['有抵押信息'] = df['抵押信息'].notna().astype(int)

    # 将数值型ID (如区域、板块) 转换为字符串，以确保它们被当作类别处理
    df['区域'] = df['区域'].astype(str)
    df = pd.get_dummies(df, columns=['区域'], drop_first=True)  
    
    # --- 5. 创建衍生特征 ---
    print("创建衍生特征 (得房率等)...")
    # 得房率
    df['得房率'] = df['套内面积'] / df['建筑面积'].replace(0, np.nan) # 避免除以0


    def extract_and_average_numeric(series):
        # 强制转换为字符串
        s = series.astype(str)
        
        # 使用正则表达式查找所有数字（包括整数和小数）
        # findall 会为每个字符串返回一个数字列表，例如 '30-45元' -> ['30', '45']
        numeric_parts = s.str.findall(r'(\d+\.?\d*)')
        
        # 定义一个函数来处理提取出的列表
        def get_average(num_list):
            # 如果列表为空 (例如原始值是'暂无'或空)，返回NaN
            if not num_list:
                return np.nan
            
            # 将列表中的所有字符串数字转为浮点数
            float_list = [float(n) for n in num_list]
            
            # 计算平均值
            return np.mean(float_list)
            
        # 将 get_average 函数应用到每个数字列表上
        return numeric_parts.apply(get_average)

    # 2. 应用该函数到需要处理的列
    df['燃气费'] = extract_and_average_numeric(df['燃气费'])
    df['供热费'] = extract_and_average_numeric(df['供热费'])
    df['停车位'] = extract_and_average_numeric(df['停车位'])
    df['停车费用'] = extract_and_average_numeric(df['停车费用'])
    df['物业费'] = extract_and_average_numeric(df['物 业 费'])

    df['房屋总数'] = df['房屋总数'].astype(str).str.replace(r'[^\d]', '', regex=True)
    df['楼栋总数'] = df['楼栋总数'].astype(str).str.replace(r'[^\d]', '', regex=True)

    # 3. 使用 pd.to_numeric() 将清理后的字符串列安全地转换为整数类型
    # errors='coerce' 会将无法转换的空值等变为NaN，然后我们用.fillna(0)填充
    df['房屋总数'] = pd.to_numeric(df['房屋总数'], errors='coerce').fillna(0).astype(int)
    df['楼栋总数'] = pd.to_numeric(df['楼栋总数'], errors='coerce').fillna(0).astype(int)
    df['楼栋总数'] = pd.to_numeric(df['楼栋总数'], errors='coerce').fillna(0).astype(int)

     
    df['房屋年限'] = df['房屋年限'].fillna('未知')
    df['房屋年限'] = df['房屋年限'].astype(str)
    df['房屋总数'] = df['房屋总数'].replace(0, 1)
    df['停车位'] = df['停车位'].replace(0, 1)
  
    # 每户平均车位数
    df['每户停车位'] = df['停车位'] / df['房屋总数']
    
    # 1. 合并文本列
    df['description_combined'] = df['核心卖点'].fillna('') + df['户型介绍'].fillna('') + df['周边配套'].fillna('')

    # 2. 定义关键词库
    objective_keywords = [
       '户型方正', '人车分流', '学区',
       '地铁', '医院', '商场', '超市', '公园', '菜市场'
        ]

    # 3. 循环创建特征列
    for keyword in objective_keywords:
        df[f'Desc_{keyword}'] = df['description_combined'].str.contains(keyword, na=False).astype(int)

    # 1. 定义正负面词典
    positive_keywords = ['体验佳', '干净', '安静', '方便', '采光好', '物业好', '安全']
    negative_keywords = ['老旧', '费高', '噪音', '通风差', '潮湿', '漏水', '老化', '乱']

    # 2. 计算分数
    df['积极反馈'] = df['客户反馈'].fillna('').apply(lambda x: sum(1 for word in positive_keywords if word in x))
    df['消极反馈'] = df['客户反馈'].fillna('').apply(lambda x: sum(1 for word in negative_keywords if word in x))

    # （可选）创建一个综合得分
    df['综合反馈'] = df['积极反馈'] - df['消极反馈']
    
    # --- 7. 清理不再需要的原始列 ---
    print("清理冗余的原始列...")
    # 定义所有已被处理、不再需要的列
    columns_to_drop = [
        '交易时间', '建筑年代', '房屋户型', '所在楼层', '环线位置','交易权属','板块','板块_comm',
        '别墅类型', '抵押信息', '房屋优势', '核心卖点', '户型介绍','房屋用途','区县','城市',
        '周边配套', '交通出行', '房屋朝向', '小区名称', '物业类别','建筑结构_comm',
        '建筑结构', '装修情况', '上次交易', '梯户比例','环线' ,'开发商','物业公司',
        '物业办公电话','物 业 费','客户反馈','积极反馈','消极反馈','description_combined',
        '交易年份','建筑年份','产权描述'# 这些信息已经合并到 full_description 中
    ]
    # 过滤掉可能不存在的列名，以避免出错
    existing_cols_to_drop = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols_to_drop)

    # --- 8. 统一处理缺失值 (NaN) ---
    print("使用中位数填充剩余的数值型缺失值...")
    # 找出所有的数值型列
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()


    # 对每一个数值型列，如果存在NaN，就用该列的中位数填充
    for col in numeric_cols:
        if df[col].isnull().any():
          median_value = df[col].median()
          # 【修正】使用赋值语句替代 inplace=True
          df[col] = df[col].fillna(median_value)

    return df

# =============================================================================
# 执行处理函数并检查结果
# =============================================================================
# 调用主函数，开始处理
df_processed = process_dataframe(df_processed)
df_processed

处理 [价格]...
处理日期和年份...
解析面积、户型、楼层等复合信息...
转换类别和布尔特征...
创建衍生特征 (得房率等)...
清理冗余的原始列...
使用中位数填充剩余的数值型缺失值...


Unnamed: 0,Price,建筑面积,套内面积,配备电梯,房屋年限,产权所属,lon,lat,年份,房屋总数,...,Desc_户型方正,Desc_人车分流,Desc_学区,Desc_地铁,Desc_医院,Desc_商场,Desc_超市,Desc_公园,Desc_菜市场,综合反馈
0,6.194049e+06,52.30,52.30,0,满五年,非共有,117.424278,40.975752,2018.0,1317,...,1,0,0,0,1,0,1,1,0,-2
1,4.354153e+06,127.44,123.70,0,满五年,非共有,117.389228,41.091295,2017.0,2317,...,1,0,0,0,1,0,0,1,0,0
2,3.321992e+06,118.02,101.95,1,满五年,非共有,117.200934,40.747919,2018.0,1554,...,1,0,0,0,0,0,0,0,0,1
3,7.895656e+06,293.23,293.23,0,满五年,非共有,117.767308,41.228803,2020.0,66,...,0,0,0,0,1,0,0,0,1,0
4,1.902960e+06,39.85,29.94,1,满五年,非共有,117.334530,40.952530,2017.0,1685,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103866,7.903242e+05,93.60,93.60,0,未知,非共有,117.062762,40.513568,2022.0,1745,...,0,0,0,0,0,0,0,0,0,0
103867,1.113952e+06,132.00,132.00,1,未知,非共有,117.075303,40.500796,2022.0,1285,...,0,0,0,0,0,0,0,0,0,0
103868,7.432028e+05,92.00,92.00,0,未知,非共有,117.029481,40.498825,2022.0,146,...,0,0,0,0,1,0,0,0,0,-1
103869,1.290376e+06,123.00,123.00,1,未知,共有,117.075373,40.500866,2022.0,1285,...,0,0,0,0,0,0,0,0,0,0


In [61]:
df_processed.to_csv('清洗后数据.csv', index=False)

In [62]:
import statsmodels.api as sm
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline

In [63]:
df_no_outliers = df_processed.copy()

cols_to_check_outliers = ['Price', '建筑面积'] 
# 确保这些列存在
cols_to_check_outliers = [col for col in cols_to_check_outliers if col in df_no_outliers.columns]

# b) 循环处理每一列的离群值
for col in cols_to_check_outliers:
    print(f"正在处理列 '{col}' 的离群值...")
    
    # 1. 计算第一四分位数 (Q1) 和第三四分位数 (Q3)
    Q1 = df_no_outliers[col].quantile(0.25)
    Q3 = df_no_outliers[col].quantile(0.75)
    
    # 2. 计算四分位距 (IQR)
    IQR = Q3 - Q1
    
    # 3. 定义离群值的边界
    # 通常使用 1.5 * IQR 作为标准，您可以根据需要调整这个系数
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # 记录处理前的行数
    rows_before = df_no_outliers.shape[0]
    
    # 4. 识别并过滤掉离群值
    df_no_outliers = df_no_outliers[
        (df_no_outliers[col] >= lower_bound) & 
        (df_no_outliers[col] <= upper_bound)
    ]
    
    # 记录处理后的行数
    rows_after = df_no_outliers.shape[0]
    
    print(f"列 '{col}': 边界为 [{lower_bound:,.2f}, {upper_bound:,.2f}]。删除了 {rows_before - rows_after} 个离群样本。")

print(f"\n离群值处理完成。原始数据集行数: {df_processed.shape[0]}, 处理后数据集行数: {df_no_outliers.shape[0]}")
df_no_outliers

正在处理列 'Price' 的离群值...
列 'Price': 边界为 [-1,793,407.31, 5,365,255.64]。删除了 7823 个离群样本。
正在处理列 '建筑面积' 的离群值...
列 '建筑面积': 边界为 [1.21, 182.62]。删除了 2806 个离群样本。

离群值处理完成。原始数据集行数: 103871, 处理后数据集行数: 93242


Unnamed: 0,Price,建筑面积,套内面积,配备电梯,房屋年限,产权所属,lon,lat,年份,房屋总数,...,Desc_户型方正,Desc_人车分流,Desc_学区,Desc_地铁,Desc_医院,Desc_商场,Desc_超市,Desc_公园,Desc_菜市场,综合反馈
1,4.354153e+06,127.44,123.70,0,满五年,非共有,117.389228,41.091295,2017.0,2317,...,1,0,0,0,1,0,0,1,0,0
2,3.321992e+06,118.02,101.95,1,满五年,非共有,117.200934,40.747919,2018.0,1554,...,1,0,0,0,0,0,0,0,0,1
4,1.902960e+06,39.85,29.94,1,满五年,非共有,117.334530,40.952530,2017.0,1685,...,0,0,0,1,0,0,0,0,0,0
6,3.631467e+06,134.10,108.16,1,满五年,非共有,117.578940,40.759199,2018.0,1100,...,0,0,0,0,0,0,1,0,0,0
7,2.098983e+05,12.74,12.74,0,满五年,非共有,117.578952,40.759211,2017.0,1100,...,0,0,0,0,0,0,1,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103866,7.903242e+05,93.60,93.60,0,未知,非共有,117.062762,40.513568,2022.0,1745,...,0,0,0,0,0,0,0,0,0,0
103867,1.113952e+06,132.00,132.00,1,未知,非共有,117.075303,40.500796,2022.0,1285,...,0,0,0,0,0,0,0,0,0,0
103868,7.432028e+05,92.00,92.00,0,未知,非共有,117.029481,40.498825,2022.0,146,...,0,0,0,0,1,0,0,0,0,-1
103869,1.290376e+06,123.00,123.00,1,未知,共有,117.075373,40.500866,2022.0,1285,...,0,0,0,0,0,0,0,0,0,0


In [64]:
poly_features_cols = ['建筑面积', '房龄']
# 检查这些列是否存在
poly_features_cols = [col for col in poly_features_cols if col in df_no_outliers.columns]


if poly_features_cols:
    # 1. 实例化 PolynomialFeatures
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    
    # 2. 对指定的列进行拟合和转换
    poly_transformed = poly.fit_transform(df_no_outliers[poly_features_cols])
    
    # 3. 获取新特征的名称
    poly_feature_names = poly.get_feature_names_out(poly_features_cols)
    
    # 4. 将转换后的数组转换为DataFrame
    df_poly = pd.DataFrame(poly_transformed, columns=poly_feature_names)
    
    # 5. 从原始数据中删除将被替换的旧列
    df_without_originals = df_no_outliers.drop(columns=poly_features_cols)
    
    # 6. 将处理过的数据与新生成的多项式特征合并
    # 确保索引一致
    df_final = pd.concat([df_without_originals.reset_index(drop=True), df_poly.reset_index(drop=True)], axis=1)
    
else:
    # 如果没有要处理的列，直接赋值
    df_final = df_no_outliers.copy()

df_final    

Unnamed: 0,Price,套内面积,配备电梯,房屋年限,产权所属,lon,lat,年份,房屋总数,楼栋总数,...,Desc_商场,Desc_超市,Desc_公园,Desc_菜市场,综合反馈,建筑面积,房龄,建筑面积^2,建筑面积 房龄,房龄^2
0,4.354153e+06,123.70,0,满五年,非共有,117.389228,41.091295,2017.0,2317,40,...,0,0,1,0,0,127.44,15.0,16240.9536,1911.600,225.00
1,3.321992e+06,101.95,1,满五年,非共有,117.200934,40.747919,2018.0,1554,20,...,0,0,0,0,1,118.02,8.5,13928.7204,1003.170,72.25
2,1.902960e+06,29.94,1,满五年,非共有,117.334530,40.952530,2017.0,1685,19,...,0,0,0,0,0,39.85,8.5,1588.0225,338.725,72.25
3,3.631467e+06,108.16,1,满五年,非共有,117.578940,40.759199,2018.0,1100,10,...,0,1,0,0,0,134.10,8.5,17982.8100,1139.850,72.25
4,2.098983e+05,12.74,0,满五年,非共有,117.578952,40.759211,2017.0,1100,10,...,0,1,0,0,-1,12.74,7.5,162.3076,95.550,56.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93237,7.903242e+05,93.60,0,未知,非共有,117.062762,40.513568,2022.0,1745,29,...,0,0,0,0,0,93.60,15.0,8760.9600,1404.000,225.00
93238,1.113952e+06,132.00,1,未知,非共有,117.075303,40.500796,2022.0,1285,12,...,0,0,0,0,0,132.00,13.5,17424.0000,1782.000,182.25
93239,7.432028e+05,92.00,0,未知,非共有,117.029481,40.498825,2022.0,146,6,...,0,0,0,0,-1,92.00,26.0,8464.0000,2392.000,676.00
93240,1.290376e+06,123.00,1,未知,共有,117.075373,40.500866,2022.0,1285,12,...,0,0,0,0,0,123.00,13.5,15129.0000,1660.500,182.25


In [65]:
categorical_cols = df_final.select_dtypes(include=['object']).columns

if not categorical_cols.empty:
    print(f"找到以下分类变量: {categorical_cols.tolist()}")
    # 使用 pd.get_dummies 进行独热编码
    # drop_first=True 用于避免多重共线性，这是一个好习惯
    df_encoded = pd.get_dummies(df_final, columns=categorical_cols, drop_first=True)
    print("独热编码完成。")
else:
    print("未找到字符串类型的列，跳过独热编码。")
    df_encoded = df_final.copy()

df_encoded

找到以下分类变量: ['房屋年限', '产权所属', '供水', '供暖', '供电', '楼层类型']
独热编码完成。


Unnamed: 0,Price,套内面积,配备电梯,lon,lat,年份,房屋总数,楼栋总数,绿 化 率,容 积 率,...,供暖_集中供暖,供暖_集中供暖/自采暖,供暖_集中供暖/自采暖/无供暖,供电_商电/民电,供电_民电,楼层类型_低楼,楼层类型_底,楼层类型_未知,楼层类型_顶,楼层类型_高楼
0,4.354153e+06,123.70,0,117.389228,41.091295,2017.0,2317,40,0.30,1.73,...,False,False,False,True,False,False,False,False,True,False
1,3.321992e+06,101.95,1,117.200934,40.747919,2018.0,1554,20,0.30,1.70,...,False,True,False,True,False,True,False,False,False,False
2,1.902960e+06,29.94,1,117.334530,40.952530,2017.0,1685,19,0.60,1.58,...,False,True,False,True,False,False,False,False,False,False
3,3.631467e+06,108.16,1,117.578940,40.759199,2018.0,1100,10,0.35,1.50,...,True,False,False,True,False,False,False,False,False,True
4,2.098983e+05,12.74,0,117.578952,40.759211,2017.0,1100,10,0.35,1.50,...,True,False,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93237,7.903242e+05,93.60,0,117.062762,40.513568,2022.0,1745,29,0.30,2.00,...,False,True,False,False,True,False,False,False,False,False
93238,1.113952e+06,132.00,1,117.075303,40.500796,2022.0,1285,12,0.30,2.50,...,False,False,False,False,True,True,False,False,False,False
93239,7.432028e+05,92.00,0,117.029481,40.498825,2022.0,146,6,0.25,1.20,...,True,False,False,False,True,False,False,False,False,False
93240,1.290376e+06,123.00,1,117.075373,40.500866,2022.0,1285,12,0.30,2.50,...,False,False,False,False,True,False,False,False,False,False


In [66]:
df_encoded.to_csv('处理后数据.csv', index=False)

In [67]:
y_full = df_encoded['Price_log']
X_full = df_encoded.drop(['Price', 'Price_log'], axis=1)
    
# 作为一个安全检查，再次确认所有列都为数值类型
X_full = X_full.select_dtypes(include=[np.number, bool])
print(f"最终用于建模的特征数量: {X_full.shape[1]}")

# --- 新增步骤 3a: 移除零方差特征 ---
print("步骤3a: 移除方差为零的特征...")
# 计算所有列的方差
variances = X_full.var()
# 找出方差为0的列
zero_variance_cols = variances[variances == 0].index

if not zero_variance_cols.empty:
    print(f"发现并移除以下 {len(zero_variance_cols)} 个零方差列: {zero_variance_cols.tolist()}")
    # 从 X_full 中删除这些列
    X_full = X_full.drop(columns=zero_variance_cols)
else:
    print("未发现方差为零的列。")



最终用于建模的特征数量: 176
步骤3a: 移除方差为零的特征...
发现并移除以下 1 个零方差列: ['有抵押信息']


In [68]:
ols_stabilized_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=1e-8)) 
])

# 对于Ridge和Lasso，我们可以直接使用内置的交叉验证版本
# 注意：Scikit-learn的RidgeCV/LassoCV在内部处理数据时不会像Pipeline那样对每次折叠都重新标准化，
# 但对于最终alpha的选择影响不大。为保持流程一致和最严格，我们也可以将它们放入Pipeline。
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RidgeCV(alphas=np.logspace(-3, 3, 100)))
])

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LassoCV(cv=5, max_iter=10000, random_state=42))
])

models = {'OLS':ols_stabilized_pipeline, 'Lasso': lasso_pipeline, 'Ridge': ridge_pipeline}

# --- 3. 评估流程 ---
print("\n步骤3: 开始进行模型评估...")
results_list = []

for name, model_pipeline in models.items():
    print(f"\n--- 正在评估模型: {name} ---")
    
    # a) 样本内 (In-sample) 性能
    # 我们需要在整个数据集上训练一次来获取样本内性能
    # 注意：这里我们手动进行标准化，以模拟Pipeline在fit时的操作
    scaler = StandardScaler()
    X_full_scaled = scaler.fit_transform(X_full)
    
    # 提取pipeline中的模型部分进行训练
    # 对于OLS，就是LinearRegression()
    # 对于RidgeCV/LassoCV，它们本身就是模型
    if name == 'OLS':
        model_instance = model_pipeline.named_steps['model']
    else: # RidgeCV and LassoCV
        model_instance = model_pipeline.named_steps['model']
        
    model_instance.fit(X_full_scaled, y_full)
    
    y_pred_in_sample_log = model_instance.predict(X_full_scaled)
    y_pred_in_sample = np.exp(y_pred_in_sample_log)
    y_true_in_sample = np.exp(y_full)
    
    in_sample_mae = mean_absolute_error(y_true_in_sample, y_pred_in_sample)
    in_sample_rmse = np.sqrt(mean_squared_error(y_true_in_sample, y_pred_in_sample))

    # b) 6折交叉验证 (Cross-validation) 性能
    # cross_val_score 会自动处理Pipeline，在每一折内先fit_transform scaler，再fit model
    # 我们需要一个自定义的 scorer 来处理对数转换
    def rmse_scorer(estimator, X, y):
        y_pred_log = estimator.predict(X)
        y_pred = np.exp(y_pred_log)
        y_true = np.exp(y)
        return np.sqrt(mean_squared_error(y_true, y_pred))

    def mae_scorer(estimator, X, y):
        y_pred_log = estimator.predict(X)
        y_pred = np.exp(y_pred_log)
        y_true = np.exp(y)
        return mean_absolute_error(y_true, y_pred)
        
    cv_rmse_scores = cross_val_score(model_pipeline, X_full, y_full, cv=6, scoring=rmse_scorer)
    cv_mae_scores = cross_val_score(model_pipeline, X_full, y_full, cv=6, scoring=mae_scorer)
    
    cv_rmse = cv_rmse_scores.mean()
    cv_mae = cv_mae_scores.mean()
    
    # 存储结果
    results_list.append({
        'Model': name,
        'In-sample MAE': in_sample_mae,
        'Cross-validation MAE': cv_mae,
        'In-sample RMSE': in_sample_rmse,
        'Cross-validation RMSE': cv_rmse
    })



步骤3: 开始进行模型评估...

--- 正在评估模型: OLS ---

--- 正在评估模型: Lasso ---

--- 正在评估模型: Ridge ---


In [69]:

# --- 4. 结果汇总与展示 ---
print("\n步骤4: 汇总所有评估结果...")
results_df = pd.DataFrame(results_list)

best_model_name = results_df.loc[results_df['Cross-validation MAE'].idxmin()]['Model']
best_model_row = results_df[results_df['Model'] == best_model_name].copy()
best_model_row['Model'] = 'Best Linear Model'

# 使用 pd.concat 来替代 append
results_df = pd.concat([results_df, best_model_row], ignore_index=True)

# 按照要求的格式展示 (以MAE为例)
print("\n--- 最终性能评估表 (使用 MAE) ---")
final_table_mae = results_df[['Model', 'In-sample MAE', 'Cross-validation MAE']].copy()
final_table_mae.set_index('Model', inplace=True)
final_table_mae.columns = ['In sample', 'Cross-validation']
print(final_table_mae.to_string(float_format="%.2f"))

# 按照要求的格式展示 (以RMSE为例)
print("\n--- 最终性能评估表 (使用 RMSE) ---")
final_table_rmse = results_df[['Model', 'In-sample RMSE', 'Cross-validation RMSE']].copy()
final_table_rmse.set_index('Model', inplace=True)
final_table_rmse.columns = ['In sample', 'Cross-validation']
print(final_table_rmse.to_string(float_format="%.2f"))


步骤4: 汇总所有评估结果...

--- 最终性能评估表 (使用 MAE) ---
                   In sample  Cross-validation
Model                                         
OLS                320891.05        3344636.83
Lasso              710060.94         852408.57
Ridge              321564.29        1313767.06
Best Linear Model  710060.94         852408.57

--- 最终性能评估表 (使用 RMSE) ---
                   In sample  Cross-validation
Model                                         
OLS                494152.04        4701017.86
Lasso             1021895.59        1110117.71
Ridge              495520.32        1674516.83
Best Linear Model 1021895.59        1110117.71


In [70]:
df_test = pd.read_csv('ruc_Class25Q2_test_price.csv',low_memory=False)
df_test

Unnamed: 0,ID,城市,区域,板块,环线,房屋户型,所在楼层,建筑面积,套内面积,房屋朝向,...,供水,供暖,供电,燃气费,供热费,停车位,停车费用,coord_x,coord_y,客户反馈
0,1000000,0,109.0,367.0,二至三环,3室2厅1厨2卫,中楼层 (共23层),282.02㎡,,南 北,...,民水,自采暖,民电,2.61元/m³,,280.0,地上150元/月/位，地下2元/时/位，地下固定车位450元/月/位,117.389491,40.901030,卫生差，阳光充足
1,1000001,0,28.0,606.0,五至六环,2室1厅1厨1卫,中楼层 (共17层),88.42㎡,71.78㎡,南 北,...,商水/民水,集中供暖,商电/民电,2.61元/m³,30元/㎡,1340.0,地上150元，地上机械180，地下300,117.376625,40.767478,卫生间整洁，通风死角多
2,1000002,0,123.0,1110.0,五至六环,3室1厅1厨2卫,高楼层 (共12层),175.52㎡,139.86㎡,西北,...,民水,集中供暖,民电,2.61元/m³,30元/㎡,300.0,150,117.631276,41.063635,反过来看，室内采光均衡，听说，监控覆盖
3,1000003,0,65.0,555.0,六环外,2室1厅1厨1卫,中楼层 (共5层),106.13㎡,,南 北,...,民水,集中供暖/自采暖,民电,2.61元/m³,30元/㎡,500.0,暂无,117.186216,41.163738,话又说回来，电力稳定，总体趋势上，网速快，门窗紧实
4,1000004,0,109.0,990.0,二环内,3室2厅1厨2卫,顶层 (共5层),116.8㎡,,南 北,...,商水/民水,集中供暖,民电,2.61-2.63元/m³,24-30元/㎡,80.0,150,117.400114,40.959679,总体状况一般，门窗紧实，个人觉得，物业服务好
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34012,1034012,11,87.0,873.0,,3室2厅1厨2卫,低楼层 (共27层),132㎡,,南 北,...,,,,,,,,116.505828,39.966772,光照不足，目前来看，卫生差
34013,1034013,11,84.0,716.0,,2室1厅1厨1卫,中楼层 (共6层),69.3㎡,,南 北,...,民水,集中供暖,民电,2.4元/m³,18元/㎡,300.0,地上0,117.050163,40.511461,楼层高，总体趋势上，陷阱条款
34014,1034014,11,106.0,942.0,,2室1厅1厨1卫,中楼层 (共6层),88.1㎡,,南 北,...,民水,集中供暖,民电,2.5-2.68元/m³,20元/㎡,500.0,,116.517498,39.853890,配套贴心，综合看来，天花高度正常，周边交通规划明确
34015,1034015,11,106.0,942.0,,2室1厅1厨1卫,低楼层 (共6层),88㎡,,南 北,...,民水,集中供暖,民电,2.5-2.68元/m³,20元/㎡,500.0,,116.517373,39.853765,仔细一看，出行方便，简单点讲，储物少


In [71]:
df_processed2 = df_test.copy()
def process_dataframe(df_in):
    """
    接收原始DataFrame，返回一个经过全面清洗、解析和特征工程的新DataFrame。
    """
    df = df_in.copy()

    # --- 2. 日期和年份相关特征 ---
    print("处理日期和年份...")
    # 从'交易时间'提取年份和月份
    df['交易时间'] = pd.to_datetime(df['交易时间'], errors='coerce')
    df['交易年份'] = df['交易时间'].dt.year
    
    # 定义一个健壮的函数来解析'建筑年代'列
    def parse_build_year(year_str):
        year_str = str(year_str) # 强制转换为字符串
        years = re.findall(r'\d{4}', year_str)
        if years:
            # 如果是年份范围 (如 '1995-2000')，取平均值
            return np.mean([int(y) for y in years])
        return np.nan # 无法解析则返回空值

    df['建筑年份'] = df['建筑年代'].apply(parse_build_year)
    
    # 创建核心特征 '房龄'
    df['房龄'] = df['交易年份'] - df['建筑年份']

    # --- 3. 数值和复合文本的解析 ---
    print("解析面积、户型、楼层等复合信息...")
    # 面积 (清洗单位'm²'并转为数值)
    df['建筑面积'] = pd.to_numeric(df['建筑面积'].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')
    df['套内面积'] = pd.to_numeric(df['套内面积'].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')
    df['套内面积'] = df['套内面积'].fillna(df['建筑面积'])

    ring_road_mapping = {
        '一至二环': 1,
        '二至三环': 2,
        '三至四环': 3,
        '四至五环': 4,
        '五至六环': 5,
        '六环外': 6,
        # 您可以根据需要添加更多映射
        # 例如，如果数据中有'一环内'
        '一环内': 0 
    }
    
    # 2. 应用映射，创建一个新的数值列 '环线编码'
    # .map() 方法会根据字典进行替换。不在字典中的值会变成NaN。
    df['环线编码'] = df['环线'].map(ring_road_mapping)
    
    # 3. 处理映射后可能出现的NaN (即原始数据中存在字典里没有的环线值)
    # 我们可以用一个不显眼的、中性的值（比如中位数）来填充这些NaN
    ring_road_median = df['环线编码'].median()
    df['环线编码'] = df['环线编码'].fillna(ring_road_median)

    # 房屋户型 (提取室、厅、卫)
    df['卧室数'] = df['房屋户型'].astype(str).str.extract(r'(\d+)[室房]').fillna(0).astype(int)
    df['客厅数'] = df['房屋户型'].astype(str).str.extract(r'(\d+)厅').fillna(0).astype(int)
    df['卫生间数'] = df['房屋户型'].astype(str).str.extract(r'(\d+)卫').fillna(0).astype(int)

    # 我们用 (\d+|[一二三四五六七八九十]+) 来匹配阿拉伯数字或中文数字
    temp_df = df['梯户比例'].astype(str).str.extract(r'(\d+|[一二三四五六七八九十]+)梯(\d+|[一二三四五六七八九十]+)户')
    temp_df.columns = ['梯', '户'] # 重命名提取出的列

    # 1. 定义一个强大、正确的中文数字转换函数
    def chinese_to_arabic(cn_str):
        cn_str = str(cn_str)
        cn_num_map = {'零': 0, '一': 1, '二': 2,'两':2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9}
        cn_unit_map = {'十': 10, '百': 100, '千': 1000, '万': 10000}
        
        # 直接尝试转为整数，如果成功说明是阿拉伯数字，直接返回
        try:
            return int(cn_str)
        except ValueError:
            pass # 如果失败，说明是中文数字，继续处理

        # 处理中文数字
        if not cn_str:
            return np.nan
            
        # 特殊处理 "十" 开头的情况，例如 "十一"
        if cn_str.startswith('十'):
            cn_str = '一' + cn_str

        res = 0
        sec_res = 0
        unit = 1
        
        for char in reversed(cn_str):
            if char in cn_unit_map:
                unit = cn_unit_map[char]
                if unit > sec_res: # 处理 "万" 等大单位
                    res += sec_res
                    sec_res = 0
            elif char in cn_num_map:
                sec_res += cn_num_map[char] * unit
        
        res += sec_res
        return res

    # 2. 定义一个包装函数来提取和计算比例
    def get_ratio(text):
        text = str(text)
        # 匹配中文或阿拉伯数字
        parts = re.findall(r'(\d+|[一二三四五六七八九十百]+)', text)
        if len(parts) == 2:
            try:
                num_elevators = chinese_to_arabic(parts[0])
                num_households = chinese_to_arabic(parts[1])
                if num_elevators > 0:
                    return num_households / num_elevators
            except:
                return np.nan
        return np.nan

    # 3. 应用函数，创建新特征'户梯比'
    df['户梯比'] = df['梯户比例'].apply(get_ratio)
    
    # 4. 根据您的要求，将空值填充为1
    df['户梯比'] = df['户梯比'].fillna(1)

    # 所在楼层 (提取总楼层和楼层类型)
    df['总楼层'] = pd.to_numeric(df['所在楼层'].astype(str).str.extract(r'共(\d+)层').squeeze(), errors='coerce')
    df['楼层类型'] = df['所在楼层'].astype(str).str.extract(r'^(\S{1,3})(?:楼层|层)').fillna('未知').squeeze()

    # 小区信息 (清洗单位'%')
    df['绿 化 率'] = pd.to_numeric(df['绿 化 率'].astype(str).str.replace('%', ''), errors='coerce') / 100
    df['容 积 率'] = pd.to_numeric(df['容 积 率'], errors='coerce')
    df['社区绿化密度'] = df['容 积 率'] / (df['绿 化 率'] + 0.01)

    # --- 4. 类别和布尔（是/否）特征转换 ---
    print("转换类别和布尔特征...")
    # 将'有'/'无'转换为1/0
    df['配备电梯'] = df['配备电梯'].apply(lambda x: 1 if str(x) == '有' else 0)

    # 从可能为空的列创建二进制特征 (1表示有信息, 0表示无)
    df['是否别墅'] = df['别墅类型'].notna().astype(int)
    df['有抵押信息'] = df['抵押信息'].notna().astype(int)

    # 将数值型ID (如区域、板块) 转换为字符串，以确保它们被当作类别处理
    df['区域'] = df['区域'].astype(str)
    df = pd.get_dummies(df, columns=['区域'], drop_first=True) 
    
    # --- 5. 创建衍生特征 ---
    print("创建衍生特征 (得房率等)...")
    # 得房率
    df['得房率'] = df['套内面积'] / df['建筑面积'].replace(0, np.nan) # 避免除以0


    def extract_and_average_numeric(series):
        # 强制转换为字符串
        s = series.astype(str)
        
        # 使用正则表达式查找所有数字（包括整数和小数）
        # findall 会为每个字符串返回一个数字列表，例如 '30-45元' -> ['30', '45']
        numeric_parts = s.str.findall(r'(\d+\.?\d*)')
        
        # 定义一个函数来处理提取出的列表
        def get_average(num_list):
            # 如果列表为空 (例如原始值是'暂无'或空)，返回NaN
            if not num_list:
                return np.nan
            
            # 将列表中的所有字符串数字转为浮点数
            float_list = [float(n) for n in num_list]
            
            # 计算平均值
            return np.mean(float_list)
            
        # 将 get_average 函数应用到每个数字列表上
        return numeric_parts.apply(get_average)

    # 2. 应用该函数到需要处理的列
    df['燃气费'] = extract_and_average_numeric(df['燃气费'])
    df['供热费'] = extract_and_average_numeric(df['供热费'])
    df['停车位'] = extract_and_average_numeric(df['停车位'])
    df['停车费用'] = extract_and_average_numeric(df['停车费用'])
    df['物业费'] = extract_and_average_numeric(df['物 业 费'])

    df['房屋总数'] = df['房屋总数'].astype(str).str.replace(r'[^\d]', '', regex=True)
    df['楼栋总数'] = df['楼栋总数'].astype(str).str.replace(r'[^\d]', '', regex=True)

    # 3. 使用 pd.to_numeric() 将清理后的字符串列安全地转换为整数类型
    # errors='coerce' 会将无法转换的空值等变为NaN，然后我们用.fillna(0)填充
    df['房屋总数'] = pd.to_numeric(df['房屋总数'], errors='coerce').fillna(0).astype(int)
    df['楼栋总数'] = pd.to_numeric(df['楼栋总数'], errors='coerce').fillna(0).astype(int)
    df['楼栋总数'] = pd.to_numeric(df['楼栋总数'], errors='coerce').fillna(0).astype(int)

     
    df['房屋年限'] = df['房屋年限'].fillna('未知')
    df['房屋年限'] = df['房屋年限'].astype(str)
    df['房屋总数'] = df['房屋总数'].replace(0, 1)
    df['停车位'] = df['停车位'].replace(0, 1)
  
    # 每户平均车位数
    df['每户停车位'] = df['停车位'] / df['房屋总数']
    
    # 1. 合并文本列
    df['description_combined'] = df['核心卖点'].fillna('') + df['户型介绍'].fillna('') + df['周边配套'].fillna('')

    # 2. 定义关键词库
    objective_keywords = [
       '户型方正', '人车分流', '学区',
       '地铁', '医院', '商场', '超市', '公园', '菜市场'
        ]

    # 3. 循环创建特征列
    for keyword in objective_keywords:
        df[f'Desc_{keyword}'] = df['description_combined'].str.contains(keyword, na=False).astype(int)

    # 1. 定义正负面词典
    positive_keywords = ['体验佳', '干净', '安静', '方便', '采光好', '物业好', '安全']
    negative_keywords = ['老旧', '费高', '噪音', '通风差', '潮湿', '漏水', '老化', '乱']

    # 2. 计算分数
    df['积极反馈'] = df['客户反馈'].fillna('').apply(lambda x: sum(1 for word in positive_keywords if word in x))
    df['消极反馈'] = df['客户反馈'].fillna('').apply(lambda x: sum(1 for word in negative_keywords if word in x))

    # （可选）创建一个综合得分
    df['综合反馈'] = df['积极反馈'] - df['消极反馈']
    
    # --- 7. 清理不再需要的原始列 ---
    print("清理冗余的原始列...")
    # 定义所有已被处理、不再需要的列
    columns_to_drop = [
        '交易时间', '建筑年代', '房屋户型', '所在楼层', '环线位置','交易权属','板块','板块_comm',
        '别墅类型', '抵押信息', '房屋优势', '核心卖点', '户型介绍','房屋用途','区县','城市',
        '周边配套', '交通出行', '房屋朝向', '小区名称', '物业类别','建筑结构_comm',
        '建筑结构', '装修情况', '上次交易', '梯户比例','环线' ,'开发商','物业公司',
        '物业办公电话','物 业 费','客户反馈','积极反馈','消极反馈','description_combined',
        '交易年份','建筑年份','产权描述'# 这些信息已经合并到 full_description 中
    ]
    # 过滤掉可能不存在的列名，以避免出错
    existing_cols_to_drop = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=existing_cols_to_drop)

    # --- 8. 统一处理缺失值 (NaN) ---
    print("使用中位数填充剩余的数值型缺失值...")
    # 找出所有的数值型列
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()


    # 对每一个数值型列，如果存在NaN，就用该列的中位数填充
    for col in numeric_cols:
        if df[col].isnull().any():
          median_value = df[col].median()
          # 【修正】使用赋值语句替代 inplace=True
          df[col] = df[col].fillna(median_value)

    return df

# =============================================================================
# 执行处理函数并检查结果
# =============================================================================
# 调用主函数，开始处理
df_processed2 = process_dataframe(df_processed2)
df_processed2

处理日期和年份...
解析面积、户型、楼层等复合信息...
转换类别和布尔特征...
创建衍生特征 (得房率等)...
清理冗余的原始列...
使用中位数填充剩余的数值型缺失值...


Unnamed: 0,ID,建筑面积,套内面积,配备电梯,房屋年限,产权所属,lon,lat,年份,房屋总数,...,Desc_户型方正,Desc_人车分流,Desc_学区,Desc_地铁,Desc_医院,Desc_商场,Desc_超市,Desc_公园,Desc_菜市场,综合反馈
0,1000000,282.02,282.02,1,满五年,非共有,117.389491,40.901030,2022.0,458,...,0,0,0,0,0,0,0,0,0,0
1,1000001,88.42,71.78,1,满五年,非共有,117.376625,40.767478,2022.0,3465,...,0,0,0,0,0,0,0,0,0,0
2,1000002,175.52,139.86,1,满五年,非共有,117.631276,41.063635,2022.0,144,...,0,0,0,1,0,0,0,1,0,0
3,1000003,106.13,106.13,0,满五年,非共有,117.186216,41.163738,2022.0,320,...,0,0,0,0,0,0,0,0,0,0
4,1000004,116.80,116.80,0,满五年,非共有,117.400114,40.959679,2022.0,340,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34012,1034012,132.00,132.00,0,未知,非共有,116.504991,39.966476,2022.0,221,...,0,0,0,0,0,0,0,0,0,0
34013,1034013,69.30,69.30,0,满两年,非共有,117.050163,40.511461,2022.0,479,...,0,0,0,0,0,0,0,0,0,0
34014,1034014,88.10,88.10,0,未知,非共有,116.517498,39.853890,2023.0,482,...,0,0,0,0,0,0,0,0,0,0
34015,1034015,88.00,88.00,0,未知,非共有,116.517373,39.853765,2023.0,482,...,0,0,0,0,0,0,0,0,0,1


In [74]:
poly_features_cols = ['建筑面积', '房龄']

if poly_features_cols:
    # 1. 实例化 PolynomialFeatures
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    
    # 2. 对指定的列进行拟合和转换
    poly_transformed = poly.fit_transform(df_processed2[poly_features_cols])
    
    # 3. 获取新特征的名称
    poly_feature_names = poly.get_feature_names_out(poly_features_cols)
    
    # 4. 将转换后的数组转换为DataFrame
    df_poly2 = pd.DataFrame(poly_transformed, columns=poly_feature_names)
    
    # 5. 从原始数据中删除将被替换的旧列
    df_without_originals2 = df_processed2.drop(columns=poly_features_cols)
    
    # 6. 将处理过的数据与新生成的多项式特征合并
    # 确保索引一致
    df_final2 = pd.concat([df_without_originals2.reset_index(drop=True), df_poly2.reset_index(drop=True)], axis=1)
    
else:
    # 如果没有要处理的列，直接赋值
    df_final2 = df_processed2.copy()

df_final2   


Unnamed: 0,ID,套内面积,配备电梯,房屋年限,产权所属,lon,lat,年份,房屋总数,楼栋总数,...,Desc_商场,Desc_超市,Desc_公园,Desc_菜市场,综合反馈,建筑面积,房龄,建筑面积^2,建筑面积 房龄,房龄^2
0,1000000,282.02,1,满五年,非共有,117.389491,40.901030,2022.0,458,3,...,0,0,0,0,0,282.02,21.0,79535.2804,5922.42,441.00
1,1000001,71.78,1,满五年,非共有,117.376625,40.767478,2022.0,3465,23,...,0,0,0,0,0,88.42,17.0,7818.0964,1503.14,289.00
2,1000002,139.86,1,满五年,非共有,117.631276,41.063635,2022.0,144,2,...,0,0,1,0,0,175.52,25.0,30807.2704,4388.00,625.00
3,1000003,106.13,0,满五年,非共有,117.186216,41.163738,2022.0,320,9,...,0,0,0,0,0,106.13,19.0,11263.5769,2016.47,361.00
4,1000004,116.80,0,满五年,非共有,117.400114,40.959679,2022.0,340,9,...,1,0,0,1,0,116.80,30.0,13642.2400,3504.00,900.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34012,1034012,132.00,0,未知,非共有,116.504991,39.966476,2022.0,221,3,...,0,0,0,0,0,132.00,18.0,17424.0000,2376.00,324.00
34013,1034013,69.30,0,满两年,非共有,117.050163,40.511461,2022.0,479,16,...,0,0,0,0,0,69.30,22.5,4802.4900,1559.25,506.25
34014,1034014,88.10,0,未知,非共有,116.517498,39.853890,2023.0,482,9,...,0,0,0,0,0,88.10,23.5,7761.6100,2070.35,552.25
34015,1034015,88.00,0,未知,非共有,116.517373,39.853765,2023.0,482,9,...,0,0,0,0,1,88.00,23.5,7744.0000,2068.00,552.25


In [75]:


categorical_cols = df_final2.select_dtypes(include=['object']).columns

if not categorical_cols.empty:
    print(f"找到以下分类变量: {categorical_cols.tolist()}")
    # 使用 pd.get_dummies 进行独热编码
    # drop_first=True 用于避免多重共线性，这是一个好习惯
    df_encoded2 = pd.get_dummies(df_final2, columns=categorical_cols, drop_first=True)
    print("独热编码完成。")
else:
    print("未找到字符串类型的列，跳过独热编码。")
    df_encoded2 = df_final2.copy()

df_encoded2

找到以下分类变量: ['房屋年限', '产权所属', '供水', '供暖', '供电', '楼层类型']
独热编码完成。


Unnamed: 0,ID,套内面积,配备电梯,lon,lat,年份,房屋总数,楼栋总数,绿 化 率,容 积 率,...,供暖_集中供暖,供暖_集中供暖/自采暖,供暖_集中供暖/自采暖/无供暖,供电_商电/民电,供电_民电,楼层类型_低楼,楼层类型_底,楼层类型_未知,楼层类型_顶,楼层类型_高楼
0,1000000,282.02,1,117.389491,40.901030,2022.0,458,3,0.1499,4.30,...,False,False,False,False,True,False,False,False,False,False
1,1000001,71.78,1,117.376625,40.767478,2022.0,3465,23,0.3000,2.37,...,True,False,False,True,False,False,False,False,False,False
2,1000002,139.86,1,117.631276,41.063635,2022.0,144,2,0.3500,1.50,...,True,False,False,False,True,False,False,False,False,True
3,1000003,106.13,0,117.186216,41.163738,2022.0,320,9,0.3000,1.60,...,False,True,False,False,True,False,False,False,False,False
4,1000004,116.80,0,117.400114,40.959679,2022.0,340,9,0.3000,2.50,...,True,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34012,1034012,132.00,0,116.504991,39.966476,2022.0,221,3,0.3500,2.30,...,False,False,False,False,False,True,False,False,False,False
34013,1034013,69.30,0,117.050163,40.511461,2022.0,479,16,0.2000,1.20,...,True,False,False,False,True,False,False,False,False,False
34014,1034014,88.10,0,116.517498,39.853890,2023.0,482,9,0.3000,2.30,...,True,False,False,False,True,False,False,False,False,False
34015,1034015,88.00,0,116.517373,39.853765,2023.0,482,9,0.3000,2.30,...,True,False,False,False,True,True,False,False,False,False


In [76]:
df_encoded2.to_csv('测试数据.csv', index=False)

In [78]:

# --- 步骤A: 使用全部训练数据，正式训练您的三个模型 ---
print("\n步骤A: 正在使用全部数据正式训练三个模型...")
for name, model in models.items():
    model.fit(X_full, y_full)
    print(f"模型 '{name}' 已正式训练完成。")

print(f"\n步骤B: 加载了 {df_encoded2.shape[0]} 条新数据 (df_encoded2) 进行预测 ---")
assert 'Price' not in df_encoded2.columns
assert 'Price_log' not in df_encoded2.columns

# --- 步骤C: 确保测试集与训练集特征对齐 (缺失列填充为0) ---
train_cols = X_full.columns  # 训练集的特征列
test_cols = df_encoded2.columns  # 测试集当前的特征列

# 找出训练集有但测试集没有的列
missing_cols = [col for col in train_cols if col not in test_cols]
print(f"测试集缺失的特征列共 {len(missing_cols)} 个，将填充为0。")

# 为测试集添加缺失列，并填充0
for col in missing_cols:
    df_encoded2[col] = 0  # 新增列并赋值0

# 确保测试集的列顺序与训练集完全一致（关键：模型预测依赖特征顺序）
X_test_aligned = df_encoded2[train_cols]

# 验证对齐结果（可选）
print(f"训练集特征数: {len(train_cols)}, 测试集对齐后特征数: {X_test_aligned.shape[1]}")
assert len(train_cols) == X_test_aligned.shape[1], "特征对齐失败！训练集与测试集特征数不一致。"


# --- 步骤D: 分别进行预测并汇总结果 ---
print("\n步骤D: 开始对 df_encoded2 进行价格预测... ---")
predictions_df = pd.DataFrame(index=X_test_aligned.index)

for name, model in models.items():
    # 使用正式训练好的 Pipeline 进行预测
    predicted_prices_log = model.predict(X_test_aligned)
    # 转换回原始价格
    predicted_prices = np.exp(predicted_prices_log)
    # 将结果存入DataFrame
    predictions_df[f'Predicted_Price_{name}'] = predicted_prices
    print(f"模型 '{name}' 预测完成。")

# --- 步骤E: 展示最终的预测结果 ---
print("\n步骤E: 最终预测结果对比 ---")
print(predictions_df.head().to_string(index=False)) 


步骤A: 正在使用全部数据正式训练三个模型...
模型 'OLS' 已正式训练完成。
模型 'Lasso' 已正式训练完成。
模型 'Ridge' 已正式训练完成。

步骤B: 加载了 34017 条新数据 (df_encoded2) 进行预测 ---
测试集缺失的特征列共 2 个，将填充为0。
训练集特征数: 175, 测试集对齐后特征数: 175

步骤D: 开始对 df_encoded2 进行价格预测... ---
模型 'OLS' 预测完成。
模型 'Lasso' 预测完成。
模型 'Ridge' 预测完成。

步骤E: 最终预测结果对比 ---
 Predicted_Price_OLS  Predicted_Price_Lasso  Predicted_Price_Ridge
        7.673559e+06           5.501631e+06           7.581519e+06
        2.713519e+06           1.521350e+06           2.688937e+06
        4.643228e+06           3.159596e+06           4.629840e+06
        3.311964e+06           1.736829e+06           3.191876e+06
        8.753749e+06           2.245724e+06           8.813965e+06


In [79]:
final_submission_df = pd.DataFrame({
    'Price': predictions_df['Predicted_Price_Ridge']
})

# 2. 在第一列添加 'ID'
# 计算起始 ID
start_id = 1000000
# 计算结束 ID
end_id = start_id + len(final_submission_df) - 1
# 生成 ID 序列
id_column = np.arange(start_id, end_id + 1)

# 使用 .insert() 方法将 ID 列插入到第 0 个位置 (即第一列)
final_submission_df.insert(0, 'ID', id_column)

# 3. 查看最终结果的前几行
print("最终生成的提交文件预览:")
print(final_submission_df.head().to_string(index=False))

最终生成的提交文件预览:
     ID        Price
1000000 7.581519e+06
1000001 2.688937e+06
1000002 4.629840e+06
1000003 3.191876e+06
1000004 8.813965e+06


In [80]:
final_submission_df
final_submission_df.to_csv('price prediction.csv', index=False)