In [2]:
import os

current_path = os.getcwd()
print("当前工作路径:", current_path)

import numpy as np
import pandas as pd
import re
import cn2an

frame_patterns = {
    'room': r'(\d+)室',     
    'hall': r'(\d+)厅',      
    'kitchen': r'(\d+)厨',   
    'bathroom': r'(\d+)卫',
    'apartment': r'(\d+)房间' 
}

def get_lift_ratio(s):
    if pd.isna(s):
        return None
    match = re.search(r'([^梯]+)梯([^户]+)户', s)
    if match:
        try:
            lift_num = cn2an.cn2an(match.group(1), 'normal')
            household_num = cn2an.cn2an(match.group(2), 'normal')
            return lift_num / household_num
        except Exception as e:
            print(f"转换错误：{e}")
            return None
    else:
        return None
    
def get_relative_height(s):
    if pd.isna(s):
        return None
    match = re.search(r'([^()]+)', s)
    if match:
        return match.group(1).strip() 
    else:
        return None

def get_total_floor(s):
    if pd.isna(s):
        return None
    match = re.search(r'\s*\((共(\d+)层)\)', s)
    if match:
        return int(match.group(2))
    else:
        return None
    
direction_mapping = {
    '东': 'east',
    '西': 'west',
    '南': 'south',
    '北': 'north',
    '东南': 'south_east',
    '东北': 'north_east',
    '西南': 'south_west',
    '西北': 'north_west'
}

def process_directions(direction_str):
    directions = direction_str.split()
    processed_directions = []
    for direction in directions:
        if direction in direction_mapping:
            processed_directions.append(direction_mapping[direction])
    return processed_directions
    

当前工作路径: c:\Users\lenovo\Desktop\2025\KE\Quent\Quant_RUC-main\Exam


In [3]:
# 读取新数据集
train_df = pd.read_csv('ruc_Class25Q2_train_price.csv')
test_df = pd.read_csv('ruc_Class25Q2_test_price.csv')

print(f"训练集大小: {train_df.shape}")
print(f"测试集大小: {test_df.shape}")

# 保存原始location1用于可视化
train_location1 = train_df['城市'].copy()
test_location1 = test_df['城市'].copy()

def preprocess(df):
    '''
    数据预处理函数
    '''
    # ============ 第一部分：重命名列 ============
    rename_mapping = {
        '城市': 'location1',
        '区域': 'location2',
        '板块': 'location3',  # 板块作为主要位置变量
        '环线': 'ring',
        'Price': 'price',
        '建筑面积': 'area_gross',
        '套内面积': 'area_net',
        '房屋户型': 'frame',
        '所在楼层': 'floor',
        '房屋朝向': 'directions',
        '建筑结构': 'structure',
        '装修情况': 'decoration',
        '配备电梯': 'lift_ornot',
        '交易权属': 'transaction_ownership',
        '房屋用途': 'purpose',
        '房屋年限': 'age',
        '产权所属': 'property_ownership',
        '房屋优势': 'advantage',
        '周边配套': 'near',
        '交通出行': 'transport',
        '建筑年代': 'build_year',
        '房屋总数': 'total_units',
        '停车位': 'parking_spaces',
        '绿化率': 'green_rate',
        '容积率': 'floor_area_ratio',
        '建筑结构_comm': 'structure_comm',
        '供水': 'water_supply',
        '供暖': 'heating',
        '供电': 'power_supply',
        '客户反馈': 'customer_feedback',
        'lon': 'lon',
        'lat': 'lat',
    }
    
    df.rename(columns=rename_mapping, inplace=True)
    
    # ============ 第二部分：处理数值型变量 ============
    
    # 1. 建筑年代处理（支持"xxxx年"和"xxxx-xxxx年"）
    def extract_build_year(s):
        if pd.isna(s):
            return None
        s = str(s)
        # 提取年份范围，如"2000-2005年"返回均值
        match = re.search(r'(\d{4})\s*-\s*(\d{4})', s)
        if match:
            year1 = int(match.group(1))
            year2 = int(match.group(2))
            return (year1 + year2) / 2
        # 单个年份
        match = re.search(r'(\d{4})', s)
        if match:
            return int(match.group(1))
        return None
    
    if 'build_year' in df.columns:
        df['build_year_num'] = df['build_year'].apply(extract_build_year)
    
    # 2. 建筑面积和套内面积
    if 'area_gross' in df.columns:
        df['area_gross'] = df['area_gross'].astype(str).str.replace('㎡', '').str.strip()
        df['area_gross'] = pd.to_numeric(df['area_gross'], errors='coerce')
    
    if 'area_net' in df.columns:
        df['area_net'] = df['area_net'].astype(str).str.replace('㎡', '').str.strip()
        df['area_net'] = pd.to_numeric(df['area_net'], errors='coerce')
    
    # 3. 提取房屋户型（从frame中提取房间数）
    if 'frame' in df.columns:
        for key, pattern in frame_patterns.items():
            df[key] = df['frame'].astype(str).str.extract(pattern, expand=False).fillna(0).astype(int)
    
    # 4. 梯户比例提取和计算
    # 梯户比例列重命名为lift_ratio_str（保存原始字符串）
    # 然后从其中提取数字计算lift_ratio
    if '梯户比例' in df.columns:
        df['lift_ratio_str'] = df['梯户比例'].fillna('')
        df['lift_ratio'] = df['lift_ratio_str'].apply(get_lift_ratio)
    elif 'lift_ratio' in df.columns:
        df['lift_ratio_str'] = df['lift_ratio'].fillna('')
        df['lift_ratio'] = df['lift_ratio_str'].apply(get_lift_ratio)
    
    # 5. 计算房龄（使用2025作为当前年份）
    # 注意：城市3、5、9没有建筑年代数据
    if 'location1' in df.columns and 'build_year_num' in df.columns:
        # 只对非3、5、9城市计算房龄
        df['house_age'] = np.nan
        mask = (~df['location1'].isin(['3', '5', '9'])) & (~df['build_year_num'].isna())
        df.loc[mask, 'house_age'] = 2025 - df.loc[mask, 'build_year_num']
    
    # 6. 处理停车位和房屋总数（提取数字）
    def extract_number(s):
        if pd.isna(s):
            return np.nan
        s = str(s)
        match = re.search(r'(\d+)', s)
        return float(match.group(1)) if match else np.nan
    
    if 'parking_spaces' in df.columns and 'total_units' in df.columns:
        df['parking_spaces_num'] = df['parking_spaces'].apply(extract_number)
        df['total_units_num'] = df['total_units'].apply(extract_number)
        # 计算每户停车位
        df['parking_per_unit'] = df['parking_spaces_num'] / df['total_units_num']
        df['parking_per_unit'] = df['parking_per_unit'].replace([np.inf, -np.inf], np.nan)
    
    # 7. 绿化率和容积率（纯数值，只需转换为数值型）
    # 注意：已经通过rename_mapping重命名为green_rate和floor_area_ratio
    if 'green_rate' in df.columns:
        df['green_rate'] = pd.to_numeric(df['green_rate'], errors='coerce')
    if 'floor_area_ratio' in df.columns:
        df['floor_area_ratio'] = pd.to_numeric(df['floor_area_ratio'], errors='coerce')
    
    # 8. 经纬度
    if 'lon' in df.columns:
        df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
    if 'lat' in df.columns:
        df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
    
    # 9. 楼层信息提取
    if 'floor' in df.columns:
        df['relative_height'] = df['floor'].apply(get_relative_height)
        df['total_floor'] = df['floor'].apply(get_total_floor)
    
    # ============ 第三部分：基于地理层级的缺失值填充 ============
    
    def fill_missing_by_hierarchy(df, col_name, ratio_base_col=None):
        """
        基于地理层级填充缺失值（板块->区域->城市）
        
        Args:
            df: 数据框
            col_name: 需要填充的列名
            ratio_base_col: 如果指定，则计算比率填充（如area_net基于area_gross的比率）
        """
        if col_name not in df.columns:
            return df
        
        # 检查是否包含地理信息
        if 'location3' not in df.columns:
            return df
        
        # 确定缺失值的判断条件（NaN或编码为0）
        if ratio_base_col and ratio_base_col in df.columns:
            # 对于比率填充的情况（如area_net）
            missing_mask = df[col_name].isna()
        else:
            # 对于其他情况，包括编码为0的情况
            missing_mask = (df[col_name].isna()) | (df[col_name] == 0)
        
        missing_indices = df[missing_mask].index
        
        if len(missing_indices) == 0:
            return df
        
        # 如果是指定比率填充的情况
        if ratio_base_col and ratio_base_col in df.columns:
            # 计算比率：先板块，再区域，再城市
            # 1. 计算板块级别的比率
            df_temp = df[~df[col_name].isna() & ~df[ratio_base_col].isna()].copy()
            if len(df_temp) > 0:
                df_temp['ratio'] = df_temp[col_name] / df_temp[ratio_base_col]
                
                # 板块级别
                location3_ratios = df_temp.groupby('location3')['ratio'].mean().to_dict()
                for idx in missing_indices:
                    if idx in df.index:
                        location3 = df.loc[idx, 'location3']
                        if pd.notna(location3) and location3 in location3_ratios:
                            if pd.notna(df.loc[idx, ratio_base_col]):
                                df.loc[idx, col_name] = df.loc[idx, ratio_base_col] * location3_ratios[location3]
                                continue
                
                # 区域级别（对于板块级别无法填充的）
                location2_ratios = df_temp.groupby('location2')['ratio'].mean().to_dict()
                for idx in missing_indices:
                    if idx in df.index and (pd.isna(df.loc[idx, col_name]) or df.loc[idx, col_name] is pd.NA):
                        location2 = df.loc[idx, 'location2']
                        if pd.notna(location2) and location2 in location2_ratios:
                            if pd.notna(df.loc[idx, ratio_base_col]):
                                df.loc[idx, col_name] = df.loc[idx, ratio_base_col] * location2_ratios[location2]
                                continue
                
                # 城市级别
                location1_ratios = df_temp.groupby('location1')['ratio'].mean().to_dict()
                for idx in missing_indices:
                    if idx in df.index and (pd.isna(df.loc[idx, col_name]) or df.loc[idx, col_name] is pd.NA):
                        location1 = df.loc[idx, 'location1']
                        if pd.notna(location1) and location1 in location1_ratios:
                            if pd.notna(df.loc[idx, ratio_base_col]):
                                df.loc[idx, col_name] = df.loc[idx, ratio_base_col] * location1_ratios[location1]
        else:
            # 普通数值填充（计算平均值）
            # 先计算板块级别的平均值
            df_valid = df[~missing_mask].copy()
            if len(df_valid) > 0:
                # 板块级别
                location3_means = df_valid.groupby('location3')[col_name].mean().to_dict()
                for idx in missing_indices:
                    if idx in df.index:
                        location3 = df.loc[idx, 'location3']
                        if pd.notna(location3) and location3 in location3_means:
                            if pd.notna(location3_means[location3]) and location3_means[location3] != 0:
                                df.loc[idx, col_name] = location3_means[location3]
                                continue
                
                # 区域级别
                location2_means = df_valid.groupby('location2')[col_name].mean().to_dict()
                for idx in missing_indices:
                    if idx in df.index and (df.loc[idx, col_name] == 0 or pd.isna(df.loc[idx, col_name])):
                        location2 = df.loc[idx, 'location2']
                        if pd.notna(location2) and location2 in location2_means:
                            if pd.notna(location2_means[location2]) and location2_means[location2] != 0:
                                df.loc[idx, col_name] = location2_means[location2]
                                continue
                
                # 城市级别
                location1_means = df_valid.groupby('location1')[col_name].mean().to_dict()
                for idx in missing_indices:
                    if idx in df.index and (df.loc[idx, col_name] == 0 or pd.isna(df.loc[idx, col_name])):
                        location1 = df.loc[idx, 'location1']
                        if pd.notna(location1) and location1 in location1_means:
                            if pd.notna(location1_means[location1]) and location1_means[location1] != 0:
                                df.loc[idx, col_name] = location1_means[location1]
        
        return df
    
    # 对数值列进行分层级填充
    # 1. 特殊处理：套内面积基于建筑面积的比率填充
    if 'area_net' in df.columns and 'area_gross' in df.columns:
        df = fill_missing_by_hierarchy(df, 'area_net', ratio_base_col='area_gross')
    
    # 2. 其他数值列的普通填充
    numeric_cols_for_imputation = ['parking_spaces_num', 'total_units_num', 'green_rate', 
                                     'floor_area_ratio', 'build_year_num', 'house_age', 'parking_per_unit']
    for col in numeric_cols_for_imputation:
        if col in df.columns:
            df = fill_missing_by_hierarchy(df, col)
    
    # ============ 第四部分：处理类别型变量 ============
    
    # 1. 环线（ring）编码：提取第一个数字汉字或内中外
    if 'ring' in df.columns:
        def encode_ring(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            # 提取数字汉字：一二三四五六七八九十
            num_chinese = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
            for i, char in enumerate(num_chinese, 1):
                if char in s:
                    return i
            # 提取内中外
            if '内' in s:
                return 1
            elif '中' in s:
                return 2
            elif '外' in s:
                return 3
            return 0
        df['ring_encoded'] = df['ring'].apply(encode_ring)
        df['ring'] = df['ring'].fillna('环线_未知')
    
    # 2. 建筑结构（structure）编码：框架=3，钢混=2，其他=1
    if 'structure' in df.columns:
        def encode_structure(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '框架' in s:
                return 3
            elif '钢混' in s:
                return 2
            else:
                return 1
        df['structure_encoded'] = df['structure'].apply(encode_structure)
        df['structure'] = df['structure'].fillna('建筑结构_未知')
    
    # 3. 装修情况（decoration）编码：精装=4，简装=3，毛坯=2，其他=1
    if 'decoration' in df.columns:
        def encode_decoration(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '精装' in s:
                return 4
            elif '简装' in s:
                return 3
            elif '毛坯' in s:
                return 2
            else:
                return 1
        df['decoration_encoded'] = df['decoration'].apply(encode_decoration)
        df['decoration'] = df['decoration'].fillna('装修情况_未知')
    
    # 4. 交易权属（transaction_ownership）编码：私产=5，商品房=4，二类经济适用房=3，动迁安置房=已购公房=2，其他=1
    if 'transaction_ownership' in df.columns:
        def encode_transaction(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '私产' in s:
                return 5
            elif '商品房' in s and '二类' not in s:
                return 4
            elif '二类经济适用房' in s:
                return 3
            elif '已购公房' in s or '动迁安置房' in s or '定向安置房' in s:
                return 2
            else:
                return 1
        df['transaction_ownership_encoded'] = df['transaction_ownership'].apply(encode_transaction)
        df['transaction_ownership'] = df['transaction_ownership'].fillna('交易权属_未知')
    
    # 5. 房屋用途（purpose）编码：普通住宅/别墅=3，公寓/商住两用=2，车库/商业办公=1
    if 'purpose' in df.columns:
        def encode_purpose(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '别墅' in s or '普通住宅' in s:
                return 3
            elif '公寓' in s or '商住两用' in s:
                return 2
            elif '车库' in s or '商业办公' in s or '酒店式' in s:
                return 1
            else:
                return 1
        df['purpose_encoded'] = df['purpose'].apply(encode_purpose)
        df['purpose'] = df['purpose'].fillna('房屋用途_未知')
    
    # 6. 产权所属（property_ownership）编码：非共有=2，共有=1
    if 'property_ownership' in df.columns:
        def encode_property(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '非共有' in s:
                return 2
            elif '共有' in s:
                return 1
            else:
                return 1
        df['property_ownership_encoded'] = df['property_ownership'].apply(encode_property)
        df['property_ownership'] = df['property_ownership'].fillna('产权所属_未知')
    
    # 8. 建筑结构_comm（structure_comm）编码：板楼=4，塔板结合=3，平房=2，塔楼=1
    if 'structure_comm' in df.columns:
        def encode_structure_comm(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '板楼' in s and '塔板' not in s:
                return 4
            elif '塔板结合' in s or ('塔板' in s and '结合' in s):
                return 3
            elif '平房' in s:
                return 2
            elif '塔楼' in s:
                return 1
            else:
                return 2
        df['structure_comm_encoded'] = df['structure_comm'].apply(encode_structure_comm)
        df['structure_comm'] = df['structure_comm'].fillna('建筑结构_comm_未知')
    
    # 9. 供水（water_supply）编码：民水=2，商水=1
    if 'water_supply' in df.columns:
        def encode_water(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '民水' in s:
                return 2
            elif '商水' in s:
                return 1
            else:
                return 2
        df['water_supply_encoded'] = df['water_supply'].apply(encode_water)
        df['water_supply'] = df['water_supply'].fillna('供水_未知')
    
    # 10. 供暖（heating）编码：集中供暖=2，自采暖=1
    if 'heating' in df.columns:
        def encode_heating(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '集中供暖' in s:
                return 2
            elif '自采暖' in s:
                return 1
            else:
                return 2
        df['heating_encoded'] = df['heating'].apply(encode_heating)
        df['heating'] = df['heating'].fillna('供暖_未知')
    
    # 11. 供电（power_supply）编码：民电=2，商电=1
    if 'power_supply' in df.columns:
        def encode_power(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '民电' in s:
                return 2
            elif '商电' in s:
                return 1
            else:
                return 2
        df['power_supply_encoded'] = df['power_supply'].apply(encode_power)
        df['power_supply'] = df['power_supply'].fillna('供电_未知')
    
    # 12. 配备电梯（lift_ornot）编码：有=2，无=1
    if 'lift_ornot' in df.columns:
        def encode_lift(s):
            if pd.isna(s) or s == '':
                return 0
            s = str(s)
            if '有' in s and '无' not in s:
                return 2
            elif '无' in s:
                return 1
            else:
                return 1
        df['lift_ornot_encoded'] = df['lift_ornot'].apply(encode_lift)
        df['lift_ornot'] = df['lift_ornot'].fillna('配备电梯_未知')
    
    # 13. 相对楼层高度（relative_height）编码：中楼层=4，高楼层=3，低楼层=2，其他=1
    if 'relative_height' in df.columns:
        def encode_relative_height(s):
            if pd.isna(s) or s == '' or str(s) == '未知楼层':
                return 0
            s = str(s)
            if '中楼层' in s:
                return 4
            elif '高楼层' in s:
                return 3
            elif '低楼层' in s:
                return 2
            elif '顶层' in s or '底层' in s or '地下室' in s:
                return 1
            else:
                return 1
        df['relative_height_encoded'] = df['relative_height'].apply(encode_relative_height)
        df['relative_height'] = df['relative_height'].fillna('未知楼层')
    
    # 处理ring_encoded：对非0值取倒数，然后对0、2、3、4城市的0值用分层级填充
    if 'ring_encoded' in df.columns:
        # 取倒数处理（ring_encoded为0的不变）
        df['ring_encoded'] = df['ring_encoded'].apply(lambda x: 1/x if x != 0 else 0)
        
        # 对城市0、2、3、4中ring_encoded=0的进行分层级填充
        if 'location1' in df.columns:
            # 先保存原始数据，只对指定城市进行填充
            original_ring = df['ring_encoded'].copy()
            # 对0、2、3、4城市的数据临时填充
            mask_zero_cities = df['location1'].astype(str).isin(['0', '2', '3', '4'])
            df_temp = df[mask_zero_cities].copy()
            if len(df_temp) > 0:
                df_temp = fill_missing_by_hierarchy(df_temp, 'ring_encoded')
                df.loc[df_temp.index, 'ring_encoded'] = df_temp['ring_encoded']
    
    # 填充编码变量的缺失值（使用分层级填充）
    encoded_cols_for_imputation = ['ring_encoded', 'structure_encoded', 'decoration_encoded',
                                     'transaction_ownership_encoded', 'purpose_encoded',
                                     'property_ownership_encoded', 'structure_comm_encoded', 
                                     'water_supply_encoded', 'heating_encoded', 'power_supply_encoded',
                                     'lift_ornot_encoded', 'relative_height_encoded']
    
    # 对3、9城市的特定变量跳过填充
    skip_fill_for_city_3_9 = ['structure_comm_encoded', 'water_supply_encoded', 
                               'heating_encoded', 'power_supply_encoded']
    
    for col in encoded_cols_for_imputation:
        if col in df.columns:
            if col in skip_fill_for_city_3_9:
                # 对3、9城市的数据不填充这些变量
                if 'location1' in df.columns:
                    mask_not_city_3_9 = ~df['location1'].astype(str).isin(['3', '9'])
                    df_temp = df[mask_not_city_3_9].copy()
                    if len(df_temp) > 0:
                        df_temp = fill_missing_by_hierarchy(df_temp, col)
                        df.loc[df_temp.index, col] = df_temp[col]
                else:
                    # 如果没有location1，所有数据都填充
                    df = fill_missing_by_hierarchy(df, col)
            else:
                # 其他变量对所有缺失值（编码为0）进行填充
                df = fill_missing_by_hierarchy(df, col)
    
    # 2. 朝向处理（转换为8个方向标志）
    if 'directions' in df.columns:
        df['directions'] = df['directions'].fillna('未知朝向')
        df['directions'] = df['directions'].apply(process_directions)
        dummies = df['directions'].apply(lambda x: pd.Series([1 if d in x else 0 for d in direction_mapping.values()]))
        dummies.columns = direction_mapping.values()
        df = pd.concat([df, dummies], axis=1)
    
    # ============ 第五部分：处理半结构化文本 ============
    
    # 1. 房屋优势打分规则
    if 'advantage' in df.columns:
        def calculate_advantage_score(s):
            if pd.isna(s):
                return 0
            s = str(s)
            score = 0
            if '地铁' in s:
                score += 1
            if '房本满两年' in s:
                score += 0.4
            if '房本满五年' in s:
                score += 1
            return score
        df['advantage_score'] = df['advantage'].apply(calculate_advantage_score)
    
    # 2. 周边配套和交通出行转为存在性标记
    for var in ['near', 'transport']:
        if var in df.columns:
            df[f'{var}_exists'] = np.where(df[var].isna(), 0, 1)
    
    # ============ 第六部分：处理纯自然语言 ============
    
    # 只有客户反馈是真正的自然语言
    if 'customer_feedback' in df.columns:
        df['customer_feedback_exists'] = np.where(df['customer_feedback'].isna(), 0, 1)
    
    # ============ 第七部分：城市3和9的特殊标记 ============
    
    if 'location1' in df.columns:
        df['is_city3_or_city9'] = (df['location1'].astype(str) == '3') | (df['location1'].astype(str) == '9')
    
    # ============ 第八部分：清理不需要的列 ============
    
    # 处理绿化和容积率列名（可能有空格的问题）
    if '绿 化 率' in df.columns:
        df.rename(columns={'绿 化 率': 'green_rate'}, inplace=True)
    if '容 积 率' in df.columns:
        df.rename(columns={'容 积 率': 'floor_area_ratio'}, inplace=True)
    if '绿化率' in df.columns and 'green_rate' not in df.columns:
        df.rename(columns={'绿化率': 'green_rate'}, inplace=True)
    if '容积率' in df.columns and 'floor_area_ratio' not in df.columns:
        df.rename(columns={'容积率': 'floor_area_ratio'}, inplace=True)
    
    drop_columns = ['frame', 'floor', 'directions', 'lift_ratio_str', 
                    'build_year', 'build_year_num', 'advantage', 
                    'near', 'transport', 'customer_feedback',
                    'district',  # district不再使用
                    'parking_spaces_num', 'total_units_num',  # 只保留parking_per_unit
                    'total_floor',  # 有了relative_height就不需要total_floor
                    'parking_spaces',  # 有了parking_per_unit就不需要parking_spaces
                    'total_units',  # 有了parking_per_unit就不需要total_units
                    'apartment',  # 不需要apartment
                    '梯户比例']  # 有了lift_ratio就不需要梯户比例
    
    drop_columns = [col for col in drop_columns if col in df.columns]
    df = df.drop(drop_columns, axis=1)
    
    return df

# 处理数据
train_df = preprocess(train_df)
test_df = preprocess(test_df)

# 检查预处理后的列名
print("\n预处理后的列名:")
print(train_df.columns.tolist()[:20])

# 检查关键列是否存在
key_cols = ['area_gross', 'lift_ratio', 'parking_per_unit', 'house_age', 'lon', 'lat']
for col in key_cols:
    exists = col in train_df.columns
    print(f"{col}: {'存在' if exists else '不存在'}")

print("\n数据处理完成！")
print(f"训练集处理后大小: {train_df.shape}")
print(f"测试集处理后大小: {test_df.shape}")

# 定义列名中文说明
column_descriptions = {
    'location1': '城市',
    'location2': '区域',
    'location3': '板块',
    'ring': '环线',
    'price': '价格',
    'area_gross': '建筑面积',
    'area_net': '套内面积',
    'room': '房间数（室）',
    'hall': '客厅数（厅）',
    'kitchen': '厨房数（厨）',
    'bathroom': '卫生间数（卫）',
    'total_floor': '总楼层数',
    'relative_height': '相对楼层高度',
    'lift_ratio': '梯户比例（数值）',
    'structure': '建筑结构',
    'structure_comm': '建筑结构_comm',
    'decoration': '装修情况',
    'lift_ornot': '是否配备电梯',
    'transaction_ownership': '交易权属',
    'purpose': '房屋用途',
    'age': '房屋年限',
    'property_ownership': '产权所属',
    'green_rate': '绿化率',
    'floor_area_ratio': '容积率',
    'parking_per_unit': '每户停车位',
    'build_year_num': '建筑年代（数值）',
    'house_age': '房龄（2025-建筑年代）',
    'lon': '经度',
    'lat': '纬度',
    'is_city3_or_city9': '是否为城市3或9',
    'east': '朝向-东',
    'west': '朝向-西',
    'south': '朝向-南',
    'north': '朝向-北',
    'south_east': '朝向-东南',
    'north_east': '朝向-东北',
    'south_west': '朝向-西南',
    'north_west': '朝向-西北',
    'water_supply': '供水',
    'heating': '供暖',
    'power_supply': '供电',
    'advantage_score': '房屋优势得分',
    'near_exists': '周边配套是否存在',
    'transport_exists': '交通出行是否存在',
    'customer_feedback_exists': '客户反馈是否存在'
}

# 输出处理后的所有列名
print("\n" + "="*80)
print("处理后训练集的所有列名及含义")
print("="*80)
print(f"\n共 {len(train_df.columns)} 列\n")

columns_after = train_df.columns.tolist()
for i, col in enumerate(columns_after, 1):
    desc = column_descriptions.get(col, '未知')
    print(f"  {i:2d}. {col:30s} - {desc}")

# 保存处理后的数据为CSV文件
print("\n" + "="*80)
print("保存处理后的数据...")
print("="*80)

train_df.to_csv('train_processed.csv', index=False, encoding='utf-8-sig')
test_df.to_csv('test_processed.csv', index=False, encoding='utf-8-sig')

print("\n已保存以下文件:")
print("  - train_processed.csv (处理后的训练集)")
print("  - test_processed.csv (处理后的测试集)")


训练集大小: (103871, 33)
测试集大小: (34017, 33)

预处理后的列名:
['location1', 'location2', 'location3', 'ring', 'price', 'area_gross', 'area_net', 'structure', 'decoration', 'lift_ornot', 'transaction_ownership', 'purpose', 'age', 'property_ownership', 'lon', 'lat', 'green_rate', 'floor_area_ratio', 'structure_comm', 'water_supply']
area_gross: 存在
lift_ratio: 存在
parking_per_unit: 存在
house_age: 存在
lon: 存在
lat: 存在

数据处理完成！
训练集处理后大小: (103871, 55)
测试集处理后大小: (34017, 55)

处理后训练集的所有列名及含义

共 55 列

   1. location1                      - 城市
   2. location2                      - 区域
   3. location3                      - 板块
   4. ring                           - 环线
   5. price                          - 价格
   6. area_gross                     - 建筑面积
   7. area_net                       - 套内面积
   8. structure                      - 建筑结构
   9. decoration                     - 装修情况
  10. lift_ornot                     - 是否配备电梯
  11. transaction_ownership          - 交易权属
  12. purpose                        - 房屋用途


In [4]:
col_na_count=train_df.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('train_df_na is: \n',col_na_count)

col_na_count=test_df.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('test_df_na is: \n',col_na_count)


train_df_na is: 
 age                 44510
green_rate          32883
floor_area_ratio    33154
lift_ratio           2619
house_age           26361
parking_per_unit     1323
dtype: int64
test_df_na is: 
 age                 11296
green_rate           9180
floor_area_ratio     9303
lift_ratio            635
house_age            6193
parking_per_unit     5136
dtype: int64


In [6]:
def get_location_relation_df(location_index, train, test_df):
    """
    基于分层级方法建立地理层级关系表（与fill_missing_by_hierarchy逻辑一致）
    """
    if location_index == 1:
        raise ValueError("城市数据齐全，不需要进行同级估计。")
    if location_index > 3:
        raise ValueError("请选择正确的地理区分度！")
    
    location_upper = 'location' + str(location_index - 1)
    location_lower = 'location' + str(location_index)
    
    # 合并训练集和测试集
    locations_df = pd.concat([train[[location_upper, location_lower]], 
                               test_df[[location_upper, location_lower]]], 
                              axis=0, ignore_index=True)
    
    # 去重并建立关系（使用pandas的去重，更高效）
    location_relation_df = locations_df.drop_duplicates(subset=[location_lower])
    location_relation_df = location_relation_df.sort_values(location_lower).reset_index(drop=True)
    
    # 标记是否在训练集中
    train_location_lower_set = set(train[location_lower].unique())
    location_relation_df[location_lower + '_isin_train'] = location_relation_df[location_lower].isin(train_location_lower_set)
    
    return location_relation_df

In [None]:
area_class='area_gross'
location_index=3  # 使用板块级别（location3）
location_relation_df=get_location_relation_df(location_index,train_df,test_df)
neighbor_num=10
IS_UPPER=True

def get_neighbor_location_list(this_location, location_index=None, location_relation_df=None, train_df=None, neighbor_num=None):
    """
    基于分层级方法获取邻居位置列表
    """
    if IS_UPPER:
        location_upper = 'location' + str(location_index - 1)
        location_lower = 'location' + str(location_index)
        
        
        # 从关系表中获取上级位置
        location_info = location_relation_df[location_relation_df[location_lower] == this_location]
        if location_info.empty:
            return []
        
        this_location_upper = location_info[location_upper].iloc[0]
        
        # 获取同一上级下的所有邻居（且在训练集中）
        neighbors = location_relation_df[
            (location_relation_df[location_upper] == this_location_upper) &
            (location_relation_df[location_lower] != this_location) &
            (location_relation_df[location_lower + '_isin_train'])
        ][location_lower].tolist()
        
        return neighbors
    else:
        train_location_list = np.sort(np.unique(train_df['location' + str(location_index)]).astype(int))
        sorted_indices = np.argsort(np.abs(train_location_list - this_location))
        neighbor_location_list = train_location_list[sorted_indices[range(neighbor_num)]]
        return neighbor_location_list.astype(str)

In [8]:
# 定义所有特征变量列表（与预处理后的变量对应）

# 线性变量（纯数值型 + 已编码的类别型变量 + 0/1变量）
linear_variable_name_list = [
    # 数值型变量
    'area_gross', 'area_net',  # 建筑面积
    'room', 'hall', 'kitchen', 'bathroom',  # 房间数（从frame提取）
    'lift_ratio',  # 梯户比例
    'parking_per_unit',  # 每户停车位
    'green_rate', 'floor_area_ratio',  # 绿化率、容积率
    'house_age',  # 房龄（2025 - build_year_num）
    'lon', 'lat',  # 经纬度
    # 编码后的类别型变量（数值型编码）
    'ring_encoded', 'structure_encoded', 'decoration_encoded',
    'transaction_ownership_encoded', 'purpose_encoded',
    'property_ownership_encoded', 'structure_comm_encoded', 
    'water_supply_encoded', 'heating_encoded', 'power_supply_encoded',
    'lift_ornot_encoded', 'relative_height_encoded',
    # 0/1变量（已经是数值型，不需要dummy）
    'east', 'west', 'south', 'north',  # 朝向
    'south_east', 'north_east', 'south_west', 'north_west',  # 朝向
    'advantage_score',  # 房屋优势得分（数值型）
    'near_exists', 'transport_exists', 'customer_feedback_exists',  # 存在性标记（0/1）
    'is_city3_or_city9',  # 城市特殊标记（boolean，但可以作为0/1使用）
]

# 非线性变量（适合进行多项式和对数变换的连续数值变量）
nonlinear_variable_name_list = [
    'area_gross', 'area_net',  # 建筑面积 - 价格与面积通常有非线性关系
    'house_age',  # 房龄 - 房龄对价格的影响可能是非线性的
    'parking_per_unit',  # 每户停车位 - 可能影响价格
    # 注意：编码后的变量不适合做非线性变换
]

# 虚拟变量（类别型变量，需要get_dummies处理）
# 注意：这些是原始类别型变量，会通过get_dummies转换为0/1虚拟变量
dummy_variable_name_list = [
    'location1', 'location2', 'location3',  # 位置信息
    'ring',  # 环线（原始类别型）
    'structure', 'structure_comm',  # 建筑结构
    'decoration',  # 装修情况
    'lift_ornot',  # 是否配备电梯
    'transaction_ownership',  # 交易权属
    'purpose',  # 房屋用途
    'property_ownership',  # 产权所属
    'water_supply', 'heating', 'power_supply',  # 基础设施
    'relative_height',  # 相对楼层高度（原始类别型）
]

print("特征变量列表定义完成")
print(f"线性变量: {len(linear_variable_name_list)} 个")
print(f"非线性变量: {len(nonlinear_variable_name_list)} 个")
print(f"虚拟变量: {len(dummy_variable_name_list)} 个")


特征变量列表定义完成
线性变量: 38 个
非线性变量: 4 个
虚拟变量: 15 个


In [None]:
# 特征矩阵生成函数定义
# 注意：特征变量列表在Cell 6中已定义

def get_Fmatrix_linear_part(df,linear_variable_name_list):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("输入 df 必须是 pandas DataFrame")
    if not isinstance(linear_variable_name_list, list) or not all(isinstance(var, str) for var in linear_variable_name_list):
        raise ValueError("linear_variable_name_list 必须是一个字符串列表")
    if not all(var in df.columns for var in linear_variable_name_list):
        raise ValueError("linear_variable_name_list 中的变量名必须存在于 df 的列中")
    # 复制原始 DataFrame，避免修改原始数据
    result_df = pd.DataFrame()
    for var in linear_variable_name_list:
        result_df[var] = df[var]
    return result_df


def get_Fmatrix_nonlinear_part(df,nonlinear_variable_name_list):
    # 检查输入是否有效
    if not isinstance(df, pd.DataFrame):
        raise ValueError("输入 df 必须是 pandas DataFrame")
    if not isinstance(nonlinear_variable_name_list, list) or not all(isinstance(var, str) for var in nonlinear_variable_name_list):
        raise ValueError("nonlinear_variable_name_list 必须是一个字符串列表")
    if not all(var in df.columns for var in nonlinear_variable_name_list):
        raise ValueError("nonlinear_variable_name_list 中的变量名必须存在于 df 的列中")

    # 复制原始 DataFrame，避免修改原始数据
    result_df = pd.DataFrame()

    # 为每个变量生成非线性项
    for var in nonlinear_variable_name_list:
        result_df[var] = df[var]
        # 负一次项
        result_df[f"({var}+1)^-1"] = 1/(df[var]+1) 
        # 二次项
        result_df[f"{var}^2"] = df[var] ** 2
        # 三次项
        result_df[f"{var}^3"] = df[var] ** 3
        # 对数项
        result_df[f"log({var}+1)"] = np.log(df[var]+1)
        # 对数的平方
        result_df[f"log({var}+1)^2"] = (np.log(df[var]+1)) ** 2
        # 对数的立方
        result_df[f"log({var}+1)^3"] = (np.log(df[var]+1)) ** 3

    return result_df

def get_Fmatrix_dummy_part(df,dummy_variable_name_list,is_test=False,train_df=None):
    if is_test:
        if train_df is None:
            raise ValueError("为训练集生成虚拟变量时必须输入测试集，以保证测试集的虚拟变量与测试集完全重合！")
    
    result_df = pd.DataFrame()
    X_temp = pd.get_dummies(df, columns=dummy_variable_name_list, prefix=dummy_variable_name_list, drop_first=False)
    
    # 安全地过滤虚拟变量列
    filtered_cols = X_temp.filter(regex='^(' + '|'.join(dummy_variable_name_list) + ')')
    
    # 逐个列处理，使用int8节省内存（虚拟变量只有0/1）
    for col in filtered_cols.columns:
        try:
            # 尝试转换为浮点数再转int8（虚拟变量只需要0或1，int8足够）
            result_df[col] = pd.to_numeric(filtered_cols[col], errors='coerce').fillna(0).astype(np.int8)
        except (ValueError, TypeError):
            # 如果转换失败，尝试其他方法
            try:
                result_df[col] = (filtered_cols[col] == filtered_cols[col].iloc[0]).astype(np.int8)
            except:
                # 最后手段：设为0
                result_df[col] = 0
                result_df[col] = result_df[col].astype(np.int8)
    
    if is_test:
        X_train_dummy_part = get_Fmatrix_dummy_part(train_df, dummy_variable_name_list)
        train_exclusive_location_list = np.setdiff1d(train_df['location'+str(location_index)], df['location'+str(location_index)])
        test_exclusive_location_list = np.setdiff1d(df['location'+str(location_index)], train_df['location'+str(location_index)])
        train_exclusive_list = np.setdiff1d(X_train_dummy_part.columns, result_df.columns)
        test_exclusive_list = np.setdiff1d(result_df.columns, X_train_dummy_part.columns)
        
        for train_exclusive in train_exclusive_list:
            result_df[train_exclusive] = 0
        
        # 处理年份
        if 'year_2022' in result_df.columns and 'year_2023' in result_df.columns:
            result_df['year_2022'] += result_df['year_2023']
            result_df.drop('year_2023', axis=1, inplace=True)
        
        if IS_UPPER:
            for test_location in test_exclusive_location_list:
                # 确保test_location是字符串类型
                test_location = str(test_location)
                neighbor_location_list = get_neighbor_location_list(this_location=test_location, location_index=location_index, location_relation_df=location_relation_df)
                neighbor_num = len(neighbor_location_list)
                if neighbor_num != 0:
                    for neighbor_location in neighbor_location_list:
                        # 确保neighbor_location是字符串类型
                        neighbor_location = str(neighbor_location)
                        result_df['location'+str(location_index)+'_'+neighbor_location] += 1/neighbor_num * result_df['location'+str(location_index)+'_'+test_location]
                    result_df.drop('location'+str(location_index)+'_'+test_location, axis=1, inplace=True)
        else:
            for test_location in test_exclusive_location_list:
                # 确保test_location是字符串类型
                test_location = str(test_location)
                neighbor_location_list = get_neighbor_location_list(this_location=test_location, train_df=train_df, neighbor_num=neighbor_num)
                for neighbor_location in neighbor_location_list:
                    # 确保neighbor_location是字符串类型
                    neighbor_location = str(neighbor_location)
                    result_df['location'+str(location_index)+'_'+neighbor_location] += 1/neighbor_num * result_df['location'+str(location_index)+'_'+test_location]
                result_df.drop('location'+str(location_index)+'_'+test_location, axis=1, inplace=True)
        
        train_exclusive_list = np.setdiff1d(X_train_dummy_part.columns, result_df.columns)
        test_exclusive_list = np.setdiff1d(result_df.columns, X_train_dummy_part.columns)
        
        if len(train_exclusive_list) != 0:
            raise ValueError("X_train_dummy_part仍然有独有变量！")
        
        if len(test_exclusive_list) != 0:
            for var in test_exclusive_list:
                result_df.drop(var, axis=1, inplace=True)
    
    return result_df


In [10]:
# 生成训练集和测试集的特征矩阵
X_train_part_list = []
X_test_part_list = []

print("开始生成特征矩阵...")

# 线性部分
X_train_part_list.append(get_Fmatrix_linear_part(train_df, linear_variable_name_list))
X_test_part_list.append(get_Fmatrix_linear_part(test_df, linear_variable_name_list))
print(f"线性部分完成")

# 非线性部分
X_train_part_list.append(get_Fmatrix_nonlinear_part(train_df, nonlinear_variable_name_list))
X_test_part_list.append(get_Fmatrix_nonlinear_part(test_df, nonlinear_variable_name_list))
print(f"非线性部分完成")

# 虚拟变量部分
X_train_part_list.append(get_Fmatrix_dummy_part(train_df, dummy_variable_name_list))
X_test_part_list.append(get_Fmatrix_dummy_part(test_df, dummy_variable_name_list, is_test=True, train_df=train_df))
print(f"虚拟变量部分完成")

print(f"训练集部分数: {len(X_train_part_list)}")
print(f"测试集部分数: {len(X_test_part_list)}")


开始生成特征矩阵...
线性部分完成
非线性部分完成
虚拟变量部分完成
训练集部分数: 3
测试集部分数: 3


In [None]:
# 优化内存使用：使用numpy数组直接合并，避免pandas concat的开销
print("开始合并训练集特征矩阵（使用numpy数组合并以节省内存）...")

# 收集所有列名和数组，保存索引
all_columns = []
all_arrays = []
saved_index = None

for i, part in enumerate(X_train_part_list):
    print(f"  处理第 {i+1}/{len(X_train_part_list)} 部分...")
    # 确保part不是None
    if part is None:
        print(f"    警告：第 {i+1} 部分为None，跳过")
        continue
    # 保存第一个部分的索引
    if i == 0:
        saved_index = part.index
    # 转换为numpy数组并收集列名
    part_array = part.values
    all_arrays.append(part_array)
    all_columns.extend(part.columns.tolist())
    # 释放原始DataFrame（但在循环中不设为None，而是等待循环结束后统一处理）

print("  合并numpy数组...")
# 使用numpy的hstack直接合并数组（更高效）
combined_array = np.hstack(all_arrays)
del all_arrays  # 释放原始数组

print("  创建最终DataFrame...")
# 一次性创建DataFrame
X_train_without_interaction = pd.DataFrame(combined_array, 
                                           index=saved_index,
                                           columns=all_columns)
del combined_array, all_columns, saved_index
print("训练集合并完成")

# 清理列表（循环结束后再清理）
for i in range(len(X_train_part_list)):
    X_train_part_list[i] = None
del X_train_part_list

# 检查测试集特征矩阵
print(f"\nX_test_part_list长度: {len(X_test_part_list)}")
if len(X_test_part_list) == 0:
    raise ValueError("X_test_part_list为空，请检查前面的特征生成步骤")
for idx, part in enumerate(X_test_part_list):
    print(f"X_test_part_list[{idx}] 形状: {part.shape}")

print("\n开始合并测试集特征矩阵（使用numpy数组合并以节省内存）...")

# 收集所有列名和数组，保存索引
all_columns_test = []
all_arrays_test = []
saved_index_test = None

for i, part in enumerate(X_test_part_list):
    print(f"  处理第 {i+1}/{len(X_test_part_list)} 部分...")
    # 确保part不是None
    if part is None:
        print(f"    警告：第 {i+1} 部分为None，跳过")
        continue
    # 保存第一个部分的索引
    if i == 0:
        saved_index_test = part.index
    # 转换为numpy数组并收集列名
    part_array = part.values
    all_arrays_test.append(part_array)
    all_columns_test.extend(part.columns.tolist())
    # 释放原始DataFrame（但在循环中不设为None，而是等待循环结束后统一处理）

print("  合并numpy数组...")
# 使用numpy的hstack直接合并数组（更高效）
combined_array_test = np.hstack(all_arrays_test)
del all_arrays_test  # 释放原始数组

print("  创建最终DataFrame...")
# 一次性创建DataFrame
X_test_without_interaction = pd.DataFrame(combined_array_test,
                                          index=saved_index_test,
                                          columns=all_columns_test)
del combined_array_test, all_columns_test, saved_index_test
print("测试集合并完成")

# 清理列表（循环结束后再清理）
for i in range(len(X_test_part_list)):
    X_test_part_list[i] = None
del X_test_part_list

interaction_variable_pair_list=[['location1','ring']]

def get_Fmatrix_with_interaction(df, interaction_variable_pair_list):
    """
    生成交互项特征
    注意：如果DataFrame有重复列名，会先处理重复列
    """
    # 检查并处理重复的列名
    if df.columns.duplicated().any():
        print(f"警告：发现重复列名，正在处理...")
        # 重命名重复的列，使用序号区分
        cols = df.columns.tolist()
        seen = {}
        new_cols = []
        for col in cols:
            if cols.count(col) > 1:  # 如果是重复列
                if col not in seen:
                    seen[col] = 0
                    new_cols.append(col)
                else:
                    seen[col] += 1
                    new_cols.append(f"{col}_dup{seen[col]}")
            else:
                new_cols.append(col)
        df.columns = new_cols
        print(f"已处理重复列名，新列数: {len(df.columns)}")
    
    result_df = pd.DataFrame(index=df.index)
    name_list_without_interaction = df.columns
    
    for variable_pair in interaction_variable_pair_list:
        variable_former_list = [name for name in name_list_without_interaction if name.split('_')[0] == variable_pair[0]]
        variable_later_list = [name for name in name_list_without_interaction if name.split('_')[0] == variable_pair[1]]
        
        for variable_former in variable_former_list:
            for variable_later in variable_later_list:
                # 使用.values避免对齐问题，或者确保使用Series
                if variable_former in df.columns and variable_later in df.columns:
                    # 直接使用Series进行乘法，避免DataFrame对齐问题
                    interaction_name = f"{variable_former}*{variable_later}"
                    result_df[interaction_name] = df[variable_former].values * df[variable_later].values
    
    # 合并原始数据和交互项
    result_df = pd.concat([df, result_df], axis=1)
    return result_df

X_train=get_Fmatrix_with_interaction(X_train_without_interaction,interaction_variable_pair_list)
X_test=get_Fmatrix_with_interaction(X_test_without_interaction,interaction_variable_pair_list)


开始合并训练集特征矩阵（使用numpy数组合并以节省内存）...
  处理第 1/3 部分...
    警告：第 1 部分为None，跳过
  处理第 2/3 部分...
    警告：第 2 部分为None，跳过
  处理第 3/3 部分...
  合并numpy数组...


In [None]:
X_train=X_train[sorted(X_train.columns)]
X_test=X_test[X_train.columns]


print('X_train.shape=',X_train.shape)
print('X_test.shape=',X_test.shape)

col_na_count=X_train.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('train_x_na:',col_na_count)

col_na_count=X_test.isna().sum()
col_na_count=col_na_count[col_na_count!=0]
print('test_x_na:',col_na_count)

# 只检查数值列中的无穷值（isinf只能用于数值类型）
numeric_cols_train = X_train.select_dtypes(include=[np.number]).columns
col_na_count = pd.Series([np.isinf(X_train[col]).sum() for col in numeric_cols_train], index=numeric_cols_train)
col_na_count = col_na_count[col_na_count != 0]
print('train_x_if:', col_na_count)

numeric_cols_test = X_test.select_dtypes(include=[np.number]).columns
col_na_count = pd.Series([np.isinf(X_test[col]).sum() for col in numeric_cols_test], index=numeric_cols_test)
col_na_count = col_na_count[col_na_count != 0]
print('test_x_inf:', col_na_count)

train_exclusive_list=np.setdiff1d(X_train.columns.tolist(),X_test.columns.tolist())
test_exclusive_list=np.setdiff1d(X_test.columns.tolist(),X_train.columns.tolist())

print('train_exclusive_list=',train_exclusive_list)
print('test_exclusive_list=',test_exclusive_list)


print('train_test_col_is_not_incident=',np.sum(X_train.columns!=X_test.columns))

use_colname_list=np.array(range(len(X_train.columns))).astype(str)
X_train_use=X_train.copy()
X_train_use.columns=use_colname_list
X_test_use=X_test.copy()
X_test_use.columns=use_colname_list

X_train.shape= (103871, 1455)
X_test.shape= (34017, 1455)
train_x_na: (house_age+1)^-1             26361
(parking_per_unit+1)^-1       1323
floor_area_ratio             33154
green_rate                   32883
house_age                    26361
house_age^2                  26361
house_age^3                  26361
house_age_dup1               26361
lift_ratio                    2619
log(house_age+1)             26361
log(house_age+1)^2           26361
log(house_age+1)^3           26361
log(parking_per_unit+1)       1323
log(parking_per_unit+1)^2     1323
log(parking_per_unit+1)^3     1323
parking_per_unit              1323
parking_per_unit^2            1323
parking_per_unit^3            1323
parking_per_unit_dup1         1323
dtype: int64
test_x_na: (house_age+1)^-1             6193
(parking_per_unit+1)^-1      5136
floor_area_ratio             9303
green_rate                   9180
house_age                    6193
house_age^2                  6193
house_age^3                  6193
hou

In [None]:
from sklearn.linear_model import LinearRegression

y=np.log(train_df['price'])
OLS_md = LinearRegression()
# 检查并处理缺失值
print("检查缺失值...")
nan_counts_train = X_train_use.isna().sum()
nan_columns_train = nan_counts_train[nan_counts_train > 0]
print(f"训练集缺失值列: {len(nan_columns_train)}列")
print(nan_columns_train)

nan_counts_test = X_test_use.isna().sum()
nan_columns_test = nan_counts_test[nan_counts_test > 0]
print(f"测试集缺失值列: {len(nan_columns_test)}列")
print(nan_columns_test)

# 填充缺失值
from sklearn.impute import SimpleImputer

# 分离数值列和非数值列
numeric_cols = X_train_use.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = X_train_use.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"数值列数量: {len(numeric_cols)}")
print(f"非数值列数量: {len(non_numeric_cols)}")

# 初始化结果DataFrame
X_train_use_imputed = X_train_use.copy()
X_test_use_imputed = X_test_use.copy()

# 对数值列使用mean策略填充
if len(numeric_cols) > 0:
    imputer_numeric = SimpleImputer(strategy="mean")
    X_train_use_imputed[numeric_cols] = pd.DataFrame(
        imputer_numeric.fit_transform(X_train_use[numeric_cols]), 
        columns=numeric_cols, 
        index=X_train_use.index
    )
    X_test_use_imputed[numeric_cols] = pd.DataFrame(
        imputer_numeric.transform(X_test_use[numeric_cols]), 
        columns=numeric_cols, 
        index=X_test_use.index
    )

# 对非数值列使用most_frequent策略填充（如果有缺失值）
if len(non_numeric_cols) > 0:
    # 检查非数值列是否有缺失值
    non_numeric_missing_train = X_train_use[non_numeric_cols].isna().any()
    non_numeric_missing_test = X_test_use[non_numeric_cols].isna().any()
    cols_with_missing = [col for col in non_numeric_cols if non_numeric_missing_train[col] or non_numeric_missing_test[col]]
    
    if len(cols_with_missing) > 0:
        imputer_non_numeric = SimpleImputer(strategy="most_frequent")
        X_train_use_imputed[cols_with_missing] = pd.DataFrame(
            imputer_non_numeric.fit_transform(X_train_use[cols_with_missing]), 
            columns=cols_with_missing, 
            index=X_train_use.index
        )
        X_test_use_imputed[cols_with_missing] = pd.DataFrame(
            imputer_non_numeric.transform(X_test_use[cols_with_missing]), 
            columns=cols_with_missing, 
            index=X_test_use.index
        )

print("缺失值填充完成")

# 使用填充后的数据
X_train_use = X_train_use_imputed
X_test_use = X_test_use_imputed

# 处理百分数字符串（如'30%'转为0.3或30）
def convert_percentage_to_numeric(series):
    """
    将百分数字符串转换为数值
    例如：'30%' -> 30 或 0.3（根据实际情况决定）
    """
    if series.dtype == 'object':
        # 检查是否是百分数字符串格式
        return series.apply(lambda x: 
            float(str(x).replace('%', '')) / 100 if isinstance(x, str) and '%' in str(x)
            else pd.to_numeric(x, errors='coerce') if not pd.isna(x)
            else x
        )
    else:
        return series

# 检查并转换所有可能包含百分数的列
for col in X_train_use.columns:
    # 检查列中是否有百分数字符串
    if X_train_use[col].dtype == 'object':
        has_percentage = X_train_use[col].apply(lambda x: isinstance(x, str) and '%' in str(x)).any()
        if has_percentage:
            print(f"转换列 {col} 中的百分数为数值...")
            X_train_use[col] = convert_percentage_to_numeric(X_train_use[col])
            X_test_use[col] = convert_percentage_to_numeric(X_test_use[col])

# 确保所有列都是数值类型（除了已经是数值的）
for col in X_train_use.columns:
    if X_train_use[col].dtype == 'object':
        # 尝试转换为数值
        X_train_use[col] = pd.to_numeric(X_train_use[col], errors='coerce')
        X_test_use[col] = pd.to_numeric(X_test_use[col], errors='coerce')

OLS_md.fit(X_train_use,y)

output_df=pd.DataFrame({
    'ID':range(len(test_df)),
    'Price':np.exp(OLS_md.predict(X_test_use))
})

output_df.to_csv('submission_2025_5_22_y=log(p).csv',index=False)

检查缺失值...
训练集缺失值列: 19列
2       26361
3        1323
23      33154
24      32883
35      26361
36      26361
37      26361
38      26361
47       2619
1328    26361
1329    26361
1330    26361
1331     1323
1332     1323
1333     1323
1339     1323
1340     1323
1341     1323
1342     1323
dtype: int64
测试集缺失值列: 19列
2       6193
3       5136
23      9303
24      9180
35      6193
36      6193
37      6193
38      6193
47       635
1328    6193
1329    6193
1330    6193
1331    5136
1332    5136
1333    5136
1339    5136
1340    5136
1341    5136
1342    5136
dtype: int64
数值列数量: 1453
非数值列数量: 2
缺失值填充完成
转换列 24 中的百分数为数值...


In [None]:
X_test.to_csv('X_test.csv',index=False)
X_train.to_csv('X_train.csv', index=False)
pd.DataFrame(y, columns=['target']).to_csv('y.csv', index=False)

In [None]:
# ============================================================================
# 最终输出：不同线性模型的性能评估表格
# ============================================================================
# 注意：所有评估指标都基于原始房价水平值的MAE

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# 尝试导入display函数（用于Jupyter notebook）
try:
    from IPython.display import display, HTML
except ImportError:
    display = print
    HTML = lambda x: print(x)

print("="*80)
print("开始训练和评估多个线性模型...")
print("="*80)

# 准备数据：使用原始价格（不是log转换后的）
y_train_original = train_df['price'].values

# 检查测试集是否有价格列
has_test_price = 'price' in test_df.columns
if has_test_price:
    y_test_original = test_df['price'].values
    print("测试集包含价格列，将使用测试集计算样本外MAE")
else:
    print("测试集不包含价格列，将使用训练集的20%作为验证集计算样本外MAE")
    from sklearn.model_selection import train_test_split
    # 使用训练集的20%作为验证集
    train_indices, val_indices = train_test_split(
        np.arange(len(train_df)), 
        test_size=0.2, 
        random_state=42
    )
    X_train_split = X_train_use.iloc[train_indices]
    X_val_split = X_train_use.iloc[val_indices]
    y_train_split_original = y_train_original[train_indices]
    y_test_original = y_train_original[val_indices]
    # 更新用于训练的索引
    train_split_mask = np.zeros(len(train_df), dtype=bool)
    train_split_mask[train_indices] = True

# 用于训练的目标变量（log转换）
y_train_log = np.log(y_train_original)

# 存储所有模型及其结果
models = {}
results = {}

# 1. OLS模型
print("\n[1/2] 训练OLS模型...")
ols_model = LinearRegression()
ols_model.fit(X_train_use, y_train_log)
models['OLS'] = ols_model
print("   ✓ OLS模型训练完成")

# 2. Ridge模型（使用网格搜索选择最佳alpha）
print("\n[2/2] 训练Ridge模型...")
# 将DataFrame转换为numpy数组，避免序列化问题
X_train_array = X_train_use.values if isinstance(X_train_use, pd.DataFrame) else X_train_use
y_train_array = y_train_log.values if isinstance(y_train_log, pd.Series) else y_train_log

# 使用网格搜索寻找最佳alpha
alphas_ridge = np.logspace(-4, 4, 20)
ridge_grid = GridSearchCV(Ridge(max_iter=2000), 
                          param_grid={'alpha': alphas_ridge},
                          cv=5, 
                          scoring='neg_mean_absolute_error',
                          n_jobs=1,
                          verbose=False)
ridge_grid.fit(X_train_array, y_train_array)
best_alpha_ridge = ridge_grid.best_params_['alpha']
ridge_model = Ridge(alpha=best_alpha_ridge, max_iter=2000)
ridge_model.fit(X_train_use, y_train_log)
models['Ridge'] = ridge_model
print(f"   ✓ Ridge模型训练完成 (最优alpha={best_alpha_ridge:.6f})")

print("\n" + "="*80)
print("所有模型训练完成！")
print("="*80)

# 定义评估函数
def evaluate_model(model, X_train, X_test, y_train_original, y_test_original, y_train_log):
    """
    评估模型，返回样本内、样本外和交叉验证的MAE（基于原始价格水平值）
    """
    # 样本内预测
    y_train_pred_log = model.predict(X_train)
    y_train_pred = np.exp(y_train_pred_log)
    mae_in_sample = mean_absolute_error(y_train_original, y_train_pred)
    
    # 样本外预测
    y_test_pred_log = model.predict(X_test)
    y_test_pred = np.exp(y_test_pred_log)
    mae_out_sample = mean_absolute_error(y_test_original, y_test_pred)
    
    # 交叉验证（使用5折）
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    X_train_array = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
    for train_idx, val_idx in kfold.split(X_train_array):
        X_train_fold = X_train_array[train_idx]
        X_val_fold = X_train_array[val_idx]
        y_train_fold_log = y_train_log[train_idx]
        y_val_fold_original = y_train_original[val_idx]
        
        # 训练模型
        model_class = type(model)
        model_params = model.get_params()
        temp_model = model_class(**model_params)
        temp_model.fit(X_train_fold, y_train_fold_log)
        
        # 预测并计算MAE
        y_val_pred_log = temp_model.predict(X_val_fold)
        y_val_pred = np.exp(y_val_pred_log)
        mae_cv = mean_absolute_error(y_val_fold_original, y_val_pred)
        cv_scores.append(mae_cv)
    
    mae_cross_validation = np.mean(cv_scores)
    
    return mae_in_sample, mae_out_sample, mae_cross_validation

# 评估所有模型
print("\n开始评估模型性能...")
for model_name, model in models.items():
    print(f"  评估{model_name}模型...")
    if has_test_price:
        # 使用原始的训练集和测试集
        results[model_name] = evaluate_model(model, X_train_use, X_test_use, 
                                              y_train_original, y_test_original, y_train_log)
    else:
        # 使用划分后的训练集和验证集
        # 注意：对于样本内评估，我们仍然使用完整的训练集（这是模型实际训练的）
        # 对于样本外评估，使用验证集
        results[model_name] = evaluate_model(model, X_train_use, X_val_split, 
                                              y_train_original, y_test_original, y_train_log)

# 找出最佳线性模型（基于样本外MAE最小）
best_model_name = min(results.keys(), key=lambda x: results[x][1])
print(f"\n最佳线性模型（基于样本外MAE）: {best_model_name}")

# 确定"其他模型"（从OLS和Ridge中选择不是最佳的那个）
other_model_name = 'Ridge' if best_model_name == 'OLS' else 'OLS'

# 构建结果表格（格式与图片中一致）
table_data = {
    '指标': ['OLS', 'Ridge', '最佳线性模型', '其他模型（非必需）'],
    '样本内': [
        round(results['OLS'][0], 2),
        round(results['Ridge'][0], 2),
        round(results[best_model_name][0], 2),
        round(results[other_model_name][0], 2)
    ],
    '样本外': [
        round(results['OLS'][1], 2),
        round(results['Ridge'][1], 2),
        round(results[best_model_name][1], 2),
        round(results[other_model_name][1], 2)
    ],
    '交叉验证': [
        round(results['OLS'][2], 2),
        round(results['Ridge'][2], 2),
        round(results[best_model_name][2], 2),
        round(results[other_model_name][2], 2)
    ]
}

results_df = pd.DataFrame(table_data)

# 设置pandas显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 打印表格（格式化的输出）
print("\n" + "="*80)
print("模型性能评估表（基于原始房价水平值的MAE）")
print("="*80)
print("\n" + results_df.to_string(index=False))
print("\n备注: 度量应基于原始的房价或租金\"水平值\"的 MAE。")
print("="*80)

# 显示表格（在Jupyter中更美观）
print("\n")
display(results_df)

# 保存表格为CSV
results_df.to_csv('模型性能评估表.csv', index=False, encoding='utf-8-sig')
print("\n✓ 表格已保存为: 模型性能评估表.csv")

# 生成更美观的HTML格式表格
results_df_html = results_df.to_html(index=False, classes='table table-striped', table_id='performance_table')
html_content = f'''
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>模型性能评估表</title>
    <style>
        body {{
            font-family: "Microsoft YaHei", "SimHei", Arial, sans-serif;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
        }}
        .container {{
            max-width: 800px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }}
        h2 {{
            text-align: center;
            color: #333;
            margin-bottom: 30px;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 20px auto;
            font-size: 14px;
        }}
        th, td {{
            border: 1px solid #ddd;
            padding: 12px;
            text-align: center;
        }}
        th {{
            background-color: #4CAF50;
            color: white;
            font-weight: bold;
        }}
        tr:nth-child(even) {{
            background-color: #f9f9f9;
        }}
        tr:hover {{
            background-color: #f5f5f5;
        }}
        .note {{
            margin-top: 30px;
            font-style: italic;
            color: #666;
            text-align: left;
            padding: 15px;
            background-color: #f9f9f9;
            border-left: 4px solid #4CAF50;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h2>模型性能评估表</h2>
        {results_df_html}
        <div class="note">
            <p><strong>备注:</strong> 度量应基于原始的房价或租金"水平值"的 MAE。</p>
        </div>
    </div>
</body>
</html>
'''
with open('模型性能评估表.html', 'w', encoding='utf-8') as f:
    f.write(html_content)
print("✓ 表格已保存为HTML格式: 模型性能评估表.html")

# 打印详细的模型信息
print("\n" + "="*80)
print("详细模型信息:")
print("="*80)
print(f"最佳线性模型: {best_model_name}")
print(f"  样本内 MAE: {results[best_model_name][0]:.2f}")
print(f"  样本外 MAE: {results[best_model_name][1]:.2f}")
print(f"  交叉验证 MAE: {results[best_model_name][2]:.2f}")
print("\n所有模型性能对比:")
for model_name in ['OLS', 'Ridge']:
    if model_name in results:
        print(f"  {model_name:12s}: 样本内={results[model_name][0]:8.2f}, "
              f"样本外={results[model_name][1]:8.2f}, CV={results[model_name][2]:8.2f}")
print("="*80)


开始训练和评估多个线性模型...


NameError: name 'train_df' is not defined

In [None]:
# ============================================================================
# 输出Ridge模型的预测结果（输出文件名保持不变）
# ============================================================================

print("="*80)
print("生成Ridge模型预测结果...")
print("="*80)

# 确保Ridge模型已经训练
if 'Ridge' not in models:
    print("错误：Ridge模型尚未训练，请先运行Cell 13")
else:
    # 使用Ridge模型进行预测（注意：目标变量是log转换后的）
    y_test_pred_log = models['Ridge'].predict(X_test_use)
    
    # 将预测结果转换回原始价格（指数变换）
    y_test_pred = np.exp(y_test_pred_log)
    
    # 创建预测结果DataFrame
    lasso_output_df = pd.DataFrame({
        'ID': range(len(test_df)),
        'Price': y_test_pred
    })
    
    # 保存为CSV文件（保持原有输出文件名不变）
    output_filename = 'lasso_predictions.csv'
    lasso_output_df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    
    print(f"\n✓ Ridge模型预测完成")
    print(f"  预测样本数: {len(lasso_output_df)}")
    print(f"  预测价格范围: [{lasso_output_df['Price'].min():.2f}, {lasso_output_df['Price'].max():.2f}]")
    print(f"  预测价格均值: {lasso_output_df['Price'].mean():.2f}")
    print(f"\n✓ 预测结果已保存为: {output_filename}")
    print("\n" + "="*80)
    
    # 显示前几行预览
    print("\n预测结果预览（前10行）:")
    print(lasso_output_df.head(10).to_string(index=False))
    print("="*80)


生成LASSO模型预测结果...

✓ LASSO模型预测完成
  预测样本数: 34017
  预测价格范围: [1177294.79, 57289476.77]
  预测价格均值: 1576859.40

✓ 预测结果已保存为: lasso_predictions.csv


预测结果预览（前10行）:
 ID        Price
  0 5.182409e+06
  1 1.431601e+06
  2 1.998190e+06
  3 1.489825e+06
  4 1.701608e+06
  5 1.492562e+06
  6 2.299132e+06
  7 1.452270e+06
  8 1.460930e+06
  9 1.620778e+06
