In [1]:
import sys

assert sys.version_info >= (3, 7)

In [2]:
from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

# 获取数据

下载训练集数据

In [7]:
import pandas as pd
import requests
from io import BytesIO
import chardet

File_url = {
    'train price': 'https://media.githubusercontent.com/media/Xieanon/my-Homework/refs/heads/master/ruc_Class25Q2_train_price.csv',
    'train rent': 'https://media.githubusercontent.com/media/Xieanon/my-Homework/refs/heads/master/ruc_Class25Q2_train_rent.csv',
    'test price': 'https://media.githubusercontent.com/media/Xieanon/my-Homework/refs/heads/master/ruc_Class25Q2_test_price.csv',
    'test rent': 'https://media.githubusercontent.com/media/Xieanon/my-Homework/refs/heads/master/ruc_Class25Q2_test_rent.csv'
}

def read_csv_auto_encoding(url):
    """自动检测编码并读取CSV"""
    response = requests.get(url)
    response.raise_for_status()
    
    # 检测编码
    encoding_detected = chardet.detect(response.content)
    encoding = encoding_detected['encoding']
    confidence = encoding_detected['confidence']
    
    print(f"检测到编码: {encoding} (置信度: {confidence:.2f})")
    
    # 使用检测到的编码读取
    data = BytesIO(response.content)
    df = pd.read_csv(data, encoding=encoding)
    return df

# 安装chardet: pip install chardet
try:
    train_price = read_csv_auto_encoding(File_url['train price'])
    train_rent=read_csv_auto_encoding(File_url['train rent'])
    test_price=read_csv_auto_encoding(File_url['test price'])
    test_rent=read_csv_auto_encoding(File_url['test rent'])
except Exception as e:
    print(f"读取失败: {e}")

检测到编码: UTF-8-SIG (置信度: 1.00)


  df = pd.read_csv(data, encoding=encoding)


检测到编码: UTF-8-SIG (置信度: 1.00)


  df = pd.read_csv(data, encoding=encoding)


检测到编码: UTF-8-SIG (置信度: 1.00)


  df = pd.read_csv(data, encoding=encoding)


检测到编码: UTF-8-SIG (置信度: 1.00)


检查训练集数据

train_price数据异常值清理

In [8]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns


In [9]:
train_price['Price']=pd.to_numeric(train_price['Price'],errors='coerce')
train_price['建筑面积']=train_price['建筑面积'].str.replace('㎡','')
train_price['建筑面积']=pd.to_numeric(train_price['建筑面积'],errors='coerce')
train_price['套内面积']=train_price['套内面积'].str.replace('㎡','')
train_price['套内面积']=pd.to_numeric(train_price['套内面积'],errors='coerce')
train_price['lon']=pd.to_numeric(train_price['lon'],errors='coerce')
train_price['lat']=pd.to_numeric(train_price['lat'],errors='coerce')


In [10]:
def clean_housing_outliers(df,show_analysis=True):
    print(f'原始数据量：{len(df)}')
    df_clean=df.copy()
    outlier_mask=pd.Series([False]*len(df_clean))

    # 1. 价格异常值清理
    if 'Price' in df_clean.columns:
        print("\n=== 处理价格异常值 ===")

        #使用对数变换检测异常值
        price_log=np.log1p(df_clean['Price'])
        z_scores=np.abs(stats.zscore(price_log))
        price_outliers=z_scores>3

        # 业务逻辑：价格在合理范围内
        price_business_outliers=(df_clean['Price']<100000)|(df_clean['Price']>100000000)

        price_outlier_mask=price_outliers|price_business_outliers
        print(f"价格异常值数量: {price_outlier_mask.sum()}")

        outlier_mask=outlier_mask|price_outlier_mask
        
    # 2. 建筑面积异常值清理
    if '建筑面积' in df_clean.columns:
        print("\n=== 处理建筑面积异常值 ===")
        area_data=df_clean['建筑面积']
        
        if len(area_data)>0:
            Q1=area_data.quantile(0.25)
            Q3=area_data.quantile(0.75)
            IQR=Q3-Q1

            if IQR>0:
                lower_bound=max(0,Q1-1.5*IQR)
                upper_bound=Q3+1.5*IQR

                area_outliers=(df_clean['建筑面积']<lower_bound)|(df_clean['建筑面积']>upper_bound)

                area_business_outliers=(df_clean['建筑面积']<10)|(df_clean['建筑面积']>1000)

                area_outlier_mask=area_outliers|area_business_outliers
                print(f'建筑面积异常值数量：{area_outlier_mask.sum()}')

                outlier_mask=outlier_mask|area_outlier_mask

    #3. 经纬度异常值清理（中国大致范围）
    if all(col in df_clean.columns for col in ['lon','lat']):
        print("\n=== 处理经纬度异常值 ===")

        #中国大致范围
        lon_outliers=(df_clean['lon']<73)|(df_clean['lon']>135)
        lat_outliers=(df_clean['lat']<18)|(df_clean['lat']>54)

        coord_outliers=lon_outliers|lat_outliers
        print(f'经纬度异常值数量：{coord_outliers.sum()}')

        outlier_mask=outlier_mask|coord_outliers
    
    # 4. 套内面积逻辑检查
    if all(col in df_clean.columns for col in ['套内面积', '建筑面积']):
        print("\n=== 处理逻辑矛盾异常值 ===")

        #套内面积不能大于建筑面积
        logic_outliers=df_clean['套内面积']>df_clean['建筑面积']
        print(f'套内面积>建筑面积的异常值：{logic_outliers.sum()}')

        outlier_mask=outlier_mask|logic_outliers

    # 5. 价格-面积关系异常值
    if all(col in df_clean.columns for col in ['Price', '建筑面积']):
        print("\n=== 处理价格-面积关系异常值 ===")

        #计算单价
        unit_price=df_clean['Price']/df_clean['建筑面积']

        #使用对数变换检测异常值
        unit_price_log=np.log1p(unit_price)
        z_scores_unit=np.abs(stats.zscore(unit_price_log))

        unit_price_outliers=z_scores_unit>3

        #业务逻辑：单价在合理范围内
        unit_business_outliers=(unit_price<1000)|(unit_price>200000)

        unit_outlier_mask=unit_price_outliers|unit_business_outliers
        print(f'单价异常值数量：{unit_outlier_mask.sum()}')

        outlier_mask=outlier_mask|unit_outlier_mask

    total_outliers=outlier_mask.sum()
    df_cleaned=df_clean[~outlier_mask]
        
    print(f"\n=== 清理完成 ===")
    print(f"原始数据量: {len(df)}")
    print(f"最终数据量: {len(df_cleaned)}")
    print(f"删除记录数: {total_outliers}")
    print(f"删除比例: {total_outliers/len(df)*100:.2f}%")
        
    return df_cleaned
        
        



        
        

In [11]:
cleaned_TP=clean_housing_outliers(train_price)

原始数据量：103871

=== 处理价格异常值 ===
价格异常值数量: 466

=== 处理建筑面积异常值 ===
建筑面积异常值数量：4126

=== 处理经纬度异常值 ===
经纬度异常值数量：0

=== 处理逻辑矛盾异常值 ===
套内面积>建筑面积的异常值：0

=== 处理价格-面积关系异常值 ===
单价异常值数量：9

=== 清理完成 ===
原始数据量: 103871
最终数据量: 99596
删除记录数: 4275
删除比例: 4.12%


# 房价预测模型

首先给原训练集再分出测试集和训练集

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.cluster import KMeans

创建数据处理管道

In [13]:
#城市，建筑结构，装修情况，配备电梯，房屋用途，交易权属，产权所属，年份，供水，供电OneHotEncoder
cat_pipeline=make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

In [14]:
#对区域、板块、产权描述、物业类别频率编码
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    """频率编码器 - 用于处理高基数分类变量"""
    
    def __init__(self, normalize=True, handle_unknown='median'):
        self.normalize = normalize
        self.handle_unknown = handle_unknown
        self.frequency_maps = {}
        self.default_values = {}
        self.cat_columns = None
        self.feature_names_in_ = None
        
    def fit(self, X, y=None):
        # 将输入转换为 DataFrame（如果是数组）
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
            # 如果没有列名，创建默认列名
            if hasattr(X, 'columns') and all(isinstance(col, (int, np.integer)) for col in X.columns):
                X.columns = [f'feature_{i}' for i in range(X.shape[1])]
        
        # 保存特征名称
        self.feature_names_in_ = X.columns.tolist()
        
        # 自动识别分类列
        self.cat_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # 如果没有找到分类列，使用所有列
        if not self.cat_columns:
            self.cat_columns = X.columns.tolist()
            print(f"警告：未找到分类列，将对所有列进行频率编码: {self.cat_columns}")
        
        for col in self.cat_columns:
            if self.normalize:
                # 计算频率（归一化）
                frequencies = X[col].value_counts(normalize=True)
                self.frequency_maps[col] = frequencies.to_dict()
            else:
                # 计算计数
                counts = X[col].value_counts()
                self.frequency_maps[col] = counts.to_dict()
            
            # 设置默认值用于处理未知类别
            if self.handle_unknown == 'median':
                self.default_values[col] = np.median(list(self.frequency_maps[col].values()))
            elif self.handle_unknown == 'min':
                self.default_values[col] = min(self.frequency_maps[col].values())
            else:  # zero
                self.default_values[col] = 0
        
        return self
    
    def transform(self, X):
        # 将输入转换为 DataFrame（如果是数组）
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        
        X_copy = X.copy()
        
        for col in self.cat_columns:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].map(self.frequency_maps[col])
                X_copy[col] = X_copy[col].fillna(self.default_values[col])
        
        return X_copy

    def get_feature_names_out(self, input_features=None):
        """返回输出特征名称"""
        if input_features is None:
            if self.feature_names_in_ is not None:
                return np.array(self.feature_names_in_)
            elif self.cat_columns is not None:
                return np.array(self.cat_columns)
            else:
                return np.array([])
        else:
            return np.array(input_features)

cat_frequency=make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    FrequencyEncoder(normalize=True,handle_unknown='median')
)



In [15]:
#经纬度聚类管道
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=11, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

cluster_simil=ClusterSimilarity(n_clusters=11,gamma=1.,random_state=42)

In [16]:
#房屋户型处理管道
import re
class HouseTypeEncoder(BaseEstimator, TransformerMixin):
    """房屋户型编码器 - 将'3室1厅1厨1卫'格式编码为四维数值特征"""
    
    def __init__(self):
        self.most_frequent = None
        self.pattern = re.compile(r'(\d+)室(\d+)厅(\d+)厨(\d+)卫')
        
    def fit(self, X, y=None):
        # 找到所有符合格式的户型
        valid_house_types = []
        for house_type in X.iloc[:, 0] if hasattr(X, 'iloc') else X:
            if pd.isna(house_type):
                continue
                
            match = self.pattern.match(str(house_type))
            if match:
                rooms = list(map(int, match.groups()))
                valid_house_types.append(rooms)
        
        # 计算每个维度的众数
        if valid_house_types:
            valid_array = np.array(valid_house_types)
            self.most_frequent = [
                np.bincount(valid_array[:, i]).argmax() for i in range(4)
            ]
        else:
            # 如果没有有效数据，使用默认值
            self.most_frequent = [2, 1, 1, 1]  # 最常见的户型: 2室1厅1厨1卫
        
        return self
    
    def transform(self, X):
        # 确保输入是DataFrame或可以迭代的形式
        if hasattr(X, 'iloc'):
            house_types = X.iloc[:, 0]
        else:
            house_types = X
            
        result = []
        
        for house_type in house_types:
            if pd.isna(house_type):
                # 缺失值用众数填充
                result.append(self.most_frequent)
                continue
                
            house_type_str = str(house_type)
            match = self.pattern.match(house_type_str)
            
            if match:
                # 提取四个数值
                rooms = list(map(int, match.groups()))
                result.append(rooms)
            else:
                # 不符合格式的用众数填充
                result.append(self.most_frequent)
        
        # 转换为DataFrame，列名为室、厅、厨、卫
        result_df = pd.DataFrame(result, columns=['室', '厅', '厨', '卫'])
        
        # 如果输入是DataFrame，保持相同的索引
        if hasattr(X, 'index'):
            result_df.index = X.index
            
        return result_df
    
    def get_feature_names_out(self, input_features=None):
        """返回输出特征名称"""
        return np.array(['室', '厅', '厨', '卫'])

house_type_pipeline=make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    HouseTypeEncoder()
)

In [17]:
#所在楼层管道
def floor_level_extractor(X):
    """
    提取楼层信息括号外的内容
    例如: "中楼层 (共5层)" -> "中楼层"
    """
    # 确保输入是DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    
    # 获取列名
    if X.shape[1] == 1:
        col_name = X.columns[0]
    else:
        # 如果有多个列，使用第一个列
        col_name = X.columns[0]
    
    result = []
    
    for floor_info in X[col_name]:
        if pd.isna(floor_info):
            # 缺失值保留为None
            result.append(None)
            continue
            
        floor_str = str(floor_info)
        
        # 使用正则表达式提取括号前的内容
        match = re.match(r'^([^(]+)', floor_str)
        
        if match:
            # 提取括号前的楼层信息并去除首尾空格
            floor_level = match.group(1).strip()
            result.append(floor_level)
        else:
            # 如果不符合格式，保留原始值
            result.append(floor_str)
    
    # 返回DataFrame，保持相同的列名
    return pd.DataFrame(result, columns=[col_name], index=X.index)

floor_level_pipeline=make_pipeline(
    FunctionTransformer(floor_level_extractor,feature_names_out='one-to-one'),
    OneHotEncoder(handle_unknown='ignore')
)

In [18]:
#房屋朝向处理管道
def direction_ext(X):
    """
    房屋朝向编码函数
    将房屋朝向字符串编码为四维向量 [东, 南, 西, 北]
    每有一个方向字符就记为1，例如"东南" -> [1,1,0,0]
    """
    # 确保输入是DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    
    # 获取列名
    if X.shape[1] == 1:
        col_name = X.columns[0]
    else:
        col_name = X.columns[0]
    
    result = []
    
    for direction_str in X[col_name]:
        if pd.isna(direction_str):
            # 缺失值用[0,0,0,0]填充
            result.append([0, 0, 0, 0])
            continue
            
        # 转换为字符串并去除空格
        direction_str = str(direction_str).replace(" ", "")
        
        # 初始化四个方向的标志
        east = 0
        south = 0
        west = 0
        north = 0
        
        # 检查每个方向是否存在
        if '东' in direction_str:
            east = 1
        if '南' in direction_str:
            south = 1
        if '西' in direction_str:
            west = 1
        if '北' in direction_str:
            north = 1
            
        result.append([east, south, west, north])
    
    # 返回DataFrame，列名为四个方向
    return pd.DataFrame(result, columns=['东', '南', '西', '北'], index=X.index)

def direction_name(transformer=None,input_features=None):
    """
    返回房屋朝向编码后的特征名称
    """
    return np.array(['东', '南', '西', '北'])

direction_pipeline=make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    FunctionTransformer(direction_ext,feature_names_out=direction_name)
)

In [19]:
#房屋优势特征管道
def house_advantage_ext(X):
    """
    房屋优势特征提取函数
    将房屋优势字符串编码为四维向量 [装修, 房本满五年, 地铁, 房本满两年]
    每有一个特征就记为1，例如"装修、房本满五年" -> [1,1,0,0]
    """
    # 确保输入是DataFrame
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    
    # 获取列名
    if X.shape[1] == 1:
        col_name = X.columns[0]
    else:
        col_name = X.columns[0]
    
    # 定义要提取的特征
    target_features = ['装修', '房本满五年', '地铁', '房本满两年']
    
    result = []
    
    for advantage_str in X[col_name]:
        if pd.isna(advantage_str):
            # 缺失值用[0,0,0,0]填充
            result.append([0, 0, 0, 0])
            continue
            
        # 转换为字符串
        advantage_str = str(advantage_str)
        
        # 初始化四个特征的标志
        renovation = 0          # 装修
        five_years = 0          # 房本满五年
        subway = 0              # 地铁
        two_years = 0           # 房本满两年
        
        # 检查每个特征是否存在
        if '装修' in advantage_str:
            renovation = 1
        if '房本满五年' in advantage_str:
            five_years = 1
        if '地铁' in advantage_str:
            subway = 1
        if '房本满两年' in advantage_str:
            two_years = 1
            
        result.append([renovation, five_years, subway, two_years])
    
    # 返回DataFrame，列名为四个特征
    return pd.DataFrame(result, columns=target_features, index=X.index)

def house_advantage_name(transformer=None,input_features=None):
    """
    返回房屋优势编码后的特征名称
    """
    return np.array(['装修', '房本满五年', '地铁', '房本满两年'])

house_advantage_pipeline=make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    FunctionTransformer(house_advantage_ext,feature_names_out=house_advantage_name)
)


In [20]:
def green_per_ext(X):
    """
    提取绿化率数值的函数
    将30%转换为0.3，将字符串转换为浮点数
    """
    # 确保输入是DataFrame
    X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
    
    # 找到绿化率列（可能列名为"绿 化 率"）
    green_col = None
    for col in X_df.columns:
        if '绿化' in col or '绿 化' in col:
            green_col = col
            break
    
    if green_col is None:
        # 如果没有找到绿化率列，返回原始数据
        return X
    
    # 复制数据避免修改原始数据
    result = X_df.copy()
    
    def extract_green_value(x):
        """提取单个绿化率值的函数"""
        if pd.isna(x):
            return np.nan
        
        # 转换为字符串处理
        x_str = str(x).strip()
        
        # 如果是百分比形式，提取数字并除以100
        if '%' in x_str:
            # 提取数字部分
            num_str = ''.join(filter(lambda c: c.isdigit() or c == '.', x_str))
            if num_str:
                return float(num_str) / 100
            else:
                return np.nan
        else:
            # 尝试直接转换为浮点数
            try:
                return float(x_str)
            except (ValueError, TypeError):
                return np.nan
    
    # 应用转换函数
    result[green_col] = result[green_col].apply(extract_green_value)
    
    return result

# 创建pipeline
green_per_pipeline = make_pipeline(
    FunctionTransformer(
        green_per_ext, 
        feature_names_out='one-to-one'
    ),
    SimpleImputer(strategy='median')
)

In [21]:
#容积率管道
from sklearn.preprocessing import RobustScaler
volume_ratio_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),  # 用中位数填充缺失值
    RobustScaler()  # 使用四分位数范围进行缩放
)


In [22]:
#物业费管道
def pp_mgmt_ext(X):
    """
    提取物业费列中每格出现的第一个物业费数值
    
    参数:
    X: 输入数据，可以是DataFrame、Series或数组
    
    返回:
    处理后的数据，包含提取的物业费数值
    """
    # 确保输入是DataFrame
    if not isinstance(X, pd.DataFrame):
        # 如果是Series，转换为DataFrame
        if hasattr(X, 'name'):
            X = pd.DataFrame(X)
        else:
            # 如果是数组，假设列名为'mgmt_fee'
            X = pd.DataFrame(X, columns=['mgmt_fee'])
    
    # 找到物业费列（可能列名为"物业费"、"物 业 费"、"物业费"等）
    mgmt_fee_col = None
    possible_names = ['物业费', '物 业 费', 'mgmt_fee', '物业费用']
    
    for col in X.columns:
        for possible_name in possible_names:
            if possible_name in col:
                mgmt_fee_col = col
                break
        if mgmt_fee_col:
            break
    
    if mgmt_fee_col is None:
        # 如果没有找到物业费列，返回原始数据的第一列
        mgmt_fee_col = X.columns[0]
        print(f"警告：未找到物业费列，使用第一列 '{mgmt_fee_col}'")
    
    # 复制数据避免修改原始数据
    result = X.copy()
    
    def extract_mgmt_fee(value):
        """提取单个物业费数值的函数"""
        if pd.isna(value):
            return np.nan
        
        # 转换为字符串处理
        value_str = str(value).strip()
        
        # 如果已经是数字，直接返回
        try:
            return float(value_str)
        except (ValueError, TypeError):
            pass
        
        # 使用正则表达式匹配第一个数字（包括小数）
        # 匹配模式：数字开头，可能包含小数点
        pattern = r'(\d+\.?\d*)'
        match = re.search(pattern, value_str)
        
        if match:
            try:
                return float(match.group(1))
            except (ValueError, TypeError):
                return np.nan
        else:
            return np.nan
    
    # 应用转换函数
    result[mgmt_fee_col] = result[mgmt_fee_col].apply(extract_mgmt_fee)
    
    return result

# 创建pipeline
mgmt_fee_pipeline = make_pipeline(
    FunctionTransformer(
        pp_mgmt_ext, 
        feature_names_out='one-to-one'
    ),
    SimpleImputer(strategy='median'),
    StandardScaler()
)

In [23]:
log_fee_pipeline=make_pipeline(
    FunctionTransformer(pp_mgmt_ext,feature_names_out='one-to-one'),
    SimpleImputer(strategy='median'),
    FunctionTransformer(np.log1p,feature_names_out='one-to-one'),
    StandardScaler()
)

In [24]:
#燃气管道
def gas_ext(X):
    """
    提取燃气费列中每格出现的第一个燃气费数值
    
    参数:
    X: 输入数据，可以是DataFrame、Series或数组
    
    返回:
    处理后的数据，包含提取的燃气费数值
    """
    # 确保输入是DataFrame
    if not isinstance(X, pd.DataFrame):
        # 如果是Series，转换为DataFrame
        if hasattr(X, 'name'):
            X = pd.DataFrame(X)
        else:
            # 如果是数组，假设列名为'gas_fee'
            X = pd.DataFrame(X, columns=['gas_fee'])
    
    # 找到燃气费列（可能列名为"燃气费"、"燃气"、"gas"等）
    gas_fee_col = None
    possible_names = ['燃气费', '燃气', 'gas', '燃气费用']
    
    for col in X.columns:
        for possible_name in possible_names:
            if possible_name in col:
                gas_fee_col = col
                break
        if gas_fee_col:
            break
    
    if gas_fee_col is None:
        # 如果没有找到燃气费列，返回原始数据的第一列
        gas_fee_col = X.columns[0]
        print(f"警告：未找到燃气费列，使用第一列 '{gas_fee_col}'")
    
    # 复制数据避免修改原始数据
    result = X.copy()
    
    def extract_gas_fee(value):
        """提取单个燃气费数值的函数"""
        if pd.isna(value):
            return np.nan
        
        # 转换为字符串处理
        value_str = str(value).strip()
        
        # 如果已经是数字，直接返回
        try:
            return float(value_str)
        except (ValueError, TypeError):
            pass
        
        # 使用正则表达式匹配第一个数字（包括小数）
        # 匹配模式：数字开头，可能包含小数点
        pattern = r'(\d+\.?\d*)'
        match = re.search(pattern, value_str)
        
        if match:
            try:
                return float(match.group(1))
            except (ValueError, TypeError):
                return np.nan
        else:
            return np.nan
    
    # 应用转换函数
    result[gas_fee_col] = result[gas_fee_col].apply(extract_gas_fee)
    
    return result

# 创建pipeline
gas_fee_pipeline = make_pipeline(
    FunctionTransformer(
        gas_ext, 
        feature_names_out='one-to-one'
    ),
    SimpleImputer(strategy='median'),
    StandardScaler()
)

In [25]:
log_gas_fee_pipeline = make_pipeline(
    FunctionTransformer(
        gas_ext, 
        feature_names_out='one-to-one'
    ),
    SimpleImputer(strategy='median'),
    FunctionTransformer(np.log1p,feature_names_out='one-to-one'),
    StandardScaler()
)

In [26]:
#处理面积
num_pipeline=make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

In [27]:
#面积log管道

log_area_pipeline=make_pipeline(
    SimpleImputer(strategy='median'),
    FunctionTransformer(np.log,feature_names_out='one-to-one'),
    StandardScaler()
)


In [28]:
#建立集成管道
from sklearn.compose import ColumnTransformer

train_price_preprocessing=ColumnTransformer([
    ('area',num_pipeline,['建筑面积']),
    ('log_area',log_area_pipeline,['建筑面积']),
    ('绿化率',green_per_pipeline,['绿 化 率']),
    ('容积率',volume_ratio_pipeline,['容 积 率']),
    ('物业费',mgmt_fee_pipeline,['物 业 费']),
    ('log物业费',log_fee_pipeline,['物 业 费']),
    ('燃气费',gas_fee_pipeline,['燃气费']),
    ('log燃气费',log_gas_fee_pipeline,['燃气费']),
    ('cat',cat_pipeline,['城市','建筑结构','装修情况','配备电梯','房屋用途','交易权属','产权所属','供水','供电']),
    ('fq',cat_frequency,['区域','板块','产权描述','物业类别']),
    ('geo',cluster_simil,['lon','lat']),
    ('type',house_type_pipeline,['房屋户型']),
    ('cat_floor',floor_level_pipeline,['所在楼层']),
    ('direction',direction_pipeline,['房屋朝向']),
    ('advantage',house_advantage_pipeline,['房屋优势'])
])

In [75]:
train_price_preprocessing.fit(X_train,y_train)
train_price_preprocessing.get_feature_names_out()

array(['area__建筑面积', 'log_area__建筑面积', '绿化率__绿 化 率', '容积率__容 积 率',
       '物业费__物 业 费', 'log物业费__物 业 费', '燃气费__燃气费', 'log燃气费__燃气费',
       'cat__城市_0', 'cat__城市_1', 'cat__城市_2', 'cat__城市_3', 'cat__城市_4',
       'cat__城市_5', 'cat__城市_6', 'cat__城市_7', 'cat__城市_8', 'cat__城市_9',
       'cat__城市_10', 'cat__城市_11', 'cat__建筑结构_未知结构', 'cat__建筑结构_框架结构',
       'cat__建筑结构_混合结构', 'cat__建筑结构_砖木结构', 'cat__建筑结构_砖混结构',
       'cat__建筑结构_钢混结构', 'cat__建筑结构_钢结构', 'cat__装修情况_其他', 'cat__装修情况_毛坯',
       'cat__装修情况_简装', 'cat__装修情况_精装', 'cat__配备电梯_无', 'cat__配备电梯_有',
       'cat__房屋用途_公寓', 'cat__房屋用途_公寓/住宅', 'cat__房屋用途_公寓/公寓',
       'cat__房屋用途_公寓（住宅）', 'cat__房屋用途_写字楼', 'cat__房屋用途_别墅',
       'cat__房屋用途_商业', 'cat__房屋用途_商业办公类', 'cat__房屋用途_商住两用',
       'cat__房屋用途_商务公寓', 'cat__房屋用途_商务型公寓', 'cat__房屋用途_四合院',
       'cat__房屋用途_底商', 'cat__房屋用途_新式里弄', 'cat__房屋用途_普通住宅',
       'cat__房屋用途_老公寓', 'cat__房屋用途_车库', 'cat__房屋用途_酒店式公寓',
       'cat__交易权属_一类经济适用房', 'cat__交易权属_二类经济适用房', 'cat__交易权属_动迁安置房',
       'cat__交易权属_售后公

In [29]:
#ols_poly_feature_selection

from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

numeric_indices = [0,1,2,3,4,5,6,7]

poly_pipeline = ColumnTransformer([
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), numeric_indices)
], remainder='passthrough')

ols_poly = Pipeline([
    ('preprocessing', train_price_preprocessing),
    ('poly', poly_pipeline),
    ('scaler', StandardScaler()),  # 重新标准化
    ('ols', LinearRegression())
])

In [30]:
#最普通的ols
simple_ols=Pipeline([
        ('preprocessing', train_price_preprocessing),
        ('ols', LinearRegression())
])

In [34]:
from sklearn.feature_selection import RFE, VarianceThreshold,SelectKBest,f_regression

# 更精细的特征选择策略
ols_advanced = Pipeline([
    ('preprocessing', train_price_preprocessing),
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    
    # 第一步：移除低方差特征
    ('variance_threshold', VarianceThreshold(threshold=0.01)),
    
    # 第二步：基于统计测试的特征选择
    ('univariate_selection', SelectKBest(score_func=f_regression, k=50)),
    
    # 第三步：基于模型的特征选择
    ('model_based_selection', SelectFromModel(
        LassoCV(cv=5, random_state=111),
        threshold='1.25*median'  # 更宽松的阈值
    )),
    
    ('ols', LinearRegression())
])

评估各种ols模型

In [98]:

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

def evaluate_ols_model(model, X_train, X_test, y_train, y_test, model_name):
    """评估OLS模型"""
    
    # 训练集预测和评估
    train_pred = model.predict(X_train)
    train_mae = mean_absolute_error(np.exp(y_train), np.exp(train_pred))
    
    # 测试集预测和评估
    test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(np.exp(y_test), np.exp(test_pred))

    def mae_original_scale(y_true_log, y_pred_log):
        """在原始价格尺度计算 MAE 的评分函数"""
        y_true_original = np.exp(y_true_log)
        y_pred_original = np.exp(y_pred_log)
        return mean_absolute_error(y_true_original, y_pred_original)
    
    # 6. 使用正确评分器进行交叉验证
    mae_scorer = make_scorer(mae_original_scale, greater_is_better=False)

    
    # 6折交叉验证
    cv_results = cross_validate(
        model, X_train, y_train,
        cv=6,
        scoring=mae_scorer,
        return_train_score=True,
        error_score='raise'
    )
    
    cv_train_mae = -cv_results['train_score'].mean()
    cv_test_mae = -cv_results['test_score'].mean()
    
    # 特征数量
    try:
        if hasattr(model, 'named_steps'):
            if 'feature_selection' in model.named_steps:
                n_features = model.named_steps['feature_selection'].transform(X_train).shape[1]
            else:
                n_features = X_train.shape[1]
        else:
            n_features = "N/A"
    except:
        n_features = "N/A"
    
    return {
        'Model': model_name,
        'In_sample_MAE': train_mae,
        'Out_of_sample_MAE': test_mae,
        'CV_MAE': cv_test_mae,
        'Number_of_Features': n_features
    }

# 准备数据
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_TP.drop('Price', axis=1),
    np.log(cleaned_TP['Price']),
    test_size=0.2,
    random_state=111
)

# 评估不同的OLS变体
models_to_evaluate = {
    'OLS_Basic': simple_ols,
    'OLS_Poly': ols_poly,
    'OLS_Advanced':ols_advanced
}

results = []
for name, model in models_to_evaluate.items():
    model.fit(X_train, y_train)
    result = evaluate_ols_model(model, X_train, X_test, y_train, y_test, name)
    results.append(result)

# 创建结果表格
results_df = pd.DataFrame(results)
print(results_df)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

          Model  In_sample_MAE  Out_of_sample_MAE         CV_MAE  \
0     OLS_Basic  550159.038026      556233.897415  550940.206710   
1      OLS_Poly  539620.142563      546830.372996  540624.875071   
2  OLS_Advanced  779643.660403      781670.751748  786201.319585   

   Number_of_Features  
0                  54  
1                  54  
2                  54  


         Model  In_sample_MAE  Out_of_sample_MAE         CV_MAE  \
  0     OLS_Basic  550159.038026      556233.897415  550940.206710   
  1      OLS_Poly  539620.142563      546830.372996  540624.875071   
  2  OLS_Advanced  779643.660403      781670.751748  786201.319585   

   Number_of_Features  
0                  54  
1                  54  
2                  54 
所以最好的ols是第二种ols_poly 

#### 接下来考虑lasso和ridge模型

In [52]:
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV

poly_pipeline = ColumnTransformer([
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), numeric_indices)
], remainder='passthrough')

# 1. Lasso 模型
lasso_model = Pipeline([
    ('preprocessing', train_price_preprocessing),
    ('poly', poly_pipeline),
    ('scaler', StandardScaler()),
    ('lasso', Lasso(alpha=0.1, random_state=111, max_iter=10000))
])

# 2. Ridge 模型
ridge_model = Pipeline([
    ('preprocessing', train_price_preprocessing),
    ('poly', poly_pipeline),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0, random_state=111))
])



In [53]:
from sklearn.model_selection import GridSearchCV

# Lasso 超参数调优
lasso_param_grid = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_grid = GridSearchCV(
    lasso_model,
    lasso_param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Ridge 超参数调优
ridge_param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

ridge_grid = GridSearchCV(
    ridge_model,
    ridge_param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

In [100]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """评估模型性能"""
    
    # 训练模型
    print(f"训练 {model_name}...")
    model.fit(X_train, y_train)
    
    # 预测
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # 计算MAE（原始价格尺度）
    train_mae = mean_absolute_error(np.exp(y_train), np.exp(train_pred))
    test_mae = mean_absolute_error(np.exp(y_test), np.exp(test_pred))

    def mae_original_scale(y_true_log, y_pred_log):
        """在原始价格尺度计算 MAE 的评分函数"""
        y_true_original = np.exp(y_true_log)
        y_pred_original = np.exp(y_pred_log)
        return mean_absolute_error(y_true_original, y_pred_original)
    
    # 6. 使用正确评分器进行交叉验证
    mae_scorer = make_scorer(mae_original_scale, greater_is_better=False)

    
    # 交叉验证
    print(f"进行 {model_name} 交叉验证...")
    cv_results = cross_validate(
        model, X_train, y_train,
        cv=6,
        scoring=mae_scorer,
        return_train_score=True
    )
    
    cv_train_mae = -cv_results['train_score'].mean()
    cv_test_mae = -cv_results['test_score'].mean()
    
    # 获取最佳参数（如果是调优模型）
    best_params = {}
    if hasattr(model, 'best_params_'):
        best_params = model.best_params_
    elif hasattr(model, 'named_steps'):
        for step_name, step in model.named_steps.items():
            if hasattr(step, 'alpha_'):
                best_params[f'{step_name}_alpha'] = step.alpha_
    
    # 特征数量
    try:
        if hasattr(model, 'named_steps'):
            X_processed = model.named_steps['preprocessing'].transform(X_train)
            X_poly = model.named_steps['poly'].transform(X_processed)
            n_features = X_poly.shape[1]
        else:
            n_features = X_train.shape[1]
    except:
        n_features = "N/A"
    
    return {
        'Model': model_name,
        'In_sample_MAE': train_mae,
        'Out_of_sample_MAE': test_mae,
        'CV_MAE': cv_test_mae,
        'Number_of_Features': n_features,
        'Best_Params': best_params
    }

# 评估所有模型
models_to_evaluate = {
    'LASSO': lasso_model,
    'RIDGE': ridge_model,
    'LASSO_Grid': lasso_grid,
    'RIDGE_Grid': ridge_grid,
}

results = []
for name, model in models_to_evaluate.items():
    result = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(result)

# 创建结果表格
results_df = pd.DataFrame(results)
print(results_df)

训练 LASSO...




进行 LASSO 交叉验证...




训练 RIDGE...




进行 RIDGE 交叉验证...




训练 LASSO_Grid...
Fitting 5 folds for each of 6 candidates, totalling 30 fits




进行 LASSO_Grid 交叉验证...
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




训练 RIDGE_Grid...
Fitting 5 folds for each of 7 candidates, totalling 35 fits




进行 RIDGE_Grid 交叉验证...
Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




        Model  In_sample_MAE  Out_of_sample_MAE         CV_MAE  \
0       LASSO  829531.314368      836935.642861  829642.938084   
1       RIDGE  539559.827728      546928.487785  540535.015624   
2  LASSO_Grid  549402.801752      555203.915686  550135.688535   
3  RIDGE_Grid  539619.969950      546830.423981  540624.599136   

   Number_of_Features              Best_Params  
0                 135                       {}  
1                 135                       {}  
2                  54  {'lasso__alpha': 0.001}  
3                  54  {'ridge__alpha': 0.001}  


        Model  In_sample_MAE  Out_of_sample_MAE         CV_MAE  \
0       LASSO  829531.314368      836935.642861  829642.938084   
1       RIDGE  539559.827728      546928.487785  540535.015624   
2  LASSO_Grid  549402.801752      555203.915686  550135.688535   
3  RIDGE_Grid  539619.969950      546830.423981  540624.599136   

   Number_of_Features              Best_Params  
0                 135                       {}  
1                 135                       {}  
2                  54  {'lasso__alpha': 0.001}  
3                  54  {'ridge__alpha': 0.001} 

所以最好的模型应该是Ridge_grid

### 现用cleaned_TP训练ridge_grid，来预测test_price

In [58]:
#test_price基础数据处理
test_price['建筑面积']=test_price['建筑面积'].str.replace('㎡','')
test_price['建筑面积']=pd.to_numeric(test_price['建筑面积'],errors='coerce')
test_price['套内面积']=test_price['套内面积'].str.replace('㎡','')
test_price['套内面积']=pd.to_numeric(test_price['套内面积'],errors='coerce')
test_price['lon']=pd.to_numeric(test_price['lon'],errors='coerce')
test_price['lat']=pd.to_numeric(test_price['lat'],errors='coerce')

In [59]:
best_Ridge_grid=Pipeline([
    ('preprocessing', train_price_preprocessing),
    ('poly', poly_pipeline),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=0.001, random_state=111))
])

In [88]:
X_train=cleaned_TP.copy()
y_train=np.log(cleaned_TP['Price'])
best_Ridge_grid.fit(X_train,y_train)
Price_predictions=best_Ridge_grid.predict(test_price)



In [89]:
np.exp(Price_predictions)

array([15237635.85815096,  4320678.04267724,  6412575.79026097, ...,
         233252.82933455,   234386.81387931,   224251.10859739])

In [91]:
Price_pre_df=pd.DataFrame(np.exp(Price_predictions),columns=['Predicted_Price'])

In [93]:
Price_pre_df.head()

Unnamed: 0,Predicted_Price
0,15237640.0
1,4320678.0
2,6412576.0
3,3012618.0
4,6488572.0


In [96]:
Price_pre_df.to_csv('Predicted_Price.csv')

# 租金模型

处理租金数据内容

In [55]:
train_rent['面积']=train_rent['面积'].str.replace('㎡','')
train_rent['面积']=pd.to_numeric(train_rent['面积'],errors='coerce')

AttributeError: Can only use .str accessor with string values!

In [None]:
#异常值清理
from scipy import stats

def remove_outliers_advanced(df):
    # 创建副本，避免修改原始数据
    df_clean = df.copy()
    
    # 记录原始数据量
    original_count = len(df_clean)
    
    # 1. 使用IQR方法检测价格和面积异常值
    def detect_outliers_iqr(series):
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return (series >= lower_bound) & (series <= upper_bound)
    
    # 2. 使用Z-score方法检测价格和面积异常值
    def detect_outliers_zscore(series, threshold=3):
        z_scores = np.abs(stats.zscore(series.dropna()))
        # 创建一个与series相同大小的布尔数组，NaN值设为True（不过滤）
        mask = pd.Series(True, index=series.index)
        mask[series.dropna().index] = z_scores < threshold
        return mask
    
    # 3. 经纬度异常值清理（中国大致范围）
    # 中国经度范围：约73°E - 135°E
    # 中国纬度范围：约18°N - 54°N
    lon_lower = 73   # 最西端
    lon_upper = 135  # 最东端
    lat_lower = 18   # 最南端
    lat_upper = 54   # 最北端
    
    # 应用IQR方法
    price_iqr_condition = detect_outliers_iqr(df_clean['Price'])
    area_iqr_condition = detect_outliers_iqr(df_clean['面积'])
    
    # 应用Z-score方法
    price_zscore_condition = detect_outliers_zscore(df_clean['Price'])
    area_zscore_condition = detect_outliers_zscore(df_clean['面积'])
    
    # 经纬度条件
    lon_condition = (df_clean['lon'] >= lon_lower) & (df_clean['lon'] <= lon_upper)
    lat_condition = (df_clean['lat'] >= lat_lower) & (df_clean['lat'] <= lat_upper)
    coord_condition = lon_condition & lat_condition
    
    # 组合所有条件 - 可以选择使用IQR或Z-score方法
    # 这里我们使用IQR方法，您可以根据需要更改为Z-score
    method_choice = 'iqr'  # 可以选择 'iqr' 或 'zscore'
    
    if method_choice == 'iqr':
        all_conditions = price_iqr_condition & area_iqr_condition & coord_condition
        method_name = "IQR"
    else:
        all_conditions = price_zscore_condition & area_zscore_condition & coord_condition
        method_name = "Z-score"
    
    # 过滤数据
    df_cleaned = df_clean[all_conditions]
    
    # 输出清理结果
    print(f"\n使用{method_name}方法的异常值清理结果:")
    print(f"原始数据量: {original_count}")
    print(f"清理后数据量: {len(df_cleaned)}")
    print(f"移除的记录数: {original_count - len(df_cleaned)}")
    
    # 分别统计各类异常值
    print(f"\n各类异常值统计:")
    print(f"价格异常值 (IQR): {len(df_clean) - price_iqr_condition.sum()} 条")
    print(f"面积异常值 (IQR): {len(df_clean) - area_iqr_condition.sum()} 条")
    print(f"价格异常值 (Z-score): {len(df_clean) - price_zscore_condition.sum()} 条")
    print(f"面积异常值 (Z-score): {len(df_clean) - area_zscore_condition.sum()} 条")
    print(f"经纬度异常值: {len(df_clean) - coord_condition.sum()} 条")
    
    return df_cleaned

cleaned_rent=remove_outliers_advanced(train_rent)

In [37]:
urls_to_remove = [
    'https://image1.ljcdn.com/rent-',
    'https://img.ljcdn.com/usercent'
]

# 方法一：精确匹配清除
def remove_url_rows_exact(df, url_list):
    """精确匹配URL并删除包含这些URL的行"""
    df_clean = df.copy()
    
    # 记录原始数据量
    original_count = len(df_clean)
    
    # 创建匹配条件
    condition = df_clean['付款方式'].isin(url_list)
    
    # 统计要删除的行数
    rows_to_remove = condition.sum()
    
    # 删除匹配的行
    df_clean = df_clean[~condition]
    
    print(f"精确匹配清除结果:")
    print(f"原始数据量: {original_count}")
    print(f"删除行数: {rows_to_remove}")
    print(f"剩余数据量: {len(df_clean)}")
    
    return df_clean
cleaned_rent=remove_url_rows_exact(cleaned_rent,urls_to_remove)

精确匹配清除结果:
原始数据量: 92252
删除行数: 5
剩余数据量: 92247


In [15]:
cleaned_rent.head(3)

Unnamed: 0,城市,户型,装修,Price,楼层,面积,朝向,交易时间,付款方式,租赁方式,...,供水,供暖,供电,燃气费,供热费,停车位,停车费用,coord_x,coord_y,客户反馈
0,0,1室1厅1卫,精装修,654646.481811,4/6层,36.42,西,2024-11-28,季付价,整租,...,民水,集中供暖,民电,2.61元/m³,24-30元/㎡,450.0,150,117.339283,40.930007,潮气重，仔细一看，房屋保养好
1,0,1室1厅1卫,精装修,665412.057415,4/6层,41.0,南,2024-10-30,季付价,整租,...,民水,集中供暖,民电,2.61元/m³,30元/㎡,150.0,150,117.446526,40.876743,服务响应中等，看起来，管线老化，消防设施齐全
2,0,1室1厅1卫,精装修,778222.820548,1/18层,37.36,北,2024-11-12,季付价,整租,...,商水/民水,集中供暖,商电/民电,2.61-2.63元/m³,30-46元/㎡,965.0,500,117.518524,40.905357,差不多这样，电梯新，总的来说，宽敞，性价比高


### 处理数据

In [38]:
#数据转型
#提取绿化率
def convert_green_rate_simple(value):
    if pd.isna(value):
        return np.nan
    try:
        value_str = str(value).strip().replace('%', '')
        return float(value_str) / 100
    except:
        return np.nan
cleaned_rent['绿化率_小数'] = cleaned_rent['绿 化 率'].apply(convert_green_rate_simple)


In [39]:
#提取物业费
import re
def extract_first_property_fee(fee_str):
    """
    从物业费字符串中提取第一个数字
    例如: "2.38-2.98元/月/㎡" -> 2.38
    """
    if pd.isna(fee_str):
        return np.nan
    
    # 转换为字符串
    fee_str = str(fee_str).strip()
    
    # 如果为空字符串，返回NaN
    if not fee_str:
        return np.nan
    
    # 尝试多种匹配模式
    patterns = [
        # 匹配数字开头，后面可能跟着-或到结尾
        r'^(\d+\.?\d*)(?:\s*[-~至]\s*\d+\.?\d*)?',
        # 匹配数字开头，后面跟着单位
        r'^(\d+\.?\d*)\s*元',
        # 匹配任何浮点数或整数
        r'(\d+\.?\d*)'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, fee_str)
        if match:
            try:
                return float(match.group(1))
            except (ValueError, TypeError):
                continue
    
    # 如果所有模式都匹配失败，返回NaN
    return np.nan

# 应用提取函数
cleaned_rent['物业费_提取'] = cleaned_rent['物 业 费'].apply(extract_first_property_fee)


In [40]:
#提取燃气费
def extract_gas_fee(fee_str):
    """
    从燃气费字符串中提取第一个数字
    例如: "3.5元/m³" -> 3.5
    """
    if pd.isna(fee_str):
        return np.nan
    
    # 转换为字符串
    fee_str = str(fee_str).strip()
    
    # 如果为空字符串，返回NaN
    if not fee_str:
        return np.nan
    
    # 处理特殊值
    special_cases = {
        '暂无': np.nan,
        '无': np.nan,
        '免费': 0,
        '0元': 0,
        '待定': np.nan,
        '未知': np.nan,
        '—': np.nan,
        '-': np.nan,
        '有': np.nan,  # 有些数据可能只标注"有"而不是具体价格
        '无': np.nan   # 有些数据可能只标注"无"
    }
    
    if fee_str in special_cases:
        return special_cases[fee_str]
    
    # 移除常见单位
    cleaned_str = fee_str
    units = ['元/m³', '元/立方', '元/立方米', '元', 'm³', '/m³', '元/m3']
    for unit in units:
        cleaned_str = cleaned_str.replace(unit, '')
    
    # 尝试多种匹配模式
    patterns = [
        # 匹配范围格式: 数字-数字
        r'^(\d+\.?\d*)\s*[-~至]\s*\d+\.?\d*',
        # 匹配单个数字开头
        r'^(\d+\.?\d*)',
        # 匹配任何位置的数字
        r'(\d+\.?\d*)'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, cleaned_str)
        if match:
            try:
                return float(match.group(1))
            except (ValueError, TypeError):
                continue
    
    return np.nan

# 应用提取函数
cleaned_rent['燃气费_提取'] = cleaned_rent['燃气费'].apply(extract_gas_fee)

In [41]:
cleaned_rent['log_面积']=np.log(cleaned_rent['面积'])
cleaned_rent['log_绿化率']=np.log(cleaned_rent['绿化率_小数'])
cleaned_rent['log_物业费']=np.log(cleaned_rent['物业费_提取'])
cleaned_rent['log_燃气费']=np.log(cleaned_rent['燃气费_提取'])

建立处理管道

In [42]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

FACILITY_FEATURES = ['冰箱', '天然', '天然气', '宽带', '床', '暖气', '洗衣机', '热水器', '电视', '空调', '衣柜']

def facility_ext(X):
    """
    从配套设施字符串中提取特征向量
    
    参数:
    X: 输入数据，可以是Series、DataFrame或二维数组
    
    返回:
    numpy数组，形状为(n_samples, n_features)，其中n_features=11
    """
    # 处理二维输入（来自ColumnTransformer）
    if hasattr(X, 'shape') and len(X.shape) == 2:
        if X.shape[1] == 1:
            # 如果是单列二维数组，转换为Series
            facilities_series = pd.Series(X.flatten())
        else:
            # 如果是多列，取第一列
            facilities_series = pd.Series(X[:, 0])
    elif hasattr(X, 'iloc'):
        # 已经是Series或DataFrame
        if len(X.shape) > 1 and X.shape[1] > 1:
            # 如果是DataFrame，取第一列
            facilities_series = X.iloc[:, 0]
        else:
            facilities_series = X
    else:
        # 转换为Series
        facilities_series = pd.Series(X)
    
    # 初始化结果数组
    result = np.zeros((len(facilities_series), len(FACILITY_FEATURES)), dtype=int)
    
    for i, facility_str in enumerate(facilities_series):
        if pd.isna(facility_str):
            # 如果是空值，全部设为0
            result[i] = np.zeros(len(FACILITY_FEATURES), dtype=int)
            continue
        
        # 转换为字符串
        facility_str = str(facility_str)
        
        # 使用顿号分割
        facilities = facility_str.split('、')
        
        # 初始化特征向量
        feature_vector = np.zeros(len(FACILITY_FEATURES), dtype=int)
        
        # 检查每个设施是否存在
        for j, feature in enumerate(FACILITY_FEATURES):
            if feature in facilities:
                feature_vector[j] = 1
        
        result[i] = feature_vector
    
    return result

def facility_name():
    """
    返回设施特征的名称列表
    
    返回:
    list: 设施特征名称列表
    """
    return [f'has_{feature}' for feature in FACILITY_FEATURES]

# 创建管道
facility_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    FunctionTransformer(
        facility_ext, 
        validate=False,
        feature_names_out=lambda transformer, input_features: facility_name()
    )
)

In [43]:
from sklearn.preprocessing import PolynomialFeatures

poly_pipeline2=make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    PolynomialFeatures(degree=2,interaction_only=True),
    StandardScaler()
)

In [44]:
#房屋户型处理管道
import re
class HouseTypeEncoder2(BaseEstimator, TransformerMixin):
    """房屋户型编码器 - 将'3室1厅1厨1卫'格式编码为四维数值特征"""
    
    def __init__(self):
        self.most_frequent = None
        self.pattern = re.compile(r'(\d+)室(\d+)厅(\d+)厨(\d+)卫')
        
    def fit(self, X, y=None):
        # 找到所有符合格式的户型
        valid_house_types = []
        for house_type in X.iloc[:, 0] if hasattr(X, 'iloc') else X:
            if pd.isna(house_type):
                continue
                
            match = self.pattern.match(str(house_type))
            if match:
                rooms = list(map(int, match.groups()))
                valid_house_types.append(rooms)
        
        # 计算每个维度的众数
        if valid_house_types:
            valid_array = np.array(valid_house_types)
            self.most_frequent = [
                np.bincount(valid_array[:, i]).argmax() for i in range(4)
            ]
        else:
            # 如果没有有效数据，使用默认值
            self.most_frequent = [2, 1, 0, 0]  # 最常见的户型: 2室1厅1厨1卫
        
        return self
    
    def transform(self, X):
        # 确保输入是DataFrame或可以迭代的形式
        if hasattr(X, 'iloc'):
            house_types = X.iloc[:, 0]
        else:
            house_types = X
            
        result = []
        
        for house_type in house_types:
            if pd.isna(house_type):
                # 缺失值用众数填充
                result.append(self.most_frequent)
                continue
                
            house_type_str = str(house_type)
            match = self.pattern.match(house_type_str)
            
            if match:
                # 提取四个数值
                rooms = list(map(int, match.groups()))
                result.append(rooms)
            else:
                # 不符合格式的用众数填充
                result.append(self.most_frequent)
        
        # 转换为DataFrame，列名为室、厅、厨、卫
        result_df = pd.DataFrame(result, columns=['室', '厅', '厨', '卫'])
        
        # 如果输入是DataFrame，保持相同的索引
        if hasattr(X, 'index'):
            result_df.index = X.index
            
        return result_df
    
    def get_feature_names_out(self, input_features=None):
        """返回输出特征名称"""
        return np.array(['室', '厅', '厨', '卫'])

house_type_pipeline2=make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    HouseTypeEncoder2()
)

In [45]:
def floor_level_ext(X):
    """
    提取楼层高度并分类
    
    参数:
    X: numpy array, 包含楼层信息的列
    
    返回:
    numpy array, 包含分类后的楼层高度
    """
    import pandas as pd
    import numpy as np
    
    # 将输入转换为Series
    series = pd.Series(X.ravel())
    
    def classify_floor(floor_str):
        if pd.isna(floor_str):
            return '未知'
        
        # 如果已经是中文分类，直接返回
        if '低楼层' in str(floor_str):
            return '低楼层'
        elif '中楼层' in str(floor_str):
            return '中楼层'
        elif '高楼层' in str(floor_str):
            return '高楼层'
        
        # 处理 x/y层 格式
        if '/' in str(floor_str) and '层' in str(floor_str):
            try:
                parts = str(floor_str).split('/')
                if len(parts) == 2:
                    current_floor = float(parts[0])
                    total_floors = float(parts[1].replace('层', ''))
                    
                    if total_floors > 0:
                        ratio = current_floor / total_floors
                        if ratio < 0.33:
                            return '低楼层'
                        elif ratio > 0.66:
                            return '高楼层'
                        else:
                            return '中楼层'
            except (ValueError, ZeroDivisionError):
                pass
        
        return '未知'
    
    result = series.apply(classify_floor)
    # 将结果转换回numpy数组，并确保是2D数组（一列）
    return result.values.reshape(-1, 1)
    
floor_pipeline2 = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    FunctionTransformer(floor_level_ext, feature_names_out='one-to-one'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

In [46]:
from sklearn.compose import ColumnTransformer

rent_preprocessing=ColumnTransformer([
    ('poly',poly_pipeline2,['面积','绿化率_小数','物业费_提取','燃气费_提取','log_面积','log_绿化率','log_物业费','log_燃气费']),
    ('cat',cat_pipeline,['城市','付款方式','租赁方式','电梯','用水','用电','燃气','建筑结构','供水','供电']),
    ('fq',cat_frequency,['产权描述','租期','区县','板块','物业类别']),
    ('facility',facility_pipeline,['配套设施']),
    ('geo',cluster_simil,['lon','lat']),
    ('type',house_type_pipeline2,['户型']),
    ('floor',floor_pipeline2,['楼层']),
    ('direction',direction_pipeline,['朝向'])
])

In [25]:
rent_preprocessing.fit_transform(cleaned_rent)
for name_i in rent_preprocessing.get_feature_names_out():
    print(name_i)

[WinError 2] 系统找不到指定的文件。
  File "D:\Programs\Anaconda\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "D:\Programs\Anaconda\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Programs\Anaconda\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Programs\Anaconda\Lib\su

poly__1
poly__面积
poly__绿化率_小数
poly__物业费_提取
poly__燃气费_提取
poly__log_面积
poly__log_绿化率
poly__log_物业费
poly__log_燃气费
poly__面积 绿化率_小数
poly__面积 物业费_提取
poly__面积 燃气费_提取
poly__面积 log_面积
poly__面积 log_绿化率
poly__面积 log_物业费
poly__面积 log_燃气费
poly__绿化率_小数 物业费_提取
poly__绿化率_小数 燃气费_提取
poly__绿化率_小数 log_面积
poly__绿化率_小数 log_绿化率
poly__绿化率_小数 log_物业费
poly__绿化率_小数 log_燃气费
poly__物业费_提取 燃气费_提取
poly__物业费_提取 log_面积
poly__物业费_提取 log_绿化率
poly__物业费_提取 log_物业费
poly__物业费_提取 log_燃气费
poly__燃气费_提取 log_面积
poly__燃气费_提取 log_绿化率
poly__燃气费_提取 log_物业费
poly__燃气费_提取 log_燃气费
poly__log_面积 log_绿化率
poly__log_面积 log_物业费
poly__log_面积 log_燃气费
poly__log_绿化率 log_物业费
poly__log_绿化率 log_燃气费
poly__log_物业费 log_燃气费
cat__城市_0
cat__城市_1
cat__城市_2
cat__城市_3
cat__城市_4
cat__城市_5
cat__城市_6
cat__城市_7
cat__城市_8
cat__城市_9
cat__城市_10
cat__城市_11
cat__付款方式_半年付价
cat__付款方式_双月付价
cat__付款方式_季付价
cat__付款方式_年付价
cat__付款方式_月付价
cat__租赁方式_合租
cat__租赁方式_整租
cat__电梯_无
cat__电梯_有
cat__用水_商水
cat__用水_民水
cat__用电_商电
cat__用电_民电
cat__燃气_无
cat__燃气_有
cat__建筑结构_塔板结合
cat__建筑结构_塔板结合/平房

### 建立模型

In [47]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFE, VarianceThreshold,SelectKBest,f_regression
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV

rent_ols=Pipeline([
    ('preprocessing',rent_preprocessing),
    ('scaler',StandardScaler()),
    ('ols',LinearRegression())
])

rent_advanced_ols=Pipeline([
    ('preprocessing', rent_preprocessing),
    ('scaler',StandardScaler()),
    
    # 第一步：移除低方差特征
    ('variance_threshold', VarianceThreshold(threshold=0.01)),
    
    # 第二步：基于统计测试的特征选择
    ('univariate_selection', SelectKBest(score_func=f_regression, k=50)),
    
    # 第三步：基于模型的特征选择
    ('model_based_selection', SelectFromModel(
        LassoCV(cv=5, random_state=111),
        threshold='1.25*median'  # 更宽松的阈值
    )),
    
    ('ols', LinearRegression())
])

In [48]:
# 1. Lasso 模型
rent_lasso_model = Pipeline([
    ('preprocessing', rent_preprocessing),
    ('scaler',StandardScaler()),
    ('lasso', Lasso(alpha=0.1, random_state=111, max_iter=10000))
])

# 2. Ridge 模型
rent_ridge_model = Pipeline([
    ('preprocessing', rent_preprocessing),
    ('scaler',StandardScaler()),
    ('ridge', Ridge(alpha=1.0, random_state=111))
])

In [49]:
# Lasso 超参数调优
rent_lasso_param_grid = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

rent_lasso_grid = GridSearchCV(
    rent_lasso_model,
    rent_lasso_param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Ridge 超参数调优
rent_ridge_param_grid = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

rent_ridge_grid = GridSearchCV(
    rent_ridge_model,
    rent_ridge_param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

### 评估模型

In [70]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """评估模型性能"""
    
    # 训练模型
    print(f"训练 {model_name}...")
    model.fit(X_train, y_train)
    
    # 预测
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # 计算MAE（原始价格尺度）
    train_mae = mean_absolute_error(np.exp(y_train), np.exp(train_pred))
    test_mae = mean_absolute_error(np.exp(y_test), np.exp(test_pred))

    def mae_original_scale(y_true_log, y_pred_log):
        """在原始价格尺度计算 MAE 的评分函数"""
        y_true_original = np.exp(y_true_log)
        y_pred_original = np.exp(y_pred_log)
        return mean_absolute_error(y_true_original, y_pred_original)
    
    # 6. 使用正确评分器进行交叉验证
    mae_scorer = make_scorer(mae_original_scale, greater_is_better=False)

    
    # 交叉验证
    print(f"进行 {model_name} 交叉验证...")
    cv_results = cross_validate(
        model, X_train, y_train,
        cv=6,
        scoring=mae_scorer,
        return_train_score=True
    )
    
    cv_train_mae = -cv_results['train_score'].mean()
    cv_test_mae = -cv_results['test_score'].mean()
    
    # 获取最佳参数（如果是调优模型）
    best_params = {}
    if hasattr(model, 'best_params_'):
        best_params = model.best_params_
    elif hasattr(model, 'named_steps'):
        for step_name, step in model.named_steps.items():
            if hasattr(step, 'alpha_'):
                best_params[f'{step_name}_alpha'] = step.alpha_
    
    # 特征数量
    try:
        if hasattr(model, 'named_steps'):
            X_processed = model.named_steps['preprocessing'].transform(X_train)
            X_poly = model.named_steps['poly'].transform(X_processed)
            n_features = X_poly.shape[1]
        else:
            n_features = X_train.shape[1]
    except:
        n_features = "N/A"
    
    return {
        'Model': model_name,
        'In_sample_MAE': train_mae,
        'Out_of_sample_MAE': test_mae,
        'CV_MAE': cv_test_mae,
        'Number_of_Features': n_features,
        'Best_Params': best_params
    }

In [71]:
from sklearn.model_selection import train_test_split
models_to_evaluate = {
    'OLS':rent_ols,
    'ADVANCED_OLS':rent_advanced_ols,
    'LASSO': rent_lasso_model,
    'RIDGE': rent_ridge_model,
    'LASSO_Grid': rent_lasso_grid,
    'RIDGE_Grid': rent_ridge_grid,
}

X_train, X_test, y_train, y_test = train_test_split(
    cleaned_rent.drop('Price', axis=1),
    np.log(cleaned_rent['Price']),
    test_size=0.2,
    random_state=111
)

results = []
for name, model in models_to_evaluate.items():
    result = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(result)

# 创建结果表格
results_df = pd.DataFrame(results)
print(results_df)

训练 OLS...




进行 OLS 交叉验证...




训练 ADVANCED_OLS...


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

进行 ADVANCED_OLS 交叉验证...


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

训练 LASSO...




进行 LASSO 交叉验证...




训练 RIDGE...




进行 RIDGE 交叉验证...




训练 LASSO_Grid...
Fitting 5 folds for each of 6 candidates, totalling 30 fits




进行 LASSO_Grid 交叉验证...
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




训练 RIDGE_Grid...
Fitting 5 folds for each of 7 candidates, totalling 35 fits




进行 RIDGE_Grid 交叉验证...
Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




          Model  In_sample_MAE  Out_of_sample_MAE         CV_MAE  \
0           OLS  107418.673886      107258.464867  107593.132438   
1  ADVANCED_OLS  119528.401294      119964.811566  118812.250821   
2         LASSO  163246.736792      164276.453256  163265.515307   
3         RIDGE  107454.382477      107300.322635  107634.684713   
4    LASSO_Grid  110589.834687      110584.779240  110745.086543   
5    RIDGE_Grid  107418.690796      107258.502144  107593.121321   

  Number_of_Features              Best_Params  
0                N/A                       {}  
1                N/A                       {}  
2                N/A                       {}  
3                N/A                       {}  
4                 52  {'lasso__alpha': 0.001}  
5                 52  {'ridge__alpha': 0.001}  


          Model  In_sample_MAE  Out_of_sample_MAE         CV_MAE  \
0           OLS  107418.673886      107258.464867  107593.132438   
1  ADVANCED_OLS  119528.401294      119964.811566  118812.250821   
2         LASSO  163246.736792      164276.453256  163265.515307   
3         RIDGE  107454.382477      107300.322635  107634.684713   
4    LASSO_Grid  110589.834687      110584.779240  110745.086543   
5    RIDGE_Grid  107418.690796      107258.502144  107593.121321   

  Number_of_Features              Best_Params  
0                N/A                       {}  
1                N/A                       {}  
2                N/A                       {}  
3                N/A                       {}  
4                 52  {'lasso__alpha': 0.001}  
5                 52  {'ridge__alpha': 0.001}  
所以应选择Ridge_grid

### 预测租金

In [56]:
test_rent['面积']=test_rent['面积'].str.replace('㎡','')
test_rent['面积']=pd.to_numeric(test_rent['面积'],errors='coerce')
test_rent['绿化率_小数'] = test_rent['绿 化 率'].apply(convert_green_rate_simple)
test_rent['物业费_提取'] = test_rent['物 业 费'].apply(extract_first_property_fee)
test_rent['燃气费_提取'] = test_rent['燃气费'].apply(extract_gas_fee)
test_rent['log_面积']=np.log(test_rent['面积'])
test_rent['log_绿化率']=np.log(test_rent['绿化率_小数'])
test_rent['log_物业费']=np.log(test_rent['物业费_提取'])
test_rent['log_燃气费']=np.log(test_rent['燃气费_提取'])

AttributeError: Can only use .str accessor with string values!

In [29]:
best_rent_ridge_model = Pipeline([
    ('preprocessing', rent_preprocessing),
    ('scaler',StandardScaler()),
    ('ridge', Ridge(alpha=0.001, random_state=111))
])

In [30]:
y_train=np.log(cleaned_rent['Price'])
X_train=cleaned_rent.drop('Price',axis=1)
best_rent_ridge_model.fit(X_train,y_train)
Rent_Predictions=best_rent_ridge_model.predict(test_rent)



In [31]:
Rent_Predictions

array([12.22501191, 12.88942738, 12.95394061, ..., 12.44481748,
       13.44845218, 13.8690944 ])

In [33]:
pd.DataFrame(np.exp(Rent_Predictions),columns=['Price']).to_csv('Rent_predictions.csv',encoding='utf-8-sig')

In [34]:
df1 = pd.read_csv('Predicted_Price.csv')
df2 = pd.read_csv('Rent_predictions.csv')

# 垂直拼接两个表的指定列
combined_column = pd.concat([df1['Predicted_Price'], df2['Price']], axis=0, ignore_index=True)

# 创建新的DataFrame或添加到现有表
result = pd.DataFrame({
    'ID': list(range(1000000, 1000000 + len(df1['Predicted_Price'])))+list(range(2000000,2000000+len(df2['Price']))),
    'Price': combined_column
})
result.to_csv('prediction.csv', index=False)

In [63]:
price_model=[ols_poly,lasso_grid,ridge_grid]
rent_model=[rent_ols,rent_lasso_grid,rent_ridge_grid]
prediction_names=['ols','lasso_grid','ridge_grid']

for model_i,model_j,name_k in zip(price_model,rent_model,prediction_names):
    X_price=cleaned_TP.drop('Price',axis=1)
    y_price=np.log(cleaned_TP['Price'])
    model_i.fit(X_price,y_price)
    price_prediction=model_i.predict(test_price)
    df1=pd.DataFrame(np.exp(price_prediction),columns=['Price'])

    X_rent=cleaned_rent.drop('Price',axis=1)
    y_rent=np.log(cleaned_rent['Price'])
    model_j.fit(X_rent,y_rent)
    rent_prediction=model_j.predict(test_rent)
    df2=pd.DataFrame(np.exp(rent_prediction),columns=['Price'])

    combined_column=pd.concat([df1['Price'],df2['Price']],axis=0,ignore_index=True)

    result=pd.DataFrame({
        'ID':list(range(1000000, 1000000 + len(df1['Price'])))+list(range(2000000,2000000+len(df2['Price']))),
        'Price':combined_column
    })

    result.to_csv(f'{name_k}.csv',index=False)



    



Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits




Fitting 5 folds for each of 7 candidates, totalling 35 fits


