# Data Processing

## Download the Data

In [1]:
# Kaggle Notebook 数据读取模板
from pathlib import Path
import pandas as pd

def load_housing_data():
    data_path = Path("/kaggle/input/housing-data/housing_data") 
    dfs = {
        "train_price": pd.read_csv(data_path / "train_price.csv"),
        "train_rent": pd.read_csv(data_path / "train_rent.csv"),
        "test_price": pd.read_csv(data_path / "test_price.csv"),
        "test_rent": pd.read_csv(data_path / "test_rent.csv")
    }

    return dfs

housing = load_housing_data()

  "train_price": pd.read_csv(data_path / "train_price.csv"),
  "train_rent": pd.read_csv(data_path / "train_rent.csv"),
  "test_price": pd.read_csv(data_path / "test_price.csv"),


## Data Cleaning

### train_price

In [2]:
train_price = housing['train_price'].copy()
train_price = train_price[train_price['别墅类型'].isna()]
train_price.drop(columns=['梯户比例','套内面积','环线位置','房屋朝向','别墅类型','交易时间','交易权属','上次交易','房屋用途','产权所属','抵押信息','房屋优势','核心卖点','户型介绍','周边配套','交通出行','年份','区县','板块_comm','物业类别','建筑年代','开发商','房屋总数','楼栋总数','物业公司','建筑结构_comm','装修情况','物业办公电话','产权描述','供水','供暖','供电','coord_x','coord_y','客户反馈'], inplace=True)

In [3]:
train_price.drop_duplicates(inplace=True)
train_price.dropna(subset=['配备电梯'],inplace=True)
train_price['环线'] = train_price['环线'].fillna('未知')

In [4]:
train_price.rename(columns={
    '建筑面积': '建筑面积（㎡）',
    '绿 化 率': '绿化率（%）',
    '容 积 率': '容积率（倍）',
    '物 业 费': '物业费（元/月/㎡）',
    '燃气费': '燃气费（元/m³）',
    '供热费': '供热费（元/㎡）',
    '停车位': '停车位（个）',
    '停车费用': '停车费用（元）',
}, inplace = True)
train_price['绿化率（%）'] = train_price['绿化率（%）'].astype(str).str.replace('%', '', regex=False)
train_price['绿化率（%）'] = pd.to_numeric(train_price['绿化率（%）'], errors='coerce')
train_price['建筑面积（㎡）'] = train_price['建筑面积（㎡）'].astype(str).str.replace('㎡', '', regex=False)
train_price['建筑面积（㎡）'] = pd.to_numeric(train_price['建筑面积（㎡）'], errors='coerce')
train_price['燃气费（元/m³）'] = train_price['燃气费（元/m³）'].astype(str).str.replace('元/m³', '', regex=False)
train_price['供热费（元/㎡）'] = train_price['供热费（元/㎡）'].astype(str).str.replace('元/㎡', '', regex=False)
train_price['物业费（元/月/㎡）'] = train_price['物业费（元/月/㎡）'].astype(str).str.replace(r'[^\d\.-]', '', regex=True)
# [^\d\.-]匹配所有不是数字、不是小数点、不是减号的字符

In [5]:
# 将区间数据用平均值代替
import pandas as pd
import numpy as np

cols = ['物业费（元/月/㎡）', '燃气费（元/m³）', '供热费（元/㎡）']
for col in cols:
    train_price[col] = train_price[col].astype(str).str.replace(r'[^\d\.-]', '', regex=True)
    train_price[col] = train_price[col].replace('', np.nan)
  
    def parse_range(x):
        if pd.isna(x):
            return np.nan
        parts = x.split('-')
        return (float(parts[0]) + float(parts[1])) / 2 if len(parts) > 1 else float(parts[0])
    
    train_price[col] = train_price[col].apply(parse_range)

train_price[cols].head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）
0,1.475,2.61,30.0
1,0.65,2.61,
2,2.48,2.61,30.0
4,5.15,2.62,37.5
5,7.0,2.61,


In [6]:
cols_to_fix = ['绿化率（%）', '容积率（倍）', '物业费（元/月/㎡）', '燃气费（元/m³）', '供热费（元/㎡）', '停车位（个）', '停车费用（元）']

for col in cols_to_fix:
    train_price[col] = pd.to_numeric(train_price[col], errors='coerce')
    
# 填补缺失值
for col in cols_to_fix:
    median_value = train_price[col].median()
    train_price[col] = train_price[col].fillna(median_value)
    print(f"{col} 缺失值已用中位数 {median_value:.2f} 填补完成")

train_price[cols_to_fix].describe()


绿化率（%） 缺失值已用中位数 33.00 填补完成
容积率（倍） 缺失值已用中位数 2.50 填补完成
物业费（元/月/㎡） 缺失值已用中位数 1.88 填补完成
燃气费（元/m³） 缺失值已用中位数 2.61 填补完成
供热费（元/㎡） 缺失值已用中位数 25.00 填补完成
停车位（个） 缺失值已用中位数 756.00 填补完成
停车费用（元） 缺失值已用中位数 300.00 填补完成


Unnamed: 0,绿化率（%）,容积率（倍）,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）,停车位（个）,停车费用（元）
count,91520.0,91520.0,91520.0,91520.0,91520.0,91520.0,91520.0
mean,38.452334,2.739625,2.375949,2.663097,23.119174,1025.8112,322.316625
std,239.779474,1.369385,3.446521,0.517296,7.860311,1073.882953,188.724116
min,0.01,0.02,0.02,0.4,0.01,1.0,1.0
25%,30.0,2.1,1.475,2.46,25.0,500.0,300.0
50%,33.0,2.5,1.875,2.61,25.0,756.0,300.0
75%,35.0,3.0,2.41,2.95,25.0,1194.0,350.0
max,10500.0,30.0,76.45,5.0,50.0,8700.0,2300.0


In [7]:
# 替换异常值（IQR 方法）
for col in cols_to_fix:
    Q1 = train_price[col].quantile(0.25)
    Q3 = train_price[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_value = train_price[col].median()
 
    train_price[col] = train_price[col].mask((train_price[col] < lower_bound) | (train_price[col] > upper_bound), median_value)

train_price[cols_to_fix].describe()

Unnamed: 0,绿化率（%）,容积率（倍）,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）,停车位（个）,停车费用（元）
count,91520.0,91520.0,91520.0,91520.0,91520.0,91520.0,91520.0
mean,32.948525,2.449292,1.859527,2.63161,25.0,786.443477,310.471287
std,2.907591,0.635627,0.679447,0.456645,0.0,476.428844,31.540344
min,23.0,0.79,0.2,1.73,25.0,1.0,230.0
25%,30.2,2.13,1.475,2.46,25.0,500.0,300.0
50%,33.0,2.5,1.875,2.61,25.0,756.0,300.0
75%,35.0,2.5,2.075,2.62,25.0,885.0,300.0
max,42.3,4.33,3.81,3.675,25.0,2200.0,420.0


### train_rent

In [8]:
train_rent = housing['train_rent'].copy()
train_rent.drop(columns=['朝向','交易时间','车位','用水','用电','采暖','租期','配套设施','年份','物业类别','建筑年代','开发商','房屋总数','楼栋总数','物业公司','建筑结构','物业办公电话','产权描述','供水','供暖','供电','coord_x','coord_y','客户反馈','停车费用'], inplace=True)
train_rent.shape


(98899, 21)

In [9]:
train_rent.isnull().sum()

城市           0
户型           1
装修       73489
Price        0
楼层           5
面积           0
付款方式     18423
租赁方式         0
电梯           4
燃气        4582
lon          0
lat          0
区县        4677
板块        5144
环线位置     69663
绿 化 率    24402
容 积 率    24080
物 业 费    22159
燃气费      25057
供热费      70064
停车位      25479
dtype: int64

In [10]:
train_rent.dropna(subset=['电梯','区县','板块'],inplace=True)
train_rent['环线位置'] = train_rent['环线位置'].fillna('未知')
train_rent['装修'] = train_rent['装修'].fillna('非精装修')
train_rent['付款方式'] = train_rent['付款方式'].fillna('未知')
train_rent['燃气'] = train_rent['燃气'].fillna('未知')
train_rent.shape

(93751, 21)

In [11]:
train_rent.rename(columns={
    '面积': '面积（㎡）',
    '绿 化 率': '绿化率（%）',
    '容 积 率': '容积率（倍）',
    '物 业 费': '物业费（元/月/㎡）',
    '燃气费': '燃气费（元/m³）',
    '供热费': '供热费（元/㎡）',
    '停车位': '停车位（个）'
}, inplace = True)

train_rent['绿化率（%）'] = train_rent['绿化率（%）'].astype(str).str.replace('%', '', regex=False)
train_rent['绿化率（%）'] = pd.to_numeric(train_price['绿化率（%）'], errors='coerce')
train_rent['面积（㎡）'] = train_rent['面积（㎡）'].astype(str).str.replace('㎡', '', regex=False)
train_rent['燃气费（元/m³）'] = train_rent['燃气费（元/m³）'].astype(str).str.replace('元/m³', '', regex=False)
train_rent['供热费（元/㎡）'] = train_rent['供热费（元/㎡）'].astype(str).str.replace('元/㎡', '', regex=False)
train_rent['物业费（元/月/㎡）'] = train_rent['物业费（元/月/㎡）'].astype(str).str.replace(r'[^\d\.-]', '', regex=True)

train_rent.sample(5)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,城市,户型,装修,Price,楼层,面积（㎡）,付款方式,租赁方式,电梯,燃气,...,lat,区县,板块,环线位置,绿化率（%）,容积率（倍）,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）,停车位（个）
30817,2,3室2厅1卫,精装修,230834.5,中楼层/27层,86.0,季付价,整租,有,有,...,30.599512,78.0,349.0,内环至外环,33.0,0.7,2.4-6,1.96-1.98,1.96,1722.0
48011,3,2室1厅1卫,精装修,427217.3,高楼层/23层,63.0,季付价,整租,有,有,...,32.293799,35.0,906.0,未知,33.0,,,,,
13675,0,3室1厅2卫,精装修,1711731.0,低楼层/14层,137.0,季付价,整租,有,有,...,40.883823,68.0,637.0,四至五环,30.0,2.35,2.6,2.61,30.0,1400.0
86952,10,3室2厅,非精装修,679472.0,低楼层/33层,95.0,月付价,整租,无,有,...,24.12804,105.0,737.0,未知,33.0,3.0,2.6-3.2,3.45,,1729.0
14866,1,3室1厅,非精装修,164797.7,高楼层/11层,83.0,季付价,整租,有,有,...,40.475754,24.0,303.0,未知,30.0,2.5,1.9-2.5,2.15-2.46,22.0,2200.0


In [12]:
# 将区间数据用平均值代替
cols = ['物业费（元/月/㎡）', '燃气费（元/m³）', '供热费（元/㎡）']
for col in cols:
    train_rent[col] = train_rent[col].astype(str).str.replace(r'[^\d\.-]', '', regex=True)
    train_rent[col] = train_rent[col].replace('', np.nan)
  
    def parse_range(x):
        if pd.isna(x):
            return np.nan
        parts = x.split('-')
        return (float(parts[0]) + float(parts[1])) / 2 if len(parts) > 1 else float(parts[0])
    
    train_rent[col] = train_rent[col].apply(parse_range)

train_rent[cols].head()

Unnamed: 0,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）
0,1.475,2.61,27.0
1,0.875,2.61,30.0
2,2.33,2.62,38.0
3,3.2,2.61,37.0
4,1.0,2.61,30.0


In [13]:
cols_to_fix1 = ['绿化率（%）', '容积率（倍）', '物业费（元/月/㎡）', '燃气费（元/m³）', '供热费（元/㎡）', '停车位（个）']

for col in cols_to_fix1:
    train_rent[col] = pd.to_numeric(train_rent[col], errors='coerce')
    
# 填补缺失值
for col in cols_to_fix1:
    median_value = train_rent[col].median()
    train_rent[col] = train_rent[col].fillna(median_value)
    print(f"{col} 缺失值已用中位数 {median_value:.2f} 填补完成")

train_rent[cols_to_fix1].describe()


绿化率（%） 缺失值已用中位数 33.00 填补完成
容积率（倍） 缺失值已用中位数 2.70 填补完成
物业费（元/月/㎡） 缺失值已用中位数 2.20 填补完成
燃气费（元/m³） 缺失值已用中位数 2.95 填补完成
供热费（元/㎡） 缺失值已用中位数 25.00 填补完成
停车位（个） 缺失值已用中位数 760.00 填补完成


Unnamed: 0,绿化率（%）,容积率（倍）,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）,停车位（个）
count,93751.0,93751.0,93751.0,93751.0,93751.0,93751.0
mean,32.937556,2.991245,2.717554,2.899832,23.34167,1164.250216
std,2.708426,1.569622,3.512976,0.52153,7.696434,1409.706026
min,23.0,0.02,0.02,0.4,0.01,1.0
25%,33.0,2.03,1.5,2.61,25.0,420.0
50%,33.0,2.7,2.2,2.95,25.0,760.0
75%,33.0,3.2,2.9,3.445,25.0,1300.0
max,42.3,30.0,76.45,5.0,50.0,8700.0


In [14]:
# 替换异常值（IQR 方法）
cols_to_fix2 = ['容积率（倍）', '物业费（元/月/㎡）', '燃气费（元/m³）',  '停车位（个）']
for col in cols_to_fix2:
    Q1 = train_rent[col].quantile(0.25)
    Q3 = train_rent[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_value = train_rent[col].median()
 
    train_rent[col] = train_rent[col].mask((train_rent[col] < lower_bound) | (train_rent[col] > upper_bound), median_value)

train_rent[cols_to_fix1].describe()

Unnamed: 0,绿化率（%）,容积率（倍）,物业费（元/月/㎡）,燃气费（元/m³）,供热费（元/㎡）,停车位（个）
count,93751.0,93751.0,93751.0,93751.0,93751.0,93751.0
mean,32.937556,2.616508,2.162704,2.890875,23.34167,796.436593
std,2.708426,0.823558,0.933765,0.497423,7.696434,533.511525
min,23.0,0.3,0.02,1.5,0.01,1.0
25%,33.0,2.05,1.5,2.61,25.0,420.0
50%,33.0,2.7,2.2,2.95,25.0,760.0
75%,33.0,3.0,2.68,3.3625,25.0,1000.0
max,42.3,4.95,5.0,4.5,50.0,2604.0


In [15]:
train_rent.drop_duplicates(inplace=True)
train_rent.shape

(93751, 21)

##  Handling Text and Categorical Attributes

### train_price

In [16]:
# 环线编码
ring_map = {
    '内环内': 1,
    '内环至中环': 2,
    '中环至外环': 3,
    '内环至外环': 3,  # 内环至外环和中环至外环视为同一层级
    '二环内': 1,
    '二至三环': 2,
    '三至四环': 3,
    '四至五环': 4,
    '五至六环': 5,
    '六环外': 6,
    '外环外': 6,
    '未知': 0
}

train_price['环线编码'] = train_price['环线'].map(ring_map)
train_price['环线编码'].fillna(0, inplace=True)
train_price[['环线', '环线编码']].head(10)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_price['环线编码'].fillna(0, inplace=True)


Unnamed: 0,环线,环线编码
0,二至三环,2
1,五至六环,5
2,五至六环,5
4,三至四环,3
5,五至六环,5
6,六环外,6
8,三至四环,3
9,四至五环,4
10,五至六环,5
11,四至五环,4


In [17]:
# 建筑结构编码
train_price['建筑结构'].fillna('未知结构', inplace=True)
structure_map = {
    '未知结构': 0,
    '钢混结构': 1,
    '钢结构': 2,
    '混合结构': 3,
    '框架结构': 4,  
    '砖混结构': 5,
    '砖木结构': 6,
}
train_price['建筑结构编码'] = train_price['建筑结构'].map(structure_map)
train_price[['建筑结构', '建筑结构编码']].head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_price['建筑结构'].fillna('未知结构', inplace=True)


Unnamed: 0,建筑结构,建筑结构编码
0,混合结构,3
1,混合结构,3
2,钢混结构,1
4,钢混结构,1
5,钢混结构,1
6,钢混结构,1
8,钢混结构,1
9,钢混结构,1
10,钢混结构,1
11,混合结构,3


In [18]:
# 房屋年限编码
# 使用众数填补缺失值
most_frequent_value = train_price['房屋年限'].mode()[0]
train_price['房屋年限'].fillna(most_frequent_value, inplace=True)
# 使用 pandas 进行独热编码
train_price = pd.get_dummies(train_price, columns=['房屋年限'], drop_first=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_price['房屋年限'].fillna(most_frequent_value, inplace=True)


In [19]:
# 楼层处理
train_price['所在楼层'] = train_price['所在楼层'].str.replace(r'\(.*\)', '', regex=True)
train_price['所在楼层'] = train_price['所在楼层'].str.strip()
floor_map = {
    '地下室': 0,
    '底层': 1,
    '低楼层': 2,
    '中楼层': 3,
    '高楼层': 4,
    '顶层': 5
}

train_price['楼层编码'] = train_price['所在楼层'].map(floor_map)
train_price['楼层编码'].head(10)

0     3
1     5
2     2
4     3
5     0
6     4
8     4
9     1
10    5
11    5
Name: 楼层编码, dtype: int64

In [20]:
import pandas as pd
import re

# 创建一个处理房屋户型的函数
def parse_layout_v2(layout):
    layout = str(layout)  # 确保传入的是字符串类型
    rooms = {'室': 0, '厅': 0, '厨': 0, '卫': 0, '房间': 0}  # 初始化各个房间数量
    match = re.findall(r'(\d+)(室|厅|厨|卫|房间)', layout) # 匹配不同类型的房间

    for num, type_ in match:
        rooms[type_] = int(num)  # 将房间数赋给对应类型

    return rooms['房间'], rooms['室'], rooms['厅'], rooms['厨'], rooms['卫']

# 确保 '房屋户型' 列的数据为字符串格式，并填充缺失值
train_price['房屋户型'] = train_price['房屋户型'].fillna('未知').astype(str)

# 应用到数据
train_price[['房间数', '室数', '厅数', '厨数', '卫数']] = train_price['房屋户型'].apply(lambda x: pd.Series(parse_layout_v2(x)))

# 检查处理后的数据
print(train_price[['房屋户型', '房间数', '室数', '厅数', '厨数', '卫数']].head())


       房屋户型  房间数  室数  厅数  厨数  卫数
0  2室1厅1厨1卫    0   2   1   1   1
1  3室1厅1厨1卫    0   3   1   1   1
2  3室2厅1厨2卫    0   3   2   1   2
4     1房间1卫    1   0   0   0   1
5  5室2厅1厨4卫    0   5   2   1   4


In [21]:
# 配备电梯处理
train_price['有电梯'] = train_price['配备电梯'].map({'无': 0, '有': 1})
train_price['有电梯'].head()

0    0
1    0
2    1
4    1
5    0
Name: 有电梯, dtype: int64

In [22]:
train_price.drop(columns=['环线','所在楼层','配备电梯','建筑结构','房屋户型'], inplace=True)

### train_rent

In [23]:
# 装修
train_rent['精装修'] = train_rent['装修'].map({'非精装修': 0, '精装修': 1})

# 付款方式
print(train_rent['付款方式'].unique())
train_rent['付款方式'] = train_rent['付款方式'].str.replace(r'http\S+', '未知', regex=True)
print(train_rent['付款方式'].unique())
train_rent = pd.get_dummies(train_rent, columns=['付款方式'], drop_first=True)
train_rent.columns

['季付价' '未知' '年付价' '半年付价' '月付价' '双月付价' 'https://image1.ljcdn.com/rent-'
 'https://img.ljcdn.com/usercent']
['季付价' '未知' '年付价' '半年付价' '月付价' '双月付价']


Index(['城市', '户型', '装修', 'Price', '楼层', '面积（㎡）', '租赁方式', '电梯', '燃气', 'lon',
       'lat', '区县', '板块', '环线位置', '绿化率（%）', '容积率（倍）', '物业费（元/月/㎡）',
       '燃气费（元/m³）', '供热费（元/㎡）', '停车位（个）', '精装修', '付款方式_双月付价', '付款方式_季付价',
       '付款方式_年付价', '付款方式_月付价', '付款方式_未知'],
      dtype='object')

In [24]:
# 租赁方式
train_rent['整租'] = train_rent['租赁方式'].map({'合租': 0, '整租': 1})
# 电梯
train_rent['有电梯'] = train_rent['电梯'].map({'无': 0, '有': 1})
# 燃气
train_rent['有燃气'] = train_rent['燃气'].map({'无': 0,'未知': 0, '有': 1})
# 环线编码
ring_map = {
    '内环内': 1,
    '内环至中环': 2,
    '中环至外环': 3,
    '内环至外环': 3,  
    '二环内': 1,
    '二至三环': 2,
    '三至四环': 3,
    '四至五环': 4,
    '五至六环': 5,
    '六环外': 6,
    '外环外': 6,
    '未知': 0
}
train_rent['环线编码'] = train_rent['环线位置'].map(ring_map)
train_rent['环线编码'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_rent['环线编码'].fillna(0, inplace=True)


In [25]:
# 户型
# 确保 '房屋户型' 列的数据为字符串格式，并填充缺失值
train_rent['户型'] = train_rent['户型'].fillna('未知').astype(str)

# 应用到数据
train_rent[['房间数', '室数', '厅数', '厨数', '卫数']] = train_rent['户型'].apply(lambda x: pd.Series(parse_layout_v2(x)))

# 检查处理后的数据
print(train_rent[['户型', '房间数', '室数', '厅数', '厨数', '卫数']].head())


       户型  房间数  室数  厅数  厨数  卫数
0  1室1厅1卫    0   1   1   0   1
1  1室1厅1卫    0   1   1   0   1
2  1室1厅1卫    0   1   1   0   1
3  3室1厅2卫    0   3   1   0   2
4  1室1厅1卫    0   1   1   0   1


In [26]:
# 楼层
# 定义楼层区间
def convert_floor(floor_data):
    floor_data = str(floor_data).strip()
    # 如果是地下室
    if '地下' in str(floor_data):
        return '地下室'
    # 处理 'x/y层' 格式的楼层数据
    match = re.match(r'(\d+)/(\d+)层', str(floor_data))
    if match:
        floor_num = int(match.group(1))
        total_floors = int(match.group(2))
        # 判断低楼层、中楼层和高楼层
        if floor_num <= total_floors * 0.3:  # 低楼层
            return '低楼层'
        elif floor_num <= total_floors * 0.7:  # 中楼层
            return '中楼层'
        else:  # 高楼层
            return '高楼层'
    # 3. 处理 '低楼层/x楼'、'中楼层/x楼'、'高楼层/x楼' 格式的数据
    match = re.match(r'(低楼层|中楼层|高楼层)/(\d+)楼', floor_data)
    if match:
        return match.group(1)  # 返回对应的楼层类型
    
    
    # 对于其他情况，返回 '未知'
    return '未知'

# 应用转换函数
train_rent['楼层类型'] = train_rent['楼层'].apply(convert_floor)

# 将楼层类型转化为编码
floor_mapping = {'低楼层': 1, '中楼层': 2, '高楼层': 3, '地下室': 4, '未知': 5}
train_rent['楼层类型编码'] = train_rent['楼层类型'].map(floor_mapping)

# 查看结果
train_rent['楼层类型编码'].head()

0    2
1    2
2    1
3    1
4    3
Name: 楼层类型编码, dtype: int64

In [27]:
train_rent.drop(columns=['装修','租赁方式','电梯','燃气','环线位置','户型', '楼层类型','楼层'], inplace=True)
train_rent['城市'] = pd.to_numeric(train_rent['城市'], errors='coerce')
train_rent['面积（㎡）'] = pd.to_numeric(train_rent['面积（㎡）'], errors='coerce')
train_rent[['付款方式_双月付价','付款方式_季付价','付款方式_年付价','付款方式_月付价','付款方式_未知']] = train_rent[['付款方式_双月付价','付款方式_季付价','付款方式_年付价','付款方式_月付价','付款方式_未知']].astype(int)

train_rent.dtypes

城市              int64
Price         float64
面积（㎡）         float64
lon           float64
lat           float64
区县            float64
板块            float64
绿化率（%）        float64
容积率（倍）        float64
物业费（元/月/㎡）    float64
燃气费（元/m³）     float64
供热费（元/㎡）      float64
停车位（个）        float64
精装修             int64
付款方式_双月付价       int64
付款方式_季付价        int64
付款方式_年付价        int64
付款方式_月付价        int64
付款方式_未知         int64
整租              int64
有电梯             int64
有燃气             int64
环线编码            int64
房间数             int64
室数              int64
厅数              int64
厨数              int64
卫数              int64
楼层类型编码          int64
dtype: object

# Feature Engineering

## Skewness

In [28]:
# 计算偏度
skewed_features = train_price.select_dtypes(include=[np.number]).skew()

# 查看偏度较大的特征（绝对值大于1）
skewed_features = skewed_features[skewed_features.abs() > 1]
print(skewed_features)

Price       4.093032
建筑面积（㎡）     1.846964
停车费用（元）     1.839502
环线编码        1.418632
建筑结构编码      2.392151
房间数        15.412717
厨数         -1.998520
卫数          1.591101
有电梯        -1.309793
dtype: float64


In [29]:
cols = ['Price', '建筑面积（㎡）','环线编码', '建筑结构编码']
train_price['log_price'] = np.log1p(train_price['Price'])
train_price['log_area'] = np.log1p(train_price['建筑面积（㎡）'])
train_price['log_ringcode'] = np.log1p(train_price['环线编码'])
train_price['log_stru_code'] = np.log1p(train_price['建筑结构编码'])
for col in ['log_price', 'log_area', 'log_ringcode','log_stru_code']:
    print(col, '偏度 =', train_price[col].skew())



log_price 偏度 = 0.28872458021975933
log_area 偏度 = -0.1268566205136206
log_ringcode 偏度 = 0.853491320319867
log_stru_code 偏度 = 1.6417965577491822


In [30]:
cols = ['Price', '面积（㎡）','卫数']
for col in cols:
    print(col, '偏度 =', train_rent[col].skew())

Price 偏度 = 5.215102536424168
面积（㎡） 偏度 = 1.633344967567743
卫数 偏度 = 1.6031481509854535


In [31]:
train_rent['log_price'] = np.log1p(train_rent['Price'])
train_rent['log_area'] = np.log1p(train_rent['面积（㎡）'])
train_rent['log_bathroom'] = np.log1p(train_rent['卫数'])

for col in ['log_price', 'log_area', 'log_bathroom']:
    print(col, '偏度 =', train_rent[col].skew())

log_price 偏度 = 0.3709961800417537
log_area 偏度 = -0.9612559334777454
log_bathroom 偏度 = 0.9619349746222539


## interaction

In [32]:
train_price.columns

Index(['城市', '区域', '板块', 'Price', '建筑面积（㎡）', 'lon', 'lat', '绿化率（%）', '容积率（倍）',
       '物业费（元/月/㎡）', '燃气费（元/m³）', '供热费（元/㎡）', '停车位（个）', '停车费用（元）', '环线编码',
       '建筑结构编码', '房屋年限_满两年', '房屋年限_满五年', '楼层编码', '房间数', '室数', '厅数', '厨数', '卫数',
       '有电梯', 'log_price', 'log_area', 'log_ringcode', 'log_stru_code'],
      dtype='object')

In [33]:
train_rent.columns

Index(['城市', 'Price', '面积（㎡）', 'lon', 'lat', '区县', '板块', '绿化率（%）', '容积率（倍）',
       '物业费（元/月/㎡）', '燃气费（元/m³）', '供热费（元/㎡）', '停车位（个）', '精装修', '付款方式_双月付价',
       '付款方式_季付价', '付款方式_年付价', '付款方式_月付价', '付款方式_未知', '整租', '有电梯', '有燃气',
       '环线编码', '房间数', '室数', '厅数', '厨数', '卫数', '楼层类型编码', 'log_price',
       'log_area', 'log_bathroom'],
      dtype='object')

In [34]:
train_price['建筑面积_绿化率'] = train_price['建筑面积（㎡）'] * train_price['绿化率（%）']
train_price['城市_建筑面积'] = train_price['城市'] * train_price['建筑面积（㎡）']
train_price['板块_容积率'] = train_price['板块'] * train_price['容积率（倍）']
train_price['log_area_房间数'] = train_price['log_area'] * train_price['房间数'] 


In [35]:
train_rent['城市_面积'] = train_rent['城市'] * train_rent['面积（㎡）']
train_rent['整租_房间数'] = train_rent['整租'] * train_rent['房间数']
train_rent['有电梯_楼层类型'] = train_rent['有电梯'] * train_rent['楼层类型编码']

## Binning

In [36]:
df1 = train_price.copy()

# 自定义分箱区间
bins = [0, 300, 400, df1['停车费用（元）'].max()]  # 分成三个区间：低、中、高
labels = ['低停车费', '中停车费', '高停车费']  # 对应标签
# 创建分箱列
df1['停车费用_bin'] = pd.cut(df1['停车费用（元）'], bins=bins, labels=labels, include_lowest=True)
# 查看结果
print(df1[['停车费用（元）', '停车费用_bin']].sample(5))
# One-Hot 编码
df1_dummies = pd.get_dummies(df1['停车费用_bin'], prefix='停车费用', drop_first=True)
df1 = pd.concat([df1, df1_dummies], axis=1)
# 查看编码后的结果
print(df1.sample(5))

        停车费用（元） 停车费用_bin
85665     400.0     中停车费
95471     300.0     低停车费
16267     400.0     中停车费
102194    400.0     中停车费
96390     300.0     低停车费
       城市    区域      板块         Price  建筑面积（㎡）         lon        lat  绿化率（%）  \
8924    0  68.0   189.0  2.546418e+06    47.96  117.465441  41.057513    33.0   
83231   8  38.0   445.0  3.824157e+05    41.00  103.759701  25.997100    33.0   
67908   3  49.0   731.0  5.894325e+06   151.00  121.773417  32.322982    33.0   
7179    0  28.0  1154.0  4.945845e+06   144.56  117.384111  40.754021    30.0   
27466   2  86.0   756.0  8.991671e+05    53.04  107.569775  30.660238    33.0   

       容积率（倍）  物业费（元/月/㎡）  ...  log_area  log_ringcode  log_stru_code  \
8924     2.32       1.750  ...  3.891004      1.791759       0.693147   
83231    1.17       1.600  ...  3.737670      0.000000       0.693147   
67908    2.50       1.875  ...  5.023881      0.693147       1.609438   
7179     2.50       1.795  ...  4.980588      1.791759       0.693147  

In [37]:
df2 = train_rent.copy()
# 自定义分箱
bins = [0, 20, 30, df2['供热费（元/㎡）'].max()]
labels = ['低供热费', '中供热费', '高供热费']

df2['供热费_bin'] = pd.cut(df2['供热费（元/㎡）'], bins=bins, labels=labels, include_lowest=True)

# One-Hot 编码
df2_dummies = pd.get_dummies(df2['供热费_bin'], prefix='供热费', drop_first=True)
df2 = pd.concat([df2, df2_dummies], axis=1)

# 查看结果
print(df2[['供热费（元/㎡）', '供热费_bin', '供热费_中供热费', '供热费_高供热费']].head())


   供热费（元/㎡） 供热费_bin  供热费_中供热费  供热费_高供热费
0      27.0    中供热费      True     False
1      30.0    中供热费      True     False
2      38.0    高供热费     False      True
3      37.0    高供热费     False      True
4      30.0    中供热费      True     False


# Feature Selection

In [38]:
df1.drop(columns=['Price', '建筑面积（㎡）','停车费用（元）','停车费用_bin','供热费（元/㎡）'],inplace=True)
df1['停车费用_中停车费'] = df1['停车费用_中停车费'].astype(int)  # 转换为整数 0 或 1
df1['停车费用_高停车费'] = df1['停车费用_高停车费'].astype(int) 

In [39]:
df2.drop(columns=['Price', '面积（㎡）','供热费（元/㎡）','供热费_bin'],inplace=True)
df2['供热费_中供热费'] = df2['供热费_中供热费'].astype(int)  # 转换为整数 0 或 1
df2['供热费_高供热费'] = df2['供热费_高供热费'].astype(int) 

In [40]:
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# 特征和目标
X1 = train_price.drop(columns=['log_price', 'Price'])  
y1 = train_price['log_price']

# 标准化数据
scaler = StandardScaler()
X1_scaled = scaler.fit_transform(X1)

# 切分数据集
X1_train, X1_test, y1_train, y1_test = train_test_split(X1_scaled, y1, test_size=0.2, random_state=111)

# LassoCV 自动选择 alpha
lasso_cv = LassoCV(alphas=[0.1, 1, 10, 100], cv=5)  # 使用5折交叉验证
lasso_cv.fit(X1_train, y1_train)

# 打印最优的 alpha 和系数
print("最佳 alpha:", lasso_cv.alpha_)
print("LASSO 回归系数：", lasso_cv.coef_)

# 获取系数不为零的特征
selected_features = X1.columns[lasso_cv.coef_ != 0]
print("Selected features by LASSO:", selected_features)

# 在测试集上进行预测
y1_pred = lasso_cv.predict(X1_test)

# 计算 MAE（Mean Absolute Error）
mae = mean_absolute_error(y1_test, y1_pred)
print("Mean Absolute Error (MAE):", mae)

最佳 alpha: 0.1
LASSO 回归系数： [-0.          0.         -0.          0.18141386  0.12405533  0.
 -0.          0.          0.          0.13556117  0.         -0.
 -0.          0.          0.          0.          0.         -0.
 -0.          0.         -0.          0.          0.          0.
  0.09510693  0.19403403  0.          0.         -0.         -0.
 -0.        ]
Selected features by LASSO: Index(['建筑面积（㎡）', 'lon', '燃气费（元/m³）', 'log_area', 'log_ringcode'], dtype='object')
Mean Absolute Error (MAE): 0.4530943382818381


In [41]:
# 特征和目标
X2 = train_rent.drop(columns=['log_price','Price'])  
y2 = train_rent['log_price']

# 标准化数据
scaler = StandardScaler()
X2_scaled = scaler.fit_transform(X2)

# 切分数据集
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_scaled, y2, test_size=0.2, random_state=111)

# LassoCV 自动选择 alpha
lasso_cv = LassoCV(alphas=[0.1, 1, 10, 100], cv=5)  # 使用5折交叉验证
lasso_cv.fit(X2_train, y2_train)

# 打印最优的 alpha 和系数
print("最佳 alpha:", lasso_cv.alpha_)
print("LASSO 回归系数：", lasso_cv.coef_)

# 获取系数不为零的特征
selected_features = X2.columns[lasso_cv.coef_ != 0]
print("Selected features by LASSO:", selected_features)

# 在测试集上进行预测
y2_pred = lasso_cv.predict(X2_test)

# 计算 MAE（Mean Absolute Error）
mae = mean_absolute_error(y2_test, y2_pred)
print("Mean Absolute Error (MAE):", mae)

最佳 alpha: 0.1
LASSO 回归系数： [-0.          0.22445207  0.07582406  0.          0.         -0.
  0.          0.          0.          0.07534714  0.00581295 -0.
  0.         -0.          0.         -0.          0.         -0.06016162
  0.01955306  0.          0.          0.17186073  0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.        ]
Selected features by LASSO: Index(['面积（㎡）', 'lon', '燃气费（元/m³）', '供热费（元/㎡）', '付款方式_未知', '整租', '环线编码'], dtype='object')
Mean Absolute Error (MAE): 0.45897802282961264


# Modeling

In [45]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import joblib  # 用于保存和加载模型

# 设置 pandas 显示选项，避免科学计数法
pd.set_option('display.float_format', '{:,.2f}'.format)

# 1. 准备数据
X2 = train_rent[['面积（㎡）', 'lon', '燃气费（元/m³）', '供热费（元/㎡）', '付款方式_未知', '整租', '环线编码']]  # 使用Lasso筛选后的特征
y2 = df2['log_price']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=111)

# 2. 定义模型与参数
models = {
    "OLS": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet()
}

param_grids = {
    "Ridge": {"alpha": [0.01, 0.1, 1, 10, 100]},
    "Lasso": {"alpha": [0.001, 0.01, 0.1, 1, 10]},
    "ElasticNet": {"alpha": [0.01, 0.1, 1, 10], "l1_ratio": [0.2, 0.5, 0.8]}
}

# 3. 建模与评估
results = []
best_model_OLS = None
best_model_Lasso = None
best_model_Ridge = None
best_model_ElasticNet = None  # 定义ElasticNet模型变量

# 训练和评估每个模型
for name, model in models.items():
    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=6, scoring='neg_mean_absolute_error')
        grid.fit(X2_train, y2_train)
        best_model = grid.best_estimator_
    else:
        best_model = model.fit(X2_train, y2_train)
    
    # 保存模型（OLS, Lasso, Ridge, ElasticNet）
    if name == 'OLS':
        best_model_OLS = best_model
    elif name == 'Lasso':
        best_model_Lasso = best_model
    elif name == 'Ridge':
        best_model_Ridge = best_model
    elif name == 'ElasticNet':
        best_model_ElasticNet = best_model  # 保存ElasticNet模型
    
    # 预测
    y2_train_pred = best_model.predict(X2_train)
    y2_test_pred = best_model.predict(X2_test)
    
    # MAE（对数值）
    mae_train = mean_absolute_error(y2_train, y2_train_pred)
    mae_test = mean_absolute_error(y2_test, y2_test_pred)
    mae_cv = -cross_validate(best_model, X2_train, y2_train, cv=6, scoring='neg_mean_absolute_error')['test_score'].mean()
    
    # 将对数价格的 MAE 添加到结果中
    results.append({
        "Model": name,
        "Train MAE (Log)": mae_train,
        "Test MAE (Log)": mae_test,
        "CV MAE (Log)": mae_cv
    })

# 4. 将对数变换值反转回原始值，并计算原始房价的 MAE
# 反转 y2_train 和 y2_test 的对数值
y2_train_original = np.exp(y2_train)
y2_test_original = np.exp(y2_test)

# 对每个模型进行预测后，将预测的对数值反转回原始值
y2_train_pred_original = np.exp(y2_train_pred)
y2_test_pred_original = np.exp(y2_test_pred)

# 计算原始值上的 MAE
mae_train_original = mean_absolute_error(y2_train_original, y2_train_pred_original)
mae_test_original = mean_absolute_error(y2_test_original, y2_test_pred_original)

# 5. 计算原始值的 CV MAE
y2_train_original_cv = np.exp(y2_train)
mae_cv_original = -cross_validate(best_model, X2_train, y2_train_original_cv, cv=6, scoring='neg_mean_absolute_error')['test_score'].mean()

# 将原始 MAE 添加到结果中
for result in results:
    result["Train MAE (Original)"] = mae_train_original
    result["Test MAE (Original)"] = mae_test_original
    result["CV MAE (Original)"] = mae_cv_original

# 6. 打印最终的结果表
results_df = pd.DataFrame(results)
print(results_df)

# 7. 保存训练好的模型到文件（使用joblib）
joblib.dump(best_model_OLS, 'model_OLS_rent.pkl')  # 保存OLS模型
joblib.dump(best_model_Lasso, 'model_Lasso_rent.pkl')  # 保存Lasso模型
joblib.dump(best_model_Ridge, 'model_Ridge_rent.pkl')  # 保存Ridge模型
joblib.dump(best_model_ElasticNet, 'model_ElasticNet_rent.pkl')  # 保存ElasticNet模型


        Model  Train MAE (Log)  Test MAE (Log)  CV MAE (Log)  \
0         OLS             0.43            0.42          0.43   
1       Ridge             0.43            0.42          0.43   
2       Lasso             0.43            0.42          0.43   
3  ElasticNet             0.43            0.42          0.43   

   Train MAE (Original)  Test MAE (Original)  CV MAE (Original)  
0            268,133.88           261,372.76         310,126.70  
1            268,133.88           261,372.76         310,126.70  
2            268,133.88           261,372.76         310,126.70  
3            268,133.88           261,372.76         310,126.70  


['model_ElasticNet_rent.pkl']

In [46]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error
import numpy as np
import pandas as pd
import joblib  # 用于保存和加载模型

# 设置 pandas 显示选项，避免科学计数法
pd.set_option('display.float_format', '{:,.2f}'.format)

# 1. 准备数据
X2 = train_rent[['面积（㎡）', 'lon', '燃气费（元/m³）', '供热费（元/㎡）', '付款方式_未知', '整租', '环线编码']]  # 使用Lasso筛选后的特征
y2 = df2['log_price']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=111)

# 2. 定义模型与参数
models = {
    "OLS": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet()
}

param_grids = {
    "Ridge": {"alpha": [0.01, 0.1, 1, 10, 100]},
    "Lasso": {"alpha": [0.001, 0.01, 0.1, 1, 10]},
    "ElasticNet": {"alpha": [0.01, 0.1, 1, 10], "l1_ratio": [0.2, 0.5, 0.8]}
}

# 3. 建模与评估
results = []
best_model_OLS = None
best_model_Lasso = None
best_model_Ridge = None
best_model_ElasticNet = None  # 定义ElasticNet模型变量

# 训练和评估每个模型
for name, model in models.items():
    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=6, scoring='neg_mean_absolute_error')
        grid.fit(X2_train, y2_train)
        best_model = grid.best_estimator_
    else:
        best_model = model.fit(X2_train, y2_train)
    
    # 保存模型（OLS, Lasso, Ridge, ElasticNet）
    if name == 'OLS':
        best_model_OLS = best_model
    elif name == 'Lasso':
        best_model_Lasso = best_model
    elif name == 'Ridge':
        best_model_Ridge = best_model
    elif name == 'ElasticNet':
        best_model_ElasticNet = best_model  # 保存ElasticNet模型
    
    # 预测
    y2_train_pred = best_model.predict(X2_train)
    y2_test_pred = best_model.predict(X2_test)
    
    # MAE（对数值）
    mae_train = mean_absolute_error(y2_train, y2_train_pred)
    mae_test = mean_absolute_error(y2_test, y2_test_pred)
    mae_cv = -cross_validate(best_model, X2_train, y2_train, cv=6, scoring='neg_mean_absolute_error')['test_score'].mean()
    
    # 将对数价格的 MAE 添加到结果中
    results.append({
        "Model": name,
        "Train MAE (Log)": mae_train,
        "Test MAE (Log)": mae_test,
        "CV MAE (Log)": mae_cv
    })

# 4. 将对数变换值反转回原始值，并计算原始房价的 MAE
# 反转 y2_train 和 y2_test 的对数值
y2_train_original = np.exp(y2_train)
y2_test_original = np.exp(y2_test)

# 对每个模型进行预测后，将预测的对数值反转回原始值
y2_train_pred_original = np.exp(y2_train_pred)
y2_test_pred_original = np.exp(y2_test_pred)

# 计算原始值上的 MAE
mae_train_original = mean_absolute_error(y2_train_original, y2_train_pred_original)
mae_test_original = mean_absolute_error(y2_test_original, y2_test_pred_original)

# 5. 将原始 MAE 添加到结果中
for result in results:
    result["Train MAE (Original)"] = mae_train_original
    result["Test MAE (Original)"] = mae_test_original
    result["CV MAE (Original)"] = mae_cv_original

# 6. 打印最终的结果表
results_df = pd.DataFrame(results)
print(results_df)

# 7. 保存训练好的模型到文件（使用joblib）
joblib.dump(best_model_OLS, 'model_OLS_rent.pkl')  # 保存OLS模型
joblib.dump(best_model_Lasso, 'model_Lasso_rent.pkl')  # 保存Lasso模型
joblib.dump(best_model_Ridge, 'model_Ridge_rent.pkl')  # 保存Ridge模型
joblib.dump(best_model_ElasticNet, 'model_ElasticNet_rent.pkl')  # 保存ElasticNet模型


        Model  Train MAE (Log)  Test MAE (Log)  CV MAE (Log)  \
0         OLS             0.43            0.42          0.43   
1       Ridge             0.43            0.42          0.43   
2       Lasso             0.43            0.42          0.43   
3  ElasticNet             0.43            0.42          0.43   

   Train MAE (Original)  Test MAE (Original)  CV MAE (Original)  
0            268,133.88           261,372.76         310,126.70  
1            268,133.88           261,372.76         310,126.70  
2            268,133.88           261,372.76         310,126.70  
3            268,133.88           261,372.76         310,126.70  


['model_ElasticNet_rent.pkl']