In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
# 加载数据
df = pd.read_csv('zhengzhou_house_data.csv',encoding='UTF-8-SIG')

# 查看数据的前几行
df.head()

Unnamed: 0,name,size_m2,features,orientation,floor,bedroom,living_room,bathroom,Apartment Layout Classification,price_10k_rmb,unit_price_10k_rmb_per_m2
0,中间好楼层 冯庄地铁口 人车分流小区 装修保持好拎包入住,88.55,"['南', '优势户型', '房东急售', '随时可看']",南,33,2,2,1,大户型,79.9,0.9024
1,亚星城市山水 交通便捷 双卫生间 临地铁 拎包入住,131.62,"['南', '满五年', '近地铁', '车位充足']",南,28,3,2,2,大户型,118.0,0.8966
2,2024年交付 121平精装修 双卫三房 拎包入住 看房随时,121.56,"['南', '热门小区', '南北通透', '满五']",南,20,3,2,2,大户型,128.0,1.053
3,铁道陇海家园 近地铁 精装可拎包入住 商圈成熟 南北通透,124.86,"['南', '满五年', '近地铁', '采光较好']",南,26,3,2,2,大户型,150.0,1.2014
4,必看好房 业主二次装修 拎包入住 近 出门就是地铁口,94.41,"['南', '热门小区', '满五', '房东急售']",南,33,3,2,1,大户型,96.0,1.0169


In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 查看数据的前几行，确认列的内容
print(df.head())

# 确定分类变量
categorical_columns = ['name', 'features', 'orientation', 'Apartment Layout Classification']

# 使用LabelEncoder对分类变量进行编码
for col in categorical_columns:
    # 检查列是否为分类变量
    if df[col].dtype == 'object':
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        print(f"Encoded {col} with {len(encoder.classes_)} unique classes")
    else:
        print(f"Column {col} is not of object type and may not need encoding.")

# 查看处理后的数据
print(df.head())

# 如果需要保存处理后的数据
df.to_csv('processed_data.csv', index=False, encoding='UTF-8-SIG')

                             name  size_m2                       features  \
0    中间好楼层 冯庄地铁口 人车分流小区 装修保持好拎包入住    88.55  ['南', '优势户型', '房东急售', '随时可看']   
1       亚星城市山水 交通便捷 双卫生间 临地铁 拎包入住   131.62    ['南', '满五年', '近地铁', '车位充足']   
2  2024年交付 121平精装修 双卫三房 拎包入住 看房随时   121.56    ['南', '热门小区', '南北通透', '满五']   
3    铁道陇海家园 近地铁 精装可拎包入住 商圈成熟 南北通透   124.86    ['南', '满五年', '近地铁', '采光较好']   
4    必看好房 业主二次装修 拎包入住 近   出门就是地铁口    94.41    ['南', '热门小区', '满五', '房东急售']   

  orientation  floor  bedroom  living_room  bathroom  \
0           南     33        2            2         1   
1           南     28        3            2         2   
2           南     20        3            2         2   
3           南     26        3            2         2   
4           南     33        3            2         1   

  Apartment Layout Classification  price_10k_rmb  unit_price_10k_rmb_per_m2  
0                             大户型           79.9                     0.9024  
1                             大户型          1

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# 1. 确定需要缩放的数值型列
numerical_columns = [
    'size_m2', 'price_10k_rmb', 'unit_price_10k_rmb_per_m2', 
    'floor', 'bedroom', 'living_room', 'bathroom'
]

# 2. 特征缩放
# 标准化（Standardization）
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
print("Standardized numerical columns.")

# 查看处理后的数据
print(df.head())

# 如果需要保存处理后的数据
df.to_csv('scaled_data.csv', index=False, encoding='UTF-8-SIG')

Standardized numerical columns.
   name   size_m2  features  orientation     floor   bedroom  living_room  \
0   637 -0.649752        73            2  0.715613 -1.290805     0.297985   
1  1020  1.374394       140            2  0.184037  0.437742     0.297985   
2    45  0.901608       152            2 -0.666486  0.437742     0.297985   
3  6158  1.056697       141            2 -0.028594  0.437742     0.297985   
4  3055 -0.374352       156            2  0.715613  0.437742     0.297985   

   bathroom  Apartment Layout Classification  price_10k_rmb  \
0 -0.567369                                0      -1.026410   
1  1.577812                                0      -0.360426   
2  1.577812                                0      -0.185628   
3  1.577812                                0       0.198930   
4 -0.567369                                0      -0.744984   

   unit_price_10k_rmb_per_m2  
0                  -1.141577  
1                  -1.156287  
2                  -0.759639  
3 

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy import sparse  # 导入 scipy.sparse 模块

# 1. 确定目标变量和特征集
target_column = 'price_10k_rmb'
y = df[target_column]  # 目标变量

features_to_remove = [target_column, 'unit_price_10k_rmb_per_m2', 'name']  # 移除目标变量和可能不需要的列
X = df.drop(columns=features_to_remove)  # 特征集

# 2. 处理分类变量（使用 OneHotEncoder）
categorical_columns = ['orientation', 'features', 'Apartment Layout Classification']
numerical_columns = X.columns.drop(categorical_columns).tolist()

# 3. 构建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# 4. 构建完整的处理管道（包括预处理和模型）
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# 5. 应用预处理
X_processed = pipeline.fit_transform(X)

# 如果 X_processed 是稀疏矩阵，先转换为密集矩阵
if isinstance(X_processed, sparse.csr_matrix):  # 修复这里
    X_processed = X_processed.toarray()

# 转换为 DataFrame
feature_names = numerical_columns + list(preprocessor.named_transformers_['cat'].get_feature_names_out())
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)

# 查看划分后的数据集大小
print(f"训练集特征集大小: {X_train.shape}")
print(f"测试集特征集大小: {X_test.shape}")
print(f"训练集目标变量大小: {y_train.shape}")
print(f"测试集目标变量大小: {y_test.shape}")

训练集特征集大小: (5550, 395)
测试集特征集大小: (1388, 395)
训练集目标变量大小: (5550,)
测试集目标变量大小: (1388,)


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split


# 1. 确定目标变量和特征集
target_column = 'price_10k_rmb'
y = df[target_column]  # 目标变量

# 移除目标变量和可能不需要的列
features_to_remove = [target_column, 'unit_price_10k_rmb_per_m2', 'name']
X = df.drop(columns=features_to_remove)  # 特征集

# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. 将训练集和测试集保存为CSV文件
# 保存训练集特征集
X_train.to_csv('train_features.csv', index=False, encoding='UTF-8-SIG')
# 保存训练集目标变量
y_train.to_csv('train_target.csv', index=False, encoding='UTF-8-SIG', header=['price_10k_rmb'])

# 保存测试集特征集
X_test.to_csv('test_features.csv', index=False, encoding='UTF-8-SIG')
# 保存测试集目标变量
y_test.to_csv('test_target.csv', index=False, encoding='UTF-8-SIG', header=['price_10k_rmb'])

print("训练集和测试集已成功保存为CSV文件！")

训练集和测试集已成功保存为CSV文件！
