In [17]:
import re
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid

# 读取训练数据
df = pd.read_csv('../dataset/train.csv')
# 查看数据集信息（列名、数据类型、缺失值等）
df.info()
# 随机抽样3条数据预览
df.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47439 entries, 0 to 47438
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Id                           47439 non-null  int64  
 1   Address                      47439 non-null  object 
 2   Sold Price                   47439 non-null  float64
 3   Summary                      47085 non-null  object 
 4   Type                         47439 non-null  object 
 5   Year built                   46394 non-null  float64
 6   Heating                      40587 non-null  object 
 7   Cooling                      26745 non-null  object 
 8   Parking                      46065 non-null  object 
 9   Lot                          33258 non-null  float64
 10  Bedrooms                     44567 non-null  object 
 11  Bathrooms                    43974 non-null  float64
 12  Full bathrooms               39574 non-null  float64
 13  Total interior l

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
41482,41482,14602 Wiley St,981000.0,Former Model Home! Open Floor Plan w/Volume C...,SingleFamily,1998.0,"Central, Electric, Forced Air",Central Air,Attached,5227.2,...,Attached,695230.0,8991.0,2020-11-11,918000.0,2002-05-08,525000.0,San Leandro,94579,CA
35785,35785,12400 Bakers Creek Rd,950000.0,Wake up to the sound of birds and sunlight fil...,SingleFamily,2005.0,"Forced air, Stove, Propane / Butane, Solar, Wo...",,"Garage - Detached, Covered",8494200.0,...,"Garage - Detached, Covered",735021.0,9004.0,2020-03-05,1199500.0,2012-03-19,640000.0,Redwood Valley,95470,CA
23932,23932,431 Fernwood Dr,1725000.0,Perfectly remodeled San Bruno home with an abu...,SingleFamily,1956.0,Forced Air,Central AC,"Garage, Garage - Attached",5601.0,...,"Garage, Garage - Attached",1684020.0,17303.0,2020-07-17,1750000.0,2018-06-22,1651000.0,San Bruno,94066,CA


In [18]:
# ================= 特征工程 =================
df_train = df.copy()
# 读取测试集数据
df_test = pd.read_csv('../dataset/test.csv')
# 将日期字段转换为datetime类型
for field in ['Listed On', 'Last Sold On']:
    df_train[field] = pd.to_datetime(df_train[field])
    df_test[field] = pd.to_datetime(df_test[field])

In [19]:
# 自动识别特征类型
cate_cols = []  # 分类特征
num_cols = []   # 数值特征
date_cols = []  # 日期特征
dtypes = df_train.dtypes
for col, dtype in dtypes.items():
    if dtype == 'object':
        cate_cols.append(col)
    elif dtype.name.startswith('datetime'):
        date_cols.append(col)
    else:
        num_cols.append(col)

In [20]:
# 定义ID列和目标列
id_col = 'Id'
target_col = 'Sold Price'

# 从数值特征中移除ID和目标列
for col in [id_col, target_col]:
    num_cols.remove(col)

# 打印特征分类结果
print(f'分类特征：{cate_cols}\n数值特征：{num_cols}\n日期特征：{date_cols}')

分类特征：['Address', 'Summary', 'Type', 'Heating', 'Cooling', 'Parking', 'Bedrooms', 'Region', 'Elementary School', 'Middle School', 'High School', 'Flooring', 'Heating features', 'Cooling features', 'Appliances included', 'Laundry features', 'Parking features', 'City', 'State']
数值特征：['Year built', 'Lot', 'Bathrooms', 'Full bathrooms', 'Total interior livable area', 'Total spaces', 'Garage spaces', 'Elementary School Score', 'Elementary School Distance', 'Middle School Score', 'Middle School Distance', 'High School Score', 'High School Distance', 'Tax assessed value', 'Annual tax amount', 'Listed Price', 'Last Sold Price', 'Zip']
日期特征：['Listed On', 'Last Sold On']


In [21]:
# ================= 数值特征处理器 =================
class Num_Features(BaseEstimator, TransformerMixin):
    def __init__(self, cols = [], fillna = False, addna = False):
        self.fillna = fillna      # 是否填充缺失值
        self.cols = cols          # 处理的列名
        self.addna = addna        # 是否添加缺失值指示列
        self.na_cols = []         # 存在缺失值的列
        self.imputers = {}        # 存储每列的填充值
        
    def fit(self, X, y=None):
        for col in self.cols:
            if self.fillna:
                # 用中位数填充缺失值
                self.imputers[col] = X[col].median()
            if self.addna and X[col].isnull().sum():
                # 记录存在缺失值的列
                self.na_cols.append(col)
        print(f'存在缺失值的列及填充值：{self.imputers}')
        return self
    
    def transform(self, X, y=None):
        df = X.loc[:, self.cols]
        for col in self.imputers:
            # 执行缺失值填充
            df[col].fillna(self.imputers[col], inplace=True)
        for col in self.na_cols:
            # 添加缺失值指示列（True表示原值为缺失）
            df[col+'_na'] = pd.isnull(df[col])
        return df

In [22]:
# ================= 通用填充器 =================
class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy, fill_value):
        self.strategy = strategy   # 填充策略
        self.fill_value = fill_value  # 填充值
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for col, content in X.items():
            # 用指定值填充缺失
            X[col].fillna(self.fill_value, inplace=True)
        return X

In [23]:
# 数值特征处理流水线
num_pipeline = Pipeline([
    ('select_num', Num_Features(cols=num_cols, fillna='median', addna=True)),
])
# 应用数值处理
X_num = num_pipeline.fit_transform(df_train)

存在缺失值的列及填充值：{'Year built': 1967.0, 'Lot': 6502.0, 'Bathrooms': 2.0, 'Full bathrooms': 2.0, 'Total interior livable area': 1566.0, 'Total spaces': 1.0, 'Garage spaces': 1.0, 'Elementary School Score': 6.0, 'Elementary School Distance': 0.5, 'Middle School Score': 5.0, 'Middle School Distance': 1.0, 'High School Score': 6.0, 'High School Distance': 1.3, 'Tax assessed value': 547524.0, 'Annual tax amount': 7129.0, 'Listed Price': 949000.0, 'Last Sold Price': 598000.0, 'Zip': 94114.0}


In [24]:
# ================= 分类特征编码器 =================
class CatEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,cols, max_n_cat=7, onehot_cols=[], orders={}):
        self.cols = cols          # 待处理列
        self.onehot_cols=onehot_cols  # 需要one-hot编码的列
        self.cats = {}            # 存储分类信息
        self.max_n_cat = max_n_cat  # 最大分类数阈值
        self.orders = orders      # 自定义分类顺序
        
    def fit(self, X, y=None):
        df_cat =  X.loc[:, self.cols]
        for n,c in df_cat.items():
            # 填充缺失并转换为有序分类
            df_cat[n].fillna('NAN', inplace=True)
            df_cat[n] = c.astype('category').cat.as_ordered()
            # 应用预定义的分类顺序
            if n in self.orders:
                df_cat[n].cat.set_categories(self.orders[n], ordered=True, inplace=True)
            # 根据分类数量决定编码方式
            cats_count = len(df_cat[n].cat.categories)
            if cats_count<=2 or cats_count>self.max_n_cat:
                self.cats[n] = df_cat[n].cat.categories
                if n in self.onehot_cols:
                    self.onehot_cols.remove(n)
            elif n not in self.onehot_cols:
                self.onehot_cols.append(n)
        print(self.onehot_cols)
        return self
    
    def transform(self, df, y=None):
        X = df.loc[:, self.cols]
        for col in self.cats:
            X[col].fillna('NAN', inplace=True)
            # 转换为有序分类并编码为数字
            cat_data = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
            X.loc[:, col] = cat_data.codes
        
        # 对指定列进行one-hot编码
        if len(self.onehot_cols):
            df_1h = pd.get_dummies(X[self.onehot_cols], dummy_na=True)
            df_drop=X.drop(self.onehot_cols,axis=1)
            return pd.concat([df_drop, df_1h], axis=1)
        return X

In [25]:
# 分类特征处理流水线
cat_pipeline = Pipeline([
    ('cat_encoder', CatEncoder(cols=cate_cols))
])
# 应用分类处理
X_cate = cat_pipeline.fit_transform(df_train)

[]


In [26]:
# ================= 日期特征处理器 =================
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    """将日期字段分解为多个时间特征"""
    field = df[field_name]
    prefix = prefix or re.sub('[Dd]ate$', '', field_name)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 
            'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: 
        attr += ['Hour', 'Minute', 'Second']
    
    # 处理周数（兼容不同Pandas版本）
    week = field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
    
    # 提取日期属性
    for n in attr: 
        df[prefix + n] = getattr(field.dt, n.lower()) if n != 'Week' else week
    
    # 添加时间戳特征
    mask = ~field.isna()
    df[prefix + 'Elapsed'] = np.where(mask, field.values.astype(np.int64) // 10**9, np.nan)
    
    if drop: 
        df.drop(field_name, axis=1, inplace=True)
    return df

In [27]:
class Datepart(BaseEstimator, TransformerMixin):
    """日期特征转换器"""
    def __init__(self, cols, time=False):
        self.cols = cols   # 日期列名
        self.time = time   # 是否包含时间部分
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df_dates = X.loc[:, self.cols]
        for col in self.cols:
            # 应用日期分解
            add_datepart(df_dates, col, time=False)
        return df_dates

# 日期处理流水线
date_pipeline = Pipeline([
    ('datepart', Datepart(cols=date_cols)),  # 日期分解
    ('imputer', Imputer(strategy="constant", fill_value=-1)),  # 缺失值填充
])

In [28]:
# 应用日期处理
X_date = date_pipeline.fit_transform(df_train)

In [29]:
# ================= 数据整合 =================
# 对数转换目标变量（解决偏态分布）
y_train = np.log(df_train[target_col])
# 合并所有处理后的特征
X_train = pd.concat([X_num, X_cate, X_date], axis=1)
X_train.shape, y_train.shape

((47439, 79), (47439,))

In [32]:
# ================= 超参数调优 =================
model = RandomForestRegressor(oob_score=True, random_state=3, n_jobs=-1)
# 参数网格
params ={
    'n_estimators': [200, 500, 700],  # 树的数量
    'min_samples_leaf': [1, 3, 10],  # 叶节点最小样本数
    'max_features': [0.5, 'sqrt'],  # 最大特征比例
    'max_depth': [8, 13, 20],  # 树的最大深度
    'min_samples_split': [2, 4]   # 节点分裂最小样本数
}

# 网格搜索寻找最优参数
best_score = 0
for g in ParameterGrid(params):
    model.set_params(**g)
    model.fit(X_train, y_train)
    # 使用OOB分数评估
    if model.oob_score_ > best_score:
        best_score = model.oob_score_
        best_grid = g
        print('oob:', best_score, best_grid)

oob: 0.9341630951318693 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
oob: 0.9345716937526515 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
oob: 0.9346658642987922 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 700}
oob: 0.9347558828116947 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 700}
oob: 0.9347966569327794 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 200}
oob: 0.9350193430268144 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 500}
oob: 0.9350929090706385 {'max_depth': 8, 'max_features': 0.5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 700}
oob: 0.9406633717454664 {'max_depth': 13, 'max_features': 0.5, 'min_samples_leaf': 1, 'min

In [33]:
# 用选定参数训练模型
m = RandomForestRegressor(
    n_jobs=-1, 
    n_estimators=700, 
    oob_score=True, 
    max_depth=20, 
    min_samples_leaf=3, 
    min_samples_split=2, 
    max_features=0.5
)
m.fit(X_train, y_train)
m.oob_score_  # 查看OOB分数

0.9419816304101775

In [34]:
# ================= 特征选择 =================
def rf_feat_importance(m, df):
    """获取随机森林特征重要性"""
    return pd.DataFrame(
        {'cols':df.columns, 'imp':m.feature_importances_}
    ).sort_values('imp', ascending=False)

# 获取特征重要性排序
fi = rf_feat_importance(m, X_train)
fi[:30]  # 查看前30重要特征

Unnamed: 0,cols,imp
15,Listed Price,0.68621
13,Tax assessed value,0.121664
14,Annual tax amount,0.072796
16,Last Sold Price,0.024973
4,Total interior livable area,0.017034
17,Zip,0.016497
65,Listed OnElapsed,0.007866
7,Elementary School Score,0.004764
2,Bathrooms,0.003951
78,Last Sold OnElapsed,0.003589


In [35]:
# 人工指定删除/保留的特征
del_cols = [  # 需手动删除的特征列表
    # 'Annual tax amount', 'Bathrooms', 'Last Sold OnYear', 'Summary', 'Address', 
    # 'Last Sold OnDayofyear','City','Heating features','Last Sold OnDay', 
    # 'Listed OnDayofyear', 'Elementary School', 'Listed OnDay', 'Type', 'Full bathrooms',
    # 'High School', 'Middle School', 'Parking', 'Listed OnYear', 'Last Sold OnYear', 'Region'
]  
keep_cols = [  # 强制保留的特征
    'Listed Price', 'Tax assessed value', 
    'Annual tax amount', 'Last Sold Price', 
    'Total interior livable area', 'Zip'
]

# 基于重要性阈值初筛特征
Threshold = 0.0009
to_keep = fi[fi.imp > Threshold].cols
to_keep = [col for _, col in to_keep.items()]

In [36]:
# 应用人工规则调整特征
for col in del_cols:
    if col in to_keep:
        to_keep.remove(col)
for col in keep_cols:
    if col not in to_keep:
        to_keep.append(col)
print(to_keep)

# 构建特征选择后的数据集
df_keep = X_train[to_keep].copy()

['Listed Price', 'Tax assessed value', 'Annual tax amount', 'Last Sold Price', 'Total interior livable area', 'Zip', 'Listed OnElapsed', 'Elementary School Score', 'Bathrooms', 'Last Sold OnElapsed', 'Listed OnYear', 'Year built', 'Full bathrooms', 'Lot', 'Middle School Score', 'Parking', 'High School Distance', 'Last Sold OnYear', 'Summary', 'High School', 'Address', 'Type', 'Bedrooms', 'Elementary School Distance', 'Elementary School', 'Last Sold OnDayofyear', 'Listed OnDayofyear', 'City']


In [37]:
# 在新特征子集上训练模型
m1 = RandomForestRegressor(
    n_jobs=-1, 
    random_state=3, 
    n_estimators=700, 
    oob_score=True, 
    max_depth=20, 
    min_samples_leaf=3, 
    min_samples_split=2, 
    max_features=0.5
)
m1.fit(df_keep, y_train)
print(m1.oob_score_)  # 输出OOB分数

0.9417614451580819


In [38]:
# 递归特征消除（通过OOB分数评估）
cols = to_keep
scores = []
feats = []
for col in cols:
    tmp = to_keep.copy()
    if col in keep_cols:  # 跳过强制保留的特征
        continue
    tmp.remove(col)
    df_tmp = X_train[tmp].copy()
    # 快速评估特征重要性
    m1 = RandomForestRegressor(
        n_jobs=-1, 
        random_state=3, 
        n_estimators=30,  # 少量树加速计算
        oob_score=True, 
        max_depth=20, 
        min_samples_leaf=3, 
        min_samples_split=2, 
        max_features=0.5
    )
    m1.fit(df_tmp, y_train)
    scores.append(m1.oob_score_)
    feats.append(col)

# 按OOB分数排序待删除特征
to_del = sorted(zip(scores, feats), reverse=True)
to_del

[(0.9382947404826335, 'Elementary School'),
 (0.9379410746105055, 'Last Sold OnYear'),
 (0.9379386301290793, 'Elementary School Score'),
 (0.9378898426392626, 'Last Sold OnElapsed'),
 (0.9376467316278406, 'Summary'),
 (0.9376445314297753, 'Bedrooms'),
 (0.9376001806461817, 'High School Distance'),
 (0.9375940841592412, 'Lot'),
 (0.937584357319157, 'City'),
 (0.9375244960763983, 'Listed OnDayofyear'),
 (0.937488527684766, 'Parking'),
 (0.9374287980472045, 'Middle School Score'),
 (0.9373436859305093, 'Year built'),
 (0.9373193170880935, 'High School'),
 (0.9372585495784047, 'Last Sold OnDayofyear'),
 (0.9372268264128717, 'Full bathrooms'),
 (0.9372255263790797, 'Address'),
 (0.9371165445359102, 'Bathrooms'),
 (0.9370941870093856, 'Elementary School Distance'),
 (0.9370892158125483, 'Listed OnYear'),
 (0.9370744387484525, 'Type'),
 (0.9358671596897381, 'Listed OnElapsed')]

In [39]:
# ================= 最终特征子集 =================
to_keep_final = [
    'Listed Price', 'Tax assessed value', 'Last Sold Price', 'Zip', 
    'Total interior livable area', 'Elementary School Score', 
    'Listed OnElapsed', 'Last Sold OnElapsed', 'Full bathrooms', 
    'Year built', 'Listed OnYear', 'Lot', 'Parking', 'Type', 
    'Middle School Score', 'High School Distance', 
    'Elementary School Distance', 'Bedrooms'
]
X_train_final = X_train[to_keep_final].copy()

In [40]:
# ================= 最终模型调优 =================
model = RandomForestRegressor(
    oob_score=True, 
    random_state=3, 
    n_jobs=-1, 
    max_features=0.5
)
# 精调参数网格
params ={
    'n_estimators': [700],      # 增加树的数量
    'min_samples_leaf': [3],
    'max_features': [0.5],
    'max_depth': [20],          # 调整深度
    'min_samples_split': [2]
}

# 执行网格搜索
best_score = 0
for g in ParameterGrid(params):
    model.set_params(**g)
    model.fit(X_train_final, y_train)
    if model.oob_score_ > best_score:
        best_score = model.oob_score_
        best_grid = g
        print('best oob:', best_score, best_grid)

best oob: 0.9423924509180475 {'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 700}


In [41]:
# 用最优参数训练最终模型
model_final = RandomForestRegressor(
    n_jobs=-1, 
    n_estimators=700, 
    max_depth=20, 
    min_samples_leaf=3, 
    min_samples_split=2, 
    max_features=0.5
)
model_final.fit(X_train_final, y_train)

In [42]:
# ================= 测试集预测 =================
# 预处理测试集数据
X_test_num = num_pipeline.transform(df_test)
X_test_cate = cat_pipeline.transform(df_test)
X_test_date = date_pipeline.transform(df_test)
# 合并测试集特征
df_t = pd.concat([X_test_num, X_test_cate, X_test_date], axis=1)
# 选择最终特征子集
df_t = df_t[to_keep_final]

In [43]:
# 预测并还原对数转换
pred = model_final.predict(df_t)
# 生成提交数据
df_pred = pd.DataFrame({
    'Id': df_test['Id'],
    'Sold Price': np.exp(pred)  # 指数转换还原原始量纲
})
print(df_pred.head())
# 保存预测结果
df_pred.to_csv('../dataset/submission_RandomForest.csv', index=False)

      Id    Sold Price
0  47439  8.060174e+05
1  47440  5.190542e+05
2  47441  8.409972e+05
3  47442  7.840630e+05
4  47443  1.097328e+06
