In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# 整理数据类型
data = pd.read_csv(r'./total_data.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Date                                     datetime64[ns]
Cumulate_day                                      int64
Month                                             int64
Weekday                                           int64
Day                                               int64
Hour                                              int64
Temperature                                     float64
weekdays                                          int64
Power_price(CNY/kWh)                            float64
Heating_or_cooling_consumption(kwh)             float64
Electrical_equipment_consumption(kwh)           float64
Lighting_Power_Consumption(kwh)                 float64
Total_consumption(kWh)                          float64
dtype: object

In [3]:
# 创建特征集(X)与目标集（Y）
X = data[['Cumulate_day', 'Month', 'Weekday', 'Day', 'Hour','Temperature', 'weekdays']]
Y = data[['Heating_or_cooling_consumption(kwh)','Electrical_equipment_consumption(kwh)','Lighting_Power_Consumption(kwh)']]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [4]:
# 构建网格搜索矩阵
param_grid = {
    'n_estimators':[50, 100]
    , 'max_depth':[None,10]
}

In [5]:
# 创建随机森林回归器
rfr = RandomForestRegressor(random_state=42)

In [6]:
# 使用GridSearchCV来寻找最佳超参数组合
grid_search = GridSearchCV(
    estimator=rfr
    , param_grid=param_grid
    , cv=3
    , n_jobs=-1 # -1表示CPU并行运算
    , verbose=2 # 2在每次交叉验证输出信息
    , scoring='neg_mean_squared_error')
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [7]:
# 输出最佳组合
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'max_depth': None, 'n_estimators': 50}


In [8]:
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)

In [9]:
# 评估模型
r2 = r2_score(Y_test, Y_pred)
print(f'R^2 Score: {r2}')

R^2 Score: 0.9900844376018153


In [10]:
# 识别重要参数，可以看到最重要的是Hour，其次是Temperature以及weekdays
importances = best_model.feature_importances_

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
feature_importance 

Unnamed: 0,Feature,Importance
4,Hour,0.489601
5,Temperature,0.212055
6,weekdays,0.168218
0,Cumulate_day,0.079775
3,Day,0.024592
1,Month,0.022084
2,Weekday,0.003674


In [11]:
#####