In [1]:
from sklearn.preprocessing import StandardScaler
from function import metrics_to_dataframe, plot_actual_vs_predicted
import pandas as pd
from function import split_data
import optuna
from sklearn.model_selection import cross_val_score

# 读取数据
data = pd.read_csv("../data/dataset_reduced.csv")
X_train, X_test, y_train, y_test = split_data(data, 'Cs')

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [2]:
# 线性回归 Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
lr_metrics = metrics_to_dataframe(y_train, y_train_pred_lr, y_test, y_test_pred_lr, 'Linear Regression')
lr_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Linear Regression,0.421558,22.900748,34.682054,30.511129,0.547166,19.912334,37.042903,27.922848


In [3]:
# 支持向量回归Support Vector Regression
from sklearn.svm import SVR

svr_params = {
    'kernel': 'poly',
    'degree': 6,
    'gamma': 'scale',
    'coef0': 3,
    'epsilon': 10,
    'verbose': True,
    'C': 0.5
}

svr = SVR(**svr_params)
svr.fit(X_train_scaled, y_train)
y_train_pred_svr = svr.predict(X_train_scaled)
y_test_pred_svr = svr.predict(X_test_scaled)
svr_metrics = metrics_to_dataframe(y_train, y_train_pred_svr, y_test, y_test_pred_svr, 'Support Vector Regression')
svr_metrics

[LibSVM]

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Support Vector Regression,0.852376,10.862432,14.656757,15.413714,0.868775,11.107893,17.472105,15.031371


In [4]:
# 随机森林 Random Forest
from sklearn.ensemble import RandomForestRegressor

# 定义目标函数
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 15)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_float('max_features', 0.1, 1.0)

    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=21
    )

    # 5-fold cross-validation
    scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    mae = -scores.mean()
    return mae

# 创建Optuna的study对象并优化
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

# 输出最佳参数
best_rf_params = study.best_params
print("Best RF_parameters: ", best_rf_params)

[I 2024-12-02 23:59:04,158] A new study created in memory with name: no-name-d4215f07-7e3e-48f4-9113-5c6fdba32915
[I 2024-12-02 23:59:04,590] Trial 0 finished with value: 20.643193911380017 and parameters: {'n_estimators': 92, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 0.929944334271702}. Best is trial 0 with value: 20.643193911380017.
[I 2024-12-02 23:59:05,120] Trial 1 finished with value: 25.267882157499553 and parameters: {'n_estimators': 143, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.8331120035433768}. Best is trial 0 with value: 20.643193911380017.
[I 2024-12-02 23:59:05,923] Trial 2 finished with value: 16.292569893487517 and parameters: {'n_estimators': 192, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 0.5236945375503068}. Best is trial 2 with value: 16.292569893487517.
[I 2024-12-02 23:59:06,495] Trial 3 finished with value: 13.234892077277731 and parameters: {'n_esti

Best RF_parameters:  {'n_estimators': 179, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5070465917404345}


In [11]:
rf_params = {
    'n_estimators': 60,
    'max_depth': 12,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 0.46,
    'random_state': 21
}
rf = RandomForestRegressor(**rf_params)
rf.fit(X_train, y_train)
y_train_pred_rf = rf.predict(X_train)
y_test_pred_rf = rf.predict(X_test)
rf_metrics = metrics_to_dataframe(y_train, y_train_pred_rf, y_test, y_test_pred_rf, 'Random Forest')
rf_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Random Forest,0.987531,2.983281,4.116053,4.479728,0.947483,6.802076,10.183421,9.509109


In [6]:
# 极端梯度提升回归 XGBoost
from xgboost import XGBRegressor
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.1, 1.0)
    gamma = trial.suggest_float('gamma', 0.0, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    reg_alpha = trial.suggest_float('reg_alpha', 0.0, 1.0)
    reg_lamda = trial.suggest_float('reg_lambda', 0.0, 10.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 1.0)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.1, 1.0)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.1, 1.0)

    xgb = XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        subsample=subsample,
        gamma=gamma,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lamda,
        colsample_bytree=colsample_bytree,
        colsample_bylevel=colsample_bylevel,
        colsample_bynode=colsample_bynode,
        random_state=21
    )

    # 5-fold cross-validation
    scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    mae = -scores.mean()
    return mae

# 创建Optuna的study对象并优化
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

# 输出最佳参数
best_xgb_params = study.best_params
print("Best XGB_parameters: ", best_xgb_params)

[I 2024-12-03 00:01:40,885] A new study created in memory with name: no-name-723d9255-7a84-4fca-94be-05d8715a6191
[I 2024-12-03 00:01:41,190] Trial 0 finished with value: 13.007460077839104 and parameters: {'n_estimators': 140, 'learning_rate': 0.24577848574293748, 'subsample': 0.1940697730435056, 'gamma': 0.1182560239839654, 'max_depth': 11, 'min_child_weight': 8, 'reg_alpha': 0.26464020226092977, 'reg_lambda': 5.185972189617651, 'colsample_bytree': 0.46231492619113235, 'colsample_bylevel': 0.6112423582903251, 'colsample_bynode': 0.5120014602397008}. Best is trial 0 with value: 13.007460077839104.
[I 2024-12-03 00:01:41,370] Trial 1 finished with value: 33.17062906563083 and parameters: {'n_estimators': 55, 'learning_rate': 0.01481010045395242, 'subsample': 0.6972459723100667, 'gamma': 0.41811178059368515, 'max_depth': 13, 'min_child_weight': 4, 'reg_alpha': 0.3788450201266905, 'reg_lambda': 5.808615788726278, 'colsample_bytree': 0.3312480559586331, 'colsample_bylevel': 0.196244220884

Best XGB_parameters:  {'n_estimators': 293, 'learning_rate': 0.24134908727438506, 'subsample': 0.5919708363574159, 'gamma': 0.07712636682267804, 'max_depth': 18, 'min_child_weight': 2, 'reg_alpha': 0.16869955875388754, 'reg_lambda': 4.866179315921636, 'colsample_bytree': 0.46927709470994383, 'colsample_bylevel': 0.4854611626141926, 'colsample_bynode': 0.7031665655760044}


In [12]:
xgb_params = {
    'n_estimators': 300,
    'learning_rate': 0.24,
    'subsample': 0.6,
    'gamma': 0.07,
    'max_depth': 18,
    'min_child_weight': 2,
    'reg_alpha': 0.17,
    'reg_lambda': 4.87,
    'colsample_bytree': 0.47,
    'colsample_bylevel': 0.49,
    'colsample_bynode': 0.70,
    'random_state': 21
}
xgb = XGBRegressor(**xgb_params)
xgb.fit(X_train, y_train)
y_train_pred_xgb = xgb.predict(X_train)
y_test_pred_xgb = xgb.predict(X_test)
xgb_metrics = metrics_to_dataframe(y_train, y_train_pred_xgb, y_test, y_test_pred_xgb, 'XGBoost')
xgb_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,XGBoost,0.99756,1.326169,1.79382,1.981496,0.977479,4.336434,6.457724,6.22702


In [8]:
# LightGBM
from lightgbm import LGBMRegressor

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.1, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 1.0)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 20)
    num_leaves = trial.suggest_int('num_leaves', 10, 100)
    reg_alpha = trial.suggest_float('reg_alpha', 0.0, 1.0)
    reg_lambda = trial.suggest_float('reg_lambda', 0.0, 1.0)


    lgbm = LGBMRegressor(
        colsample_bytree=colsample_bytree,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_child_samples=min_child_samples,
        n_estimators=n_estimators,
        num_leaves=num_leaves,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        subsample=subsample,
        verbose = -1,
        random_state=21
    )

    # 5-fold cross-validation
    scores = cross_val_score(lgbm, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    mae = -scores.mean()
    return mae

# 创建Optuna的study对象并优化
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

# 输出最佳参数
best_lgbm_params = study.best_params
print("Best LGBM_parameters: ", best_lgbm_params)

[I 2024-12-03 00:07:16,201] A new study created in memory with name: no-name-36123ab8-6ef7-4002-9d06-7480a616f398
[I 2024-12-03 00:07:16,496] Trial 0 finished with value: 10.266097009479166 and parameters: {'n_estimators': 272, 'learning_rate': 0.19958255077938547, 'subsample': 0.2555523634884629, 'max_depth': 22, 'colsample_bytree': 0.818349008297414, 'min_child_samples': 17, 'num_leaves': 79, 'reg_alpha': 0.9347415259377686, 'reg_lambda': 0.5984680446986921}. Best is trial 0 with value: 10.266097009479166.
[I 2024-12-03 00:07:16,639] Trial 1 finished with value: 9.532346042060215 and parameters: {'n_estimators': 95, 'learning_rate': 0.2220182781471903, 'subsample': 0.9995501309657335, 'max_depth': 15, 'colsample_bytree': 0.3154461914364857, 'min_child_samples': 7, 'num_leaves': 60, 'reg_alpha': 0.5670178684039395, 'reg_lambda': 0.3030566652921637}. Best is trial 1 with value: 9.532346042060215.
[I 2024-12-03 00:07:16,753] Trial 2 finished with value: 10.585238171386576 and parameters

Best LGBM_parameters:  {'n_estimators': 246, 'learning_rate': 0.28418110827577264, 'subsample': 0.5125240691075271, 'max_depth': 20, 'colsample_bytree': 0.31054172055463847, 'min_child_samples': 10, 'num_leaves': 14, 'reg_alpha': 0.17756031599844185, 'reg_lambda': 0.6220799728565203}


In [13]:
lgbm_params = {
    'colsample_bytree': 0.31,
    'learning_rate': 0.28,
    'max_depth': 20,
    'min_child_samples': 10,
    'n_estimators': 250,
    'num_leaves': 14,
    'reg_alpha': 0.18,
    'reg_lambda': 0.62,
    'subsample': 0.51,
    'verbose': -1,
    'random_state': 21
}

lgbm = LGBMRegressor(**lgbm_params)
lgbm.fit(X_train, y_train)
y_train_pred_lgbm = lgbm.predict(X_train)
y_test_pred_lgbm = lgbm.predict(X_test)
lgbm_metrics = metrics_to_dataframe(y_train, y_train_pred_lgbm, y_test, y_test_pred_lgbm, 'LightGBM')
lgbm_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,LightGBM,0.995408,1.916616,2.6502,2.718406,0.978632,4.373229,6.785443,6.065547


In [15]:
metrics = pd.concat([lr_metrics, svr_metrics, rf_metrics, xgb_metrics, lgbm_metrics])
metrics_rounded = metrics.round(3)
metrics_rounded

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Linear Regression,0.422,22.901,34.682,30.511,0.547,19.912,37.043,27.923
0,Support Vector Regression,0.852,10.862,14.657,15.414,0.869,11.108,17.472,15.031
0,Random Forest,0.988,2.983,4.116,4.48,0.947,6.802,10.183,9.509
0,XGBoost,0.998,1.326,1.794,1.981,0.977,4.336,6.458,6.227
0,LightGBM,0.995,1.917,2.65,2.718,0.979,4.373,6.785,6.066
