# 模型评估

## 读取数据

In [4]:
from src.models import load_data_dopants, dataset_split_10class, metrics_to_dataframe

# 读取数据
filepath = '../data/processed/data_dopants.csv'
data = load_data_dopants(filepath)

# 按10个等级分割数据集，同时标准化数据
X_train_scaled, X_test_scaled, y_train, y_test = dataset_split_10class(data)

## 1. 核岭回归（Kernel Ridge Regression）

In [5]:
from sklearn.kernel_ridge import KernelRidge

# 初始化模型
krr = KernelRidge(alpha=1.04,
                  gamma=1.02,
                  kernel='polynomial',
                  degree=1,
                  coef0=1.52)

# 训练模型
krr.fit(X_train_scaled, y_train)
y_train_pred = krr.predict(X_train_scaled)
y_test_pred = krr.predict(X_test_scaled)

krr_df = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'KRR')
krr_df

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,KRR,0.435135,22.894784,31.818375,30.408537,0.44676,23.182953,31.754049,30.803639


## 2. 支持向量回归（Support Vector Regression）

In [6]:
from sklearn.svm import SVR

# 初始化模型，这里使用支持向量回归
svr = SVR(C=0.8, 
          kernel='poly', 
          degree=3, 
          gamma='scale', 
          coef0=7.7, 
          epsilon=0.1)

# 训练模型
svr.fit(X_train_scaled, y_train)
y_train_pred = svr.predict(X_train_scaled)
y_test_pred = svr.predict(X_test_scaled)

svr_df = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'SVR')
svr_df

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,SVR,0.672478,15.401887,18.041004,23.154925,0.64308,17.949116,22.530406,24.741786


## 3. 随机森林回归（Random Forest Regression）

In [7]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=140,
                            max_depth=12,
                            min_samples_leaf=1,
                            min_samples_split=2,
                            random_state=21)

# 训练模型
rfr.fit(X_train_scaled, y_train)
y_train_pred = rfr.predict(X_train_scaled)
y_test_pred = rfr.predict(X_test_scaled)

rfr_df = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'RF')
rfr_df

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,RF,0.97968,3.836267,4.787475,5.76751,0.844773,10.142849,13.554456,16.316589


## 4. 梯度提升回归（Gradient Boosting Regression）

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

# 初始化模型
gbr = GradientBoostingRegressor(n_estimators=200,
                                alpha=0.07,
                                learning_rate=0.14,
                                max_depth=9,
                                max_features=0.2,
                                min_samples_leaf=3,
                                min_samples_split=7,
                                subsample=0.8,
                                random_state=21)

# 训练模型
gbr.fit(X_train_scaled, y_train)
y_train_pred = gbr.predict(X_train_scaled)
y_test_pred = gbr.predict(X_test_scaled)

gbr_df = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'GBR')
gbr_df

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,GBR,0.998413,1.072222,1.302167,1.611991,0.918238,6.5527,8.510915,11.841927


## 5. XGBoost回归（XGBoost Regression）

In [9]:
from xgboost import XGBRegressor

# 初始化模型，这里使用XGBoost回归器
xgb = XGBRegressor(n_estimators=190,
                             learning_rate=0.15,
                             subsample=0.5,
                             gamma=0.1,
                             max_depth=8,
                             min_child_weight=2,
                             reg_alpha=0.34,
                             colsample_bytree=1.0,
                             colsample_bylevel=0.3,
                             colsample_bynode=0.7,
                             random_state=21)

# 训练模型
xgb.fit(X_train_scaled, y_train)
y_train_pred = xgb.predict(X_train_scaled)
y_test_pred = xgb.predict(X_test_scaled)

xgb_df = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'XGB')
xgb_df

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,XGB,0.995994,1.843803,2.196843,2.560729,0.928432,6.58921,8.734536,11.07911


## 6. ANN回归（Artificial Neural Network Regression）

In [10]:
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

# 初始化ANN模型
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)  # 输出层：一个神经元，无激活函数，用于回归任务
])

# 编译模型，指定优化器、损失函数和评价指标
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 初始化早停回调
early_stopper = EarlyStopping(
    monitor='val_loss',     # 监控验证集损失
    min_delta=0.01,        # 表示监控指标至少需要改善 0.001
    patience=50,            # 如果30个epoch内验证集损失没有改善，则提前停止训练
    verbose=1,              # 输出早停信息
    mode='min',             # 监控的指标是损失，应该减小
    restore_best_weights=True  # 训练结束后，模型权重回滚到最佳状态
)

# 训练模型
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.3,  # 使用20%的数据作为验证集
    epochs=500,  # 最大训练轮数
    callbacks=[early_stopper],  # 使用早停机制
    verbose=1  # 输出训练信息
)

# 预测训练集和测试集
y_train_pred = model.predict(X_train_scaled).flatten()
y_test_pred = model.predict(X_test_scaled).flatten()

# 计算评价指标
ann_df = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'ANN')
ann_df

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 9989.6631 - mae: 91.5390 - val_loss: 11415.2344 - val_mae: 98.9351
Epoch 2/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 10037.3936 - mae: 91.6908 - val_loss: 11138.6309 - val_mae: 97.5502
Epoch 3/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 10014.0840 - mae: 90.9228 - val_loss: 10567.7051 - val_mae: 94.6396
Epoch 4/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8758.9492 - mae: 85.5328 - val_loss: 9426.6191 - val_mae: 88.5556
Epoch 5/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 7771.7437 - mae: 79.0597 - val_loss: 7291.6582 - val_mae: 76.0934
Epoch 6/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5866.1230 - mae: 64.7689 - val_loss: 4142.9971 - val_mae: 52.6977
Epoch 7/500
[1m10/10[0m [32m━━━━━━━━━

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,ANN,0.899677,8.716485,9.577816,12.815142,0.800707,13.407557,16.246555,18.488069


## 7. 模型性能汇总

In [11]:
import pandas as pd

# 模型评估
models_df = pd.concat([krr_df, svr_df, rfr_df, gbr_df, xgb_df, ann_df], axis=0)
models_df

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,KRR,0.435135,22.894784,31.818375,30.408537,0.44676,23.182953,31.754049,30.803639
0,SVR,0.672478,15.401887,18.041004,23.154925,0.64308,17.949116,22.530406,24.741786
0,RF,0.97968,3.836267,4.787475,5.76751,0.844773,10.142849,13.554456,16.316589
0,GBR,0.998413,1.072222,1.302167,1.611991,0.918238,6.5527,8.510915,11.841927
0,XGB,0.995994,1.843803,2.196843,2.560729,0.928432,6.58921,8.734536,11.07911
0,ANN,0.899677,8.716485,9.577816,12.815142,0.800707,13.407557,16.246555,18.488069
