In [8]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# 将当前工作目录添加到 Python 路径
# 获取当前工作目录
current_working_dir = os.getcwd()

# 将根目录添加到 sys.path
sys.path.append(os.path.abspath(os.path.join(current_working_dir, '../..')))

from function import metrics_to_dataframe, plot_actual_vs_predicted

In [9]:
# 读取数据
df = pd.read_csv('../../data/dataset.csv')

# 对指定列进行独热编码
columns_to_encode = ['CM_type', 'CM_morph', 'MS2_morph', 'CP_morph', 'Cation', 'Anion']
df_encoded = pd.get_dummies(df, columns=columns_to_encode)

# # 对目标值进行对数变换
# df_encoded['Cs_log'] = df_encoded['Cs'].apply(lambda x: np.log(x))

X = df_encoded.drop(['Cs'], axis=1)
y = df_encoded['Cs']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21
)

In [10]:
# optuna 自动搜索超参数
import optuna
def objective(trial):
    param = {
        'n_estimators': 300,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
    }
    
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)

print('Best trial:')
trial = study.best_trial
print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


[I 2025-03-05 14:06:38,068] A new study created in memory with name: no-name-a92e88f7-545f-40a3-9bd9-c74f4d39239d
[I 2025-03-05 14:06:38,420] Trial 0 finished with value: 68.7957273474337 and parameters: {'learning_rate': 0.11653346968008813, 'subsample': 0.987113132860646, 'gamma': 0.3817780745001286, 'max_depth': 15, 'min_child_weight': 2, 'reg_alpha': 0.22077504966405714, 'colsample_bytree': 0.560325451648684, 'colsample_bylevel': 0.9131923495665482, 'colsample_bynode': 0.5570088397602624}. Best is trial 0 with value: 68.7957273474337.
[I 2025-03-05 14:06:38,687] Trial 1 finished with value: 72.99044588506298 and parameters: {'learning_rate': 0.12907314288575336, 'subsample': 0.927253663826916, 'gamma': 0.982901371015894, 'max_depth': 17, 'min_child_weight': 9, 'reg_alpha': 0.815158830210682, 'colsample_bytree': 0.5324181712118328, 'colsample_bylevel': 0.6567907250061596, 'colsample_bynode': 0.6253233534758368}. Best is trial 0 with value: 68.7957273474337.
[I 2025-03-05 14:06:38,96

Best trial:
  Value: 53.39514740616011
  Params: 
    learning_rate: 0.19029567682962328
    subsample: 0.9985545953310594
    gamma: 0.9237782509970361
    max_depth: 8
    min_child_weight: 5
    reg_alpha: 0.41399194965508074
    colsample_bytree: 0.8732858503886761
    colsample_bylevel: 0.7434770246023662
    colsample_bynode: 0.6620653193121468


In [11]:
# 模型训练
xgb_params = {
    'n_estimators': 300,
    'learning_rate': trial.params['learning_rate'],
    'subsample': trial.params['subsample'],
    'gamma': trial.params['gamma'],
    'max_depth': trial.params['max_depth'],
    'min_child_weight': trial.params['min_child_weight'],
    'reg_alpha': trial.params['reg_alpha'],
    'colsample_bytree': trial.params['colsample_bytree'],
    'colsample_bylevel': trial.params['colsample_bylevel'],
    'colsample_bynode': trial.params['colsample_bynode'],    
}
model = XGBRegressor(**xgb_params)
model.fit(X_train, y_train)

# 预测
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 将预测值转换为原始值
# y_train_pred = np.exp(y_train_pred)
# y_test_pred = np.exp(y_test_pred)
# y_train = np.exp(y_train)
# y_test = np.exp(y_test)

# plot_actual_vs_predicted(y_train, y_train_pred, y_test, y_test_pred, 'XGBoost', figpath='xgb_actual_vs_predicted.png')
xgb_metrics = metrics_to_dataframe(y_train, y_train_pred, y_test, y_test_pred, 'XGBoost')
xgb_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,XGBoost,0.999534,5.729477,1.86116,9.738938,0.983928,36.469242,15.493889,53.395149
