In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.feature_selection import RFECV

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns
import time
from itertools import product
from tqdm import tqdm
from time import sleep
from tqdm.auto import tqdm

In [2]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [3]:
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [4]:
SHyper = pd.read_csv('SCouping_product_smile.csv')

In [5]:
S_CmordredC = pd.read_csv('CatalystSC.csv')
S_CmordredC_fixes = S_CmordredC.dropna(axis=1, how='any')
S_CmordredC_fix = S_CmordredC_fixes.copy()
S_CmordredC_fix.drop(S_CmordredC_fix.columns[0], axis = 1, inplace = True)
S_CmordredC_fix.rename(columns=lambda x: x + "-1", inplace=True)

In [6]:
S_CmordredI = pd.read_csv('ImineSC.csv')
S_CmordredI_fixes = S_CmordredI.dropna(axis=1, how='any')
S_CmordredI_fix = S_CmordredI_fixes.copy()
S_CmordredI_fix.drop(S_CmordredI_fix.columns[0], axis = 1, inplace = True)
S_CmordredI_fix.rename(columns=lambda x: x + "-2", inplace=True)

In [7]:
S_CmordredT = pd.read_csv('ThiolSC.csv')
S_CmordredT_fixes = S_CmordredT.dropna(axis=1, how='any')
S_CmordredT_fix = S_CmordredT_fixes.copy()
S_CmordredT_fix.drop(S_CmordredT_fix.columns[0], axis = 1, inplace = True)
S_CmordredT_fix.rename(columns=lambda x: x + "-3", inplace=True)

In [8]:
S_CmordredP = pd.read_csv('ProductSC.csv')
S_CmordredP_fixes = S_CmordredP.dropna(axis=1, how='any')
S_CmordredP_fix = S_CmordredP_fixes.copy()
S_CmordredP_fix.drop(S_CmordredP_fix.columns[0], axis = 1, inplace = True)
S_CmordredP_fix.rename(columns=lambda x: x + "-4", inplace=True)

In [9]:
MordredPP = pd.concat([S_CmordredC_fix, S_CmordredI_fix, S_CmordredT_fix, S_CmordredP_fix], axis = 1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(MordredPP.values,SHyper.iloc[:,-2].values, test_size=475, random_state=42)

In [11]:
kfold = KFold(n_splits=10, shuffle=True, random_state=81)

In [12]:
param_grid = {
   'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
   'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
}

# 2. 创建 XGB 回归模型
model = XGBRegressor(
    random_state=49,
    objective='reg:squarederror',
    n_jobs=-1
)

# 3. 使用 GridSearchCV 进行超参数优化
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,                           
    scoring='r2',                   
    n_jobs=-1,                      
    verbose=1                       
)

# 4. 在完整训练集上拟合（使用全部特征）
for _ in tqdm(range(1), desc="Grid search"):
    grid_search.fit(X_train, y_train)

# 5. 输出最佳参数
print("Best parameters found: ", grid_search.best_params_)

# 6. ✅ 将最优模型命名为 XGB_best_model（与 ET_best_model 区分）
XGB_best_model_MD = grid_search.best_estimator_

# 7. 在测试集上预测
y_pred = XGB_best_model_MD.predict(X_test)

# 8. 评估性能
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error on Test Set: %.4f" % mse)
print("R^2 Score on Test Set: %.4f" % r2)

Grid search:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 95 candidates, totalling 475 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 475 out of 475 | elapsed: 25.2min finished


Best parameters found:  {'learning_rate': 0.2, 'n_estimators': 20}
Mean Squared Error on Test Set: 0.0568
R^2 Score on Test Set: 0.8824


In [13]:
importance_gain = XGB_best_model_MD.get_booster().get_score(importance_type='gain')
importance_cover = XGB_best_model_MD.get_booster().get_score(importance_type='cover')
importance_tgain = XGB_best_model_MD.get_booster().get_score(importance_type='total_gain')
importance_tcover = XGB_best_model_MD.get_booster().get_score(importance_type='total_cover')

In [14]:
MDfeature_names = list(MordredPP)

In [15]:
importance_df = pd.DataFrame([importance_gain]).T.reset_index()
importance_df = pd.DataFrame(list(importance_gain.items()), columns=['feature', 'importance'])

feature_map = {f'f{i}': feature for i, feature in enumerate(MDfeature_names)}

importance_df['feature'] = importance_df['feature'].map(feature_map)

# 按重要性排序
importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)


print(importance_df.head(30))

          feature  importance
0      SMR_VSA9-1   19.334607
1     nAromAtom-3    8.295526
2      BCUTZ-1l-1    4.964406
3       SpMax_A-1    4.135267
4        ATSC5c-1    2.311661
5        ATSC8Z-1    2.219353
6         ATS4v-1    1.948305
7           IC4-4    1.467553
8     BCUTdv-1l-4    1.056111
9         VE2_A-1    0.994283
10      AATSC4p-4    0.978743
11      AATSC7Z-1    0.680620
12          IC4-1    0.647356
13        ATS4p-4    0.591251
14       ATSC4v-1    0.577841
15        VE3_A-1    0.549103
16       AATS7d-4    0.382063
17     SMR_VSA7-4    0.367586
18      ATSC3dv-1    0.344476
19        ATS2d-1    0.342187
20       ATSC0c-1    0.336394
21        ATS7i-1    0.328341
22     AATS8are-4    0.304884
23       AATS7d-1    0.289661
24        MID_O-1    0.277779
25      ATSC7pe-4    0.265179
26       MATS5c-1    0.249560
27      AATS8se-1    0.249341
28  EState_VSA3-1    0.234887
29       MATS4i-1    0.227781


In [16]:
importance_df = pd.DataFrame([importance_cover]).T.reset_index()
importance_df = pd.DataFrame(list(importance_cover.items()), columns=['feature', 'importance'])

feature_map = {f'f{i}': feature for i, feature in enumerate(MDfeature_names)}

importance_df['feature'] = importance_df['feature'].map(feature_map)

# 按重要性排序
importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)


print(importance_df.head(30))

          feature  importance
0      SMR_VSA9-1  600.000000
1        GATS2v-1  497.000000
2       AATSC7Z-1  439.000000
3      BCUTZ-1l-1  439.000000
4       MDEC-33-1  380.000000
5        ATSC1s-4  368.000000
6        ATS5dv-4  368.000000
7         ATS1s-4  368.000000
8        ATSC4Z-1  313.000000
9     nAromAtom-3  271.842105
10      Xpc-5dv-1  256.000000
11      ATSC2se-4  253.000000
12          IC5-1  251.000000
13    BCUTdv-1l-4  222.333333
14       GATS1d-1  221.500000
15       ATSC8Z-1  218.428571
16       ATSC6Z-1  214.000000
17       ATSC6c-1  213.000000
18      GATS1dv-1  196.000000
19      AATS8se-4  185.000000
20       AATS7p-1  182.000000
21  ETA_shape_y-1  177.200000
22     BCUTZ-1l-4  176.000000
23      AATS6dv-4  172.333333
24         MIC3-4  172.000000
25       AATS8p-4  167.000000
26      MATS1dv-4  161.000000
27       ATSC4Z-4  160.000000
28       AATS1d-1  158.000000
29       ATSC5c-1  157.461538


In [17]:
importance_df = pd.DataFrame([importance_tcover]).T.reset_index()
importance_df = pd.DataFrame(list(importance_tcover.items()), columns=['feature', 'importance'])

feature_map = {f'f{i}': feature for i, feature in enumerate(MDfeature_names)}

importance_df['feature'] = importance_df['feature'].map(feature_map)

# 按重要性排序
importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)


print(importance_df.head(30))

          feature  importance
0      SMR_VSA9-1     11400.0
1     nAromAtom-3      5165.0
2       AATSC7Z-1      3951.0
3      BCUTZ-1l-1      2195.0
4        ATSC5c-1      2047.0
5       SpAbs_A-1      1861.0
6       SpMax_A-1      1598.0
7        ATSC8Z-1      1529.0
8        ATSC4v-1      1495.0
9     BCUTdv-1l-4      1334.0
10     BCUTs-1l-4      1162.0
11       AATS2d-4      1149.0
12      SpMAD_A-1       938.0
13       MATS5c-1       924.0
14  ETA_shape_y-1       886.0
15      MDEC-33-1       760.0
16        MID_O-1       629.0
17       AATS4p-1       544.0
18      AATS6dv-4       517.0
19       MATS7i-4       515.0
20      Xpc-5dv-1       512.0
21       GATS2v-1       497.0
22      AATS6pe-4       497.0
23      SpAbs_A-2       474.0
24      SpMAD_A-4       471.0
25       AATS2s-1       463.0
26       GATS1d-1       443.0
27       ATSC1s-4       368.0
28       ATS5dv-4       368.0
29        ATS1s-4       368.0


In [18]:
importance_df = pd.DataFrame([importance_tgain]).T.reset_index()
importance_df = pd.DataFrame(list(importance_tgain.items()), columns=['feature', 'importance'])

feature_map = {f'f{i}': feature for i, feature in enumerate(MDfeature_names)}

importance_df['feature'] = importance_df['feature'].map(feature_map)

# 按重要性排序
importance_df = importance_df.sort_values('importance', ascending=False).reset_index(drop=True)


print(importance_df.head(30))

          feature  importance
0      SMR_VSA9-1  367.357539
1     nAromAtom-3  157.614989
2       SpMax_A-1   95.111148
3        ATSC5c-1   30.051589
4      BCUTZ-1l-1   24.822032
5        ATSC8Z-1   15.535474
6         VE2_A-1   11.931392
7        ATSC4v-1    9.823292
8           IC4-4    7.337764
9     BCUTdv-1l-4    6.336663
10      AATSC7Z-1    6.125582
11      SpAbs_A-1    5.243820
12      SpMAD_A-1    2.063790
13       MATS5c-1    1.996479
14        ATS7i-1    1.970046
15      AATSC4p-4    1.957485
16        ATS4v-1    1.948305
17        VE3_A-1    1.647309
18          IC4-1    1.294712
19           nH-1    1.207322
20        MID_O-1    1.111117
21        ATS0s-4    1.075724
22  ETA_shape_y-1    1.031843
23       ATSC3c-1    0.978531
24      SpAbs_A-4    0.933024
25       ATS0dv-1    0.895259
26       AATS2d-4    0.890026
27        VE1_A-2    0.775904
28      SpAbs_A-2    0.770924
29      SpMAD_A-4    0.733886
