In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.feature_selection import RFECV

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
from tqdm import tqdm
from time import sleep

In [3]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [4]:
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [5]:
SHyper = pd.read_csv('SCouping_product_smile.csv')

In [6]:
param_grid = {
    'RF': {
        'n_estimators': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1800, 2000],      
        'max_features': ['sqrt', 'log2', 0.2, 0.3, 0.4, 0.5]                
    },
    'ET': {
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900],      
        'max_features': ['sqrt', 'log2', 0.2, 0.3, 0.4, 0.5]                
    },
    'DT': {
        'max_depth': [3, 5, 7, 9, 11, 13, 15, None],                       
        'min_samples_split': [2, 3, 4, 5, 6, 7, 8]                         
    },
    'KNN': {
        'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],                  
        'p': [1, 2, 3, 4, 5]                                                
    },
    'KRR': {
        'alpha': [1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 0.5, 1.0, 5.0],  
        'gamma': [1e-4, 5e-4, 1e-3, 5e-3, 1e-2]                             
    },
    'XGB': {
        'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
    }
}

In [7]:
models = [
    RandomForestRegressor(n_jobs=-1, random_state=42),
    ExtraTreesRegressor(n_jobs=-1, random_state=42),
    KNeighborsRegressor(n_jobs=-1),
    DecisionTreeRegressor(random_state=42),
    KernelRidge(kernel='rbf'),
    xgb.XGBRegressor(n_jobs=-1, random_state=42)
]

model_names = ['RF', 'ET', 'KNN', 'DT', 'KRR', 'XGB']

In [8]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [9]:
mols1 = [Chem.MolFromSmiles(smi) for smi in SHyper['Catalyst']]
mols2 = [Chem.MolFromSmiles(smi) for smi in SHyper['Imine']]
mols3 = [Chem.MolFromSmiles(smi) for smi in SHyper['Thiol']]
mols4 = [Chem.MolFromSmiles(smi) for smi in SHyper['Product']]

In [10]:
from rdkit.Avalon import pyAvalonTools

Avs1 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols1]
df_Avs1 = pd.DataFrame(np.array(Avs1))
df_Avs1.columns = [f'1_{col}' for col in df_Avs1.columns]

Avs2 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols2]
df_Avs2 = pd.DataFrame(np.array(Avs2))
df_Avs2.columns = [f'2_{col}' for col in df_Avs2.columns]

Avs3 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols3]
df_Avs3 = pd.DataFrame(np.array(Avs3))
df_Avs3.columns = [f'3_{col}' for col in df_Avs3.columns]

Avs4 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols4]
df_Avs4 = pd.DataFrame(np.array(Avs4))
df_Avs4.columns = [f'4_{col}' for col in df_Avs4.columns]

In [11]:
Avs_CSSP = pd.concat([df_Avs1, df_Avs2, df_Avs3, df_Avs4], axis = 1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(Avs_CSSP.values,SHyper.iloc[:,-2].values, test_size=475, random_state=42)

In [13]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [69]:
for model_name, model in tqdm(zip(model_names, models), desc="AvalonPEMF"):
    GS = GridSearchCV(
        model,
        param_grid[model_name],
        cv=kfold,
        n_jobs=-1,
        scoring={'r2': 'r2', 'mae': mae_scorer, 'rmse': rmse_scorer},
        refit='r2'
    )
    GS.fit(X_train, y_train)
    
    best_param = GS.best_params_
    best_score = GS.best_score_ 
    best_rmse = -GS.cv_results_['mean_test_rmse'][GS.best_index_]
    best_mae = -GS.cv_results_['mean_test_mae'][GS.best_index_] 
    
    
    y_pred = GS.predict(X_test)
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    
    print('Model: %4s, Best CV R²: %.4f, Best CV MAE: %.4f, Best CV RMSE: %.4f' % 
          (model_name, best_score, best_mae, best_rmse))
    print('Best Params:', best_param)
    print(f'Test Set Performance MAE: {mae_test:.4f}, R²: {r2_test:.4f}\n')

AvalonPEMF: 1it [11:19, 679.58s/it]

Model:   RF, Best CV R²: 0.8900, Best CV MAE: 0.1591, Best CV RMSE: 0.2269
Best Params: {'max_features': 0.2, 'n_estimators': 150}
Test Set Performance MAE: 0.1437, R²: 0.9064



AvalonPEMF: 2it [30:16, 948.63s/it]

Model:   ET, Best CV R²: 0.8751, Best CV MAE: 0.1652, Best CV RMSE: 0.2406
Best Params: {'max_features': 0.2, 'n_estimators': 450}
Test Set Performance MAE: 0.1522, R²: 0.8812



AvalonPEMF: 3it [32:07, 566.17s/it]

Model:  KNN, Best CV R²: 0.7003, Best CV MAE: 0.2770, Best CV RMSE: 0.3775
Best Params: {'n_neighbors': 2, 'p': 1}
Test Set Performance MAE: 0.2674, R²: 0.6969



AvalonPEMF: 4it [32:20, 347.56s/it]

Model:   DT, Best CV R²: 0.8620, Best CV MAE: 0.1736, Best CV RMSE: 0.2554
Best Params: {'max_depth': 9, 'min_samples_split': 8}
Test Set Performance MAE: 0.1559, R²: 0.8727



AvalonPEMF: 5it [32:49, 232.73s/it]

Model:  KRR, Best CV R²: 0.8858, Best CV MAE: 0.1685, Best CV RMSE: 0.2314
Best Params: {'alpha': 0.005, 'gamma': 0.001}
Test Set Performance MAE: 0.1578, R²: 0.8965



AvalonPEMF: 6it [52:06, 521.11s/it]

Model:  XGB, Best CV R²: 0.8827, Best CV MAE: 0.1700, Best CV RMSE: 0.2358
Best Params: {'max_depth': 2, 'n_estimators': 50}
Test Set Performance MAE: 0.1554, R²: 0.8996




