In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.feature_selection import RFECV

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
from tqdm import tqdm
from time import sleep

In [2]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [3]:
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [4]:
SHyper = pd.read_csv('SCouping_product_smile.csv')

In [44]:
param_grid = {
    'RF': {
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
    },
    'ET': {
        'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'weights': ['uniform', 'distance'],
        'algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [25, 30, 35],
        'p': [1, 2],
    },
    'DT': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 4, 6],
    },
    'KRR': {
        'gamma': [None, 0.01, 0.001, 0.0001],
    },
    'XGB': {
        'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190],
    }
}

In [45]:
models = [
    RandomForestRegressor(n_jobs=-1, random_state=42),
    ExtraTreesRegressor(n_jobs=-1, random_state=42),
    KNeighborsRegressor(n_jobs=-1),
    DecisionTreeRegressor(random_state=42),
    KernelRidge(),
    xgb.XGBRegressor(n_jobs=-1, random_state=42)
]

model_names = ['RF', 'ET', 'KNN', 'DT', 'KRR', 'XGB']

In [31]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

In [32]:
mols1 = [Chem.MolFromSmiles(smi) for smi in SHyper['Catalyst']]
mols2 = [Chem.MolFromSmiles(smi) for smi in SHyper['Imine']]
mols3 = [Chem.MolFromSmiles(smi) for smi in SHyper['Thiol']]
mols4 = [Chem.MolFromSmiles(smi) for smi in SHyper['Product']]

In [33]:
from rdkit.Avalon import pyAvalonTools

Avs1 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols1]
df_Avs1 = pd.DataFrame(np.array(Avs1))
df_Avs1.columns = [f'1_{col}' for col in df_Avs1.columns]

Avs2 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols2]
df_Avs2 = pd.DataFrame(np.array(Avs2))
df_Avs2.columns = [f'2_{col}' for col in df_Avs2.columns]

Avs3 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols3]
df_Avs3 = pd.DataFrame(np.array(Avs3))
df_Avs3.columns = [f'3_{col}' for col in df_Avs3.columns]

Avs4 = [pyAvalonTools.GetAvalonFP(mol, nBits=2048) for mol in mols4]
df_Avs4 = pd.DataFrame(np.array(Avs4))
df_Avs4.columns = [f'4_{col}' for col in df_Avs4.columns]

In [46]:
Avs_CSSP = pd.concat([df_Avs1, df_Avs2, df_Avs3, df_Avs4], axis = 1)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(Avs_CSSP.values,SHyper.iloc[:,-2].values, test_size=475, random_state=42)

In [48]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [49]:
for model_name, model in tqdm(zip(model_names, models), desc="AvalonPEMF"):
    GS = GridSearchCV(
        model,
        param_grid[model_name],
        cv=kfold,
        n_jobs=-1,
        scoring={'r2': 'r2', 'mae': mae_scorer, 'rmse': rmse_scorer},
        refit='r2'
    )
    GS.fit(X_train, y_train)
    
    best_param = GS.best_params_
    best_score = GS.best_score_ 
    best_rmse = -GS.cv_results_['mean_test_rmse'][GS.best_index_]
    best_mae = -GS.cv_results_['mean_test_mae'][GS.best_index_] 
    
    
    y_pred = GS.predict(X_test)
    mae_test = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    
    print('Model: %4s, Best CV R²: %.4f, Best CV MAE: %.4f, Best CV RMSE: %.4f' % 
          (model_name, best_score, best_mae, best_rmse))
    print('Best Params:', best_param)
    print(f'Test Set Performance MAE: {mae_test:.4f}, R²: {r2_test:.4f}\n')

AvalonPEMF: 1it [30:34, 1834.39s/it]

Model:   RF, Best CV R²: 0.8886, Best CV MAE: 0.1570, Best CV RMSE: 0.2275
Best Params: {'n_estimators': 1500}
Test Set Performance MAE: 0.1425, R²: 0.9077



AvalonPEMF: 2it [1:23:58, 2640.10s/it]

Model:   ET, Best CV R²: 0.8361, Best CV MAE: 0.1883, Best CV RMSE: 0.2746
Best Params: {'n_estimators': 600}
Test Set Performance MAE: 0.1703, R²: 0.8475



AvalonPEMF: 3it [1:27:20, 1527.03s/it]

Model:  KNN, Best CV R²: 0.7112, Best CV MAE: 0.2735, Best CV RMSE: 0.3720
Best Params: {'algorithm': 'ball_tree', 'leaf_size': 35, 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Test Set Performance MAE: 0.2587, R²: 0.7215



AvalonPEMF: 4it [1:27:23, 925.10s/it] 

Model:   DT, Best CV R²: 0.8434, Best CV MAE: 0.1838, Best CV RMSE: 0.2726
Best Params: {'max_depth': 10, 'min_samples_split': 6}
Test Set Performance MAE: 0.1578, R²: 0.8715



AvalonPEMF: 5it [1:27:25, 592.31s/it]

Model:  KRR, Best CV R²: 0.8537, Best CV MAE: 0.1917, Best CV RMSE: 0.2625
Best Params: {'gamma': None}
Test Set Performance MAE: 0.1853, R²: 0.8605



AvalonPEMF: 6it [1:33:27, 934.58s/it]

Model:  XGB, Best CV R²: 0.8784, Best CV MAE: 0.1635, Best CV RMSE: 0.2403
Best Params: {'n_estimators': 10}
Test Set Performance MAE: 0.1474, R²: 0.8969




