In [31]:
import pandas as pd
df = pd.read_csv('MACCSkeys_descriptors.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

In [2]:
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },    
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet(max_iter=10000),
    "PolyRidge": make_pipeline( PCA(n_components=20), PolynomialFeatures(), StandardScaler(), Ridge()),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(early_stopping=True, n_jobs=-1),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1, n_jobs=-1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)
    if name in param_grids:
        grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
        nested_scores = cross_val_score(grid, X, y, cv=outer_cv, scoring='r2', n_jobs=-1)
        print(f"{name} - nested CV R2(test, average): {nested_scores.mean():.3f}")
        grid.fit(X, y)
        print(f"{name} - Best Params used by all data: {grid.best_params_}")
        
    else:
        # case of Linear
        scores = cross_val_score(pipeline, X, y, cv=outer_cv, scoring='r2', n_jobs=-1)
        print(f"{name} - outer-CV R2-score(test, average): {scores.mean():.3f}")

Linear - outer-CV R2-score(test, average): 0.506
Lasso - nested CV R2(test, average): 0.475
Lasso - Best Params used by all data: {'lasso__alpha': 0.01}
Ridge - nested CV R2(test, average): 0.487
Ridge - Best Params used by all data: {'ridge__alpha': 10}
ElasticNet - nested CV R2(test, average): 0.481
ElasticNet - Best Params used by all data: {'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 1}
PolyRidge - nested CV R2(test, average): 0.337
PolyRidge - Best Params used by all data: {'polynomialfeatures__degree': 2, 'ridge__alpha': 100}
RandomForest - nested CV R2(test, average): 0.477
RandomForest - Best Params used by all data: {'randomforestregressor__max_depth': 15, 'randomforestregressor__min_samples_leaf': 2, 'randomforestregressor__n_estimators': 100}
SVR - nested CV R2(test, average): 0.460
SVR - Best Params used by all data: {'svr__C': 10, 'svr__epsilon': 0.5, 'svr__gamma': 'auto', 'svr__kernel': 'rbf'}
XGBoost - nested CV R2(test, average): 0.497
XGBoost - Best Params used 

In [97]:
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },    
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(max_iter=10000),
    "Ridge": Ridge(max_iter=10000),
    "ElasticNet": ElasticNet(max_iter=50000),
    "PolyRidge": make_pipeline( PCA(n_components=20), PolynomialFeatures(), StandardScaler(), Ridge()),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train):{train_r2:.3f}, R2(test):{test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result: R2(train) {np.mean(tot_train):.3f}, R2(test) {np.mean(tot_test):.3f}")

---------- Linear ----------
params, R2(train), R2(test)
outer CV R2(train): 0.727, R2(test): 0.593
outer CV R2(train): 0.747, R2(test): 0.512
outer CV R2(train): 0.739, R2(test): 0.536
outer CV R2(train): 0.723, R2(test): 0.550
outer CV R2(train): 0.745, R2(test): 0.518
outer CV R2(train): 0.752, R2(test): 0.487
outer CV R2(train): 0.698, R2(test): 0.641
outer CV R2(train): 0.757, R2(test): 0.503
outer CV R2(train): 0.756, R2(test): 0.469
outer CV R2(train): 0.706, R2(test): 0.600
outer CV R2(train): 0.740, R2(test): 0.507
outer CV R2(train): 0.791, R2(test): 0.389
outer CV R2(train): 0.753, R2(test): 0.520
outer CV R2(train): 0.716, R2(test): 0.568
outer CV R2(train): 0.767, R2(test): 0.287
outer CV R2(train): 0.693, R2(test): 0.536
outer CV R2(train): 0.734, R2(test): 0.570
outer CV R2(train): 0.782, R2(test): 0.312
outer CV R2(train): 0.786, R2(test): 0.378
outer CV R2(train): 0.708, R2(test): 0.630
outer CV R2(train): 0.741, R2(test): 0.498
outer CV R2(train): 0.741, R2(test): 0.5

  model = cd_fast.enet_coordinate_descent(


inner CV: R2(test, mean): 0.389, params: {'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0}, outer CV R2(train):0.635, R2(test):0.456
inner CV: R2(test, mean): 0.529, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 1}, outer CV R2(train):0.681, R2(test):0.221


  model = cd_fast.enet_coordinate_descent(


inner CV: R2(test, mean): 0.290, params: {'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0}, outer CV R2(train):0.582, R2(test):0.501
inner CV: R2(test, mean): 0.514, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.6}, outer CV R2(train):0.681, R2(test):0.477
inner CV: R2(test, mean): 0.533, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.2}, outer CV R2(train):0.754, R2(test):0.351
inner CV: R2(test, mean): 0.452, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.2}, outer CV R2(train):0.761, R2(test):0.408


  model = cd_fast.enet_coordinate_descent(


inner CV: R2(test, mean): 0.397, params: {'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0}, outer CV R2(train):0.640, R2(test):0.433
inner CV: R2(test, mean): 0.473, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.6}, outer CV R2(train):0.671, R2(test):0.487
inner CV: R2(test, mean): 0.439, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.6}, outer CV R2(train):0.657, R2(test):0.536
inner CV: R2(test, mean): 0.419, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.6}, outer CV R2(train):0.651, R2(test):0.584


  model = cd_fast.enet_coordinate_descent(


inner CV: R2(test, mean): 0.437, params: {'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0}, outer CV R2(train):0.671, R2(test):0.468
inner CV: R2(test, mean): 0.411, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.5}, outer CV R2(train):0.667, R2(test):0.500
inner CV: R2(test, mean): 0.477, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.4}, outer CV R2(train):0.717, R2(test):0.345
inner CV: R2(test, mean): 0.447, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.4}, outer CV R2(train):0.696, R2(test):0.464


  model = cd_fast.enet_coordinate_descent(


inner CV: R2(test, mean): 0.310, params: {'elasticnet__alpha': 1, 'elasticnet__l1_ratio': 0}, outer CV R2(train):0.617, R2(test):0.575
inner CV: R2(test, mean): 0.450, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 1}, outer CV R2(train):0.624, R2(test):0.477
inner CV: R2(test, mean): 0.495, params: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.5}, outer CV R2(train):0.692, R2(test):0.507
[ElasticNet] total result: R2(train) 0.674, R2(test) 0.481
---------- PolyRidge ----------
params, R2(train), R2(test)
inner CV: R2(test, mean): 0.304, params: {'polynomialfeatures__degree': 2, 'ridge__alpha': 100}, outer CV R2(train):0.874, R2(test):0.507
inner CV: R2(test, mean): 0.310, params: {'polynomialfeatures__degree': 2, 'ridge__alpha': 100}, outer CV R2(train):0.883, R2(test):0.318
inner CV: R2(test, mean): 0.332, params: {'polynomialfeatures__degree': 2, 'ridge__alpha': 100}, outer CV R2(train):0.883, R2(test):0.122
inner CV: R2(test, mean): 0.287, params: {'polynomialfea



inner CV: R2(test, mean): 0.178, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.725, R2(test):0.565




inner CV: R2(test, mean): 0.268, params: {'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.760, R2(test):0.535




inner CV: R2(test, mean): 0.143, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.704, R2(test):0.376




inner CV: R2(test, mean): 0.207, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.767, R2(test):0.573




inner CV: R2(test, mean): 0.164, params: {'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.762, R2(test):0.517




inner CV: R2(test, mean): 0.298, params: {'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.790, R2(test):0.362




inner CV: R2(test, mean): 0.065, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.755, R2(test):0.628




inner CV: R2(test, mean): 0.202, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.751, R2(test):0.421




inner CV: R2(test, mean): 0.177, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.701, R2(test):0.435




inner CV: R2(test, mean): 0.217, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.713, R2(test):0.412




inner CV: R2(test, mean): 0.203, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.669, R2(test):0.591




inner CV: R2(test, mean): 0.236, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.675, R2(test):0.282




inner CV: R2(test, mean): 0.185, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.687, R2(test):0.392




inner CV: R2(test, mean): 0.184, params: {'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__learning_rate': 0.05, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.683, R2(test):0.514




inner CV: R2(test, mean): 0.210, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.794, R2(test):0.304




inner CV: R2(test, mean): 0.111, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.616, R2(test):0.239




inner CV: R2(test, mean): 0.264, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.777, R2(test):0.418




inner CV: R2(test, mean): 0.219, params: {'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.844, R2(test):0.271




inner CV: R2(test, mean): 0.159, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.749, R2(test):0.551




inner CV: R2(test, mean): 0.266, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.671, R2(test):0.459




inner CV: R2(test, mean): 0.184, params: {'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.783, R2(test):0.469




inner CV: R2(test, mean): 0.189, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.05, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.708, R2(test):0.434




inner CV: R2(test, mean): 0.126, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.777, R2(test):0.575




inner CV: R2(test, mean): 0.251, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.741, R2(test):0.466




inner CV: R2(test, mean): 0.231, params: {'lgbmregressor__colsample_bytree': 0.8, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 50, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.644, R2(test):0.430




inner CV: R2(test, mean): 0.136, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.743, R2(test):0.480




inner CV: R2(test, mean): 0.200, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.05, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.724, R2(test):0.451




inner CV: R2(test, mean): 0.118, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.05, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.649, R2(test):0.634




inner CV: R2(test, mean): 0.220, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.793, R2(test):0.476




inner CV: R2(test, mean): 0.224, params: {'lgbmregressor__colsample_bytree': 0.6, 'lgbmregressor__learning_rate': 0.05, 'lgbmregressor__max_depth': 3, 'lgbmregressor__n_estimators': 200, 'lgbmregressor__subsample': 0.6}, outer CV R2(train):0.692, R2(test):0.476
[LightGBM] total result: R2(train) 0.728, R2(test) 0.458
---------- HistGB ----------
params, R2(train), R2(test)
inner CV: R2(test, mean): 0.129, params: {'histgradientboostingregressor__l2_regularization': 0.1, 'histgradientboostingregressor__learning_rate': 0.2, 'histgradientboostingregressor__max_depth': 5, 'histgradientboostingregressor__max_iter': 100}, outer CV R2(train):0.694, R2(test):0.513
inner CV: R2(test, mean): 0.211, params: {'histgradientboostingregressor__l2_regularization': 0.1, 'histgradientboostingregressor__learning_rate': 0.2, 'histgradientboostingregressor__max_depth': 1, 'histgradientboostingregressor__max_iter': 100}, outer CV R2(train):0.322, R2(test):0.130
inner CV: R2(test, mean): 0.154, params: {'his

In [5]:
import pandas as pd
df = pd.read_csv('Mordred_descriptors_revise.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=Warning)

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },    
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(max_iter=10000),
    "Ridge": Ridge(max_iter=10000),
    "ElasticNet": ElasticNet(max_iter=50000),
    "PolyRidge": make_pipeline( PCA(n_components=20), PolynomialFeatures(), StandardScaler(), Ridge()),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- Linear ----------
params, R2(train), R2(test)
outer CV R2(train): 0.792, R2(test): 0.532
outer CV R2(train): 0.814, R2(test): 0.438
outer CV R2(train): 0.770, R2(test): 0.631
outer CV R2(train): 0.747, R2(test): 0.679
outer CV R2(train): 0.805, R2(test): 0.534
outer CV R2(train): 0.805, R2(test): 0.522
outer CV R2(train): 0.767, R2(test): 0.591
outer CV R2(train): 0.795, R2(test): 0.536
outer CV R2(train): 0.809, R2(test): 0.444
outer CV R2(train): 0.766, R2(test): 0.530
outer CV R2(train): 0.795, R2(test): 0.478
outer CV R2(train): 0.826, R2(test): 0.474
outer CV R2(train): 0.816, R2(test): 0.453
outer CV R2(train): 0.794, R2(test): 0.464
outer CV R2(train): 0.803, R2(test): 0.232
outer CV R2(train): 0.779, R2(test): 0.515
outer CV R2(train): 0.784, R2(test): 0.475
outer CV R2(train): 0.809, R2(test): 0.449
outer CV R2(train): 0.819, R2(test): 0.435
outer CV R2(train): 0.766, R2(test): 0.569
outer CV R2(train): 0.792, R2(test): 0.503
outer CV R2(train): 0.776, R2(test): 0.6

In [6]:
import pandas as pd
df = pd.read_csv('Morgan_descriptors_revise.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=Warning)

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },    
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(max_iter=10000),
    "Ridge": Ridge(max_iter=10000),
    "ElasticNet": ElasticNet(max_iter=50000),
    "PolyRidge": make_pipeline( PCA(n_components=20), PolynomialFeatures(), StandardScaler(), Ridge()),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- Linear ----------
params, R2(train), R2(test)
outer CV R2(train): 0.790, R2(test): 0.544
outer CV R2(train): 0.814, R2(test): 0.438
outer CV R2(train): 0.768, R2(test): 0.633
outer CV R2(train): 0.747, R2(test): 0.679
outer CV R2(train): 0.804, R2(test): 0.536
outer CV R2(train): 0.804, R2(test): 0.523
outer CV R2(train): 0.759, R2(test): 0.631
outer CV R2(train): 0.794, R2(test): 0.538
outer CV R2(train): 0.809, R2(test): 0.445
outer CV R2(train): 0.763, R2(test): 0.565
outer CV R2(train): 0.792, R2(test): 0.503
outer CV R2(train): 0.826, R2(test): 0.473
outer CV R2(train): 0.815, R2(test): 0.460
outer CV R2(train): 0.792, R2(test): 0.475
outer CV R2(train): 0.803, R2(test): 0.237
outer CV R2(train): 0.779, R2(test): 0.515
outer CV R2(train): 0.780, R2(test): 0.511
outer CV R2(train): 0.809, R2(test): 0.452
outer CV R2(train): 0.819, R2(test): 0.439
outer CV R2(train): 0.765, R2(test): 0.584
outer CV R2(train): 0.787, R2(test): 0.522
outer CV R2(train): 0.774, R2(test): 0.6

In [7]:
import pandas as pd
df = pd.read_csv('RDKit_descriptors_revise.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=Warning)

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },    
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(max_iter=10000),
    "Ridge": Ridge(max_iter=10000),
    "ElasticNet": ElasticNet(max_iter=50000),
    "PolyRidge": make_pipeline( PCA(n_components=20), PolynomialFeatures(), StandardScaler(), Ridge()),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- Linear ----------
params, R2(train), R2(test)
outer CV R2(train): 0.783, R2(test): 0.572
outer CV R2(train): 0.798, R2(test): 0.498
outer CV R2(train): 0.766, R2(test): 0.660
outer CV R2(train): 0.740, R2(test): 0.698
outer CV R2(train): 0.799, R2(test): 0.550
outer CV R2(train): 0.800, R2(test): 0.527
outer CV R2(train): 0.748, R2(test): 0.659
outer CV R2(train): 0.787, R2(test): 0.572
outer CV R2(train): 0.800, R2(test): 0.500
outer CV R2(train): 0.749, R2(test): 0.605
outer CV R2(train): 0.788, R2(test): 0.500
outer CV R2(train): 0.821, R2(test): 0.500
outer CV R2(train): 0.808, R2(test): 0.463
outer CV R2(train): 0.781, R2(test): 0.511
outer CV R2(train): 0.797, R2(test): 0.303
outer CV R2(train): 0.773, R2(test): 0.515
outer CV R2(train): 0.766, R2(test): 0.587
outer CV R2(train): 0.808, R2(test): 0.458
outer CV R2(train): 0.813, R2(test): 0.475
outer CV R2(train): 0.760, R2(test): 0.601
outer CV R2(train): 0.776, R2(test): 0.575
outer CV R2(train): 0.768, R2(test): 0.6

KeyboardInterrupt: 

In [11]:
import pandas as pd
df = pd.read_csv('RDKit_descriptors_revise.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=Warning)

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },    
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    #"Linear": LinearRegression(),
    #"Lasso": Lasso(max_iter=10000),
    #"Ridge": Ridge(max_iter=10000),
    #"ElasticNet": ElasticNet(max_iter=50000),
    #"PolyRidge": make_pipeline( PCA(n_components=20), StandardScaler(), PolynomialFeatures(), Ridge()),
    #"RandomForest": RandomForestRegressor(),
    #"SVR": SVR(),
    #"XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    #"LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    #"HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- KNN ----------
params, R2(train), R2(test)
inner CV: R2(test): 0.324, params: {'kneighborsregressor__metric': 'euclidean', 'kneighborsregressor__n_neighbors': 3, 'kneighborsregressor__p': 1, 'kneighborsregressor__weights': 'distance'}, outer CV R2(train): 0.949, R2(test): 0.545
inner CV: R2(test): 0.315, params: {'kneighborsregressor__metric': 'euclidean', 'kneighborsregressor__n_neighbors': 3, 'kneighborsregressor__p': 1, 'kneighborsregressor__weights': 'uniform'}, outer CV R2(train): 0.697, R2(test): 0.468
inner CV: R2(test): 0.365, params: {'kneighborsregressor__metric': 'euclidean', 'kneighborsregressor__n_neighbors': 5, 'kneighborsregressor__p': 1, 'kneighborsregressor__weights': 'distance'}, outer CV R2(train): 0.975, R2(test): 0.287
inner CV: R2(test): 0.283, params: {'kneighborsregressor__metric': 'euclidean', 'kneighborsregressor__n_neighbors': 7, 'kneighborsregressor__p': 1, 'kneighborsregressor__weights': 'distance'}, outer CV R2(train): 0.964, R2(test): 0.466
inn

In [18]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },
    "GPR": {
        'pca__n_components': [10, 20, 30],
        'gaussianprocessregressor__alpha': [1e-6, 1e-5, 1e-4],
        'gaussianprocessregressor__kernel': [RBF(length_scale=l) for l in [0.1, 1.0, 10.0]]
    },
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    #"Linear": LinearRegression(),
    #"Lasso": Lasso(max_iter=10000),
    #"Ridge": Ridge(max_iter=10000),
    #"ElasticNet": ElasticNet(max_iter=50000),
    #"PolyRidge": make_pipeline(PCA(n_components=20), StandardScaler(), PolynomialFeatures(), Ridge()),
    "GPR": make_pipeline(PCA(), StandardScaler(), GaussianProcessRegressor(n_restarts_optimizer=5)),
    #"RandomForest": RandomForestRegressor(),
    #"SVR": SVR(),
    #"XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    #"LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    #"HistGB": HistGradientBoostingRegressor(early_stopping=True),
    #"KNN": KNeighborsRegressor(),
    #"MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- GPR ----------
params, R2(train), R2(test)
inner CV: R2(test): -0.122, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=1), 'pca__n_components': 20}, outer CV R2(train): 0.949, R2(test): 0.337
inner CV: R2(test): 0.152, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=1), 'pca__n_components': 20}, outer CV R2(train): 0.960, R2(test): 0.153
inner CV: R2(test): 0.034, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=1), 'pca__n_components': 20}, outer CV R2(train): 0.975, R2(test): 0.199
inner CV: R2(test): 0.016, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=1), 'pca__n_components': 20}, outer CV R2(train): 0.964, R2(test): 0.229
inner CV: R2(test): 0.223, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

import pandas as pd
df = pd.read_csv('Mordred_descriptors_revise.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0.2, 0.5, 0.8,],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },
    "GPR": {
        'pca__n_components': [10, 20, 30],
        'gaussianprocessregressor__alpha': [1e-6, 1e-5, 1e-4],
        'gaussianprocessregressor__kernel': [RBF(length_scale=l) for l in [0.1, 1.0, 10.0]]
    },
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10],
    },
    "SVR": {
        'svr__C': [0.1, 1, 10],
        'svr__epsilon': [0.5, 1.0],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [5, 10],
        'xgbregressor__n_estimators': [100, 200],
    },
    "LightGBM": {
        'lgbmregressor__learning_rate': [0.05, 0.1],
        'lgbmregressor__max_depth': [5, 10],
        'lgbmregressor__n_estimators': [100, 200],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1],
        "histgradientboostingregressor__max_depth": [5, 10],
        "histgradientboostingregressor__max_iter": [100, 200],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7],
        'kneighborsregressor__weights': ['uniform', 'distance']
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,)],
        'mlpregressor__alpha': [1e-4, 1e-3],
    }
}

# モデルの設定
models = {
    #"Linear": LinearRegression(),
    #"Lasso": Lasso(max_iter=10000),
    #"Ridge": Ridge(max_iter=10000),
    #"ElasticNet": ElasticNet(max_iter=50000),
    #"PolyRidge": make_pipeline(PCA(n_components=20), StandardScaler(), PolynomialFeatures(), Ridge()),
    #"GPR": make_pipeline(PCA(), StandardScaler(), GaussianProcessRegressor(n_restarts_optimizer=5)),
    #"RandomForest": RandomForestRegressor(),
    #"SVR": SVR(),
    #"XGBoost": XGBRegressor(verbosity=0, colsample_bytree=0.8, subsample=0.6, n_jobs=-1),
    #"LightGBM": LGBMRegressor(verbose=-1, colsample_bytree=0.8, subsample=0.6, n_jobs=-1),
    #"HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_n_splits = 3
inner_n_splits = 3
outer_cv = RepeatedKFold(n_splits=outer_n_splits, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=inner_n_splits, n_repeats=n_repeats, random_state=1)

exe = True

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    print( '----------', name, '----------' )
    total_combi = 1
    for params, p_range in param_grids[name].items():
        print('#', params.split('__')[1], p_range)
        total_combi *= len(p_range)
    print( 'total points =', total_combi, '(grid search) x', n_repeats, '(n_repeats) x', inner_n_splits, '(inner-CV) x', outer_n_splits, '(outer-CV) =', total_combi * n_repeats * outer_n_splits * inner_n_splits, '(points)' )
    
    print( '----- params, R2(train), R2(test)')
    tot_train = []
    tot_test = []
    i = 1
    if exe == True:
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
            if name in param_grids:
                grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
                grid.fit(X_train, y_train)
                
                y_train_pred = grid.predict(X_train)
                y_test_pred  = grid.predict(X_test)
                train_r2 = r2_score(y_train, y_train_pred)
                test_r2  = r2_score(y_test, y_test_pred)
                print( f"{i}, inner CV R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
                i += 1
            
            else:
                pipeline.fit(X_train, y_train)
                y_train_pred = pipeline.predict(X_train)
                y_test_pred  = pipeline.predict(X_test)
                train_r2 = r2_score(y_train, y_train_pred)
                test_r2  = r2_score(y_test, y_test_pred)
                print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
                
            tot_train.append(train_r2)
            tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}\n")

In [28]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

import pandas as pd
df = pd.read_csv('Morgan_descriptors_revise.csv')
df2 = df.iloc[:,13:].dropna()
y = df2.iloc[:,0]
X = df2.iloc[:,1:]

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },
    "GPR": {
        'pca__n_components': [5, 10, 20, 30],
        'gaussianprocessregressor__alpha': [1e-6, 1e-5, 1e-4],
        'gaussianprocessregressor__kernel': [RBF(length_scale=l) for l in [0.1, 1.0, 10.0]]
    },
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    #"Linear": LinearRegression(),
    #"Lasso": Lasso(max_iter=10000),
    #"Ridge": Ridge(max_iter=10000),
    #"ElasticNet": ElasticNet(max_iter=50000),
    #"PolyRidge": make_pipeline(PCA(n_components=20), StandardScaler(), PolynomialFeatures(), Ridge()),
    #"GPR": make_pipeline(PCA(), StandardScaler(), GaussianProcessRegressor(n_restarts_optimizer=5)),
    "GPR": make_pipeline(StandardScaler(), PCA(), GaussianProcessRegressor(n_restarts_optimizer=5)),
    #"RandomForest": RandomForestRegressor(),
    #"SVR": SVR(),
    #"XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    #"LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    #"HistGB": HistGradientBoostingRegressor(early_stopping=True),
    #"KNN": KNeighborsRegressor(),
    #"MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- GPR ----------
params, R2(train), R2(test)
inner CV: R2(test): -0.211, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=1), 'pca__n_components': 30}, outer CV R2(train): 1.000, R2(test): 0.352
inner CV: R2(test): 0.042, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=1), 'pca__n_components': 30}, outer CV R2(train): 1.000, R2(test): -0.216
inner CV: R2(test): 0.145, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=10), 'pca__n_components': 30}, outer CV R2(train): 1.000, R2(test): 0.077
inner CV: R2(test): -0.126, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(length_scale=10), 'pca__n_components': 30}, outer CV R2(train): 1.000, R2(test): 0.056
inner CV: R2(test): 0.219, params: {'gaussianprocessregressor__alpha': 0.0001, 'gaussianprocessregressor__kernel': RBF(le

In [45]:
import pandas as pd
df = pd.read_csv('Morgan_descriptors_revise.csv')
df2 = df.iloc[:,5:].dropna()
X = df2.iloc[:,:8]
y = df2.iloc[:,8]

# モデルとハイパーパラメータの設定
param_grids = {
    "Lasso": {
        'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "Ridge": {
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    "ElasticNet": {
        'elasticnet__alpha': [0.001, 0.01, 0.1, 1, 10],
        'elasticnet__l1_ratio': [0, 0.2, 0.4, 0.5, 0.6, 0.8, 1],
    },
    "PolyRidge": {
        'polynomialfeatures__degree': [2, 3],
        'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    },
    "GPR": {
        #'pca__n_components': [5, 10, 20, 30],
        'gaussianprocessregressor__alpha': [1e-6, 1e-5, 1e-4],
        'gaussianprocessregressor__kernel': [RBF(length_scale=l) for l in [0.1, 1.0, 10.0]]
    },
    "RandomForest": {
        'randomforestregressor__n_estimators': [50, 75, 100],
        'randomforestregressor__max_depth': [5, 10, 15],
        'randomforestregressor__min_samples_leaf': [1, 2, 5],
    },
    "SVR": {
        'svr__C': [0.01, 0.1, 1, 10, 100],
        'svr__epsilon': [0.5, 1.0, 2.0],
        'svr__kernel': ['rbf', 'linear'],
        'svr__gamma': ['scale','auto']
    },
    "XGBoost": {
        'xgbregressor__learning_rate': [0.05, 0.1],
        'xgbregressor__max_depth': [2, 5, 10],
        'xgbregressor__n_estimators': [50, 100, 200],
        'xgbregressor__subsample': [0.6, 0.8, 1.0],
        'xgbregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        'lgbmregressor__n_estimators': [50, 100, 200],
        'lgbmregressor__max_depth': [3, 5, 7],
        'lgbmregressor__learning_rate': [0.01, 0.05, 0.1],
        'lgbmregressor__subsample': [0.6, 0.8, 1.0],
        'lgbmregressor__colsample_bytree': [0.6, 0.8, 1.0],
    },
    "HistGB": {
        "histgradientboostingregressor__learning_rate": [0.05, 0.1, 0.2],
        "histgradientboostingregressor__max_depth": [1, 3, 5, 7],
        "histgradientboostingregressor__max_iter": [100, 200],
        "histgradientboostingregressor__l2_regularization": [0.0, 0.1, 1.0],
    },
    "KNN": {
        'kneighborsregressor__n_neighbors': [3, 5, 7, 10],
        'kneighborsregressor__weights': ['uniform', 'distance'],
        'kneighborsregressor__metric': ['euclidean', 'manhattan'],
        'kneighborsregressor__p': [1, 2]
    },
    "MLP": {
        'mlpregressor__hidden_layer_sizes': [(50,), (100,), (50, 20)],
        'mlpregressor__alpha': [1e-4, 1e-3, 1e-2],
        'mlpregressor__learning_rate_init': [0.0001, 0.001, 0.01],
    }
}

# モデルの設定
models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(max_iter=10000),
    "Ridge": Ridge(max_iter=10000),
    "ElasticNet": ElasticNet(max_iter=50000),
    "PolyRidge": make_pipeline(PolynomialFeatures(), Ridge()),
    "GPR": make_pipeline(GaussianProcessRegressor(n_restarts_optimizer=5)),
    #"GPR": make_pipeline(StandardScaler(), PCA(), GaussianProcessRegressor(n_restarts_optimizer=5)),
    "RandomForest": RandomForestRegressor(),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(verbosity=0, n_jobs=-1),
    "LightGBM": LGBMRegressor(verbose=-1, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(early_stopping=True),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=200, early_stopping=True, validation_fraction=0.1)
}

n_repeats = 10
outer_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)
inner_cv = RepeatedKFold(n_splits=3, n_repeats=n_repeats, random_state=1)

# ネストされた交差検証を使ってモデルを評価
for name, model in models.items():
    print( '----------', name, '----------\nparams, R2(train), R2(test)')
    pipeline = model if isinstance(model, Pipeline) else make_pipeline(StandardScaler(), model)

    tot_train = []
    tot_test = []
    
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if name in param_grids:
            grid = GridSearchCV(pipeline, param_grids[name], cv=inner_cv, scoring='r2', n_jobs=-1)
            grid.fit(X_train, y_train)
            
            y_train_pred = grid.predict(X_train)
            y_test_pred  = grid.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"inner CV: R2(test): {grid.best_score_:.3f}, params: {grid.best_params_}, outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
        
        else:
            pipeline.fit(X_train, y_train)
            y_train_pred = pipeline.predict(X_train)
            y_test_pred  = pipeline.predict(X_test)
            train_r2 = r2_score(y_train, y_train_pred)
            test_r2  = r2_score(y_test, y_test_pred)
            print( f"outer CV R2(train): {train_r2:.3f}, R2(test): {test_r2:.3f}" )
            
        tot_train.append(train_r2)
        tot_test.append(test_r2)
    
    print(f"[{name}] total result = R2(train): {np.mean(tot_train):.3f}, R2(test): {np.mean(tot_test):.3f}")

---------- Linear ----------
params, R2(train), R2(test)
outer CV R2(train): 0.399, R2(test): 0.166
outer CV R2(train): 0.425, R2(test): 0.102
outer CV R2(train): 0.357, R2(test): 0.268
outer CV R2(train): 0.297, R2(test): 0.453
outer CV R2(train): 0.322, R2(test): 0.330
outer CV R2(train): 0.502, R2(test): -0.140
outer CV R2(train): 0.273, R2(test): 0.461
outer CV R2(train): 0.474, R2(test): 0.067
outer CV R2(train): 0.357, R2(test): 0.337
outer CV R2(train): 0.442, R2(test): 0.120
outer CV R2(train): 0.408, R2(test): 0.090
outer CV R2(train): 0.327, R2(test): 0.364
outer CV R2(train): 0.364, R2(test): 0.279
outer CV R2(train): 0.412, R2(test): 0.225
outer CV R2(train): 0.369, R2(test): 0.251
outer CV R2(train): 0.390, R2(test): 0.066
outer CV R2(train): 0.400, R2(test): 0.212
outer CV R2(train): 0.363, R2(test): 0.186
outer CV R2(train): 0.274, R2(test): 0.389
outer CV R2(train): 0.410, R2(test): 0.142
outer CV R2(train): 0.400, R2(test): 0.205
outer CV R2(train): 0.391, R2(test): 0.

In [None]:
from screening import *

df = pd.read_csv('Morgan_descriptors_revise.csv')
df2 = df.iloc[:,5:].dropna()
X = df2.iloc[:,:8]
y = df2.iloc[:,8]

In [7]:
y = df2.iloc[:,8]
X = df2.iloc[:,9:]

In [8]:
exe = True
run_screening(X, y)

---------- Linear ----------
# no-parameters
total points = 1 (grid search) x 10 (n_repeats) x 3 (inner-CV) x 3 (outer-CV) = 90 (points)
----- 3 x 3 cross validation
outer CV R2(train): 0.790, R2(test): 0.544
outer CV R2(train): 0.814, R2(test): 0.438
outer CV R2(train): 0.768, R2(test): 0.633
outer CV R2(train): 0.747, R2(test): 0.679
outer CV R2(train): 0.804, R2(test): 0.536
outer CV R2(train): 0.804, R2(test): 0.523
outer CV R2(train): 0.759, R2(test): 0.631
outer CV R2(train): 0.794, R2(test): 0.538
outer CV R2(train): 0.809, R2(test): 0.445
outer CV R2(train): 0.763, R2(test): 0.565
outer CV R2(train): 0.792, R2(test): 0.503
outer CV R2(train): 0.826, R2(test): 0.473
outer CV R2(train): 0.815, R2(test): 0.460
outer CV R2(train): 0.792, R2(test): 0.475
outer CV R2(train): 0.803, R2(test): 0.237
outer CV R2(train): 0.779, R2(test): 0.515
outer CV R2(train): 0.780, R2(test): 0.511
outer CV R2(train): 0.809, R2(test): 0.452
outer CV R2(train): 0.819, R2(test): 0.439
outer CV R2(train