Skopt https://scikit-optimize.github.io/stable/
    

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, power_transform, quantile_transform
import scipy.stats as st
from sklearn.ensemble import RandomForestRegressor

from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from collections import defaultdict

In [2]:
df = pd.read_csv('rnn_data_prajwal.csv', 
                 parse_dates=['DateTime'], index_col='DateTime')

In [3]:
Xvar = ['Ta', 'Ws', 'Fg', 'VPD', 'Fn', 'q', 'Ts', 'Sws']
yvar = 'Fc'

In [4]:
# split into input (X) and output (Y) variables
n_train= 8500

train_df = df.iloc[:n_train]
test_df = df.iloc[n_train:] 

print(train_df.shape, test_df.shape)

X_train, y_train = train_df[Xvar], train_df[yvar]
X_test, y_test = test_df[Xvar], test_df[yvar]

print(X_train.shape, y_train.shape)
print(X_train.keys())

(8500, 10) (515, 10)
(8500, 8) (8500,)
Index(['Ta', 'Ws', 'Fg', 'VPD', 'Fn', 'q', 'Ts', 'Sws'], dtype='object')


In [5]:
params_space  = [Integer(3, 17, name='max_depth'),
          Categorical([True, False], name = "oob_score"),
          Integer(1, len(Xvar), name='max_features'),
          Integer(2, 100, name='min_samples_split'),
          Integer(1, 100, name='min_samples_leaf'),
          Integer(1000, 1001, name='n_estimators')]

In [6]:
### Custom cross-validation function: weighted F1 score on specific threshold

@use_named_args(params_space)
def objective2(**params):
    """
    Objective function to minimize, using custom cross-validation. Default CV is limited 
    """

    # Update parameters with default parameters
    params = {**params, **{'n_jobs':-1}}    
    threshold = 0.01 # Evaluate for single point

    cls_ = RandomForestRegressor(**params)
    print("------ Sampling new data point ------")

    kfold = KFold(n_splits=5, shuffle=True)
    cval_results = defaultdict(list)

    for train_index, test_index in kfold.split(X_train, y_train):
        X_train_, X_test_ = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
        y_train_, y_test_ = y_train.iloc[train_index], y_train.iloc[test_index]

        # Train regressor on current fold
        cls_.fit(X_train_, y_train_)
        y_test_predicted = cls_.predict(X_test_)

        cval_results['mse'].append(mean_squared_error(y_test_, y_test_predicted))

    score = np.mean(np.sqrt(cval_results['mse']))
    print("Params:",params)
    print("Score:", score)
    
    return score

In [7]:
# Perform optimisation

from skopt import gp_minimize
res_gp = gp_minimize(objective2, params_space, n_calls=100, random_state=0)

"Best score=%.4f" % res_gp.fun

------ Sampling new data point ------
Params: {'max_depth': 11, 'oob_score': False, 'max_features': 7, 'min_samples_split': 85, 'min_samples_leaf': 63, 'n_estimators': 1000, 'n_jobs': -1}
Score: 3.132340250819464
------ Sampling new data point ------
Params: {'max_depth': 7, 'oob_score': True, 'max_features': 3, 'min_samples_split': 49, 'min_samples_leaf': 81, 'n_estimators': 1000, 'n_jobs': -1}
Score: 3.1450229532130756
------ Sampling new data point ------
Params: {'max_depth': 8, 'oob_score': False, 'max_features': 3, 'min_samples_split': 66, 'min_samples_leaf': 37, 'n_estimators': 1001, 'n_jobs': -1}
Score: 3.1266063138028874
------ Sampling new data point ------
Params: {'max_depth': 5, 'oob_score': False, 'max_features': 4, 'min_samples_split': 80, 'min_samples_leaf': 53, 'n_estimators': 1001, 'n_jobs': -1}
Score: 3.16964325878399
------ Sampling new data point ------
Params: {'max_depth': 13, 'oob_score': False, 'max_features': 5, 'min_samples_split': 76, 'min_samples_leaf': 11,

------ Sampling new data point ------
Params: {'max_depth': 10, 'oob_score': False, 'max_features': 3, 'min_samples_split': 2, 'min_samples_leaf': 63, 'n_estimators': 1000, 'n_jobs': -1}
Score: 3.1289005336687676
------ Sampling new data point ------
Params: {'max_depth': 17, 'oob_score': False, 'max_features': 1, 'min_samples_split': 2, 'min_samples_leaf': 7, 'n_estimators': 1000, 'n_jobs': -1}
Score: 3.096651745151091
------ Sampling new data point ------
Params: {'max_depth': 17, 'oob_score': True, 'max_features': 1, 'min_samples_split': 2, 'min_samples_leaf': 16, 'n_estimators': 1000, 'n_jobs': -1}
Score: 3.1323283710053147
------ Sampling new data point ------
Params: {'max_depth': 17, 'oob_score': False, 'max_features': 1, 'min_samples_split': 100, 'min_samples_leaf': 20, 'n_estimators': 1000, 'n_jobs': -1}
Score: 3.1863393101927935
------ Sampling new data point ------
Params: {'max_depth': 15, 'oob_score': False, 'max_features': 5, 'min_samples_split': 2, 'min_samples_leaf': 30

------ Sampling new data point ------
Params: {'max_depth': 14, 'oob_score': False, 'max_features': 2, 'min_samples_split': 2, 'min_samples_leaf': 9, 'n_estimators': 1001, 'n_jobs': -1}
Score: 3.073649365943934
------ Sampling new data point ------
Params: {'max_depth': 15, 'oob_score': False, 'max_features': 1, 'min_samples_split': 2, 'min_samples_leaf': 1, 'n_estimators': 1001, 'n_jobs': -1}
Score: 3.1348191532125638
------ Sampling new data point ------
Params: {'max_depth': 13, 'oob_score': False, 'max_features': 3, 'min_samples_split': 2, 'min_samples_leaf': 15, 'n_estimators': 1001, 'n_jobs': -1}
Score: 3.0767196537647323
------ Sampling new data point ------
Params: {'max_depth': 15, 'oob_score': False, 'max_features': 3, 'min_samples_split': 2, 'min_samples_leaf': 14, 'n_estimators': 1001, 'n_jobs': -1}
Score: 3.0808284973919897
------ Sampling new data point ------
Params: {'max_depth': 11, 'oob_score': False, 'max_features': 3, 'min_samples_split': 2, 'min_samples_leaf': 16, 

'Best score=3.0703'

In [8]:
print("Optimal parameters")
for param, value in zip(params_space, res_gp.x):
    print(f"Param: {param.name}, value: {value}")

Optimal parameters
Param: max_depth, value: 17
Param: oob_score, value: False
Param: max_features, value: 3
Param: min_samples_split, value: 2
Param: min_samples_leaf, value: 13
Param: n_estimators, value: 1001


In [9]:
from skopt.plots import plot_convergence

In [10]:
plt.clf()
plot_convergence(res_gp)

<matplotlib.axes._subplots.AxesSubplot at 0x109557748>