Nested-Cross Val

In [70]:
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

#Data download
train = pd.read_csv('/home/redman/blackthorn/enamine/f/data/train_logP_v4_features_2.2.csv')
train = train.drop(['fold_id'], axis=1)
train = train.dropna(axis=0)
test = pd.read_csv('/home/redman/blackthorn/enamine/f/data/test_logP_v4_features_2.2.csv')
test = test.dropna(axis=0)
cv_inner = StratifiedKFoldContinuous(n_splits=2, shuffle=True, random_state=1)
X_train = train.drop(['logP'], axis=1)
y_train = train['logP']
X_test = test.drop(['logP'], axis=1)
y_test = test['logP']
#define the model
model = ElasticNet(random_state=1)
# define search space
param_grid = {
                'alpha'     : [0.1,1,10,0.01],
                'l1_ratio'  : np.arange(0.40,1.00,0.10),
                'tol'       : [0.0001,0.001]
            }

# define search
search = GridSearchCV(model, param_grid, scoring=['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'], cv=cv_inner, refit='r2', return_train_score=True)
# execute search
result = search.fit(X_train, y_train)
# get the best performing model fit on the whole training set
best_model = result.best_estimator_
# evaluate model on the hold out dataset
yhat = best_model.predict(X_test)
# evaluate the model
def metrics(test, feature, regr=best_model):
    y_pred = regr.predict(test.drop([feature], axis=1))
    r2 = r2_score(y_pred=y_pred, y_true = test[feature])
    print(f'=========%s========='%(feature))
    print('R^2 = '+str(round(r2, 3)))
    print('MAE = ', round(mae(y_true = test[feature], y_pred=y_pred), 3))
    print('MSE = ', round(mse(y_true = test[feature], y_pred=y_pred), 3))
metrics(test, 'logP')
metrics(train, 'logP')



R^2 = 0.858
MAE =  0.15
MSE =  0.031
R^2 = 0.798
MAE =  0.178
MSE =  0.063


In [105]:
result.cv_results_['mean_test_r2']

array([0.58776168, 0.5869133 , 0.56956858, 0.56866354, 0.54475645,
       0.54380843, 0.52856372, 0.52708934, 0.52203393, 0.52091415,
       0.5191257 , 0.51796504, 0.51810282, 0.51772172, 0.52243634,
       0.52231281, 0.52483844, 0.52486331, 0.52630363, 0.52633979,
       0.52715509, 0.52714213, 0.5277299 , 0.52772084, 0.3886759 ,
       0.38875796, 0.3260629 , 0.32615315, 0.27954997, 0.27955867,
       0.25001265, 0.25001904, 0.21183987, 0.21184468, 0.16685822,
       0.16685818, 0.52719798, 0.5273059 , 0.52149305, 0.52169414,
       0.51968528, 0.52007   , 0.51111664, 0.51133001, 0.51089772,
       0.51117299, 0.51764063, 0.51783681])

In [98]:
np.where(result.cv_results_['mean_test_neg_root_mean_squared_error']== (max(result.cv_results_['mean_test_neg_root_mean_squared_error'])))


(array([0]),)

In [72]:
result.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_alpha', 'param_l1_ratio', 'param_tol', 'params', 'split0_test_r2', 'split1_test_r2', 'mean_test_r2', 'std_test_r2', 'rank_test_r2', 'split0_train_r2', 'split1_train_r2', 'mean_train_r2', 'std_train_r2', 'split0_test_neg_mean_absolute_error', 'split1_test_neg_mean_absolute_error', 'mean_test_neg_mean_absolute_error', 'std_test_neg_mean_absolute_error', 'rank_test_neg_mean_absolute_error', 'split0_train_neg_mean_absolute_error', 'split1_train_neg_mean_absolute_error', 'mean_train_neg_mean_absolute_error', 'std_train_neg_mean_absolute_error', 'split0_test_neg_root_mean_squared_error', 'split1_test_neg_root_mean_squared_error', 'mean_test_neg_root_mean_squared_error', 'std_test_neg_root_mean_squared_error', 'rank_test_neg_root_mean_squared_error', 'split0_train_neg_root_mean_squared_error', 'split1_train_neg_root_mean_squared_error', 'mean_train_neg_root_mean_squared_error', 'std_train_neg_root_mean_squ

In [42]:
from sklearn.utils.validation import check_array, column_or_1d
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import warnings
from sklearn.utils import check_random_state
from sklearn.utils.multiclass import type_of_target

class StratifiedKFoldContinuous(StratifiedKFold):
    """Stratified K-Folds cross-validator for continuous data.

    Provides train/test indices to split data in train/test sets.

    This cross-validation object is a variation of KFold that returns
    stratified folds. The folds are made by preserving the percentage of
    samples for each bin of the data.

    
    For visualisation of cross-validation behaviour and
    comparison between common scikit-learn split methods
    refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`

    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    
    n_bins : int, default=10
        Number of bins to make classes from the continuum. Must be at least 2.

        .. versionchanged:: 0.22
            ``n_splits`` default value changed from 3 to 5.

    shuffle : bool, default=False
        Whether to shuffle each class's samples before splitting into batches.
        Note that the samples within each split will not be shuffled.

    random_state : int, RandomState instance or None, default=None
        When `shuffle` is True, `random_state` affects the ordering of the
        indices, which controls the randomness of each fold for each class.
        Otherwise, leave `random_state` as `None`.
        Pass an int for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    Examples
    --------


    Notes
    -----
    The implementation is designed to:

    * Generate test sets such that all contain the same distribution of
      bins, or as close as possible.
    * Be invariant to bins label.
    * Preserve order dependencies in the dataset ordering, when
      ``shuffle=False``: all samples from class k in some test set were
      contiguous in y, or separated in y by samples from classes other than k.
    * Generate test sets where the smallest and largest differ by at most one
      sample.

    .. versionchanged:: 0.22
        The previous implementation did not follow the last constraint.

    See Also
    --------
    RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
    """

    def __init__(self, n_splits=5, *, shuffle=False, random_state=None, n_bins=10):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def _make_test_folds(self, X, y=None):
        rng = check_random_state(self.random_state)
        y = np.asarray(y)
        type_of_target_y = type_of_target(y)
        allowed_target_types = ("binary", "multiclass", "continuous")
        if type_of_target_y not in allowed_target_types:
            raise ValueError(
                "Supported target types are: {}. Got {!r} instead.".format(
                    allowed_target_types, type_of_target_y
                )
            )
        if type_of_target_y == "continuous":
            y = KBinsDiscretizer(n_bins=10,
                                 encode='ordinal',
                                 strategy='uniform').fit_transform(y.reshape(-1, 1))
            
        y = column_or_1d(y)

        _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
        # y_inv encodes y according to lexicographic order. We invert y_idx to
        # map the classes so that they are encoded by order of appearance:
        # 0 represents the first label appearing in y, 1 the second, etc.
        _, class_perm = np.unique(y_idx, return_inverse=True)
        y_encoded = class_perm[y_inv]

        n_classes = len(y_idx)
        y_counts = np.bincount(y_encoded)
        min_groups = np.min(y_counts)
        if np.all(self.n_splits > y_counts):
            raise ValueError(
                "n_splits=%d cannot be greater than the"
                " number of members in each class." % (self.n_splits)
            )
        if self.n_splits > min_groups:
            warnings.warn(
                "The least populated class in y has only %d"
                " members, which is less than n_splits=%d."
                % (min_groups, self.n_splits),
                UserWarning,
            )

        # Determine the optimal number of samples from each class in each fold,
        # using round robin over the sorted y. (This can be done direct from
        # counts, but that code is unreadable.)
        y_order = np.sort(y_encoded)
        allocation = np.asarray(
            [
                np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
                for i in range(self.n_splits)
            ]
        )

        # To maintain the data order dependencies as best as possible within
        # the stratification constraint, we assign samples from each class in
        # blocks (and then mess that up when shuffle=True).
        test_folds = np.empty(len(y), dtype="i")
        for k in range(n_classes):
            # since the kth column of allocation stores the number of samples
            # of class k in each test set, this generates blocks of fold
            # indices corresponding to the allocation for class k.
            folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
            if self.shuffle:
                rng.shuffle(folds_for_class)
            test_folds[y_encoded == k] = folds_for_class
        return test_folds

    def _iter_test_masks(self, X, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.

            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.

        y : array-like of shape (n_samples,)
            The target variable for supervised learning problems.
            Stratification is done based on the y labels.

        groups : object
            Always ignored, exists for compatibility.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.

        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting `random_state`
        to an integer.
        """
        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
        return super().split(X, y, groups)