In [1]:
import math

import numpy as np
import pandas as pd

import lightgbm as lgb
import catboost as cat
from catboost import Pool

import itertools
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, BaseShuffleSplit, _validate_shuffle_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

import eli5
from IPython.display import display
from eli5.permutation_importance import get_score_importances
from eli5.sklearn import PermutationImportance

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns

import optuna

import warnings
warnings.filterwarnings('ignore')


from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL


class CFG:
    feature_sel = False
    n_feature_sel_folds = 5
    
    lgbm_optimize = False
    cb_optimize = False
    n_optimize_folds = 3
    n_optimize_repeats = 5
    
    n_stacking_folds = 10
    

# Load Data

In [2]:
# COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
COMP_PATH = "icr-identify-age-related-conditions"

train_df = pd.read_csv(f'{COMP_PATH}//train.csv')
test_df = pd.read_csv(f'{COMP_PATH}/test.csv')
greeks = pd.read_csv(f"{COMP_PATH}/greeks.csv")
sample_submission = pd.read_csv(f"{COMP_PATH}/sample_submission.csv")

train_df['EJ'] = train_df['EJ'].replace({'A': 0, 'B': 1})
test_df['EJ'] = test_df['EJ'].replace({'A': 0, 'B': 1})

train_df.columns = train_df.columns.str.replace(' ', '')
test_df.columns = test_df.columns.str.replace(' ', '')

# train_df.drop('Id',axis=1, inplace=True)
# train_df.fillna(train_df.median(), inplace=True)

# Add distance features

In [3]:
# from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

# features = [fe for fe in train_df.columns if fe not in ['Id','Class']].copy()

# # cosine distance
# knn = NearestNeighbors(n_neighbors=21, metric='cosine', n_jobs=-1)
# knn.fit(train_df[features].fillna(0))

# # train
# dists, nears = knn.kneighbors(train_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# train_df['class_cos'] = np.array(classes[i].mean() for i in range(len(nears)))
# train_df['class_cos'] = train_df['class_cos'].astype(float)

# train_df['dists_cos_0'] = np.array([dists[i][classes[i] == 0].mean() for i in range(len(nears))])
# train_df['dists_cos_1'] = np.array([dists[i][classes[i] == 1].mean() for i in range(len(nears))])
# train_df['dists_cos_0'], train_df['dists_cos_1'] = train_df['dists_cos_0'].fillna(10), train_df['dists_cos_1'].fillna(10)

# # test
# dists, nears = knn.kneighbors(test_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# test_df['class_cos'] = np.array(classes[i].mean() for i in range(len(nears)))
# test_df['class_cos'] = test_df['class_cos'].astype(float)

# test_df['dists_cos_0'] = np.array([dists[i][classes[i] == 0].mean() for i in range(len(nears))])
# test_df['dists_cos_1'] = np.array([dists[i][classes[i] == 1].mean() for i in range(len(nears))])
# test_df['dists_cos_0'], test_df['dists_cos_1'] = test_df['dists_cos_0'].fillna(10), test_df['dists_cos_1'].fillna(10)

# # euclidean distance
# knn = NearestNeighbors(n_neighbors=21, metric='euclidean', n_jobs=-1)
# knn.fit(train_df[features].fillna(0))

# # train
# dists, nears = knn.kneighbors(train_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# train_df['class_euc'] = np.array(classes[i].mean() for i in range(len(nears)))
# train_df['class_euc'] = train_df['class_euc'].astype(float)

# train_df['dists_euc_0'] = np.array([dists[i][classes[i] == 0].mean() for i in range(len(nears))])
# train_df['dists_euc_1'] = np.array([dists[i][classes[i] == 1].mean() for i in range(len(nears))])
# train_df['dists_euc_0'], train_df['dists_euc_1'] = train_df['dists_euc_0'].fillna(10), train_df['dists_euc_1'].fillna(10)

# # test
# dists, nears = knn.kneighbors(test_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# test_df['class_euc'] = np.array(classes[i].mean() for i in range(len(nears)))
# test_df['class_euc'] = test_df['class_euc'].astype(float)

# test_df['dists_euc_0'] = np.array([dists[i][classes[i] == 0].mean() for i in range(len(nears))])
# test_df['dists_euc_1'] = np.array([dists[i][classes[i] == 1].mean() for i in range(len(nears))])
# test_df['dists_euc_0'], test_df['dists_euc_1'] = test_df['dists_euc_0'].fillna(10), test_df['dists_euc_1'].fillna(10)

# # manhattan distance
# knn = NearestNeighbors(n_neighbors=21, metric='manhattan', n_jobs=-1)
# knn.fit(train_df[features].fillna(0))

# # train
# dists, nears = knn.kneighbors(train_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# train_df['class_man'] = np.array(classes[i].mean() for i in range(len(nears)))
# train_df['class_man'] = train_df['class_man'].astype(float)

# train_df['dists_man_0'] = np.array([dists[i][classes[i] == 0].mean() for i in range(len(nears))])
# train_df['dists_man_1'] = np.array([dists[i][classes[i] == 1].mean() for i in range(len(nears))])
# train_df['dists_man_0'], train_df['dists_man_1'] = train_df['dists_man_0'].fillna(10), train_df['dists_man_1'].fillna(10)

# # test
# dists, nears = knn.kneighbors(test_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# test_df['class_man'] = np.array(classes[i].mean() for i in range(len(nears)))
# test_df['class_man'] = test_df['class_man'].astype(float)

# test_df['dists_man_0'] = np.array([dists[i][classes[i] == 0].mean() for i in range(len(nears))])
# test_df['dists_man_1'] = np.array([dists[i][classes[i] == 1].mean() for i in range(len(nears))])
# test_df['dists_man_0'], test_df['dists_man_1'] = test_df['dists_man_0'].fillna(10), test_df['dists_man_1'].fillna(10)

# Standard Scaler

In [4]:
# scaler = StandardScaler()
# new_num_cols = train_df.select_dtypes(include=['float64']).columns

# train_df[new_num_cols] = scaler.fit_transform(train_df[new_num_cols])
# test_df[new_num_cols] = scaler.transform(test_df[new_num_cols])

# Brute Force Feature Generation

Combine features in all possible ways.

In [5]:
# fi = pd.read_csv('feature_importances.csv', index_col = 'Unnamed: 0')
# fi_cols = set(fi['Feature'].head(100).values)

# perm = pd.read_csv('perm_df.csv', index_col = 'Unnamed: 0')
# perm_cols = set(perm['importance'].head(100).index)

# important_col = list(perm_cols.intersection(fi_cols))
# print(important_col)

# Denoising

In [6]:
# features = [fe for fe in train_df.columns if fe not in ['Id','CF', 'CB', 'DV', 'BR', 'DF', 'AR', 'GI', 'AY', 'GB',
#                                                         'AH', 'CW', 'CL', 'Class', 'BP']]

# for f in features:
#     train_df[f] = np.floor(train_df[f]*1000)/1000 # quality decreases no significant result for LGBM

# Log features (preserve sign)

In [7]:
# for f in features:
#     train_df[f] = np.sign(train_df[f]) * np.log1p(np.abs(train_df[f])) # no significant result for LGBM

# Feature generation

In [8]:
# features = train_df.drop(['Class', 'Id'], axis=1).columns

features = [fe for fe in train_df.columns if fe not in ['Id','Class']]

# features = [fe for fe in train_df.columns if fe not in ['Id','CF', 'CB', 'DV', 'BR', 'DF', 'AR', 'GI', 'AY', 'GB',
#                                                         'AH', 'CW', 'CL', 'Class', 'BP', 'AX', 'AZ', 'BD', 'BZ', 'EG', 
#                                                         'EH', 'EJ', 'FC', 'FD', 'FE', 'FS', 'GF', 'GH']]

# ['AX', 'AZ', 'BD', 'BZ', 'EG', 'EH', 'EJ', 'FC', 'FD', 'FE', 'FS', 'GF', 'GH']

# selected = ['CR+DU', 'EH-FI', 'CD-DL', 'AB-FI', 'FL+GL', 'DU-EP', 'BQ+EP', 'FR+GL', 'CD-EP', 'AF-EG', 
#             'CU-DU', 'AB-CR', 'BQ+CD', 'BQ-GF', 'BQ+DI', 'CR+DH', 'CD+EL', 'CC+EH', 'DA-DU', 'AF+BQ', 
#             'CR+EH', 'BQ-DN', 'DN-EL', 'EJ+FI', 'BQ-EP', 'EE*EP', 'BQ/CU', 'AB*FR', 'EJ/FL', 'CD/CH', 
#             'CD/EP', 'CR*DU', 'AF/EG', 'CC/CD', 'AB/CH', 'BQ/CH', 'CH/DI', 'AB/DN', 'DU/EP', 'DU/EJ', 
#             'DU/GL', 'DY/EE', 'CR*CS', 'DH/DU', 'EJ*GL', 'CD/DL', 'DH/DI', 'DU*FR', 'CR*DH']

# def gen_features(features, df):
#     generated_features = pd.DataFrame()

#     for fe_a, fe_b in tqdm(itertools.combinations(features, 2), total=sum([1 for i in itertools.combinations(features, 2)])):
#         generated_features[f'{fe_a}+{fe_b}']   = df[fe_a] + df[fe_b]
#         generated_features[f'{fe_a}-{fe_b}']   = df[fe_a] - df[fe_b] 
#         generated_features[f'{fe_a}*{fe_b}']   = df[fe_a] * df[fe_b]
#         generated_features[f'{fe_a}/{fe_b}']   = df[fe_a] / df[fe_b]

# #         generated_features[f'{fe_a}_2']        = df[fe_a].pow(2)
# #         generated_features[f'{fe_b}_2']        = df[fe_b].pow(2)
# #         generated_features[f'{fe_a}*{fe_b}_2'] = df[fe_a] * df[fe_b].pow(2)
# #         generated_features[f'{fe_a}_2*{fe_b}'] = df[fe_a].pow(2) * df[fe_b]

# #         generated_features[f'{fe_a}_05'] = df[fe_a].pow(0.5)
# #         generated_features[f'{fe_b}_05'] = df[fe_b].pow(0.5)
# #         generated_features[f'{fe_a}*{fe_b}_05'] = df[fe_a] * df[fe_b].pow(0.5)
# #         generated_features[f'{fe_a}_05*{fe_b}'] = df[fe_a].pow(0.5) * df[fe_b]

# #         generated_features[f'{fe_a}_log'] = np.log(df[fe_a])
# #         generated_features[f'{fe_b}_log'] = np.log(df[fe_b])
# #         generated_features[f'{fe_a}*{fe_b}_log'] = df[fe_a] * np.log(df[fe_b])
# #         generated_features[f'{fe_a}_log*{fe_b}'] = np.log(df[fe_a]) * df[fe_b]
        
#     generated_features = generated_features[selected]
#     generated_features = pd.concat([generated_features, df[features]], axis=1)
    
#     # prevent inf
#     for g in generated_features.columns:
#         generated_features[g] = np.minimum(np.maximum(generated_features[g], -1e9), 1e9)
    
#     return generated_features

# generated_features_train = gen_features(features, train_df)
# generated_features_test = gen_features(features, test_df)

# Multilabel Stratification

In [9]:
def IterativeStratification(labels, r, random_state):
    """This function implements the Iterative Stratification algorithm described
    in the following paper:
    Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
    Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
    (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
    2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
    Heidelberg.
    """

    n_samples = labels.shape[0]
    test_folds = np.zeros(n_samples, dtype=int)

    # Calculate the desired number of examples at each subset
    c_folds = r * n_samples

    # Calculate the desired number of examples of each label at each subset
    c_folds_labels = np.outer(r, labels.sum(axis=0))

    labels_not_processed_mask = np.ones(n_samples, dtype=bool)

    while np.any(labels_not_processed_mask):
        # Find the label with the fewest (but at least one) remaining examples,
        # breaking ties randomly
        num_labels = labels[labels_not_processed_mask].sum(axis=0)

        # Handle case where only all-zero labels are left by distributing
        # across all folds as evenly as possible (not in original algorithm but
        # mentioned in the text). (By handling this case separately, some
        # code redundancy is introduced; however, this approach allows for
        # decreased execution time when there are a relatively large number
        # of all-zero labels.)
        if num_labels.sum() == 0:
            sample_idxs = np.where(labels_not_processed_mask)[0]

            for sample_idx in sample_idxs:
                fold_idx = np.where(c_folds == c_folds.max())[0]

                if fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]

                test_folds[sample_idx] = fold_idx
                c_folds[fold_idx] -= 1

            break

        label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
        if label_idx.shape[0] > 1:
            label_idx = label_idx[random_state.choice(label_idx.shape[0])]

        sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]

        for sample_idx in sample_idxs:
            # Find the subset(s) with the largest number of desired examples
            # for this label, breaking ties by considering the largest number
            # of desired examples, breaking further ties randomly
            label_folds = c_folds_labels[:, label_idx]
            fold_idx = np.where(label_folds == label_folds.max())[0]

            if fold_idx.shape[0] > 1:
                temp_fold_idx = np.where(c_folds[fold_idx] ==
                                         c_folds[fold_idx].max())[0]
                fold_idx = fold_idx[temp_fold_idx]

                if temp_fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]

            test_folds[sample_idx] = fold_idx
            labels_not_processed_mask[sample_idx] = False

            # Update desired number of examples
            c_folds_labels[fold_idx, labels[sample_idx]] -= 1
            c_folds[fold_idx] -= 1

    return test_folds


class MultilabelStratifiedKFold(_BaseKFold):
    """Multilabel stratified K-Folds cross-validator
    Provides train/test indices to split multilabel data into train/test sets.
    This cross-validation object is a variation of KFold that returns
    stratified folds for multilabel data. The folds are made by preserving
    the percentage of samples for each label.
    Parameters
    ----------
    n_splits : int, default=3
        Number of folds. Must be at least 2.
    shuffle : boolean, optional
        Whether to shuffle each stratification of the data before splitting
        into batches.
    random_state : int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedKFold that only uses random_state
        when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
    >>> mskf.get_n_splits(X, y)
    2
    >>> print(mskf)  # doctest: +NORMALIZE_WHITESPACE
    MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
    >>> for train_index, test_index in mskf.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    Notes
    -----
    Train and test sizes may be slightly different in each fold.
    See also
    --------
    RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
    n times.
    """

    def __init__(self, n_splits=3, *, shuffle=False, random_state=None):
        super(MultilabelStratifiedKFold, self).__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def _make_test_folds(self, X, y):
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

        num_samples = y.shape[0]

        rng = check_random_state(self.random_state)
        indices = np.arange(num_samples)

        if self.shuffle:
            rng.shuffle(indices)
            y = y[indices]

        r = np.asarray([1 / self.n_splits] * self.n_splits)

        test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

        return test_folds[np.argsort(indices)]

    def _iter_test_masks(self, X=None, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedKFold, self).split(X, y, groups)


class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
    """Repeated Multilabel Stratified K-Fold cross validator.
    Repeats Mulilabel Stratified K-Fold n times with different randomization
    in each repetition.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    n_repeats : int, default=10
        Number of times cross-validator needs to be repeated.
    random_state : None, int or RandomState, default=None
        Random state to be used to generate random state for each
        repetition as well as randomly breaking ties within the iterative
        stratification algorithm.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
    ...     random_state=0)
    >>> for train_index, test_index in rmskf.split(X, y):
    ...     print("TRAIN:", train_index, "TEST:", test_index)
    ...     X_train, X_test = X[train_index], X[test_index]
    ...     y_train, y_test = y[train_index], y[test_index]
    ...
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [0 1 4 5] TEST: [2 3 6 7]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    See also
    --------
    RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
    n times.
    """
    def __init__(self, n_splits=5, *, n_repeats=10, random_state=None):
        super(RepeatedMultilabelStratifiedKFold, self).__init__(
            MultilabelStratifiedKFold, n_repeats=n_repeats, random_state=random_state,
            n_splits=n_splits)


class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
    """Multilabel Stratified ShuffleSplit cross-validator
    Provides train/test indices to split data into train/test sets.
    This cross-validation object is a merge of MultilabelStratifiedKFold and
    ShuffleSplit, which returns stratified randomized folds for multilabel
    data. The folds are made by preserving the percentage of each label.
    Note: like the ShuffleSplit strategy, multilabel stratified random splits
    do not guarantee that all folds will be different, although this is
    still very likely for sizeable datasets.
    Parameters
    ----------
    n_splits : int, default 10
        Number of re-shuffling & splitting iterations.
    test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. By default, the value is set to 0.1.
        The default will change in version 0.21. It will remain 0.1 only
        if ``train_size`` is unspecified, otherwise it will complement
        the specified ``train_size``.
    train_size : float, int, or None, default is None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedShuffleSplit that only uses
        random_state when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
    ...    random_state=0)
    >>> msss.get_n_splits(X, y)
    3
    >>> print(mss)       # doctest: +ELLIPSIS
    MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
                                     train_size=None)
    >>> for train_index, test_index in msss.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    TRAIN: [1 2 5 6] TEST: [0 3 4 7]
    Notes
    -----
    Train and test sizes may be slightly different from desired due to the
    preference of stratification over perfectly sized folds.
    """

    def __init__(self, n_splits=10, *, test_size="default", train_size=None,
                 random_state=None):
        super(MultilabelStratifiedShuffleSplit, self).__init__(
            n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)

    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)

# LGBM feature selection

In [10]:
from shaphypetune import BoostBoruta

params = {
        'boosting_type':'goss',
        'learning_rate': 0.06733232950390658, 
        'n_estimators': 50000, 
        'early_stopping_round' : 100, 
        'subsample' : 0.6970532011679706, # bagging_fraction
        'colsample_bytree': 0.6055755840633003, # feature_fraction
        'num_leaves': 6,
        'class_weight': 'balanced',
        'metric': 'none', 
        'is_unbalance': True, 
        'random_state': 8062023,
        'feature_fraction_seed': 8062023,
        'bagging_seed': 8062023,
        'max_depth': 8,
        'reg_alpha': 0.08866046540248787,  
        'reg_lambda': 1.0245261859148395e-06,
        'importance_type': 'gain'
        }

def balanced_log_loss(y_true, y_pred):

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def calc_log_loss_weight(y_true): 
    '''w0, w1 assign different weights to individual data points during training.'''
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

def lgbm_tuning(features, permut=False, boruta=False):
    metric = balanced_log_loss
    eval_results_ = {}

    cv_scores = [] # store all cv scores of outer loop inference

    perm_df_ = pd.DataFrame()
    feature_importances_ = pd.DataFrame()
    boruta_df_ = pd.DataFrame()
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')
        
        # Create an oof array for inner loop
        oof = np.zeros(train_df.shape[0])
        
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_feature_sel_folds, shuffle=True, random_state=8062023+i)

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df[features], y=greeks.iloc[:,1:3]), start = 1): 
            X, y = train_df[features], train_df.Class
#             X, y = generated_features_train, train_df.Class
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]


            X_train = X_train.reset_index(drop=True)
            y_train = y_train.reset_index(drop=True)

            # Store models here
            models_ = [] 

            eval_results_[fold]= {}

            clf = lgb.LGBMClassifier(**params)
            clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                    eval_metric=bll_metric, # eval_sample_weight=w_val, 
                    early_stopping_rounds=300, verbose=-1)

            models_.append(clf)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof[val_idx] = val_preds

            val_score = metric(y_val, val_preds)
            best_iter = clf.best_iteration_

            print(f'Fold: {blu}{fold:>3}{res}| {metric.__name__}: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')

            # permutation importance
            if permut:
                perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                             random_state=42, cv=None, refit=False).fit(X_val, y_val)

                perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                                   index=X_val.columns).sort_index()

                if perm_df_.shape[0] == 0:
                    perm_df_ = perm_importance_df.copy()
                else:
                    perm_df_ += perm_importance_df

            # tree feature importance
            f_i = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns), 
                                              reverse=True, key=lambda x: x[1]), 
                               columns=['Value','Feature'])

            if feature_importances_.shape[0] == 0:
                feature_importances_ = f_i.copy()
            else:

                feature_importances_['Value'] += f_i['Value']

            # Boruta SHAP importance
            if boruta:
                model = BoostBoruta(clf, importance_type='shap_importances', train_importance=False)
                model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                          eval_metric=bll_metric, early_stopping_rounds=300, verbose=-1)

                boruta_importance_df = pd.DataFrame({'importance': model.ranking_}, 
                                                     index=X_train.columns).sort_index()
                if boruta_df_.shape[0] == 0:
                    boruta_df_ = boruta_importance_df.copy()
                else:
                    boruta_df_ += boruta_importance_df

        fold_cv_score = metric(y, oof)
        print(f'{red} CV score: {res} {metric.__name__}: {red}{fold_cv_score:.5f}{res}')
        print(f'{"*" * 50}\n')
        cv_scores.append(fold_cv_score)


    print(f'{red} Avg score {CFG.n_feature_sel_folds}-fold: {res} {metric.__name__}: {red}{np.mean(cv_scores):.5f}{res}')
    print(f'{"*" * 50}\n')
    
    if permut:
        perm_df_ = perm_df_.sort_values('importance', ascending=False)
        
    if boruta:
        boruta_df_ = boruta_df_.sort_values('importance')
                                    
    feature_importances_ = feature_importances_.sort_values('Value', ascending=False)
    
    return perm_df_, feature_importances_, boruta_df_, np.mean(cv_scores)

if CFG.feature_sel:
    perm_df_, feature_importances_, boruta_df_, cv_scores = lgbm_tuning(features, permut=False, boruta=False)

Inner CV avg score:  balanced_log_loss: 0.20305
**************************************************

 Outer Holdout avg score:  balanced_log_loss: 0.21093
**************************************************

# Analyze permutation feature importance

In [11]:
# perm_df_.to_csv('perm_df.csv')
# perm_df_
# perm_cols = set(perm_df_.index[-20:])
# perm_cols

# Analyze tree gain feature importance

In [12]:
# feature_importances_.to_csv('feature_importances.csv')
# feature_importances_
# fi_cols = set(feature_importances_['Feature'].values[-20:])
# fi_cols

# Analyze BORUTA importance

In [13]:
# boruta_df_.to_csv('perm_df.csv')
# boruta_df_
# boruta_cols = set(boruta_df_.index[-20:])
# boruta_cols

# LGBM Optuna optimization

In [14]:
def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def bll_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def calc_log_loss_weight(y_true): 
    '''w0, w1 assign different weights to individual data points during training.'''
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

def lgbm_opt(features, boosting_type, n_trials):
    X, y = train_df[features], train_df.Class
#     X, y = generated_features_train, train_df.Class
    
    def objective(trial):
        bll_list = list()
        
        param = {
            # Main parameters
#                     'device': 'gpu',
#                     'gpu_platform_id': 0,
#                     'gpu_device_id': 0,
            'objective': 'binary',
            'metric': 'none',
            'is_unbalance': True,
            'boosting_type': boosting_type, # trial.suggest_categorical('boosting_type', ['goss', 'gbdt', 'dart']),   
            # Hyperparamters (in order of importance decreasing)
            'n_estimators': 3000, # trial.suggest_int('n_estimators', 500, 1500),  # max number of trees in model
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 3e-1),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True), # L1,  alias: reg_alpha
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True), # L2, alias: reg_lambda
             # decrease to deal with overfit
            'max_depth': trial.suggest_int('max_depth', 4, 10),   # tree max depth 
             # decrease to deal with overfit
            'num_leaves': trial.suggest_int('num_leaves', 4, 128),  # Max number of leaves in one tree
                                                                   # should be ~ 2**(max_depth-1)
            'bagging_fraction': None, # Randomly select part of data without 
                                      # resampling if bagging_fraction < 1.0
                                      # alias: subsample
            'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 0.7), # Randomly select a subset of features 
                                                                       # if feature_fraction < 1.0
                                                                       # alias: colsample_bytree
            # decrease to deal with overfit
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100), # Minimal number of data in one leaf
                                                                               # aliases: min_child_samples, 
#             # decrease to deat with overfit
#             'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 1e-1), # Stop trying to split 
#                                                                                                    # leave if sum of it's
#                                                                                                    # hessian less than k
#                                                                                                    # alias: min_child_weight
            
            # increase for accuracy, decrease to deal with overfit
            'max_bin': trial.suggest_int('max_bin', 32, 255), # Max number of bins that feature values will be bucketed in
            # increase to deal with overfit
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), # Perform bagging at every k iteration
            'early_stopping_round': 100, 
            
#           'subsample_for_bin': 200000, # Number of data that sampled to construct feature discrete bins; setting this 
                                         # to larger value will give better training result but may increase train time 
#           'cat_smooth': trial.suggest_float('cat_smooth', 10.0, 100.0),  # this can reduce the effect of noises in 
                                                                           # categorical features, especially for 
                                                                           # categories with few data
            'verbose': -1
        }

        if boosting_type != 'goss':
            param['bagging_fraction'] = trial.suggest_float('bagging_fraction', 0.3, 0.7)
        
        for i in range(CFG.n_optimize_repeats):
            print(f'Repeat {blu}#{i+1}')

            kf = MultilabelStratifiedKFold(n_splits=CFG.n_optimize_folds, shuffle=True, random_state=10062023+i)

            # Stratify based on Class and Alpha (3 types of conditions)
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df[features], y=greeks.iloc[:,1:3]), start = 1): 

                # Split the dataset according to the fold indexes.
                X_train = X.iloc[train_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]

                dtrain = lgb.Dataset(X_train, label=y_train)
                dvalid = lgb.Dataset(X_val, label=y_val)
                
                # Add a callback for pruning
                pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'balanced_log_loss')
                
                gbm = lgb.train(
                    param, dtrain, valid_sets=[dvalid], callbacks=[pruning_callback], 
                    feval=bll_metric
                )

                preds = gbm.predict(X_val)
                bll = balanced_log_loss(y_val, preds)
                bll_list.append(bll)
        
        return np.mean(bll_list)
            
    study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=100), direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_lgbm_{boosting_type}.csv')
            
if CFG.lgbm_optimize:
    for bt in ['goss', 'gbdt']:
        lgbm_opt(features, bt, n_trials=1000)

# CatBoost Optuna optimization

In [15]:
from optuna.integration import CatBoostPruningCallback

def catboost_opt(features, n_trials):
    X, y = train_df[features], train_df.Class
    
    def objective(trial):
        bll_list = list()
        
        # Parameters
        params = {
            'auto_class_weights': 'Balanced',
#             'task_type': 'GPU',
            'eval_metric': 'Logloss',
            'loss_function': 'Logloss', 
            'random_seed': 10062023,
            'od_type': 'Iter', # Type of overfitting detector - stop after k iteraions
            'od_wait': 100, # Overfitting detector - stop training after k iterations without metric improvement
#             'metric_period': 100, # Show metric each k iterations
            'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
             # Hyperparamters (in order of importance decreasing)
            'iterations' : 5000, # trial.suggest_int('iterations', 300, 1200),        
            'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),    
            'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
            'depth' : trial.suggest_int('depth', 4, 10),  # Max tree depth         
             # decrease to deal with overfit
            'subsample': trial.suggest_float('subsample', 0.3, 1), # randomly select part of data without return
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1), # the percentage of features to use at each 
                                                                                   # split selection
                                                                                   # alias: rsm
             # increase to deal with overfit
            'random_strength': trial.suggest_float('random_strength', 0, 100), # The amount of randomness to use 
                                                                               # for scoring splits when the tree structure
                                                                               # is selected. Helps to avoid overfitting
            'bagging_temperature' : trial.suggest_float('bagging_temperature', 0, 100),     # Assigns random 
                                                                                            # weights to objects
            # this feature value can be increased to 1024 for important features:
            # per_float_feature_quantization='0:border_count=1024'
            'border_count': 254, # trial.suggest_categorical('border_count', [128, 254]), # The number of splits for numerical features
                                                                                          # bigger is better but slowly
                                                                                          # alias: max_bin
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100), # Minimal number of data in one leaf
                                                                               # aliases: min_child_samples, 
        }
        
        if params['grow_policy'] == 'SymmetricTree': 
            params['boosting_type']= trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
        else:
            params['boosting_type'] = 'Plain'
            
        if params['grow_policy'] == 'Lossguide': 
            params['max_leaves'] = trial.suggest_int('max_leaves', 4, 128) # Max number of leaves in one tree
        else:                                                              # decrease to deal with overfit
            params['max_leaves'] = None
        
        for i in range(CFG.n_optimize_repeats):
            print(f'{blu}Repeat #{i+1}')

            # Create an oof array for inner loop
            oof = np.zeros(train_df.shape[0])

            kf = MultilabelStratifiedKFold(n_splits=CFG.n_optimize_folds, shuffle=True, random_state=8062023+i)

            # Stratify based on Class and Alpha (3 types of conditions)
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df[features], y=greeks.iloc[:,1:3]), start = 1): 

                # Split the dataset according to the fold indexes.
                X_train = X.iloc[train_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]
          
                train_pool = Pool(X_train, y_train, cat_features=['EJ'])
                val_pool = Pool(X_val, y_val, cat_features=['EJ'])

                # Learning
                model = cat.CatBoostClassifier(**params)     
                # Add a callback for pruning
#                 pruning_callback = optuna.integration.CatBoostPruningCallback(trial, "Logloss")
                model.fit(train_pool, eval_set=val_pool, verbose=0)#, callbacks=[pruning_callback])
                # Evoke pruning manually
#                 pruning_callback.check_pruned()
                # Predict
                preds = model.predict_proba(val_pool)[:,1]
                # Evaluation
                bll = balanced_log_loss(y_val, preds)
                bll_list.append(bll)
                
        
        return np.mean(bll_list)
            
#     study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=100), direction="minimize")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe()
    df.sort_values('value').iloc[:, [1] + list(range(5, 14))]
    df.to_csv(f'optuna_catboost_fold_.csv')
            
if CFG.cb_optimize:
    catboost_opt(features, n_trials=1000)

# Load LGBM parameters

In [16]:
import glob

param_list = glob.glob("optuna_lgbm*.csv")
models = list()
best_lgbm_params = list()

lgbm_params = pd.DataFrame()

for f in param_list:
    gb_type = [f.split('_')][0][2][:-4]
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    tmp['params_boosting_type'] = gb_type
    if lgbm_params.shape[0] == 0:
        lgbm_params = tmp
    else:
        lgbm_params = pd.concat([lgbm_params, tmp])
        
lgbm_params = lgbm_params.sort_values('value').head(20)
param_cols = [c for c in lgbm_params.columns if c.startswith('params_')]
lgbm_params = lgbm_params[param_cols]

for idx, row in lgbm_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['objective'] = 'binary'
    row_dict['metric'] = 'none'
#     row_dict['subsample_for_bin'] = 300000
    row_dict['force_col_wise'] = False
    row_dict['early_stopping_rounds'] = 50
    row_dict['verbose'] = -1
    row_dict['max_bin'] = 255
    row_dict['bagging_freq'] = int(row_dict['bagging_freq'])
    if row_dict['bagging_fraction'] != row_dict['bagging_fraction']:
        row_dict['bagging_fraction'] = None
    row_dict['min_child_samples'] = int(row_dict['min_child_samples'])
    row_dict['n_estimators'] = 3000 # int(row_dict['n_estimators'])
    
    row_dict['learning_rate'] = 0.06733232950390658
    row_dict['num_leaves'] = int(row_dict['num_leaves'])
    row_dict['max_depth'] = int(row_dict['max_depth'])
    row_dict['is_unbalance'] = True
    row_dict['class_weight'] = 'balanced'
    row_dict['verbose'] = -1
    
    best_lgbm_params.append(row_dict)

# LGBM stacking

In [17]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def lgbm_training():
    models_ = list()
    bll_list = list()
    weights_ = list()
    
    X, y = train_df[features], train_df.Class
#     X, y = generated_features_train, train_df.Class
     
    kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=8062023+20)
    metric = balanced_log_loss
    eval_results_ = {}     # used to store evaluation results for each fold

    oof_level2 = np.zeros([y.shape[0], len(best_lgbm_params) + 1])
    oof_level2[:, len(best_lgbm_params)] = y

    print(f"Training with {blu}{X.shape[1]}{res} features")

    for fold, (fit_idx, val_idx) in tqdm(enumerate(kf.split(X=train_df, y=greeks.iloc[:,1:3]), start = 1),
                                         total=CFG.n_stacking_folds):
        
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]

        for i, params in enumerate(best_lgbm_params):
            
            clf = lgb.LGBMClassifier(**params)
            clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                    eval_metric=bll_metric, verbose=-1)
            models_.append(clf)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            best_iter = clf.best_iteration_

            print(clf.best_iteration_)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')
        
    return oof_level2, models_

oof_level2_lgbm, models_lgbm = lgbm_training()

Training with [1m[34m56[0m features


  0%|          | 0/10 [00:00<?, ?it/s]

322
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.09302[0m | Best iteration: [1m[34m 322[0m
291
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.07349[0m | Best iteration: [1m[34m 291[0m
203
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.07743[0m | Best iteration: [1m[34m 203[0m
309
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.05975[0m | Best iteration: [1m[34m 309[0m
224
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.07209[0m | Best iteration: [1m[34m 224[0m
300
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.08065[0m | Best iteration: [1m[34m 300[0m
299
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.03538[0m | Best iteration: [1m[34m 299[0m
195
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.07314[0m | Best iteration: [1m[34m 195[0m
204
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.08876[0m | Best iteration: [1m[34m 204[0m
392
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.05104[0m | Best iteration: [1m[34m 392[0m
268
Fold: [1m[34m  1[0m| bl

273
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.05437[0m | Best iteration: [1m[34m 273[0m
278
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.06293[0m | Best iteration: [1m[34m 278[0m
294
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.03529[0m | Best iteration: [1m[34m 294[0m
433
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.03387[0m | Best iteration: [1m[34m 433[0m
601
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.03116[0m | Best iteration: [1m[34m 601[0m
448
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.06609[0m | Best iteration: [1m[34m 448[0m
208
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.07634[0m | Best iteration: [1m[34m 208[0m
528
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.01879[0m | Best iteration: [1m[34m 528[0m
415
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.04711[0m | Best iteration: [1m[34m 415[0m
390
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.16776[0m | Best iteration: [1m[34m 390[0m
334
Fold: [1m[34m  2[0m| bl

323
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.11317[0m | Best iteration: [1m[34m 323[0m
231
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.15085[0m | Best iteration: [1m[34m 231[0m
361
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.12430[0m | Best iteration: [1m[34m 361[0m
343
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.14168[0m | Best iteration: [1m[34m 343[0m
431
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.13063[0m | Best iteration: [1m[34m 431[0m
285
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.15499[0m | Best iteration: [1m[34m 285[0m
266
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.13380[0m | Best iteration: [1m[34m 266[0m
384
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.11908[0m | Best iteration: [1m[34m 384[0m
363
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.13490[0m | Best iteration: [1m[34m 363[0m
342
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.14630[0m | Best iteration: [1m[34m 342[0m
394
Fold: [1m[34m  2[0m| bl

485
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.12527[0m | Best iteration: [1m[34m 485[0m
317
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.12595[0m | Best iteration: [1m[34m 317[0m
300
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.12122[0m | Best iteration: [1m[34m 300[0m
380
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.13220[0m | Best iteration: [1m[34m 380[0m
241
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.14094[0m | Best iteration: [1m[34m 241[0m
340
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.12514[0m | Best iteration: [1m[34m 340[0m
193
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.19387[0m | Best iteration: [1m[34m 193[0m
111
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.27135[0m | Best iteration: [1m[34m 111[0m
118
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.22722[0m | Best iteration: [1m[34m 118[0m
131
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.22688[0m | Best iteration: [1m[34m 131[0m
108
Fold: [1m[34m  3[0m| bl

120
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.21766[0m | Best iteration: [1m[34m 120[0m
129
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.20830[0m | Best iteration: [1m[34m 129[0m
170
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.19508[0m | Best iteration: [1m[34m 170[0m
141
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.22502[0m | Best iteration: [1m[34m 141[0m
124
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.22810[0m | Best iteration: [1m[34m 124[0m
104
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.21804[0m | Best iteration: [1m[34m 104[0m
134
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.21541[0m | Best iteration: [1m[34m 134[0m
96
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.18665[0m | Best iteration: [1m[34m  96[0m
91
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.21588[0m | Best iteration: [1m[34m  91[0m
115
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.19554[0m | Best iteration: [1m[34m 115[0m
124
Fold: [1m[34m  3[0m| bll_

127
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.21345[0m | Best iteration: [1m[34m 127[0m
120
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.21721[0m | Best iteration: [1m[34m 120[0m
114
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.20679[0m | Best iteration: [1m[34m 114[0m
107
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.20584[0m | Best iteration: [1m[34m 107[0m
99
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.18558[0m | Best iteration: [1m[34m  99[0m
76
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.18779[0m | Best iteration: [1m[34m  76[0m
92
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.19985[0m | Best iteration: [1m[34m  92[0m
68
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.20928[0m | Best iteration: [1m[34m  68[0m
76
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.20941[0m | Best iteration: [1m[34m  76[0m
85
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.20712[0m | Best iteration: [1m[34m  85[0m
85
Fold: [1m[34m  4[0m| bll_metri

81
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.19977[0m | Best iteration: [1m[34m  81[0m
86
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.19642[0m | Best iteration: [1m[34m  86[0m
81
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.22335[0m | Best iteration: [1m[34m  81[0m
85
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.18480[0m | Best iteration: [1m[34m  85[0m
90
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.17645[0m | Best iteration: [1m[34m  90[0m
68
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.23666[0m | Best iteration: [1m[34m  68[0m
94
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.22022[0m | Best iteration: [1m[34m  94[0m
100
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.15413[0m | Best iteration: [1m[34m 100[0m
93
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.18663[0m | Best iteration: [1m[34m  93[0m
104
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.17922[0m | Best iteration: [1m[34m 104[0m
78
Fold: [1m[34m  4[0m| bll_metric:

241
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.11621[0m | Best iteration: [1m[34m 241[0m
279
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.10341[0m | Best iteration: [1m[34m 279[0m
131
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.14425[0m | Best iteration: [1m[34m 131[0m
177
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.14706[0m | Best iteration: [1m[34m 177[0m
171
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.19555[0m | Best iteration: [1m[34m 171[0m
248
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.11392[0m | Best iteration: [1m[34m 248[0m
153
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.12010[0m | Best iteration: [1m[34m 153[0m
173
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.13148[0m | Best iteration: [1m[34m 173[0m
197
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.12766[0m | Best iteration: [1m[34m 197[0m
152
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.14019[0m | Best iteration: [1m[34m 152[0m
180
Fold: [1m[34m  5[0m| bl

175
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.11755[0m | Best iteration: [1m[34m 175[0m
163
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.14799[0m | Best iteration: [1m[34m 163[0m
218
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.11625[0m | Best iteration: [1m[34m 218[0m
390
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.09669[0m | Best iteration: [1m[34m 390[0m
129
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.14239[0m | Best iteration: [1m[34m 129[0m
244
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.12264[0m | Best iteration: [1m[34m 244[0m
201
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.14987[0m | Best iteration: [1m[34m 201[0m
215
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.10658[0m | Best iteration: [1m[34m 215[0m
219
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.13705[0m | Best iteration: [1m[34m 219[0m
174
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.13188[0m | Best iteration: [1m[34m 174[0m
211
Fold: [1m[34m  6[0m| bl

221
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.09608[0m | Best iteration: [1m[34m 221[0m
199
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.12564[0m | Best iteration: [1m[34m 199[0m
174
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.12152[0m | Best iteration: [1m[34m 174[0m
116
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.13147[0m | Best iteration: [1m[34m 116[0m
128
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.12552[0m | Best iteration: [1m[34m 128[0m
111
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.12530[0m | Best iteration: [1m[34m 111[0m
186
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.13636[0m | Best iteration: [1m[34m 186[0m
135
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.12496[0m | Best iteration: [1m[34m 135[0m
136
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.15344[0m | Best iteration: [1m[34m 136[0m
182
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.15869[0m | Best iteration: [1m[34m 182[0m
106
Fold: [1m[34m  6[0m| bl

127
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.15246[0m | Best iteration: [1m[34m 127[0m
133
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.14421[0m | Best iteration: [1m[34m 133[0m
191
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.11273[0m | Best iteration: [1m[34m 191[0m
173
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.15674[0m | Best iteration: [1m[34m 173[0m
151
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.12039[0m | Best iteration: [1m[34m 151[0m
158
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.13053[0m | Best iteration: [1m[34m 158[0m
34
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.46726[0m | Best iteration: [1m[34m  34[0m
150
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.36302[0m | Best iteration: [1m[34m 150[0m
142
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.45261[0m | Best iteration: [1m[34m 142[0m
149
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.34964[0m | Best iteration: [1m[34m 149[0m
129
Fold: [1m[34m  7[0m| bll

26
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.45787[0m | Best iteration: [1m[34m  26[0m
246
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.38608[0m | Best iteration: [1m[34m 246[0m
152
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.38498[0m | Best iteration: [1m[34m 152[0m
191
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.36388[0m | Best iteration: [1m[34m 191[0m
181
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.38893[0m | Best iteration: [1m[34m 181[0m
162
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.36816[0m | Best iteration: [1m[34m 162[0m
160
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.40781[0m | Best iteration: [1m[34m 160[0m
179
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.39069[0m | Best iteration: [1m[34m 179[0m
163
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.42278[0m | Best iteration: [1m[34m 163[0m
179
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.36189[0m | Best iteration: [1m[34m 179[0m
183
Fold: [1m[34m  7[0m| bll

145
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.38720[0m | Best iteration: [1m[34m 145[0m
350
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.11808[0m | Best iteration: [1m[34m 350[0m
584
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.05870[0m | Best iteration: [1m[34m 584[0m
224
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.11564[0m | Best iteration: [1m[34m 224[0m
807
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.03437[0m | Best iteration: [1m[34m 807[0m
569
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.04653[0m | Best iteration: [1m[34m 569[0m
448
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.06593[0m | Best iteration: [1m[34m 448[0m
533
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.04902[0m | Best iteration: [1m[34m 533[0m
600
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.04253[0m | Best iteration: [1m[34m 600[0m
558
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.05212[0m | Best iteration: [1m[34m 558[0m
539
Fold: [1m[34m  8[0m| bl

546
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.04507[0m | Best iteration: [1m[34m 546[0m
617
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.07063[0m | Best iteration: [1m[34m 617[0m
472
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.06728[0m | Best iteration: [1m[34m 472[0m
688
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.02854[0m | Best iteration: [1m[34m 688[0m
408
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.07395[0m | Best iteration: [1m[34m 408[0m
689
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.06125[0m | Best iteration: [1m[34m 689[0m
559
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.06221[0m | Best iteration: [1m[34m 559[0m
553
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.03033[0m | Best iteration: [1m[34m 553[0m
538
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.05051[0m | Best iteration: [1m[34m 538[0m
373
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.09148[0m | Best iteration: [1m[34m 373[0m
211
Fold: [1m[34m  9[0m| bl

184
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.15327[0m | Best iteration: [1m[34m 184[0m
163
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.15821[0m | Best iteration: [1m[34m 163[0m
153
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.18957[0m | Best iteration: [1m[34m 153[0m
141
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.19867[0m | Best iteration: [1m[34m 141[0m
155
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.20083[0m | Best iteration: [1m[34m 155[0m
175
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.20239[0m | Best iteration: [1m[34m 175[0m
171
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.14039[0m | Best iteration: [1m[34m 171[0m
160
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.13990[0m | Best iteration: [1m[34m 160[0m
177
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.17469[0m | Best iteration: [1m[34m 177[0m
157
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.19662[0m | Best iteration: [1m[34m 157[0m
171
Fold: [1m[34m  9[0m| bl

109
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.17616[0m | Best iteration: [1m[34m 109[0m
237
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.17801[0m | Best iteration: [1m[34m 237[0m
180
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.15524[0m | Best iteration: [1m[34m 180[0m
213
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.12695[0m | Best iteration: [1m[34m 213[0m
122
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.22836[0m | Best iteration: [1m[34m 122[0m
57
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.42841[0m | Best iteration: [1m[34m  57[0m
37
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.43369[0m | Best iteration: [1m[34m  37[0m
44
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.42406[0m | Best iteration: [1m[34m  44[0m
52
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.42712[0m | Best iteration: [1m[34m  52[0m
56
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.45933[0m | Best iteration: [1m[34m  56[0m
70
Fold: [1m[34m 10[0m| bll_metr

53
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.47603[0m | Best iteration: [1m[34m  53[0m
44
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.43261[0m | Best iteration: [1m[34m  44[0m
42
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.41722[0m | Best iteration: [1m[34m  42[0m
49
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.48006[0m | Best iteration: [1m[34m  49[0m
52
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.47818[0m | Best iteration: [1m[34m  52[0m
57
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.43859[0m | Best iteration: [1m[34m  57[0m
104
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.42257[0m | Best iteration: [1m[34m 104[0m
44
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.46387[0m | Best iteration: [1m[34m  44[0m
68
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.44809[0m | Best iteration: [1m[34m  68[0m
65
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.48039[0m | Best iteration: [1m[34m  65[0m
57
Fold: [1m[34m 10[0m| bll_metric: 

# Load CatBoost parameters

In [26]:
import glob

param_list = glob.glob("optuna_catboost*.csv")
models = list()
best_cb_params = list()

cb_params = pd.DataFrame()

for f in param_list:
    gb_type = [f.split('_')][0][2][:-4]
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if cb_params.shape[0] == 0:
        cb_params = tmp
    else:
        cb_params = pd.concat([cb_params, tmp])
        
cb_params = cb_params.sort_values('value').head(10)
param_cols = [c for c in cb_params.columns if c.startswith('params_')]
cb_params = cb_params[param_cols]


for idx, row in cb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['auto_class_weights'] = 'Balanced'
    row_dict['eval_metric'] = 'Logloss'
    row_dict['loss_function'] = 'Logloss'
    row_dict['random_seed'] = 13062023
    row_dict['verbose'] = 0
    row_dict['od_type'] = 'Iter'
    row_dict['od_wait'] = 100
    row_dict['border_count'] = 254
    row_dict['iterations'] = 10000
    
    row_dict['learning_rate'] = float(row_dict['learning_rate'])
    row_dict['l2_leaf_reg'] = float(row_dict['l2_leaf_reg'])
    row_dict['depth'] = int(row_dict['depth'])
    row_dict['subsample'] = 0.7 # float(row_dict['subsample'])
    row_dict['colsample_bylevel'] = 0.6 # float(row_dict['colsample_bylevel'])
    row_dict['random_strength'] = float(row_dict['random_strength'])
    row_dict['bagging_temperature'] = float(row_dict['bagging_temperature'])
    row_dict['min_data_in_leaf'] = int(row_dict['min_data_in_leaf'])
    if row_dict['grow_policy'] == 'Lossguide':
        row_dict['max_leaves'] = int(row_dict['max_leaves'])
    else:
        del row_dict['max_leaves']
    if row_dict['grow_policy'] == 'SymmetricTree':
        row_dict['boosting_type'] = 'Plain'
    else:
        del row_dict['boosting_type']
    best_cb_params.append(row_dict)

In [31]:
cb_params

Unnamed: 0,params_bagging_temperature,params_boosting_type,params_colsample_bylevel,params_depth,params_grow_policy,params_l2_leaf_reg,params_learning_rate,params_max_leaves,params_min_data_in_leaf,params_random_strength,params_subsample
396,16.924565,,0.62421,7,Depthwise,98.992431,0.019815,,40,52.263689,0.701717
481,28.824499,,0.626792,7,Depthwise,47.924482,0.018408,,48,41.631713,0.763397
394,16.455069,,0.707206,7,Depthwise,98.541614,0.022903,,52,51.869465,0.71552
370,13.088738,,0.687809,7,Depthwise,41.046913,0.028732,,78,67.964056,0.655998
390,18.837716,,0.626973,7,Depthwise,60.975839,0.023156,,45,56.234068,0.675789
449,31.240715,,0.60514,7,Depthwise,50.982881,0.016109,,37,45.490921,0.779972
466,28.818879,,0.622859,7,Depthwise,99.863997,0.017545,,41,41.019326,0.685292
467,29.181086,,0.627586,7,Depthwise,99.522737,0.016224,,42,40.595238,0.684444
498,60.471417,,0.61306,7,Depthwise,62.874617,0.015994,,44,43.275402,0.695487
484,29.059347,,0.639709,7,Depthwise,40.758941,0.018064,,48,41.449227,0.744676


# CatBoost Stacking

In [28]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def cb_training():
    models_ = list()
    bll_list = list()
    weights_ = list()
    
    X, y = train_df[features], train_df.Class
#     X, y = generated_features_train, train_df.Class
     
    kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=8062023+20)
    metric = balanced_log_loss
    eval_results_ = {}     # used to store evaluation results for each fold

    oof_level2 = np.zeros([y.shape[0], len(best_cb_params) + 1])
    oof_level2[:, len(best_cb_params)] = y

    print(f"Training with {blu}{X.shape[1]}{res} features")

    for fold, (fit_idx, val_idx) in tqdm(enumerate(kf.split(X=train_df, y=greeks.iloc[:,1:3]), start = 1),
                                         total=CFG.n_stacking_folds):
        
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]

        train_pool = Pool(X_train, y_train, cat_features=['EJ'])
        val_pool = Pool(X_val, y_val, cat_features=['EJ'])
        
        for i, params in enumerate(best_cb_params):
            
            model = cat.CatBoostClassifier(**params)
            model.fit(train_pool, eval_set=val_pool, verbose=0)
            models_.append(model)

            val_preds = model.predict_proba(val_pool)[:,1]
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            best_iter = model.best_iteration_

            print(model.best_iteration_)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')
        
    return oof_level2, models_

oof_level2_cb, models_cb = cb_training()

Training with [1m[34m56[0m features


  0%|          | 0/10 [00:00<?, ?it/s]

6908
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.09011[0m | Best iteration: [1m[34m6908[0m
2384
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.08756[0m | Best iteration: [1m[34m2384[0m
3617
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.09357[0m | Best iteration: [1m[34m3617[0m
5311
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.06641[0m | Best iteration: [1m[34m5311[0m
4726
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.08850[0m | Best iteration: [1m[34m4726[0m
2263
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.09716[0m | Best iteration: [1m[34m2263[0m
5514
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.09097[0m | Best iteration: [1m[34m5514[0m
9997
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.08402[0m | Best iteration: [1m[34m9997[0m
2911
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.09060[0m | Best iteration: [1m[34m2911[0m
2441
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.08658[0m | Best iteration: [1m[34m2441[0m
7892
Fold: [1m[34m

1336
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.26196[0m | Best iteration: [1m[34m1336[0m
1299
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.26428[0m | Best iteration: [1m[34m1299[0m
1315
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.28709[0m | Best iteration: [1m[34m1315[0m
1701
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.25630[0m | Best iteration: [1m[34m1701[0m
1450
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.25538[0m | Best iteration: [1m[34m1450[0m
1151
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.25115[0m | Best iteration: [1m[34m1151[0m
632
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.49774[0m | Best iteration: [1m[34m 632[0m
639
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.47966[0m | Best iteration: [1m[34m 639[0m
559
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.52741[0m | Best iteration: [1m[34m 559[0m
387
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.50852[0m | Best iteration: [1m[34m 387[0m
538
Fold: [1m[34m 10[

# Stacking with Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

oof_level2 = np.concatenate([oof_level2_lgbm[:,:-1] , oof_level2_cb[:,:-1] ], axis=1)
X = oof_level2
y = oof_level2_lgbm[:,-1]

# mean bll
print(balanced_log_loss(y, np.mean(X, axis=1)))

lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)

pred = lr.predict_proba(X)[:,1]

# lr bll
print(balanced_log_loss(y, pred))

weights = lr.coef_[0]

0.19535033662509985
0.18179915769320437


# Models evaluation

In [22]:
# ## Model Evaluation
# metric_score_folds = pd.DataFrame.from_dict(all_eval_results_)
# fit_logloss = []
# val_logloss = []

# for seed in CFG.seeds:
#     for fold in range(1,CFG.n_folds+1):
#         fit_logloss.append(metric_score_folds[seed][fold]['training']['balanced_log_loss'])
#         val_logloss.append(metric_score_folds[seed][fold]['valid_1']['balanced_log_loss'])

# fig, axes = plt.subplots(math.ceil(CFG.n_folds*len(CFG.seeds)/CFG.n_folds), CFG.n_folds, figsize=(20, 20), dpi=150)
# ax = axes.flatten()
# for i, (f, v, m) in enumerate(zip(fit_logloss, val_logloss, models_), start = 1): 
#     sns.lineplot(f, color='#B90000', ax=ax[i-1], label='fit')
#     sns.lineplot(v, color='#048BA8', ax=ax[i-1], label='val')
#     ax[i-1].legend()
#     ax[i-1].spines['top'].set_visible(False);
#     ax[i-1].spines['right'].set_visible(False)
#     ax[i-1].set_title(f'Seed {CFG.seeds[(i-1)//CFG.n_folds]} Fold {CFG.n_folds if i%CFG.n_folds==0 else i%CFG.n_folds}', fontdict={'fontweight': 'bold'})

#     color =  ['#048BA8', palette[-3]]
#     best_iter = m.best_iteration_
#     span_range = [[0, best_iter], [best_iter + 10, best_iter + CFG.num_boost_round]]

#     for idx, sub_title in enumerate([f'Best\nIteration: {best_iter}', f'Early\n Stopping: 2000']):
#         ax[i-1].annotate(sub_title,
#                     xy=(sum(span_range[idx])/2 , 0.5),
#                     xytext=(0,0), textcoords='offset points',
#                     va="center", ha="center",
#                     color="w", fontsize=16, fontweight='bold',
#                     bbox=dict(boxstyle='round4', pad=0.4, color=color[idx], alpha=0.6))
#         ax[i-1].axvspan(span_range[idx][0]-0.4,span_range[idx][1]+0.4,  color=color[idx], alpha=0.07)

#     ax[i-1].set_xlim(0, best_iter + 20 + 2000)
#     ax[i-1].legend(bbox_to_anchor=(0.95, 1), loc='upper right', title='logloss')

# plt.tight_layout();

# Predict test

In [38]:
models = models_lgbm + models_cb

def predict(X):
    y = np.zeros(len(X))
    for i, model in enumerate(models):
#         y += weights[i] * model.predict_proba(X)[:,1]
        y += model.predict_proba(X)[:,1]
#     return y / sum(weights)
    return y / len(models)

predictions = predict(test_df[features])
# predictions = predict(generated_features_test)

test_df['class_1'] = predictions
test_df['class_0'] = 1 - predictions

sample_submission[['class_0', 'class_1']] = test_df[['class_0', 'class_1']]
sample_submission.to_csv(r"submission.csv", index=False)
sample_submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.739544,0.260456
1,010ebe33f668,0.739544,0.260456
2,02fa521e1838,0.739544,0.260456
3,040e15f562a2,0.739544,0.260456
4,046e85c7cc7f,0.739544,0.260456


You have a lot of resulting features. I have already identified a few important once. 