In [16]:
import math

import numpy as np
import pandas as pd

import lightgbm as lgb
import catboost as cat
from catboost import Pool
import xgboost as xgb

import itertools
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, BaseShuffleSplit, _validate_shuffle_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from scipy import stats

import eli5
from IPython.display import display
from eli5.permutation_importance import get_score_importances
from eli5.sklearn import PermutationImportance

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns

import optuna

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL


class CFG:
    undersample = True
    
    feature_sel = False
    n_feature_sel_folds = 5
    
    lgbm_optimize = False
    cb_optimize = False
    xgb_optimize = False
    n_trials = 500
    n_optimize_folds = 5
    n_optimize_repeats = 5
    
    stacking = True
    n_stacking_folds = 5
    

# Load Data

In [2]:
# COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
COMP_PATH = "icr-identify-age-related-conditions"

train_df = pd.read_csv(f'{COMP_PATH}//train.csv')
test_df = pd.read_csv(f'{COMP_PATH}/test.csv')
greeks = pd.read_csv(f"{COMP_PATH}/greeks.csv")
sample_submission = pd.read_csv(f"{COMP_PATH}/sample_submission.csv")

train_df['EJ'] = train_df['EJ'].replace({'A': 0, 'B': 1})
test_df['EJ'] = test_df['EJ'].replace({'A': 0, 'B': 1})

train_df.columns = train_df.columns.str.replace(' ', '')
test_df.columns = test_df.columns.str.replace(' ', '')

# train_df.drop('Id',axis=1, inplace=True)
# train_df.fillna(train_df.median(), inplace=True)

# Standard Scaler

In [3]:
# scaler = StandardScaler()
# new_num_cols = train_df.select_dtypes(include=['float64']).columns

# train_df[new_num_cols] = scaler.fit_transform(train_df[new_num_cols])
# test_df[new_num_cols] = scaler.transform(test_df[new_num_cols])

# Brute Force Feature Generation

Combine features in all possible ways.

In [4]:
# fi = pd.read_csv('feature_importances.csv', index_col = 'Unnamed: 0')
# fi_cols = set(fi['Feature'].head(100).values)

# perm = pd.read_csv('perm_df.csv', index_col = 'Unnamed: 0')
# perm_cols = set(perm['importance'].head(100).index)

# important_col = list(perm_cols.intersection(fi_cols))
# print(important_col)

# Denoising

In [5]:
# features = [fe for fe in train_df.columns if fe not in ['Id','CF', 'CB', 'DV', 'BR', 'DF', 'AR', 'GI', 'AY', 'GB',
#                                                         'AH', 'CW', 'CL', 'Class', 'BP']]

# for f in features:
#     train_df[f] = np.floor(train_df[f]*1000)/1000 # quality decreases no significant result for LGBM

# Log features (preserve sign)

In [6]:
# for f in features:
#     train_df[f] = np.sign(train_df[f]) * np.log1p(np.abs(train_df[f])) # no significant result for LGBM

# Remove outliers

In [7]:
features_with_outliers = [fe for fe in train_df.columns if fe not in ['BN', 'BQ', 'CW', 'EL', 'GH', 
                                                                      'GI', 'GL', 'Id', 'Class', 'EJ']]

for f in features_with_outliers:
    train_df[f] = train_df[f].clip(upper=train_df[f].quantile(0.99))

# NaN imputing

In [8]:
# from sklearn.impute import KNNImputer
# from sklearn.metrics import pairwise_distances


# train_df['BQ'] = train_df['BQ'].fillna(train_df['BQ'].min())
# train_df['EL'] = train_df['EL'].fillna(train_df['EL'].mode()[0])


# features = [fe for fe in train_df.columns if fe not in ['Id','Class']]

# def cosine_dist(X, Y, metric='cosine', missing_values=np.nan, **kwargs):
#     X[np.isnan(X)]=0
#     Y[np.isnan(Y)]=0
#     return pairwise_distances(X=X.reshape(-1, 1), 
#                               Y=Y.reshape(-1, 1), 
#                               metric='cosine').sum()

# imputer = KNNImputer(n_neighbors=5, metric=cosine_dist)
# imputer.fit_transform(train_df[features])

# Feature generation

In [9]:
features = train_df.drop(['Class', 'Id'], axis=1).columns

# features = [fe for fe in train_df.columns if fe not in ['CF', 'CB', 'DV', 'BR', 'DF', 'GB', 'AH', 
#                                                         'CW', 'CL', 'BP', 'BD', 'FC', 'GE', 'GF',
#                                                         'AR', 'GI', 'Id', 'Class', 'AX']]

# def gen_features(features, df):
#     generated_features = pd.DataFrame()

#     for fe_a, fe_b in tqdm(itertools.combinations(features, 2), total=sum([1 for i in itertools.combinations(features, 2)])):

# #         generated_features[f'{fe_a}_2']        = df[fe_a].pow(2)
# #         generated_features[f'{fe_b}_2']        = df[fe_b].pow(2)
# #         generated_features[f'{fe_a}*{fe_b}_2'] = df[fe_a] * df[fe_b].pow(2)
# #         generated_features[f'{fe_a}_2*{fe_b}'] = df[fe_a].pow(2) * df[fe_b]

# #         generated_features[f'{fe_a}_05'] = df[fe_a].pow(0.5)
# #         generated_features[f'{fe_b}_05'] = df[fe_b].pow(0.5)
# #         generated_features[f'{fe_a}*{fe_b}_05'] = df[fe_a] * df[fe_b].pow(0.5)
# #         generated_features[f'{fe_a}_05*{fe_b}'] = df[fe_a].pow(0.5) * df[fe_b]

# #         generated_features[f'{fe_a}_log'] = np.log(df[fe_a])
# #         generated_features[f'{fe_b}_log'] = np.log(df[fe_b])
# #         generated_features[f'{fe_a}*{fe_b}_log'] = df[fe_a] * np.log(df[fe_b])
# #         generated_features[f'{fe_a}_log*{fe_b}'] = np.log(df[fe_a]) * df[fe_b]
        
#     generated_features = generated_features[selected]
#     generated_features = pd.concat([generated_features, df[features]], axis=1)
    
#     # prevent inf
#     for g in generated_features.columns:
#         generated_features[g] = np.minimum(np.maximum(generated_features[g], -1e9), 1e9)
    
#     return generated_features

# generated_features_train = gen_features(features, train_df)
# generated_features_test = gen_features(features, test_df)

# features = generated_features_train.columns

# Add distance features

In [10]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

class_imbalance = train_df[train_df['Class'] == 0].shape[0] / train_df[train_df['Class'] == 1].shape[0]

# average label of 20 Nearest Neighbours (colsine distance)
knn = NearestNeighbors(n_neighbors=21, metric='cosine', n_jobs=-1)
knn.fit(train_df[features].fillna(0))

# train
dists, nears = knn.kneighbors(train_df[features].fillna(0), return_distance=True)
dists, nears = dists[:,1:], nears[:,1:]

classes = np.array([train_df.loc[n, 'Class'] for n in nears])
train_df['class_cos'] = np.array(classes[i].mean() * class_imbalance for i in range(len(nears)))
train_df['class_cos'] = train_df['class_cos'].astype(float)

# test
dists, nears = knn.kneighbors(test_df[features].fillna(0), return_distance=True)
dists, nears = dists[:,1:], nears[:,1:]

classes = np.array([train_df.loc[n, 'Class'] for n in nears])
test_df['class_cos'] = np.array(classes[i].mean()  * class_imbalance for i in range(len(nears)))
test_df['class_cos'] = test_df['class_cos'].astype(float)


# features = [fe for fe in train_df.columns if fe not in ['CF', 'CB', 'DV', 'BR', 'DF', 'GB', 'AH', 
#                                                         'CW', 'CL', 'BP', 'BD', 'FC', 'GE', 'GF',
#                                                         'AR', 'GI', 'Id', 'Class', 'AX']]

# Multilabel Stratification

In [None]:
def IterativeStratification(labels, r, random_state):
    """This function implements the Iterative Stratification algorithm described
    in the following paper:
    Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
    Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
    (eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
    2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
    Heidelberg.
    """

    n_samples = labels.shape[0]
    test_folds = np.zeros(n_samples, dtype=int)

    # Calculate the desired number of examples at each subset
    c_folds = r * n_samples

    # Calculate the desired number of examples of each label at each subset
    c_folds_labels = np.outer(r, labels.sum(axis=0))

    labels_not_processed_mask = np.ones(n_samples, dtype=bool)

    while np.any(labels_not_processed_mask):
        # Find the label with the fewest (but at least one) remaining examples,
        # breaking ties randomly
        num_labels = labels[labels_not_processed_mask].sum(axis=0)

        # Handle case where only all-zero labels are left by distributing
        # across all folds as evenly as possible (not in original algorithm but
        # mentioned in the text). (By handling this case separately, some
        # code redundancy is introduced; however, this approach allows for
        # decreased execution time when there are a relatively large number
        # of all-zero labels.)
        if num_labels.sum() == 0:
            sample_idxs = np.where(labels_not_processed_mask)[0]

            for sample_idx in sample_idxs:
                fold_idx = np.where(c_folds == c_folds.max())[0]

                if fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]

                test_folds[sample_idx] = fold_idx
                c_folds[fold_idx] -= 1

            break

        label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
        if label_idx.shape[0] > 1:
            label_idx = label_idx[random_state.choice(label_idx.shape[0])]

        sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]

        for sample_idx in sample_idxs:
            # Find the subset(s) with the largest number of desired examples
            # for this label, breaking ties by considering the largest number
            # of desired examples, breaking further ties randomly
            label_folds = c_folds_labels[:, label_idx]
            fold_idx = np.where(label_folds == label_folds.max())[0]

            if fold_idx.shape[0] > 1:
                temp_fold_idx = np.where(c_folds[fold_idx] ==
                                         c_folds[fold_idx].max())[0]
                fold_idx = fold_idx[temp_fold_idx]

                if temp_fold_idx.shape[0] > 1:
                    fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]

            test_folds[sample_idx] = fold_idx
            labels_not_processed_mask[sample_idx] = False

            # Update desired number of examples
            c_folds_labels[fold_idx, labels[sample_idx]] -= 1
            c_folds[fold_idx] -= 1

    return test_folds


class MultilabelStratifiedKFold(_BaseKFold):
    """Multilabel stratified K-Folds cross-validator
    Provides train/test indices to split multilabel data into train/test sets.
    This cross-validation object is a variation of KFold that returns
    stratified folds for multilabel data. The folds are made by preserving
    the percentage of samples for each label.
    Parameters
    ----------
    n_splits : int, default=3
        Number of folds. Must be at least 2.
    shuffle : boolean, optional
        Whether to shuffle each stratification of the data before splitting
        into batches.
    random_state : int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedKFold that only uses random_state
        when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
    >>> mskf.get_n_splits(X, y)
    2
    >>> print(mskf)  # doctest: +NORMALIZE_WHITESPACE
    MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
    >>> for train_index, test_index in mskf.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    Notes
    -----
    Train and test sizes may be slightly different in each fold.
    See also
    --------
    RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
    n times.
    """

    def __init__(self, n_splits=3, *, shuffle=False, random_state=None):
        super(MultilabelStratifiedKFold, self).__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def _make_test_folds(self, X, y):
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

        num_samples = y.shape[0]

        rng = check_random_state(self.random_state)
        indices = np.arange(num_samples)

        if self.shuffle:
            rng.shuffle(indices)
            y = y[indices]

        r = np.asarray([1 / self.n_splits] * self.n_splits)

        test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

        return test_folds[np.argsort(indices)]

    def _iter_test_masks(self, X=None, y=None, groups=None):
        test_folds = self._make_test_folds(X, y)
        for i in range(self.n_splits):
            yield test_folds == i

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedKFold, self).split(X, y, groups)


class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
    """Repeated Multilabel Stratified K-Fold cross validator.
    Repeats Mulilabel Stratified K-Fold n times with different randomization
    in each repetition.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    n_repeats : int, default=10
        Number of times cross-validator needs to be repeated.
    random_state : None, int or RandomState, default=None
        Random state to be used to generate random state for each
        repetition as well as randomly breaking ties within the iterative
        stratification algorithm.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
    ...     random_state=0)
    >>> for train_index, test_index in rmskf.split(X, y):
    ...     print("TRAIN:", train_index, "TEST:", test_index)
    ...     X_train, X_test = X[train_index], X[test_index]
    ...     y_train, y_test = y[train_index], y[test_index]
    ...
    TRAIN: [0 3 4 6] TEST: [1 2 5 7]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [0 1 4 5] TEST: [2 3 6 7]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    See also
    --------
    RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
    n times.
    """
    def __init__(self, n_splits=5, *, n_repeats=10, random_state=None):
        super(RepeatedMultilabelStratifiedKFold, self).__init__(
            MultilabelStratifiedKFold, n_repeats=n_repeats, random_state=random_state,
            n_splits=n_splits)


class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
    """Multilabel Stratified ShuffleSplit cross-validator
    Provides train/test indices to split data into train/test sets.
    This cross-validation object is a merge of MultilabelStratifiedKFold and
    ShuffleSplit, which returns stratified randomized folds for multilabel
    data. The folds are made by preserving the percentage of each label.
    Note: like the ShuffleSplit strategy, multilabel stratified random splits
    do not guarantee that all folds will be different, although this is
    still very likely for sizeable datasets.
    Parameters
    ----------
    n_splits : int, default 10
        Number of re-shuffling & splitting iterations.
    test_size : float, int, None, optional
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. By default, the value is set to 0.1.
        The default will change in version 0.21. It will remain 0.1 only
        if ``train_size`` is unspecified, otherwise it will complement
        the specified ``train_size``.
    train_size : float, int, or None, default is None
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Unlike StratifiedShuffleSplit that only uses
        random_state when ``shuffle`` == True, this multilabel implementation
        always uses the random_state since the iterative stratification
        algorithm breaks ties randomly.
    Examples
    --------
    >>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    >>> import numpy as np
    >>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
    >>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
    >>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
    ...    random_state=0)
    >>> msss.get_n_splits(X, y)
    3
    >>> print(mss)       # doctest: +ELLIPSIS
    MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
                                     train_size=None)
    >>> for train_index, test_index in msss.split(X, y):
    ...    print("TRAIN:", train_index, "TEST:", test_index)
    ...    X_train, X_test = X[train_index], X[test_index]
    ...    y_train, y_test = y[train_index], y[test_index]
    TRAIN: [1 2 5 7] TEST: [0 3 4 6]
    TRAIN: [2 3 6 7] TEST: [0 1 4 5]
    TRAIN: [1 2 5 6] TEST: [0 3 4 7]
    Notes
    -----
    Train and test sizes may be slightly different from desired due to the
    preference of stratification over perfectly sized folds.
    """

    def __init__(self, n_splits=10, *, test_size="default", train_size=None,
                 random_state=None):
        super(MultilabelStratifiedShuffleSplit, self).__init__(
            n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)

    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Note that providing ``y`` is sufficient to generate the splits and
            hence ``np.zeros(n_samples)`` may be used as a placeholder for
            ``X`` instead of actual training data.
        y : array-like, shape (n_samples, n_labels)
            The target variable for supervised learning problems.
            Multilabel stratification is done based on the y labels.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        Notes
        -----
        Randomized CV splitters may return different results for each call of
        split. You can make the results identical by setting ``random_state``
        to an integer.
        """
        y = check_array(y, ensure_2d=False, dtype=None)
        return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)

# LGBM feature selection

In [None]:
from shaphypetune import BoostBoruta

params = {
        'boosting_type':'goss',
        'learning_rate': 0.06733232950390658, 
        'n_estimators': 5000, 
        'early_stopping_round' : 100, 
        'subsample' : 0.7, # bagging_fraction
        'colsample_bytree': 0.6, # feature_fraction
        'num_leaves': 33,
        'class_weight': 'balanced',
        'metric': 'none', 
        'is_unbalance': True, 
        'random_state': 8062023,
        'feature_fraction_seed': 8062023,
        'bagging_seed': 8062023,
        'max_depth': 6,
        'reg_alpha': 2.025436e-04,  
        'reg_lambda': 2.290193e-07,
#         'bagging_freq': 6,
        'max_bin': 198,
        'min_child_samples': 32,
        'importance_type': 'gain'
        }

def balanced_log_loss(y_true, y_pred):

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def calc_log_loss_weight(y_true): 
    '''w0, w1 assign different weights to individual data points during training.'''
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

def lgbm_tuning(features, permut=False, boruta=False):
    metric = balanced_log_loss
    eval_results_ = {}

    cv_scores = [] # store all cv scores of outer loop inference

    perm_df_ = pd.DataFrame()
    feature_importances_ = pd.DataFrame()
    boruta_df_ = pd.DataFrame()
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')
        
        # Make random under-sampling to balance classes
        positive_count_train = train_df['Class'].value_counts()[1]
        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                        1: positive_count_train}, 
                                     random_state=15062023+i, 
                                     replacement=True)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_feature_sel_folds, shuffle=True, random_state=8062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])
        
        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start = 1): 
            X, y = X_re[features], y_re

            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]


            X_train = X_train.reset_index(drop=True)
            y_train = y_train.reset_index(drop=True)

            # Store models here
            models_ = [] 

            eval_results_[fold]= {}

            clf = lgb.LGBMClassifier(**params)
            clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                    eval_metric=bll_metric, # eval_sample_weight=w_val, 
                    early_stopping_rounds=100, verbose=1)

            models_.append(clf)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof[val_idx] = val_preds

            val_score = metric(y_val, val_preds)
            best_iter = clf.best_iteration_

            print(f'Fold: {blu}{fold:>3}{res}| {metric.__name__}: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')

            # permutation importance
            if permut:
                perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                             random_state=42, cv=None, refit=False).fit(X_val, y_val)

                perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                                   index=X_val.columns).sort_index()

                if perm_df_.shape[0] == 0:
                    perm_df_ = perm_importance_df.copy()
                else:
                    perm_df_ += perm_importance_df

            # tree feature importance
            f_i = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns), 
                                              reverse=True, key=lambda x: x[1]), 
                               columns=['Value','Feature'])

            if feature_importances_.shape[0] == 0:
                feature_importances_ = f_i.copy()
            else:

                feature_importances_['Value'] += f_i['Value']

            # Boruta SHAP importance
            if boruta:
                model = BoostBoruta(clf, importance_type='shap_importances', train_importance=False)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                          eval_metric=bll_metric, early_stopping_rounds=300, verbose=-1)

                boruta_importance_df = pd.DataFrame({'importance': model.ranking_}, 
                                                     index=X_train.columns).sort_index()
                if boruta_df_.shape[0] == 0:
                    boruta_df_ = boruta_importance_df.copy()
                else:
                    boruta_df_ += boruta_importance_df

        fold_cv_score = metric(y_re, oof)
        print(f'{red} CV score: {res} {metric.__name__}: {red}{fold_cv_score:.5f}{res}')
        print(f'{"*" * 50}\n')
        cv_scores.append(fold_cv_score)


    print(f'{red} Avg score {CFG.n_feature_sel_folds}-fold: {res} {metric.__name__}: {red}{np.mean(cv_scores):.5f}{res}')
    print(f'{"*" * 50}\n')
    
    if permut:
        perm_df_ = perm_df_.sort_values('importance', ascending=False)
        
    if boruta:
        boruta_df_ = boruta_df_.sort_values('importance')
                                    
    feature_importances_ = feature_importances_.sort_values('Value', ascending=False)
    
    return perm_df_, feature_importances_, boruta_df_, np.mean(cv_scores)

if CFG.feature_sel:
    perm_df_, feature_importances_, boruta_df_, cv_scores = lgbm_tuning(features, permut=False, boruta=False)

# Check features correlation

In [None]:
if CFG.feature_sel:
    col = 'BZ'
    x = train_df[train_df[col] <= train_df[col].quantile(0.99)]
    cm = x[[c for c in train_df.columns if c not in ['Id', 'Class']]].corr()
    display(np.abs(cm[col]).sort_values(ascending=False)[1:])

# Analyze permutation feature importance

In [None]:
if CFG.feature_sel:
    perm_df_.to_csv('perm_df.csv')
    perm_df_
    perm_cols = set(perm_df_.index[-35:])
    display(perm_cols)

# Analyze tree gain feature importance

In [None]:
if CFG.feature_sel:
    feature_importances_.to_csv('feature_importances.csv')
    feature_importances_
    fi_cols = set(feature_importances_['Feature'].values[-23:])
    display(fi_cols)

# Analyze BORUTA importance

In [None]:
if CFG.feature_sel:
    boruta_df_.to_csv('boruta_df_.csv')
    boruta_df_
    boruta_cols = set(boruta_df_.index[-35:])
    display(boruta_cols)

# LGBM Optuna optimization

In [None]:
def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1 - y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def bll_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def calc_log_loss_weight(y_true): 
    '''w0, w1 assign different weights to individual data points during training.'''
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

X, y = train_df[features], train_df['Class'] 
    
def objective(trial):
    param = {
        # Main parameters
#                     'device': 'gpu',
#                     'gpu_platform_id': 0,
#                     'gpu_device_id': 0,
        'objective': 'binary',
        'metric': 'none',
#         'is_unbalance': True,
        'boosting_type': trial.suggest_categorical('boosting_type', ['goss', 'gbdt', 'dart']),   
        # Hyperparamters (in order of importance decreasing)
        'n_estimators': 3000, # trial.suggest_int('n_estimators', 500, 1500),  # max number of trees in model
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 3e-1),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True), # L1,  alias: reg_alpha
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True), # L2, alias: reg_lambda
         # decrease to deal with overfit
        'max_depth': trial.suggest_int('max_depth', 4, 10),   # tree max depth 
         # decrease to deal with overfit
        'num_leaves': trial.suggest_int('num_leaves', 4, 128),  # Max number of leaves in one tree
                                                               # should be ~ 2**(max_depth-1)
        'bagging_fraction': None, # Randomly select part of data without 
                                  # resampling if bagging_fraction < 1.0
                                  # alias: subsample
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 0.7), # Randomly select a subset of features 
                                                                   # if feature_fraction < 1.0
                                                                   # alias: colsample_bytree
        # decrease to deal with overfit
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100), # Minimal number of data in one leaf
                                                                           # aliases: min_child_samples, 
        # increase for accuracy, decrease to deal with overfit
        'max_bin': trial.suggest_int('max_bin', 32, 255), # Max number of bins that feature values will be bucketed in
        # increase to deal with overfit
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), # Perform bagging at every k iteration
        'early_stopping_round': 100, 

#           'subsample_for_bin': 200000, # Number of data that sampled to construct feature discrete bins; setting this 
                                     # to larger value will give better training result but may increase train time 
#           'cat_smooth': trial.suggest_float('cat_smooth', 10.0, 100.0),  # this can reduce the effect of noises in 
                                                                       # categorical features, especially for 
                                                                       # categories with few data
        'verbose': -1
    }

    if param['boosting_type'] != 'goss':
        param['bagging_fraction'] = trial.suggest_float('bagging_fraction', 0.3, 0.7)

    bll_list = list()
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under-sampling to balance classes
        positive_count_train = train_df['Class'].value_counts()[1]
        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                        1: positive_count_train}, 
                                     random_state=15062023+i, 
                                     replacement=True)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_feature_sel_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            dtrain = lgb.Dataset(X_train, label=y_train)
            dvalid = lgb.Dataset(X_val, label=y_val)

            # Add a callback for pruning
#             pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'balanced_log_loss')

            gbm = lgb.train(
                param, dtrain, valid_sets=[dvalid], # callbacks=[pruning_callback], 
                feval=bll_metric, verbose_eval=0
            )

            val_preds = gbm.predict(X_val)
            oof[val_idx] = val_preds
        bll_list.append(balanced_log_loss(y_re, oof))

    return np.mean(bll_list)
            

if CFG.lgbm_optimize:
#     study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=100), direction="minimize")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials*2)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_lgbm.csv')

# Load LGBM parameters

In [None]:
import glob

param_list = glob.glob("optuna_lgbm.csv")
models = list()
best_lgbm_params = list()

lgbm_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if lgbm_params.shape[0] == 0:
        lgbm_params = tmp
    else:
        lgbm_params = pd.concat([lgbm_params, tmp])
        
lgbm_params = lgbm_params.sort_values('value').head(20)
param_cols = [c for c in lgbm_params.columns if c.startswith('params_')]
lgbm_params = lgbm_params[param_cols]

for idx, row in lgbm_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['objective'] = 'binary'
    row_dict['metric'] = 'none'
#     row_dict['subsample_for_bin'] = 300000
    row_dict['force_col_wise'] = False
    row_dict['early_stopping_rounds'] = 50
    row_dict['verbose'] = -1
    row_dict['max_bin'] = 255
    row_dict['bagging_freq'] = int(row_dict['bagging_freq'])
    row_dict['min_data_in_leaf'] = int(row_dict['min_data_in_leaf'])
    row_dict['n_estimators'] = 3000 # int(row_dict['n_estimators'])
    row_dict['learning_rate'] = float(row_dict['learning_rate'])
    row_dict['num_leaves'] = int(row_dict['num_leaves'])
    row_dict['max_depth'] = int(row_dict['max_depth'])
    row_dict['is_unbalance'] = True
    row_dict['class_weight'] = 'balanced'
    row_dict['verbose'] = -1
    
    if row_dict['boosting_type'] == 'goss':
        row_dict['bagging_fraction'] = None
        
    best_lgbm_params.append(row_dict)

# LGBM train

In [None]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def lgbm_training():
    # Make random under-sampling to balance classes
    positive_count_train = train_df['Class'].value_counts()[1]
    sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                    1: positive_count_train}, 
                                 random_state=150620231, 
                                 replacement=True)

    X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']

    if CFG.undersample:
        X_re, y_re = sampler.fit_resample(X_re, y_re)
    
    kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=8062023+20)

    oof_level2 = np.zeros([y_re.shape[0], len(best_lgbm_params) + 1])
    oof_level2[:, len(best_lgbm_params)] = y_re
    oof_level2_test = np.zeros([test_df.shape[0], len(best_lgbm_params)])

    print(f"Training with {blu}{X_re.shape[1]}{res} features")

    for fold, (fit_idx, val_idx) in tqdm(enumerate(kf.split(X=X_re, y=X_re.iloc[:,-3:]), start = 1),
                                         total=CFG.n_stacking_folds):
        X, y, test = X_re[features], y_re, test_df[features]
        
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]

        for i, params in enumerate(best_lgbm_params):
            
            clf = lgb.LGBMClassifier(**params)
            clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                    eval_metric=bll_metric, verbose=-1)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            best_iter = clf.best_iteration_

            print(clf.best_iteration_)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')
            
            oof_level2_test[:, i] += clf.predict_proba(test)[:,1]
        
    return oof_level2, oof_level2_test / CFG.n_stacking_folds

if CFG.stacking:
    oof_level2_lgbm, oof_level2_test_lgbm = lgbm_training()

# XGBoost Optuna optimization

In [None]:
X, y = train_df[features], train_df['Class']

def objective(trial):
    bll_list = list()

    params = {
        "n_estimators": 3000, # trial.suggest_int('n_estimators', 100, 1000, step=100),
        "random_state": 14062023,
        "early_stopping_rounds": 100,
        "verbosity": 0,
#         "scale_pos_weight": class_imbalance,
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree"]),# "dart", "gblinear"]), 
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
    }

    if params["booster"] in ["gbtree", "dart"]:
        params["learning_rate"] = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) # alias eta
        # maximum depth of the tree, signifies complexity of the tree.
        params["max_depth"] = trial.suggest_int("max_depth", 3, 10)
        # minimum child weight, larger the term more conservative the tree.
        params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        # defines how selective algorithm is.
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under-sampling to balance classes
        positive_count_train = train_df['Class'].value_counts()[1]
        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                        1: positive_count_train}, 
                                     random_state=15062023+i, 
                                     replacement=True)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_feature_sel_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            # Learning
            model = xgb.XGBClassifier(**params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=1000)
            # Predict
            val_preds = model.predict_proba(X_val)[:,1]
            oof[val_idx] = val_preds
        
        bll_list.append(balanced_log_loss(y_re, oof))    
    
    return np.mean(bll_list)

if CFG.xgb_optimize:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials, )

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe()
    df.sort_values('value').iloc[:, [1] + list(range(5, 14))]
    df.to_csv(f'optuna_xgb.csv')

# Load XGBoost parameters

In [None]:
import glob

param_list = glob.glob("optuna_xgb.csv")
models = list()
best_xb_params = list()

xb_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if xb_params.shape[0] == 0:
        xb_params = tmp
    else:
        xb_params = pd.concat([cb_params, tmp])
        
xb_params = xb_params.sort_values('value').head(10)
param_cols = [c for c in xb_params.columns if c.startswith('params_')]
xb_params = xb_params[param_cols]

for idx, row in xb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['n_estimators'] = 3000
    row_dict['random_state'] = 14062023
    row_dict['early_stopping_rounds'] = 100
    row_dict['verbosity'] = 0
    row_dict['scale_pos_weight'] = 4.71
    row_dict['objective'] = "binary:logistic"
    row_dict['eval_metric'] = "logloss"
    row_dict['tree_method'] = "exact"
    row_dict['booster'] = "gbtree"

    if row_dict["booster"] in ["gbtree", "dart"]:
        row_dict["learning_rate"] = float(row_dict['learning_rate'])
        row_dict["max_depth"] = int(row_dict['max_depth'])
        row_dict["min_child_weight"] = float(row_dict['min_child_weight'])
        row_dict["gamma"] = float(row_dict['gamma'])
    else:
        row_dict["learning_rate"] = None
        row_dict["max_depth"] = None
        row_dict["min_child_weight"] = None
        row_dict["gamma"] = None
        row_dict["grow_policy"] = None     

    if row_dict["booster"] == "dart":
        row_dict["rate_drop"] = float(row_dict['rate_drop'])
        row_dict["skip_drop"] = float(row_dict['skip_drop'])
    else:
        row_dict["sample_type"] = None
        row_dict["normalize_type"] = None
        row_dict["rate_drop"] = None
        row_dict["skip_drop"] = None

    best_xb_params.append(row_dict)

# XGBoost train

In [None]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def xgboost_training():
    # Make random under-sampling to balance classes
    positive_count_train = train_df['Class'].value_counts()[1]
    sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                    1: positive_count_train}, 
                                 random_state=150620232, 
                                 replacement=True)

    X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']

    if CFG.undersample:
        X_re, y_re = sampler.fit_resample(X_re, y_re)
    
    kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=8062023+20)

    oof_level2 = np.zeros([y_re.shape[0], len(best_xb_params) + 1])
    oof_level2[:, len(best_xb_params)] = y_re
    oof_level2_test = np.zeros([test_df.shape[0], len(best_xb_params)])

    print(f"Training with {blu}{X_re.shape[1]}{res} features")

    for fold, (fit_idx, val_idx) in tqdm(enumerate(kf.split(X=X_re, y=X_re.iloc[:,-3:]), start = 1),
                                         total=CFG.n_stacking_folds):
        X, y, test = X_re[features], y_re, test_df[features]
        
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]

        for i, params in enumerate(best_xb_params):
            clf = xgb.XGBClassifier(**params)
            
            clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=500)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            best_iter = 0

#             print(clf.best_iteration_)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')
        
            oof_level2_test[:, i] += clf.predict_proba(test)[:,1]
        
    return oof_level2, oof_level2_test / CFG.n_stacking_folds

if CFG.stacking:
    oof_level2_xgb, oof_level2_test_xgb = xgboost_training()

# CatBoost Optuna optimization

In [None]:
from optuna.integration import CatBoostPruningCallback

X, y = train_df[features], train_df['Class']

def objective(trial):
    
    bll_list = list()

    # Parameters
    params = {
        'task_type': 'CPU', # GPU
#         'scale_pos_weight': class_imbalance,
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss', 
        'random_seed': 10062023,
        'od_type': 'Iter', # Type of overfitting detector - stop after k iteraions
        'od_wait': 100, # Overfitting detector - stop training after k iterations without metric improvement
#             'metric_period': 100, # Show metric each k iterations
        # Hyperparamters (in order of importance decreasing)
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'iterations' : 3000, # trial.suggest_int('iterations', 300, 1200),        
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-4, 3e-1),    
        'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
        'depth' : trial.suggest_int('depth', 4, 10),  # Max tree depth                                                   
         # increase to deal with overfit
        'random_strength': trial.suggest_float('random_strength', 0, 100), # The amount of randomness to use 
                                                                           # for scoring splits when the tree structure
                                                                           # is selected. Helps to avoid overfitting
                                                                           # CPU only
        # per_float_feature_quantization='0:border_count=1024'
        'border_count': 254, # trial.suggest_categorical('border_count', [128, 254]), # The number of splits for numerical features
                                                                                      # bigger is better but slowly
                                                                                      # alias: max_bin
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100), # Minimal number of data in one leaf
                                                                           # aliases: min_child_samples, 

    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 100) # Assigns random 
                                                                                           # weights to objects
                                                                                           # works only with 
                                                                                           # Bayesian bootstrap
    if params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.3, 1) # Percentage of features to use 
                                                                        # at each split

    if params['grow_policy'] == 'Lossguide': 
        params['max_leaves'] = trial.suggest_int('max_leaves', 4, 128) # Max number of leaves in one tree 
                                                                       # decrease to deal with the overfit

    if params['grow_policy'] == 'SymmetricTree': 
        params['boosting_type'] = trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
    else:
        params['boosting_type'] = 'Plain'
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under-sampling to balance classes
        positive_count_train = train_df['Class'].value_counts()[1]
        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                        1: positive_count_train}, 
                                     random_state=15062023+i, 
                                     replacement=True)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_feature_sel_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            train_pool = Pool(X_train, y_train, cat_features=['EJ'])
            val_pool = Pool(X_val, y_val, cat_features=['EJ'])

            # Learning
            model = cat.CatBoostClassifier(**params)     
            # Add a callback for pruning
#             pruning_callback = optuna.integration.CatBoostPruningCallback(trial, "Logloss")
            model.fit(train_pool, eval_set=val_pool, verbose=0)#, callbacks=[pruning_callback])
            # Evoke pruning manually
#                 pruning_callback.check_pruned()
            # Predict
            val_preds = model.predict_proba(val_pool)[:,1]
            oof[val_idx] = val_preds
        
        bll_list.append(balanced_log_loss(y_re, oof))    
    
    return np.mean(bll_list)

if CFG.cb_optimize:
#     study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=100), direction="minimize")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe()
    df.sort_values('value').iloc[:, [1] + list(range(5, 14))]
    df.to_csv(f'optuna_catboost.csv')

# Load CatBoost parameters

In [None]:
import glob

param_list = glob.glob("optuna_catboost.csv")
models = list()
best_cb_params = list()

cb_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if cb_params.shape[0] == 0:
        cb_params = tmp
    else:
        cb_params = pd.concat([cb_params, tmp])
        
cb_params = cb_params.sort_values('value').head(10)
param_cols = [c for c in cb_params.columns if c.startswith('params_')]
cb_params = cb_params[param_cols]


for idx, row in cb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['task_type'] = 'CPU'
    row_dict['auto_class_weights'] = 'Balanced'
    row_dict['eval_metric'] = 'Logloss'
    row_dict['loss_function'] = 'Logloss'
    row_dict['random_seed'] = 13062023
    row_dict['verbose'] = 0
    row_dict['od_type'] = 'Iter'
    row_dict['od_wait'] = 100
    row_dict['border_count'] = 254
    row_dict['iterations'] = 3000
    row_dict['learning_rate'] = float(row_dict['learning_rate'])
    row_dict['l2_leaf_reg'] = float(row_dict['l2_leaf_reg'])
    row_dict['depth'] = int(row_dict['depth'])
    row_dict['random_strength'] = float(row_dict['random_strength'])
    row_dict['min_data_in_leaf'] = int(row_dict['min_data_in_leaf'])
    
    if row_dict["bootstrap_type"] == "Bayesian":
        row_dict['bagging_temperature'] = float(row_dict['bagging_temperature'])
    else:    
        row_dict['bagging_temperature'] = None
        
    if row_dict["bootstrap_type"] == "Bernoulli":
        row_dict['subsample'] = float(row_dict['subsample'])
    else:
        row_dict['subsample'] = None
    
    if row_dict['grow_policy'] == 'Lossguide':
        row_dict['max_leaves'] = int(row_dict['max_leaves'])
    else:
        row_dict['max_leaves'] = None
    
    if row_dict['grow_policy'] != 'SymmetricTree':
        row_dict['boosting_type'] = 'Plain'
    
    best_cb_params.append(row_dict)

# CatBoost train

In [None]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def cb_training():
    # Make random under-sampling to balance classes
    positive_count_train = train_df['Class'].value_counts()[1]
    sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 
                                                    1: positive_count_train}, 
                                 random_state=150620233, 
                                 replacement=True)

    X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']

    if CFG.undersample:
        X_re, y_re = sampler.fit_resample(X_re, y_re)
    
    kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=8062023+20)

    oof_level2 = np.zeros([y_re.shape[0], len(best_cb_params) + 1])
    oof_level2[:, len(best_cb_params)] = y_re
    oof_level2_test = np.zeros([test_df.shape[0], len(best_cb_params)])

    print(f"Training with {blu}{X_re.shape[1]}{res} features")

    for fold, (fit_idx, val_idx) in tqdm(enumerate(kf.split(X=X_re, y=X_re.iloc[:,-3:]), start = 1),
                                         total=CFG.n_stacking_folds):
        X, y, test = X_re[features], y_re, test_df[features]
        
        # Split the dataset according to the fold indexes.
        X_train = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]

        train_pool = Pool(X_train, y_train, cat_features=['EJ'])
        val_pool = Pool(X_val, y_val, cat_features=['EJ'])
        
        for i, params in enumerate(best_cb_params):
            
            model = cat.CatBoostClassifier(**params)
            model.fit(train_pool, eval_set=val_pool, verbose=0)

            val_preds = model.predict_proba(val_pool)[:,1]
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            best_iter = model.best_iteration_

            print(model.best_iteration_)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                  f' | Best iteration: {blu}{best_iter:>4}{res}')
        
            oof_level2_test[:, i] += model.predict_proba(test)[:,1]
        
    return oof_level2, oof_level2_test / CFG.n_stacking_folds

if CFG.stacking:
    oof_level2_cb, oof_level2_test_cb = cb_training()

# Stacking with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

oof_level2 = np.concatenate([oof_level2_lgbm[:,:-1] , oof_level2_cb[:,:-1], oof_level2_xgb[:,:-1]], axis=1)
oof_level2_test = np.concatenate([oof_level2_test_lgbm , oof_level2_test_cb, oof_level2_test_xgb], axis=1)

X = oof_level2
y = oof_level2_lgbm[:,-1]

# mean bll
print(balanced_log_loss(y, np.mean(X, axis=1)))

lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)

pred = lr.predict_proba(X)[:,1]

# lr bll
print(balanced_log_loss(y, pred))

weights = lr.coef_[0]

# Models evaluation

In [None]:
# ## Model Evaluation
# metric_score_folds = pd.DataFrame.from_dict(all_eval_results_)
# fit_logloss = []
# val_logloss = []

# for seed in CFG.seeds:
#     for fold in range(1,CFG.n_folds+1):
#         fit_logloss.append(metric_score_folds[seed][fold]['training']['balanced_log_loss'])
#         val_logloss.append(metric_score_folds[seed][fold]['valid_1']['balanced_log_loss'])

# fig, axes = plt.subplots(math.ceil(CFG.n_folds*len(CFG.seeds)/CFG.n_folds), CFG.n_folds, figsize=(20, 20), dpi=150)
# ax = axes.flatten()
# for i, (f, v, m) in enumerate(zip(fit_logloss, val_logloss, models_), start = 1): 
#     sns.lineplot(f, color='#B90000', ax=ax[i-1], label='fit')
#     sns.lineplot(v, color='#048BA8', ax=ax[i-1], label='val')
#     ax[i-1].legend()
#     ax[i-1].spines['top'].set_visible(False);
#     ax[i-1].spines['right'].set_visible(False)
#     ax[i-1].set_title(f'Seed {CFG.seeds[(i-1)//CFG.n_folds]} Fold {CFG.n_folds if i%CFG.n_folds==0 else i%CFG.n_folds}', fontdict={'fontweight': 'bold'})

#     color =  ['#048BA8', palette[-3]]
#     best_iter = m.best_iteration_
#     span_range = [[0, best_iter], [best_iter + 10, best_iter + CFG.num_boost_round]]

#     for idx, sub_title in enumerate([f'Best\nIteration: {best_iter}', f'Early\n Stopping: 2000']):
#         ax[i-1].annotate(sub_title,
#                     xy=(sum(span_range[idx])/2 , 0.5),
#                     xytext=(0,0), textcoords='offset points',
#                     va="center", ha="center",
#                     color="w", fontsize=16, fontweight='bold',
#                     bbox=dict(boxstyle='round4', pad=0.4, color=color[idx], alpha=0.6))
#         ax[i-1].axvspan(span_range[idx][0]-0.4,span_range[idx][1]+0.4,  color=color[idx], alpha=0.07)

#     ax[i-1].set_xlim(0, best_iter + 20 + 2000)
#     ax[i-1].legend(bbox_to_anchor=(0.95, 1), loc='upper right', title='logloss')

# plt.tight_layout();

# Predict test

In [None]:
def predict(X):
    y = np.zeros(len(X))
    for i in range(oof_level2_test.shape[1]):
        y += weights[i] * oof_level2_test[:,i]
#         y += oof_level2_test[:,i]
    return y / sum(weights)
#     return y / len(models)

predictions = predict(test_df[features])
# predictions = predict(generated_features_test)

test_df['class_1'] = predictions
test_df['class_0'] = 1 - predictions

sample_submission[['class_0', 'class_1']] = test_df[['class_0', 'class_1']]
sample_submission.to_csv(r"submission.csv", index=False)
sample_submission

You have a lot of resulting features. I have already identified a few important once. 