In [36]:
# !pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
# !mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [37]:
import sys
sys.path.append('/kaggle/input/iter-strat/iter_strat')

import math

import numpy as np
import pandas as pd

import lightgbm as lgb
import catboost as cat
from catboost import Pool
import xgboost as xgb
from tabpfn import TabPFNClassifier

import itertools
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, BaseShuffleSplit, _validate_shuffle_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from scipy import stats

import eli5
from IPython.display import display
from eli5.permutation_importance import get_score_importances
from eli5.sklearn import PermutationImportance

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns

import optuna

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL


class CFG:
    # main
    kaggle = False
    test = False
    
    # features
    fe_drop = True
    
    del_errs = False
    del_outliers = True
    
    feature_sel = False
    n_feature_sel_repeats = 5
    n_feature_sel_folds = 5
    
    undersample = False
    oversample = False
    
    nan_impute = False
    standard_scale = False
    log = False
    
    # optimization
    n_estimators = 3000
    early_stopping_rounds = 100
    
    lgbm_optimize = False
    xgb_optimize = False
    cb_optimize = False
    
    n_trials = 500
    n_optimize_folds = 10
    n_optimize_repeats = 2
    
    # train
    lgbm_train = True
    xgb_train = False
    cb_train = False
    tabpfn_train = False

    # inference
    n_stacking_folds = 20
    n_stacking_models_lgbm = 40
    n_stacking_models_xgb = 10
    n_stacking_models_cb = 20
    n_stacking_models_tabpfn = 20

    adjust_class_threshold = False
    

# Load Data

In [38]:
if CFG.kaggle:
    COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
else:
    COMP_PATH = "icr-identify-age-related-conditions"

train_df = pd.read_csv(f'{COMP_PATH}//train.csv')
test_df = pd.read_csv(f'{COMP_PATH}/test.csv')
greeks = pd.read_csv(f"{COMP_PATH}/greeks.csv")
sample_submission = pd.read_csv(f"{COMP_PATH}/sample_submission.csv")

train_df['EJ'] = train_df['EJ'].replace({'A': 0, 'B': 1})
test_df['EJ'] = test_df['EJ'].replace({'A': 0, 'B': 1})

train_df.columns = train_df.columns.str.replace(' ', '')
test_df.columns = test_df.columns.str.replace(' ', '')

# Delete outliers

In [39]:
features_with_outliers = [fe for fe in train_df.columns if fe not in ['BN', 'BQ', 'CW', 'EL', 'GH', 
                                                                      'GI', 'GL', 'Id', 'Class', 'EJ']]

if CFG.del_outliers:
    for f in features_with_outliers:
        train_df[f] = train_df[f].clip(upper=train_df[f].quantile(0.99))

# Delete erroneus objects

In [40]:
if CFG.del_errs:
    err_objs = [292, 102, 509, 367, 313, 462, 556]
    train_df = train_df.loc[[i for i in train_df.index if i not in err_objs], :].reset_index(drop=True)
    greeks = greeks.loc[[i for i in greeks.index if i not in err_objs], :].reset_index(drop=True)

class_imbalance = 3 # train_df[train_df['Class'] == 0].shape[0] / train_df[train_df['Class'] == 1].shape[0]

# Add distance features

In [41]:
# from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

# features = train_df.drop(['Class', 'Id'], axis=1).columns

# # average label of 20 Nearest Neighbours (colsine distance)
# knn = NearestNeighbors(n_neighbors=10, metric='cosine', n_jobs=-1)
# knn.fit(train_df[features].fillna(0))

# # train
# dists, nears = knn.kneighbors(train_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# train_df['class_cos'] = np.array(classes[i].mean() for i in range(len(nears)))
# train_df['class_cos'] = train_df['class_cos'].astype(float)

# # test
# dists, nears = knn.kneighbors(test_df[features].fillna(0), return_distance=True)
# dists, nears = dists[:,1:], nears[:,1:]

# classes = np.array([train_df.loc[n, 'Class'] for n in nears])
# test_df['class_cos'] = np.array(classes[i].mean() for i in range(len(nears)))
# test_df['class_cos'] = test_df['class_cos'].astype(float)

# Drop not necessary features

In [42]:
if CFG.fe_drop:
    features = [fe for fe in train_df.columns if fe not in ['CF', 'CB', 'DV', 'BR', 'DF', 'GB', 'AH',
                                                            'CW', 'CL', 'BP', 'BD', 'FC', 'GE', 'GF',
                                                            'AR', 'GI', 'Id', 'Class', 'AX', 'DA']]
else:
    features = [fe for fe in train_df.columns if fe not in ['Id', 'Class', 'EJ']]

num_cols = train_df.select_dtypes(include=['float64']).columns

len(train_df.columns), len(features), len(num_cols)

(58, 38, 55)

# NaN impute

In [43]:
from datetime import datetime

imp = SimpleImputer(missing_values=np.nan, strategy='median')

train_df_tabpfn = train_df.copy()
test_df_tabpfn = test_df.copy()

train_df_tabpfn[num_cols] = imp.fit_transform(train_df_tabpfn[num_cols])
test_df_tabpfn[num_cols] = imp.transform(test_df_tabpfn[num_cols])

if CFG.nan_impute:
    train_df[num_cols] = train_df_tabpfn[num_cols]
    test_df[num_cols] = test_df_tabpfn[num_cols]

# Standard Scale

In [44]:
sc = StandardScaler()
train_df_tabpfn[num_cols] = sc.fit_transform(train_df_tabpfn[num_cols])
test_df_tabpfn[num_cols] = sc.transform(test_df_tabpfn[num_cols])
train_df_tabpfn['EJ'] = train_df['EJ']
test_df_tabpfn['EJ'] = test_df['EJ']

if CFG.standard_scale:
    train_df[num_cols] = train_df_tabpfn[num_cols]
    test_df[num_cols] = test_df_tabpfn[num_cols]

# Log features (preserve sign)

In [45]:
if CFG.log:
    train_df[num_cols] = np.log1p(train_df[num_cols])
    test_df[num_cols] = np.log1p(test_df[num_cols])

# for f in features:
#     train_df[f] = np.sign(train_df[f]) * np.log1p(np.abs(train_df[f])) # no significant result for LGBM

# LGBM feature selection

In [46]:
if not CFG.kaggle:

    from shaphypetune import BoostBoruta

    params = {
                'n_estimators': CFG.n_estimators,
                'early_stopping_round': CFG.early_stopping_rounds,
                'objective': 'binary',
                'metric': 'logloss', 
                'n_jobs': -1,
                'is_unbalance':True, 
                'class_weight':'balanced', 
                'verbose': -1,
                'seed': 19062023,
            }

    def balanced_log_loss(y_true, y_pred):

        # Nc is the number of observations
        N_1 = np.sum(y_true == 1, axis=0)
        N_0 = np.sum(y_true == 0, axis=0)

        N_inv_0 = 1/N_0 if N_0 > 0 else 0
        N_inv_1 = 1/N_1 if N_1 > 0 else 0

        # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

        # balanced logarithmic loss
        loss_numerator = - N_inv_0 * np.sum((1 - y_true) * np.log(1 - y_pred)) - N_inv_1 * np.sum(y_true * np.log(y_pred))

        return loss_numerator / 2

    def bll_metric(y_true, y_pred):
        return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

    def lgbm_tuning(features, permut=False, boruta=False):
        metric = balanced_log_loss
        eval_results_ = {}

        cv_scores = [] # store all cv scores of outer loop inference

        perm_df_ = pd.DataFrame()
        feature_importances_ = pd.DataFrame()
        boruta_df_ = pd.DataFrame()
        
        for i in range(CFG.n_feature_sel_repeats):
            print(f'Repeat {blu}#{i+1}')
            
            # Make random under-sampling to balance classes
            positive_count_train = train_df['Class'].value_counts()[1]
            sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * 3, 
                                                            1: positive_count_train}, 
                                        random_state=15062023+i, 
                                        replacement=True)

            X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
            
            if CFG.undersample:
                X_re, y_re = sampler.fit_resample(X_re, y_re)
            
            # Create Stratified Multilabel k-Fold scheme
            kf = MultilabelStratifiedKFold(n_splits=CFG.n_feature_sel_folds, shuffle=True, random_state=8062023+i)

            # Create an oof array for inner loop
            oof = np.zeros(X_re.shape[0])
            
            # Stratify based on Class and Alpha (3 types of conditions)
            for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start = 1): 
                X, y = X_re[features], y_re

                # Split the dataset according to the fold indexes.
                X_train = X.iloc[train_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[train_idx]
                y_val = y.iloc[val_idx]


                X_train = X_train.reset_index(drop=True)
                y_train = y_train.reset_index(drop=True)

                # Store models here
                models_ = [] 

                eval_results_[fold]= {}

                clf = lgb.LGBMClassifier(**params)
                clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)

                models_.append(clf)

                val_preds = clf.predict_proba(X_val)[:,1]
                oof[val_idx] = val_preds

                val_score = metric(y_val, val_preds)
                best_iter = clf.best_iteration_

                print(f'Fold: {blu}{fold:>3}{res}| {metric.__name__}: {blu}{val_score:.5f}{res}'
                    f' | Best iteration: {blu}{best_iter:>4}{res}')

                # permutation importance
                if permut:
                    perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                                random_state=42, cv=None, refit=False).fit(X_val, y_val)

                    perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                                    index=X_val.columns).sort_index()

                    if perm_df_.shape[0] == 0:
                        perm_df_ = perm_importance_df.copy()
                    else:
                        perm_df_ += perm_importance_df

                # tree feature importance
                f_i = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns), 
                                                reverse=True, key=lambda x: x[1]), 
                                columns=['Value','Feature'])

                if feature_importances_.shape[0] == 0:
                    feature_importances_ = f_i.copy()
                else:

                    feature_importances_['Value'] += f_i['Value']

                # Boruta SHAP importance
                if boruta:
                    model = BoostBoruta(clf, importance_type='shap_importances', train_importance=False)
                    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)

                    boruta_importance_df = pd.DataFrame({'importance': model.ranking_}, index=X_train.columns).sort_index()
                    if boruta_df_.shape[0] == 0:
                        boruta_df_ = boruta_importance_df.copy()
                    else:
                        boruta_df_ += boruta_importance_df

            fold_cv_score = metric(y_re, oof)
            print(f'{red} CV score: {res} {metric.__name__}: {red}{fold_cv_score:.5f}{res}')
            print(f'{"*" * 50}\n')
            cv_scores.append(fold_cv_score)


        print(f'{red} Avg score {CFG.n_feature_sel_folds}-fold: {res} {metric.__name__}: {red}{np.mean(cv_scores):.5f}{res}')
        print(f'{"*" * 50}\n')
        
        if permut:
            perm_df_ = perm_df_.sort_values('importance', ascending=False)
            
        if boruta:
            boruta_df_ = boruta_df_.sort_values('importance')
                                        
        feature_importances_ = feature_importances_.sort_values('Value', ascending=False)
        
        return perm_df_, feature_importances_, boruta_df_, np.mean(cv_scores)

    if CFG.feature_sel:
        perm_df_, feature_importances_, boruta_df_, cv_scores = lgbm_tuning(features, permut=True, boruta=True)

# Check features correlation

In [47]:
if CFG.feature_sel:
    col = 'DA'
    x = train_df[train_df[col] <= train_df[col].quantile(0.99)]
    cm = x[[c for c in train_df.columns if c not in ['Id', 'Class']]].corr()
    display(np.abs(cm[col]).sort_values(ascending=False)[1:])

# Analyze permutation feature importance

In [48]:
if CFG.feature_sel:
    perm_df_.to_csv('perm_df.csv')
    perm_df_
    perm_cols = set(perm_df_.index[-35:])
    display(perm_cols)

In [49]:
# x = pd.read_csv('perm_df.csv', index_col='Unnamed: 0')
# x['feature'] = x.index.copy()
# x = x.reset_index(drop=True)
# x['rank'] = x['importance'].rank()
# x = x[['feature', 'rank']]

# Analyze tree gain feature importance

In [50]:
if CFG.feature_sel:
    feature_importances_.to_csv('feature_importances.csv')
    feature_importances_
    fi_cols = set(feature_importances_['Feature'].values[-23:])
    display(fi_cols)

In [51]:
# y = pd.read_csv('feature_importances.csv')
# y['feature'] = y['Feature']
# y = y.reset_index(drop=True)
# y['rank'] = y['Value'].rank()
# y = y[['feature', 'rank']]

# Analyze BORUTA importance

In [52]:
if CFG.feature_sel:
    boruta_df_.to_csv('boruta_df_.csv')
    boruta_df_
    boruta_cols = set(boruta_df_.index[-35:])
    display(boruta_cols)

In [53]:
# z = pd.read_csv('boruta_df_.csv', index_col='Unnamed: 0')
# z['feature'] = z.index.copy()
# z = z.reset_index(drop=True)
# z['rank'] = z['importance'].rank(ascending=False)
# z = z[['feature', 'rank']]

# Sort all features according to their importance

In [54]:
# a = pd.concat([x, y, z])
# a = a[['feature', 'rank']]
# res = a.groupby('feature')['rank'].sum().sort_values(ascending=False)#.index.to_list()
# res

# LGBM Optuna optimization

In [55]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

X, y = train_df[features], train_df['Class'] 
    
def objective(trial):
    param = {
        # Main parameters
#                     'device': 'gpu',
#                     'gpu_platform_id': 0,
#                     'gpu_device_id': 0,
        'objective': 'binary',
        'metric': 'none',
        'boosting_type': trial.suggest_categorical('boosting_type', ['goss', 'gbdt']),#, 'dart']),   
        # Hyperparamters (in order of importance decreasing)
        'n_estimators': CFG.n_estimators, # trial.suggest_int('n_estimators', 500, 1500),  # max number of trees in model
        'early_stopping_round': CFG.early_stopping_rounds, 
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 3e-1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True), # L1,  alias: lambda_l1
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True), # L2, alias: lambda_l2
         # decrease to deal with overfit
        'max_depth': trial.suggest_int('max_depth', 4, 10),   # tree max depth 
         # decrease to deal with overfit
        'num_leaves': trial.suggest_int('num_leaves', 4, 128),  # Max number of leaves in one tree
                                                                # should be ~ 2**(max_depth-1)
        'subsample': None, # Randomly select part of data without 
                                  # resampling if subsample < 1.0
                                  # alias: bagging_fraction
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.7), # Randomly select a subset of features 
                                                                   # if colsample_bytree < 1.0
                                                                   # alias:feature_fraction
        # decrease to deal with overfit
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100), # Minimal number of data in one leaf
                                                                             # aliases: min_data_in_leaf, 
        # increase for accuracy, decrease to deal with overfit
        'max_bin': trial.suggest_int('max_bin', 32, 255), # Max number of bins that feature values will be bucketed in
        # increase to deal with overfit
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7), # Perform bagging at every k iteration, alias: bagging_freq

#           'subsample_for_bin': 200000, # Number of data that sampled to construct feature discrete bins; setting this 
                                     # to larger value will give better training result but may increase train time 
#           'cat_smooth': trial.suggest_float('cat_smooth', 10.0, 100.0),  # this can reduce the effect of noises in 
                                                                       # categorical features, especially for 
                                                                       # categories with few data
        'verbose': -1
    }

    if not CFG.oversample and not CFG.undersample:
        # param['is_unbalance'] = True
        param['scale_pos_weight'] = class_imbalance
    
    if param['boosting_type'] != 'goss':
        param['subsample'] = trial.suggest_float('subsample', 0.3, 0.7)

    bll_list = list()
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under- or oversampling to balance classes
        if CFG.undersample:
            positive_count_train = train_df['Class'].value_counts()[1]
            sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * 3, 
                                                            1: positive_count_train}, 
                                        random_state=15062023+i, 
                                        replacement=True)
        elif CFG.oversample:
            sampler = RandomOverSampler(random_state=2306020231)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_optimize_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            # oversample
            if CFG.oversample:
                X_train, y_train = sampler.fit_resample(X_train, y_train)

            clf = lgb.LGBMClassifier(**param)
            clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                    eval_metric=bll_metric, verbose=0)

            val_preds = clf.predict_proba(X_val)[:,1]
            oof[val_idx] = val_preds
        bll_list.append(balanced_log_loss(y_re, oof))

    return np.mean(bll_list)
            

if CFG.lgbm_optimize:
#     study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=100), direction="minimize")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials * 2)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_lgbm.csv')

    display(df.head(10))

# Load LGBM parameters

In [56]:
import glob

if CFG.kaggle:
    param_list = glob.glob("/kaggle/input/icr-optuna-no-da/optuna_lgbm.csv")
else:
    param_list = glob.glob("optuna_lgbm.csv")

models = list()
best_lgbm_params = list()

lgbm_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if lgbm_params.shape[0] == 0:
        lgbm_params = tmp
    else:
        lgbm_params = pd.concat([lgbm_params, tmp])
        
lgbm_params = lgbm_params.sort_values('value').head(CFG.n_stacking_models_lgbm)
param_cols = [c for c in lgbm_params.columns if c.startswith('params_')]
lgbm_params = lgbm_params[param_cols]

for idx, row in lgbm_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['objective'] = 'binary'
    row_dict['metric'] = 'none'
#     row_dict['subsample_for_bin'] = 300000
    row_dict['force_col_wise'] = False
    row_dict['n_estimators'] = CFG.n_estimators
    row_dict['early_stopping_round'] = CFG.early_stopping_rounds
    row_dict['verbose'] = -1
    row_dict['max_bin'] = 255
    
    row_dict['num_leaves'] = int(row_dict['num_leaves'])
    row_dict['max_depth'] = int(row_dict['max_depth'])
    row_dict['min_child_samples'] = int(row_dict['min_child_samples'])
    row_dict['subsample_freq'] = int(row_dict['subsample_freq'])
    row_dict['learning_rate'] = float(row_dict['learning_rate'])
    
    # if not CFG.oversample and not CFG.undersample:
        # row_dict['is_unbalance'] = True
    row_dict['scale_pos_weight'] = class_imbalance
    
    if row_dict['boosting_type'] == 'goss':
        row_dict['subsample'] = None
        
    best_lgbm_params.append(row_dict)

if CFG.test:
    best_lgbm_params = [{
            'boosting_type': 'goss',
            'n_estimators': 50000,
            'early_stopping_round': 300,
            'max_depth': 8,
            'learning_rate': 0.06733232950390658,
            'subsample': 0.6970532011679706,
            'colsample_bytree': 0.6055755840633003,
            'is_unbalance': True, 
            'class_weight': 'balanced',
            'metric':'none',
            'verbose': -1,
            'random_state': 42,
        }
    ]

                         

# XGBoost Optuna optimization

In [57]:
X, y = train_df[features], train_df['Class']

def objective(trial):
    bll_list = list()

    params = {
        "n_estimators": CFG.n_estimators, # trial.suggest_int('n_estimators', 100, 1000, step=100),
        "early_stopping_rounds": CFG.early_stopping_rounds,
        "verbosity": 0,
        "random_state": 14062023,
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree"]),# "dart", "gblinear"]), 
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
    }

    # if not CFG.oversample and not CFG.undersample:
    params["scale_pos_weight"] = class_imbalance
    
    if params["booster"] in ["gbtree", "dart"]:
        params["learning_rate"] = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True) # alias eta
        # maximum depth of the tree, signifies complexity of the tree.
        params["max_depth"] = trial.suggest_int("max_depth", 3, 10)
        # minimum child weight, larger the term more conservative the tree.
        params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        # defines how selective algorithm is.
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under- or oversampling to balance classes
        positive_count_train = train_df['Class'].value_counts()[1]
        if CFG.undersample:
            sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * 3, 
                                                            1: positive_count_train}, 
                                        random_state=15062023+i, 
                                        replacement=True)
        elif CFG.oversample:
            sampler = RandomOverSampler(random_state=2306020231)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_optimize_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            # oversample
            if CFG.oversample:
                X_train, y_train = sampler.fit_resample(X_train, y_train)

            # Learning
            model = xgb.XGBClassifier(**params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
            # Predict
            val_preds = model.predict_proba(X_val)[:,1]
            oof[val_idx] = val_preds
        
        bll_list.append(balanced_log_loss(y_re, oof))    
    
    return np.mean(bll_list)

if CFG.xgb_optimize:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials * 2)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_xgb.csv')

    display(df.head(10))

# Load XGBoost parameters

In [58]:
import glob

if CFG.kaggle:
    param_list = glob.glob("/kaggle/input/icr-optuna-no-da/optuna_xgb.csv")
else:
    param_list = glob.glob("optuna_xgb.csv")

models = list()
best_xgb_params = list()

xgb_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if xgb_params.shape[0] == 0:
        xgb_params = tmp
    else:
        xgb_params = pd.concat([xgb_params, tmp])
        
xgb_params = xgb_params.sort_values('value').head(CFG.n_stacking_models_xgb)
param_cols = [c for c in xgb_params.columns if c.startswith('params_')]
xgb_params = xgb_params[param_cols]

for idx, row in xgb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['n_estimators'] = CFG.n_estimators
    row_dict['early_stopping_rounds'] = CFG.early_stopping_rounds
    row_dict['random_state'] = 14062023
    row_dict['verbosity'] = 0
    row_dict['objective'] = "binary:logistic"
    row_dict['eval_metric'] = "logloss"
    row_dict['tree_method'] = "exact"
    row_dict['booster'] = "gbtree"

    # if not CFG.oversample and not CFG.undersample:
    row_dict['scale_pos_weight'] = class_imbalance

    if row_dict["booster"] in ["gbtree", "dart"]:
        row_dict["max_depth"] = int(row_dict["max_depth"])
        row_dict["min_child_weight"] = int(row_dict["min_child_weight"])
    else:
        row_dict["learning_rate"] = None
        row_dict["max_depth"] = None
        row_dict["min_child_weight"] = None
        row_dict["gamma"] = None
        row_dict["grow_policy"] = None     

    if row_dict["booster"] != "dart":
        row_dict["sample_type"] = None
        row_dict["normalize_type"] = None
        row_dict["rate_drop"] = None
        row_dict["skip_drop"] = None

    best_xgb_params.append(row_dict)

if CFG.test:
    best_xgb_params = [{
            'n_estimators': CFG.n_estimators,
            'early_stopping_rounds': CFG.early_stopping_rounds,
            'objective': "binary:logistic",
            'scale_pos_weight': class_imbalance, 
            'verbosity': 0,
            'random_state': 19062023,
        }
    ]

# CatBoost Optuna optimization

In [59]:
X, y = train_df[features], train_df['Class']

def objective(trial):
    
    bll_list = list()

    # Parameters
    params = {
        'task_type': 'CPU', # GPU
        'eval_metric': 'Logloss',
        'loss_function': 'Logloss', 
        'random_seed': 19062023,
        'od_type': 'Iter', # Type of overfitting detector - stop after k iteraions
        'iterations' : CFG.n_estimators, # trial.suggest_int('iterations', 300, 1200),        
        'od_wait': CFG.early_stopping_rounds, # Overfitting detector - stop training after k iterations without metric improvement
        # 'metric_period': 100, # Show metric each k iterations
        # Hyperparamters (in order of importance decreasing)
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS', 'No']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-3, 3e-1), 
        'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
        'depth' : trial.suggest_int('depth', 4, 10),  # Max tree depth                                          
         # increase to deal with overfit
        'random_strength': trial.suggest_float('random_strength', 0, 100), # The amount of randomness to use
                                                                           # for scoring splits when the tree structure
                                                                           # is selected. Helps to avoid overfitting
                                                                           # CPU only
        # per_float_feature_quantization='0:border_count=1024'
        'border_count': 254, # trial.suggest_categorical('border_count', [128, 254]), # The number of splits for numerical features
                                                                                      # bigger is better but slowly
                                                                                      # alias: max_bin
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100), # Minimal number of data in one leaf
                                                                           # aliases: min_child_samples, 

    }

    if not CFG.oversample and not CFG.undersample:
        # params['auto_class_weights'] = 'Balanced'
        params['scale_pos_weight'] = class_imbalance
        
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 100) # Assigns random 
                                                                                           # weights to objects
                                                                                           # works only with 
                                                                                           # Bayesian bootstrap
    if params["bootstrap_type"] in ["Poisson", "Bernoulli", "MVS"]:
        params["subsample"] = trial.suggest_float("subsample", 0.3, 1) # Percentage of objects to use 
                                                                        # at each split

    if params['task_type'] == 'CPU' and params['bootstrap_type'] != 'Bayesian':
        params["colsample_bylevel"] = trial.suggest_float("colsample_bylevel", 0.3, 1)  # Percentage of features to use 
                                                                                        # at each split;
                                                                                        # with Bayesian bootstrap and Lossguide grop policy
                                                                                        # leads to error (CatBoost bug)
    else:
        params["colsample_bylevel"] = None                                                     

    if params['grow_policy'] == 'Lossguide': 
        params['max_leaves'] = trial.suggest_int('max_leaves', 4, 128) # Max number of leaves in one tree 
                                                                       # decrease to deal with the overfit

    if params['grow_policy'] == 'SymmetricTree': 
        params['boosting_type'] = trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
    else:
        params['boosting_type'] = 'Plain'
    
    for i in range(CFG.n_optimize_repeats):
        print(f'Repeat {blu}#{i+1}')

        # Make random under- or oversampling to balance classes
        positive_count_train = train_df['Class'].value_counts()[1]
        if CFG.undersample:
            sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * 3, 
                                                            1: positive_count_train}, 
                                        random_state=15062023+i, 
                                        replacement=True)
        elif CFG.oversample:
            sampler = RandomOverSampler(random_state=2306020231)

        X_re, y_re = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class']
        
        if CFG.undersample:
            X_re, y_re = sampler.fit_resample(X_re, y_re)
        
        # Create Stratified Multilabel k-Fold scheme
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_optimize_folds, shuffle=True, random_state=10062023+i)

        # Create an oof array for inner loop
        oof = np.zeros(X_re.shape[0])

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=X_re[features], y=X_re.iloc[:,-3:]), start=1): 
            X, y = X_re[features], y_re

            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            # oversample
            if CFG.oversample:
                X_train, y_train = sampler.fit_resample(X_train, y_train)

            train_pool = Pool(X_train, y_train, cat_features=['EJ'])
            val_pool = Pool(X_val, y_val, cat_features=['EJ'])

            # Learning
            model = cat.CatBoostClassifier(**params)     
            # Add a callback for pruning
#             pruning_callback = optuna.integration.CatBoostPruningCallback(trial, "Logloss")
            model.fit(train_pool, eval_set=val_pool, verbose=0)#, callbacks=[pruning_callback])
            # Evoke pruning manually
#                 pruning_callback.check_pruned()
            # Predict
            val_preds = model.predict_proba(val_pool)[:,1]
            oof[val_idx] = val_preds
        
        bll_list.append(balanced_log_loss(y_re, oof))    
    
    return np.mean(bll_list)

if CFG.cb_optimize:
#     study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=100), direction="minimize")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=CFG.n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    df = study.trials_dataframe().sort_values('value')
    df.to_csv(f'optuna_catboost.csv')

    display(df.head(10))

# Load CatBoost parameters

In [60]:
import glob

if CFG.kaggle:
    param_list = glob.glob("/kaggle/input/icr-optuna-no-da/optuna_catboost.csv")
else:
    param_list = glob.glob("optuna_catboost.csv")

models = list()
best_cb_params = list()

cb_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if cb_params.shape[0] == 0:
        cb_params = tmp
    else:
        cb_params = pd.concat([cb_params, tmp])
        
cb_params = cb_params.sort_values('value').head(CFG.n_stacking_models_cb)
param_cols = [c for c in cb_params.columns if c.startswith('params_')]
cb_params = cb_params[param_cols]


for idx, row in cb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['task_type'] = 'CPU'
    row_dict['eval_metric'] = 'Logloss'
    row_dict['loss_function'] = 'Logloss'
    row_dict['random_seed'] = 13062023
    row_dict['verbose'] = 0
    row_dict['od_type'] = 'Iter'
    row_dict['iterations'] = CFG.n_estimators * 4
    row_dict['od_wait'] = CFG.early_stopping_rounds
    row_dict['border_count'] = 254
    
    if not CFG.oversample and not CFG.undersample:
        # row_dict['auto_class_weights'] = 'Balanced'
        row_dict['scale_pos_weight'] = class_imbalance
        
    if row_dict["task_type"] != "GPU":
        row_dict['colsample_bylevel'] = None
    
    if row_dict["bootstrap_type"] != "Bayesian":
        row_dict['bagging_temperature'] = None
        
    if row_dict["bootstrap_type"] not in ["Poisson", "Bernoulli", "MVS"]:
        row_dict['subsample'] = None
    
    if row_dict['grow_policy'] == 'Lossguide':
        row_dict['max_leaves'] = int(row_dict['max_leaves'])
    else:
        row_dict['max_leaves'] = None
    
    if row_dict['grow_policy'] != 'SymmetricTree':
        row_dict['boosting_type'] = 'Plain'
    
    best_cb_params.append(row_dict)

if CFG.test:
    best_cb_params = [{
            'iterations': CFG.n_estimators,
            'od_type': 'Iter',
            'od_wait': CFG.early_stopping_rounds,
            'eval_metric': "Logloss",
            'loss_function': "Logloss",
            'auto_class_weights': 'Balanced', 
            'verbose': 0,
            'random_seed': 19062023,
        }
    ]

# Models train

In [61]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def pp_prob(p):
    c0 = p[:,0].sum()
    c1 = p[:,1:].sum()
    new_p = p * np.array([[1/(c0 if i==0 else c1) for i in range(p.shape[1])]])
    new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
    return np.sum(new_p[:,1:],1,keepdims=False)

def model_train(how, best_params):
    # Make random under-sampling to balance classes
    positive_count_train = train_df['Class'].value_counts()[1]
    if CFG.undersample:
        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * 3, 
                                                        1: positive_count_train}, 
                                    random_state=150620231, 
                                    replacement=True)
    elif CFG.oversample:
        sampler = RandomOverSampler(random_state=2306020231)

    if how == 'tabpfn':
        X_re, y_re, test = pd.concat([train_df_tabpfn[num_cols], greeks.iloc[:,1:4]], axis=1), train_df['Class'], test_df_tabpfn[num_cols]
    else:
        X_re, y_re, test = pd.concat([train_df[features], greeks.iloc[:,1:4]], axis=1), train_df['Class'], test_df[features]

    if CFG.undersample:
        X_re, y_re = sampler.fit_resample(X_re, y_re)
    
    oof_level2 = np.zeros([y_re.shape[0], len(best_params) + 1])
    oof_level2[:, len(best_params)] = y_re
    oof_level2_test = np.zeros([test_df.shape[0], len(best_params)])
    
    for i, params in tqdm(enumerate(best_params), total=len(best_params)):
    
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=80620231+i)

        print(f"Training with {blu}{len(features)}{res} features")

        for fold, (fit_idx, val_idx) in enumerate(kf.split(X=X_re, y=X_re.iloc[:,-3:]), start = 1):
            X, y = X_re[features], y_re
            
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[fit_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[fit_idx]
            y_val = y.iloc[val_idx]

            # oversample
            if CFG.oversample:
                X_train, y_train = sampler.fit_resample(X_train, y_train)
            
            if how == 'catboost':
                train_pool = Pool(X_train, y_train, cat_features=['EJ'])
                val_pool = Pool(X_val, y_val, cat_features=['EJ'])           
            
            if how == 'lgbm':
                model = lgb.LGBMClassifier(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)
                best_iter = model.best_iteration_
            elif how == 'xgboost':
                model = xgb.XGBClassifier(**params)
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
                best_iter = model.get_booster().best_iteration
            elif how == 'catboost':
                model = cat.CatBoostClassifier(**params)
                model.fit(train_pool, eval_set=val_pool, verbose=0)
                best_iter = model.best_iteration_
            elif how == 'tabpfn':
                model = TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')
                model.fit(X_train, y_train, overwrite_warning=True)
                best_iter = 0
            else:
                return None, None
                
            if how == 'tabpfn':
                val_preds = pp_prob(model.predict_proba(X_val))
                oof_level2_test[:, i] += pp_prob(model.predict_proba(test))
            else:
                val_preds = model.predict_proba(X_val)[:,1]
                oof_level2_test[:, i] += model.predict_proba(test)[:,1]
            
            oof_level2[val_idx, i] = val_preds

            val_score = balanced_log_loss(y_val, val_preds)
            
            print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                    f' | Best iteration: {blu}{best_iter:>4}{res}') 
        
    return oof_level2, oof_level2_test / CFG.n_stacking_folds


oof_train_list = list()
oof_test_list = list()

if CFG.lgbm_train:
    oof_level2_lgbm, oof_level2_test_lgbm = model_train('lgbm', best_lgbm_params)
    oof_train_list.append(oof_level2_lgbm[:,:-1])
    oof_test_list.append(oof_level2_test_lgbm)
    y = oof_level2_lgbm[:,-1]

if CFG.xgb_train:
    oof_level2_xgb, oof_level2_test_xgb = model_train('xgboost', best_xgb_params)
    oof_train_list.append(oof_level2_xgb[:,:-1])
    oof_test_list.append(oof_level2_test_xgb)
    y = oof_level2_xgb[:,-1]

if CFG.cb_train:
    oof_level2_cb, oof_level2_test_cb = model_train('catboost', best_cb_params)
    oof_train_list.append(oof_level2_cb[:,:-1])
    oof_test_list.append(oof_level2_test_cb)
    y = oof_level2_cb[:,-1]

if CFG.tabpfn_train:
    oof_level2_tabpfn, oof_level2_test_tabpfn = model_train('tabpfn', [i for i in range(CFG.n_stacking_models_tabpfn)])
    oof_train_list.append(oof_level2_tabpfn[:,:-1])
    oof_test_list.append(oof_level2_test_tabpfn)
    y = oof_level2_tabpfn[:,-1]

  0%|          | 0/40 [00:00<?, ?it/s]

Training with [1m[34m38[0m features
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.12301[0m | Best iteration: [1m[34m 182[0m
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.00278[0m | Best iteration: [1m[34m 494[0m
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.54705[0m | Best iteration: [1m[34m  73[0m
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.17000[0m | Best iteration: [1m[34m 138[0m
Fold: [1m[34m  5[0m| bll_metric: [1m[34m0.09815[0m | Best iteration: [1m[34m 263[0m
Fold: [1m[34m  6[0m| bll_metric: [1m[34m0.03804[0m | Best iteration: [1m[34m 670[0m
Fold: [1m[34m  7[0m| bll_metric: [1m[34m0.16580[0m | Best iteration: [1m[34m 430[0m
Fold: [1m[34m  8[0m| bll_metric: [1m[34m0.32528[0m | Best iteration: [1m[34m 193[0m
Fold: [1m[34m  9[0m| bll_metric: [1m[34m0.37580[0m | Best iteration: [1m[34m  66[0m
Fold: [1m[34m 10[0m| bll_metric: [1m[34m0.34359[0m | Best iteration: [1m[34m 179[0m
Fold: [1m[34m 11[0m| bll_met

In [62]:
# def model_train(how, best_params):

#     oof_level2 = np.zeros([train_df['Class'].shape[0], len(best_params) + 1])
#     oof_level2[:, len(best_params)] = train_df['Class']
#     oof_level2_test = np.zeros([test_df.shape[0], len(best_params)])
    
#     for i, params in tqdm(enumerate(best_params), total=len(best_params)):
        
#         if how == 'tabpfn':
#             X, y, test = train_df_tabpfn[num_cols], train_df['Class'], test_df_tabpfn[num_cols]
#         else:
#             X, y, test = train_df[features], train_df['Class'], test_df[features]
    
#         kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=80620231+i)

#         print(f"Training with {blu}{len(features)}{res} features")

#         for fold, (fit_idx, val_idx) in enumerate(kf.split(X=X, y=greeks.iloc[:,1:4]), start=1):
#             # Split the dataset according to the fold indexes.
#             X_train = X.iloc[fit_idx]
#             X_val = X.iloc[val_idx]
#             y_train = y.iloc[fit_idx]
#             y_val = y.iloc[val_idx]

#             # undersample / oversample
#             if CFG.undersample or CFG.oversample:
#                 if CFG.undersample:
#                     positive_count_train = y_train.value_counts()[1]
#                     sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * 3, 
#                                                             1: positive_count_train}, 
#                                                 random_state=150620231 + i, 
#                                                 replacement=True)
#                 elif CFG.oversample:
#                     sampler = RandomOverSampler(random_state=2306020231)

#                 X_train, y_train = sampler.fit_resample(X_train, y_train)
            
#             if how == 'catboost':
#                 train_pool = Pool(X_train, y_train, cat_features=['EJ'])
#                 val_pool = Pool(X_val, y_val, cat_features=['EJ'])           
            
#             if how == 'lgbm':
#                 model = lgb.LGBMClassifier(**params)
#                 model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)
#                 best_iter = model.best_iteration_
#             elif how == 'xgboost':
#                 model = xgb.XGBClassifier(**params)
#                 model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
#                 best_iter = model.get_booster().best_iteration
#             elif how == 'catboost':
#                 model = cat.CatBoostClassifier(**params)
#                 model.fit(train_pool, eval_set=val_pool, verbose=0)
#                 best_iter = model.best_iteration_
#             elif how == 'tabpfn':
#                 model = TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')
#                 model.fit(X_train, y_train, overwrite_warning=True)
#                 best_iter = 0
#             else:
#                 return None, None
                
#             if how == 'tabpfn':
#                 val_preds = pp_prob(model.predict_proba(X_val))
#                 oof_level2_test[:, i] += pp_prob(model.predict_proba(test))
#             else:
#                 val_preds = model.predict_proba(X_val)[:,1]
#                 oof_level2_test[:, i] += model.predict_proba(test)[:,1]
            
#             oof_level2[val_idx, i] = val_preds

#             val_score = balanced_log_loss(y_val, val_preds)
            
#             print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
#                     f' | Best iteration: {blu}{best_iter:>4}{res}') 
        
#     return oof_level2, oof_level2_test / CFG.n_stacking_folds

# oof_train_list = list()
# oof_test_list = list()

# if CFG.lgbm_train:
#     oof_level2_lgbm, oof_level2_test_lgbm = model_train('lgbm', best_lgbm_params)
#     oof_train_list.append(oof_level2_lgbm[:,:-1])
#     oof_test_list.append(oof_level2_test_lgbm)
#     y = oof_level2_lgbm[:,-1]

# if CFG.xgb_train:
#     oof_level2_xgb, oof_level2_test_xgb = model_train('xgboost', best_xgb_params)
#     oof_train_list.append(oof_level2_xgb[:,:-1])
#     oof_test_list.append(oof_level2_test_xgb)
#     y = oof_level2_xgb[:,-1]

# if CFG.cb_train:
#     oof_level2_cb, oof_level2_test_cb = model_train('catboost', best_cb_params)
#     oof_train_list.append(oof_level2_cb[:,:-1])
#     oof_test_list.append(oof_level2_test_cb)
#     y = oof_level2_cb[:,-1]

# if CFG.tabpfn_train:
#     oof_level2_tabpfn, oof_level2_test_tabpfn = model_train('tabpfn', [i for i in range(CFG.n_stacking_models_tabpfn)])
#     oof_train_list.append(oof_level2_tabpfn[:,:-1])
#     oof_test_list.append(oof_level2_test_tabpfn)
#     y = oof_level2_tabpfn[:,-1]

# Stacking with Logistic Regression

In [63]:
from sklearn.linear_model import LogisticRegression

oof_level2 = np.concatenate(oof_train_list, axis=1)
oof_level2_test = np.concatenate(oof_test_list, axis=1)

X = oof_level2

# mean bll
print(balanced_log_loss(y, np.mean(X, axis=1)))

lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)

pred = lr.predict_proba(X)[:,1]

# lr bll
print(balanced_log_loss(y, pred))

weights = lr.coef_[0]

0.18130449603081458
0.13249114243596113


In [64]:
# full dataset with class balance 3
# 0.18130449603081458
# 0.13249114243596113

# undersample with class balance 3
# 0.11174849200860162
# 0.09973545010826133

# undersample with class balance 3 and different undersamping for every model (validatidation was not undersampled)
# 0.20137094391536337
# 0.11640872638477567


In [65]:
# LGBM + CatBoost + XGBoost + TabPFN
# 0.17472710249039772
# 0.09683636947851168

# LGBM oversampled 10 folds
# 0.1899300127485599
# 0.1321074079639491

# LGBM + CatBoost + XGBoost + TabPFN + different k-fold for every model
# 0.17616782811158316
# 0.09637187593530008

# LGBM + CatBoost + XGBoost + TabPFN + different k-fold for every model + 20 folds
# 0.16113327662498975
# 0.08182251667662065

# LGBM 20 folds
# 0.13372873188516746
# 0.09363332625161021

# Which objects are the most erroneus?

In [66]:
preds = np.mean(X, axis=1)
errors = pd.Series(np.abs(y - preds))
errors = errors.sort_values(ascending=False) 
errors[errors >= errors.quantile(0.99)].index.to_list()


[509, 292, 313, 102, 408, 337, 380]

# Find the best class threshold

In [67]:
def pp_prob3(_oof, _p, num=1.5):
    # increase (num > 1) or decrease (num < 1) binary prediction value
    oof = num * _oof / ((num - 1) * _oof + 1)
    p = num * _p / ((num - 1) * _p + 1)
    return oof, p

def inflate_preds(_y, _oof, _p):
    # find the best num multiplier for binary prediction
    best_score = np.inf
    best_num = None
    best_oof = None
    best_p = None
    
    candidates = np.linspace(0.05,5,100)
    for num in candidates:
        curr_oof, curr_p = pp_prob3(_oof, _p, num)
        curr_score = balanced_log_loss(_y, curr_oof)
        if curr_score < best_score:
            best_num = num
            best_score = curr_score
            best_p = curr_p
            best_oof = curr_oof
    print('best num:', round(best_num, 2), '/ best score:', best_score)
    return best_oof, best_p

# Predict test

In [68]:
def predict(X):
    y = np.zeros(len(X))
    for i in range(oof_level2_test.shape[1]):
        # y += weights[i] * oof_level2_test[:,i]
        y += oof_level2_test[:,i]
    # return y / sum(weights)
    return y / oof_level2_test.shape[1]

predictions = predict(test_df[features])

if CFG.adjust_class_threshold:
    _, predictions = inflate_preds(y, np.mean(X, axis=1), predictions)

test_df['class_1'] = predictions
test_df['class_0'] = 1 - predictions

sample_submission[['class_0', 'class_1']] = test_df[['class_0', 'class_1']]
sample_submission.to_csv(r"submission.csv", index=False)
sample_submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.892822,0.107178
1,010ebe33f668,0.892822,0.107178
2,02fa521e1838,0.892822,0.107178
3,040e15f562a2,0.892822,0.107178
4,046e85c7cc7f,0.892822,0.107178
