## LGBM load parameters

In [None]:
import glob

if CFG.kaggle:
    param_list = glob.glob("/kaggle/input/icr-lgbm-optuna/optuna_lgbm.csv")
else:
    param_list = glob.glob("optuna_lgbm.csv")

models = list()
best_lgbm_params = list()

lgbm_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if lgbm_params.shape[0] == 0:
        lgbm_params = tmp
    else:
        lgbm_params = pd.concat([lgbm_params, tmp])
        
lgbm_params = lgbm_params.sort_values('value').head(CFG.n_stacking_models)
param_cols = [c for c in lgbm_params.columns if c.startswith('params_')]
lgbm_params = lgbm_params[param_cols]

for idx, row in lgbm_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['objective'] = 'binary'
    row_dict['metric'] = 'none'
#     row_dict['subsample_for_bin'] = 300000
    row_dict['force_col_wise'] = False
    row_dict['n_estimators'] = CFG.n_estimators
    row_dict['early_stopping_round'] = CFG.early_stopping_rounds
    row_dict['boosting_type'] = 'goss'
    row_dict['verbose'] = -1
    row_dict['max_bin'] = 255
    
    row_dict['num_leaves'] = int(row_dict['num_leaves'])
    row_dict['max_depth'] = int(row_dict['max_depth'])
    row_dict['min_data_in_leaf'] = int(row_dict['min_data_in_leaf'])
    row_dict['bagging_freq'] = int(row_dict['bagging_freq'])
    row_dict['learning_rate'] = 0.06433232950390658 # float(row_dict['learning_rate'])
    
    if not CFG.undersample:
        row_dict['is_unbalance'] = True
        row_dict['class_weight'] = 'balanced'
        # row_dict['scale_pos_weight'] = class_imbalance
    
    if row_dict['boosting_type'] == 'goss':
        row_dict['subsample'] = None
        
    best_lgbm_params.append(row_dict)

## CatBoost load parameters

In [None]:
import glob

if CFG.kaggle:
    param_list = glob.glob("/kaggle/input/icr-lgbm-optuna/optuna_catboost.csv")
else:
    param_list = glob.glob("optuna_catboost.csv")

models = list()
best_cb_params = list()

cb_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if cb_params.shape[0] == 0:
        cb_params = tmp
    else:
        cb_params = pd.concat([cb_params, tmp])
        
cb_params = cb_params.sort_values('value').head(CFG.n_stacking_models)
param_cols = [c for c in cb_params.columns if c.startswith('params_')]
cb_params = cb_params[param_cols]


for idx, row in cb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['task_type'] = 'CPU'
    row_dict['eval_metric'] = 'Logloss'
    row_dict['loss_function'] = 'Logloss'
    row_dict['random_seed'] = 13062023
    row_dict['verbose'] = 0
    row_dict['od_type'] = 'Iter'
    row_dict['iterations'] = 10000 # CFG.n_estimators
    row_dict['od_wait'] = CFG.early_stopping_rounds
    row_dict['border_count'] = 254
    
    if not CFG.undersample:
        row_dict['auto_class_weights'] = 'Balanced'
        # row_dict['scale_pos_weight'] = class_imbalance
        
    if row_dict["task_type"] != "GPU":
        row_dict['colsample_bylevel'] = None
    
    if row_dict["bootstrap_type"] != "Bayesian":
        row_dict['bagging_temperature'] = None
        
    if row_dict["bootstrap_type"] not in ["Poisson", "Bernoulli", "MVS"]:
        row_dict['subsample'] = None
    
    if row_dict['grow_policy'] == 'Lossguide':
        row_dict['max_leaves'] = int(row_dict['max_leaves'])
    else:
        row_dict['max_leaves'] = None
    
    if row_dict['grow_policy'] != 'SymmetricTree':
        row_dict['boosting_type'] = 'Plain'
    
    best_cb_params.append(row_dict)

## XGBoost load parameters

In [None]:
import glob

if CFG.kaggle:
    param_list = glob.glob("/kaggle/input/icr-lgbm-optuna/optuna_xgb.csv")
else:
    param_list = glob.glob("optuna_xgb.csv")

models = list()
best_xb_params = list()

xb_params = pd.DataFrame()

for f in param_list:
    tmp = pd.read_csv(f, index_col='Unnamed: 0')
    if xb_params.shape[0] == 0:
        xb_params = tmp
    else:
        xb_params = pd.concat([xb_params, tmp])
        
xb_params = xb_params.sort_values('value').head(CFG.n_stacking_models)
param_cols = [c for c in xb_params.columns if c.startswith('params_')]
xb_params = xb_params[param_cols]

for idx, row in xb_params.iterrows():
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['n_estimators'] = CFG.n_estimators
    row_dict['early_stopping_rounds'] = CFG.early_stopping_rounds
    row_dict['random_state'] = 14062023
    row_dict['verbosity'] = 0
    row_dict['objective'] = "binary:logistic"
    row_dict['eval_metric'] = "logloss"
    row_dict['tree_method'] = "exact"
    row_dict['booster'] = "gbtree"

    if not CFG.undersample:
        row_dict['scale_pos_weight'] = class_imbalance

    if row_dict["booster"] in ["gbtree", "dart"]:
        row_dict["max_depth"] = int(row_dict["max_depth"])
        row_dict["min_child_weight"] = int(row_dict["min_child_weight"])
    else:
        row_dict["learning_rate"] = None
        row_dict["max_depth"] = None
        row_dict["min_child_weight"] = None
        row_dict["gamma"] = None
        row_dict["grow_policy"] = None     

    if row_dict["booster"] != "dart":
        row_dict["sample_type"] = None
        row_dict["normalize_type"] = None
        row_dict["rate_drop"] = None
        row_dict["skip_drop"] = None

    best_xb_params.append(row_dict)

## Train with the Cross Validation + any model

In [None]:
def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def pp_prob(p):
    c0 = p[:,0].sum()
    c1 = p[:,1:].sum()
    new_p = p * np.array([[1/(c0 if i==0 else c1) for i in range(p.shape[1])]])
    new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
    return np.sum(new_p[:,1:],1,keepdims=False)

def model_train(how, best_params):

    oof_level2 = np.zeros([train_df['Class'].shape[0], len(best_params) + 1])
    oof_level2[:, len(best_params)] = train_df['Class']
    oof_level2_test = np.zeros([test_df.shape[0], len(best_params)])
    
    for i, params in tqdm(enumerate(best_params), total=len(best_params)):
        
        if how == 'tabpfn':
            X, y, test = train_df_tabpfn[num_cols], train_df['Class'], test_df_tabpfn[num_cols]
        else:
            X, y, test = train_df[features], train_df['Class'], test_df[features]
    
        if CFG.n_stacking_folds > 0:
            if CFG.k_fold:
                kf = KFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=80620231+i)
                y_fold = y
            elif CFG.strat_k_fold:
                kf = StratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=80620231+i)
                y_fold = y
            else:
                kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=80620231+i)
                y_fold = greeks.iloc[:,1:4]
            
            print(f"Training with {blu}{len(features)}{res} features")

            for fold, (fit_idx, val_idx) in enumerate(kf.split(X=X, y=y_fold)):
                # Split the dataset according to the fold indexes.
                X_train = X.iloc[fit_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[fit_idx]
                y_val = y.iloc[val_idx]

                # Make random under- or oversampling to balance classes
                if CFG.undersample or CFG.oversample:
                    if CFG.undersample:
                        positive_count_train = y_train.value_counts()[1]
                        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * class_imbalance, 
                                                                        1: positive_count_train}, 
                                                    random_state=15062023+i, 
                                                    replacement=True)
                    elif CFG.oversample:
                        negative_count_train = y_train.value_counts()[0]
                        sampler = RandomOverSampler(sampling_strategy={0: negative_count_train, 
                                                                    1: negative_count_train // class_imbalance}, 
                                                    random_state=2306020231+i)

                    X_train, y_train = sampler.fit_resample(X_train, y_train)
                
                if how == 'lgbm':
                    model = lgb.LGBMClassifier(**params)
                    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)
                    best_iter = model.best_iteration_
                elif how == 'xgboost':
                    model = xgb.XGBClassifier(**params)
                    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
                    best_iter = model.get_booster().best_iteration
                elif how == 'catboost':
                    train_pool = Pool(X_train, y_train, cat_features=['EJ'])
                    val_pool = Pool(X_val, y_val, cat_features=['EJ'])   
                    model = cat.CatBoostClassifier(**params)
                    model.fit(train_pool, eval_set=val_pool, verbose=0)
                    best_iter = model.best_iteration_
                elif how == 'tabpfn':
                    model = TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')
                    model.fit(X_train, y_train, overwrite_warning=True)
                    best_iter = 0
                else:
                    return None, None
                    
                if how == 'tabpfn':
                    val_preds = pp_prob(model.predict_proba(X_val))
                    oof_level2_test[:, i] += pp_prob(model.predict_proba(test))
                else:
                    val_preds = model.predict_proba(X_val)[:,1]
                    oof_level2_test[:, i] += model.predict_proba(test)[:,1]
                
                oof_level2[val_idx, i] = val_preds

                val_score = balanced_log_loss(y_val, val_preds)
                
                print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                        f' | Best iteration: {blu}{best_iter:>4}{res}')  
        else:
            if how == 'lgbm':
                model = lgb.LGBMClassifier(**params)
                model.fit(X, y, verbose=0)
            elif how == 'xgboost':
                model = xgb.XGBClassifier(**params)
                model.fit(X, y, verbose=0)
            elif how == 'catboost':
                train_pool = Pool(X, y, cat_features=['EJ'])
                model = cat.CatBoostClassifier(**params)
                model.fit(train_pool, verbose=0)
            elif how == 'tabpfn':
                model = TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')
                model.fit(X, y, overwrite_warning=True)
            else:
                return None, None

            oof_level2_test[:, i] += model.predict_proba(test)[:,1]
        
    return oof_level2, oof_level2_test / max(CFG.n_stacking_folds, 1)

oof_train_list = list()
oof_test_list = list()

if CFG.lgbm_train:
    oof_level2_lgbm, oof_level2_test_lgbm = model_train('lgbm', best_lgbm_params)
    oof_train_list.append(oof_level2_lgbm[:,:-1])
    oof_test_list.append(oof_level2_test_lgbm)
    y = oof_level2_lgbm[:,-1]

if CFG.xgb_train:
    oof_level2_xgb, oof_level2_test_xgb = model_train('xgboost', best_xgb_params)
    oof_train_list.append(oof_level2_xgb[:,:-1])
    oof_test_list.append(oof_level2_test_xgb)
    y = oof_level2_xgb[:,-1]

if CFG.cb_train:
    oof_level2_cb, oof_level2_test_cb = model_train('catboost', best_cb_params)
    oof_train_list.append(oof_level2_cb[:,:-1])
    oof_test_list.append(oof_level2_test_cb)
    y = oof_level2_cb[:,-1]

if CFG.tabpfn_train:
    oof_level2_tabpfn, oof_level2_test_tabpfn = model_train('tabpfn', [i for i in range(CFG.n_stacking_models_tabpfn)])
    oof_train_list.append(oof_level2_tabpfn[:,:-1])
    oof_test_list.append(oof_level2_test_tabpfn)
    y = oof_level2_tabpfn[:,-1]

# Stacking LGBM + CatBoost + XGBoost with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

oof_level2 = np.concatenate(oof_train_list, axis=1)
oof_level2_test = np.concatenate(oof_test_list, axis=1)

X = oof_level2

# mean bll
print(balanced_log_loss(y, np.mean(X, axis=1)))

lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)

pred = lr.predict_proba(X)[:,1]

# lr bll
print(balanced_log_loss(y, pred))

weights = lr.coef_[0]

# LGBM + TSS + Stacking

## Create metafeatures for the train set

In [None]:
# dates for the second level
dates_level2 = [28, 29, 30, 31, 32, 33]

# train dates
dates_train = X_train.date_block_num
# dates_train_level2 = dates_train[dates_train.isin(dates)]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin(dates_level2)]

# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], len(best_params)+1])
X_train_level2[:, len(best_params)] = y_train_level2

meta_index_begin = 0
meta_index_end = 0

# Now fill `X_train_level2` with metafeatures
for cur_block_num in tqdm_notebook(dates_level2):
    
    # split data
    train_index = X_train.loc[dates_train <  cur_block_num].index
    test_index  = X_train.loc[dates_train == cur_block_num].index
    
    X_train_l2 = X_train.loc[train_index, :]
    X_test_l2 =  X_train.loc[test_index, :]

    y_train_l2 = y_train[train_index]
    y_test_l2 =  y_train[test_index]
    
    meta_index_end += y_test_l2.shape[0]
    
    # predict metafeatures for each of LGBM regressors
    for i, params in enumerate(tqdm_notebook(best_params)):
        lgb = LGBMRegressor(**params)
        reg = lgb.fit(X_train_l2, y_train_l2)
        pred = lgb.predict(X_test_l2)
        X_train_level2[meta_index_begin:meta_index_end, i] = pred

        del lgb, reg, pred
        gc.collect()
        
    meta_index_begin = meta_index_end
        
X_train_level2 = pd.DataFrame(X_train_level2, columns=[f'lgbr_{i+1}' for i in range(len(best_params))]+['gt'])
X_train_level2.to_pickle('LGBM_X_train_level2.pkl')

## Create metafeatures for the test set

In [None]:
X_test_level2 = np.zeros([X_test.shape[0], len(best_params)])

for i, params in enumerate(tqdm_notebook(best_params)):
    lgb = LGBMRegressor(**params)
    reg = lgb.fit(X_train, y_train)
    pred = lgb.predict(X_test)
    X_test_level2[:, i] = pred
    
    del lgb, reg, pred
    gc.collect()
    
X_test_level2 = pd.DataFrame(X_test_level2, columns=[f'lgbr_{i+1}' for i in range(len(best_params))])
X_test_level2.to_pickle('LGBM_X_test_level2.pkl')

## Stacking with Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

X = X_train_level2[[c for c in X_train_level2.columns if c != 'gt']]
y = X_train_level2['gt']

lr = LinearRegression()
lr.fit(X, y)

pred = lr.predict(X_test_level2)