# Permutation feature selection + LGBM


In [None]:
import math

import numpy as np
import pandas as pd

import lightgbm as lgb

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, BaseShuffleSplit, _validate_shuffle_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

import eli5
from IPython.display import display
from eli5.permutation_importance import get_score_importances
from eli5.sklearn import PermutationImportance

from shaphypetune import BoostBoruta

import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL

# Set all features combinations (brute force)

In [2]:
features = train_df.drop(['f_1', 'target'], axis=1).columns
generated_features = pd.DataFrame()

# with all of these features usually will work too long, so try to comment some of them 
# and select features separately
for fe_a, fe_b in itertools.combinations(features, 2):

    generated_features[f'{fe_a}+{fe_b}']   = train_df[fe_a] + train_df[fe_b]
    generated_features[f'{fe_a}-{fe_b}']   = train_df[fe_a] - train_df[fe_b] 
    generated_features[f'{fe_a}*{fe_b}']   = train_df[fe_a] * train_df[fe_b]
    generated_features[f'{fe_a}/{fe_b}']   = train_df[fe_a] / train_df[fe_b]

    generated_features[f'{fe_a}*{fe_b}_2'] = train_df[fe_a] * train_df[fe_b].pow(2)
    generated_features[f'{fe_a}_2*{fe_b}'] = train_df[fe_a].pow(2) * train_df[fe_b]
    generated_features[f'{fe_a}_2']        = rain_df[fe_a].pow(2)
    generated_features[f'{fe_b}_2']        = train_df[fe_b].pow(2)

    generated_features[f'{fe_a}_05'] = train_df[fe_a].pow(0.5)
    generated_features[f'{fe_b}_05'] = train_df[fe_b].pow(0.5)
    generated_features[f'{fe_a}*{fe_b}_05'] = train_df[fe_a] * train_df[fe_b].pow(0.5)
    generated_features[f'{fe_a}_05*{fe_b}'] = train_df[fe_a].pow(0.5) * train_df[fe_b]
    
    generated_features[f'{fe_a}_log'] = np.log(train_df[fe_a])
    generated_features[f'{fe_b}_log'] = np.log(train_df[fe_b])
    generated_features[f'{fe_a}*{fe_b}_log'] = train_df[fe_a] * np.log(train_df[fe_b])
    generated_features[f'{fe_a}_log*{fe_b}'] = np.log(train_df[fe_a]) * train_df[fe_b]

# Nested CV + LGBM importance + Permutation importance + Boruta SHAP

In [None]:
class CFG:
    n_repeats = 4
    n_folds = 5

params = {
        'boosting_type':'goss',
        'learning_rate': 0.06733232950390658, 
        'n_estimators': 50000, 
        'early_stopping_round' : 100, 
        'subsample' : 0.6970532011679706,
        'colsample_bytree': 0.6055755840633003,
        'num_leaves': 6,
        'class_weight': 'balanced',
        'metric': 'none', 
        'is_unbalance': True, 
        'random_state': 8062023,
        'feature_fraction_seed': 8062023,
        'bagging_seed': 8062023,
        'max_depth': 8,
        'reg_alpha': 0.08866046540248787,  
        'reg_lambda': 1.0245261859148395e-06,
        'importance_type': 'gain'
        }

def lgbm_tuning(features, permut=False, boruta=False):
    metric = balanced_log_loss
    eval_results_ = {}

    outer_cv_score = [] # store all cv scores of outer loop inference
    inner_cv_score = [] # store all cv scores of inner loop training

    perm_df_ = pd.DataFrame()
    feature_importances_ = pd.DataFrame()
    boruta_df_ = pd.DataFrame()
    
    for i in range(CFG.n_repeats):
        print(f'Repeat {blu}#{i+1}')
        
        kf = MultilabelStratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=8062023+i)

        # Stratify based on Class and Alpha (3 types of conditions)
        for fold, (train_idx, val_idx) in enumerate(kf.split(X=train_df[features], y=greeks.iloc[:,1:3]), start = 1): 
            X, y = train_df[features], train_df.Class
#             X, y = generated_features_train, train_df.Class
            # Split the dataset according to the fold indexes.
            X_train = X.iloc[train_idx]
            X_val = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            # 20% hold-out set
            X_holdout, y_holdout = X_val, y_val

            # Create an oof array for inner loop
            oof_inner = np.zeros(len(X_train))

            X_train = X_train.reset_index(drop=True)
            y_train = y_train.reset_index(drop=True)

            cv = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=8062023+i) # Use stratifiedKfold to make life easier

            X_outer, y_outer = X_train, y_train

            models_ = [] # Used to store models trained in the inner loop.

            print(f"Outer Loop fold {fold}, Inner Loop Training with {blu}{X_train.shape[0]}{res} samples, {blu}{X_train.shape[1]}{res} features, seed = {blu}{8602023}{res}")

            for fold, (train_idx, val_idx) in enumerate(cv.split(X=X_train, y=y_train), start = 1):
                # Split the dataset according to the fold indexes.
                X_train = X_outer.iloc[train_idx]
                X_val = X_outer.iloc[val_idx]
                y_train = y_outer.iloc[train_idx]
                y_val = y_outer.iloc[val_idx]

                eval_results_[fold]= {}

                clf = lgb.LGBMClassifier(**params)
                clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                        eval_metric='logloss', 
                        early_stopping_rounds=300, verbose=-1)

                models_.append(clf)

                val_preds = clf.predict_proba(X_val)[:,1]
                oof_inner[val_idx] = val_preds

                val_score = metric(y_val, val_preds)
                best_iter = clf.best_iteration_

                print(f'Fold: {blu}{fold:>3}{res}| {metric.__name__}: {blu}{val_score:.5f}{res}'
                      f' | Best iteration: {blu}{best_iter:>4}{res}')

                # permutation importance
                if permut:
                    perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                                 random_state=42, cv=None, refit=False).fit(X_val, y_val)

                    perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                                       index=X_val.columns).sort_index()

                    if perm_df_.shape[0] == 0:
                        perm_df_ = perm_importance_df.copy()
                    else:
                        perm_df_ += perm_importance_df

                # tree feature importance
                f_i = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns), 
                                                  reverse=True, key=lambda x: x[1]), 
                                   columns=['Value','Feature'])

                if feature_importances_.shape[0] == 0:
                    feature_importances_ = f_i.copy()
                else:

                    feature_importances_['Value'] += f_i['Value']
                    
                # BORUTA importance
                if boruta:
                    model = BoostBoruta(clf, importance_type='shap_importances', train_importance=False)
                    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                              eval_metric=bll_metric, early_stopping_rounds=300, verbose=-1)
                    
                    boruta_importance_df = pd.DataFrame({'importance': model.ranking_}, 
                                                         index=X_train.columns).sort_index()
                    if boruta_df_.shape[0] == 0:
                        boruta_df_ = boruta_importance_df.copy()
                    else:
                        boruta_df_ += boruta_importance_df

            mean_cv_score = metric(y_outer, oof_inner)
            print(f'{red} Inner CV score: {res} {metric.__name__}: {red}{mean_cv_score:.5f}{res}')
            print(f'{"*" * 50}\n')
            inner_cv_score.append(mean_cv_score)

            # infer holdout data using 5-fold model trained in inner loop
            preds = np.zeros(len(X_holdout))
            for model in models_:
                preds += model.predict_proba(X_holdout)[:,1]
            preds = preds / len(models_)
            cv_score = metric(y_holdout, preds)
            print(f'{red} Outer Holdout score: {res} {metric.__name__}: {red}{cv_score:.5f}{res}')
            print(f'{"*" * 50}\n')
            outer_cv_score.append(cv_score)

    print(f'{red} Inner CV avg score: {res} {metric.__name__}: {red}{np.mean(inner_cv_score):.5f}{res}')
    print(f'{"*" * 50}\n')

    print(f'{red} Outer Holdout avg score: {res} {metric.__name__}: {red}{np.mean(outer_cv_score):.5f}{res}')
    print(f'{"*" * 50}\n')
    
    if permut:
        perm_df_ = perm_df_.sort_values('importance', ascending=False)
        
    if boruta:
        boruta_df_ = boruta_df_.sort_values('importance')
                                    
    feature_importances_ = feature_importances_.sort_values('Value', ascending=False)
    
    return perm_df_, feature_importances_, boruta_df_, np.mean(inner_cv_score), np.mean(outer_cv_score)

perm_df_, feature_importances_, boruta_df_, inner_cv_score, outer_cv_score = lgbm_tuning(features, permut=False, boruta=True)