# Permutation feature selection + LGBM


In [None]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import eli5
from IPython.display import display
from eli5.permutation_importance import get_score_importances
from eli5.sklearn import PermutationImportance

import matplotlib.pyplot as plt

import itertools

import warnings
warnings.filterwarnings('ignore')

## Set feature combinations

In [48]:
import pandas as pd

# a = pd.read_csv('perm_df_plus_minus.csv')
x = pd.read_csv('perm_df_pow_05.csv')
y = pd.read_csv('perm_df_log.csv')
z = pd.read_csv('perm_df_pow_2.csv')
zz = pd.read_csv('perm_df_plus_minus.csv')

In [50]:
# list_x = list(x[x['importance'] > 0.002].values[:,0])
# list_y = list(y[y['importance'] > 0.002].values[:,0])
# list_z = list(z[z['importance'] > 0.002].values[:,0])
# list_zz = list(zz[zz['importance'] > 0.002].values[:,0])

['DU+FR',
 'DU*FR',
 'DU/GL',
 'DU-EP',
 'BQ/DA',
 'DH/DU',
 'AF/DL',
 'AB+DU',
 'AH-DU']

In [49]:
zz

Unnamed: 0.1,Unnamed: 0,importance
0,DU+FR,0.00775
1,DU*FR,0.00525
2,DU/GL,0.00475
3,DU-EP,0.00350
4,BQ/DA,0.00325
...,...,...
4507,BQ/CR,-0.00075
4508,BC-DA,-0.00075
4509,DL/EB,-0.00100
4510,BQ*CF,-0.00100


In [2]:
features = train_df.drop(['f_1', 'target'], axis=1).columns
generated_features = pd.DataFrame()

# with all of these features usually will work too long, so try to comment some of them 
# and select features separately
for fe_a, fe_b in itertools.combinations(features, 2):

    generated_features[f'{fe_a}+{fe_b}']   = train_df[fe_a] + train_df[fe_b]
    generated_features[f'{fe_a}-{fe_b}']   = train_df[fe_a] - train_df[fe_b] 
    generated_features[f'{fe_a}*{fe_b}']   = train_df[fe_a] * train_df[fe_b]
    generated_features[f'{fe_a}/{fe_b}']   = train_df[fe_a] / train_df[fe_b]

    generated_features[f'{fe_a}*{fe_b}_2'] = train_df[fe_a] * train_df[fe_b].pow(2)
    generated_features[f'{fe_a}_2*{fe_b}'] = train_df[fe_a].pow(2) * train_df[fe_b]
    generated_features[f'{fe_a}_2']        = rain_df[fe_a].pow(2)
    generated_features[f'{fe_b}_2']        = train_df[fe_b].pow(2)

    generated_features[f'{fe_a}_05'] = train_df[fe_a].pow(0.5)
    generated_features[f'{fe_b}_05'] = train_df[fe_b].pow(0.5)
    generated_features[f'{fe_a}*{fe_b}_05'] = train_df[fe_a] * train_df[fe_b].pow(0.5)
    generated_features[f'{fe_a}_05*{fe_b}'] = train_df[fe_a].pow(0.5) * train_df[fe_b]
    
    generated_features[f'{fe_a}_log'] = np.log(train_df[fe_a])
    generated_features[f'{fe_b}_log'] = np.log(train_df[fe_b])
    generated_features[f'{fe_a}*{fe_b}_log'] = train_df[fe_a] * np.log(train_df[fe_b])
    generated_features[f'{fe_a}_log*{fe_b}'] = np.log(train_df[fe_a]) * train_df[fe_b]

# Permutation importance

In [None]:
# LGBM params
params = {
    'boosting_type': 'GBDT',
    'objective':"binary",
    'metric':'binary_logloss',
    'random_state': 6052023,
    'verbose': -1
}

perm_df = pd.DataFrame()
n_splits = 5
n_rounds = 5

#Balance & downsample
sampler = RandomUnderSampler(sampling_strategy={0:200 , 1:100},random_state=0, replacement=False)
# x_train_bal, y_train_bal = sampler.fit_resample(train_df.drop(['Class'], axis=1), train_df.Class)
x_train_bal, y_train_bal = sampler.fit_resample(generated_features, train_df.Class)

#Storage for oof scroe of current resampled dataset
oof_score = pd.DataFrame(index=x_train_bal.index, columns=['preds'])

#Split Data
for i in range(n_rounds):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state = 5062023 + i)
    for fold, (train_index, val_index) in enumerate(skf.split(x_train_bal, y_train_bal)):
        print(f'round - {i}, fold - {fold}')
        
        X_train, Y_train, X_val, Y_val = x_train_bal.iloc[train_index], y_train_bal[train_index], x_train_bal.iloc[val_index], y_train_bal[val_index]

        clf = lgb.LGBMClassifier(**params, n_estimators = 1000)
        clf.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_val, Y_val)], 
                early_stopping_rounds=30, eval_metric='logloss', verbose=50)

        preds = clf.predict_proba(X_val)[:,1]

        oof_score.loc[val_index, 'preds'] = preds


        perm = PermutationImportance(clf, scoring=None, n_iter=1, 
                                     random_state=42, cv=None, refit=False).fit(X_val, Y_val)


        perm_importance_df = pd.DataFrame({'importance': perm.feature_importances_}, 
                                           index=X_val.columns).sort_index()

        if perm_df.shape[0] == 0:
            perm_df = perm_importance_df.copy()
        else:
            perm_df += perm_importance_df
            
        print('\n')
        
# collect all permutation importances into the dataset and then avearge them by number of rounds * folds
perm_df /= n_splits * n_rounds

perm_df = perm_df.sort_values('importance', ascending=False)
perm_df.to_csv('perm_df.csv')