# BorutaPy

https://www.kaggle.com/tilii7/boruta-feature-elimination#306039 

```'Hmm… Why should I trust this Boruta? I think it is clear by now that ps_calc features are useless since they have no correlation with the target. '…Tree based models/feature selection methods exploits NON-LINEAR relationships of independent variables with the target. Splitting criterion of trees is based on some impurity/entropy metric. Concepts like corelation apply to LINEAR models where the data is LINEARLY SEPARABLE!..Why should you trust it? Well, because if your data is not linearly separable then this one of the better options to do feature selection by, as, Boruta introduces shadow features into the model. By shadow features it means shuffling data within certain columns and reintroducing them into the dataset. Why do this? Because a variable is selected over n-iterations only after comparison with its shadow feature, ensuring that if it is selected, it is not as a result of randomness! This is my go-to method as opposed to evaluating feature importances from say a standalone RandomForest or GBM/XGBOOST/LGBM.```

Duration : ~ 10 hours

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [51]:
from src.dataset.data import Dataset
from src.features.build_features import *
from src.features.process_fold import *
from src.model.train import *

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import lightgbm as lgb
from boruta import BorutaPy

import matplotlib.pyplot as plt

pd.options.display.max_rows = None
pd.options.display.max_columns = None

%matplotlib inline

In [4]:
ds = Dataset()
ds.load_dataset()
build_processed_dataset(ds)

In [5]:
X = ds.X_train.reset_index()
y = ds.y_train.reset_index()['isFraud']#.ravel()
X_test = ds.X_test.reset_index()

In [6]:
folds = KFold(n_splits=5, random_state=0, shuffle=False)

train_index, valid_index = list(folds.split(X))[0]
X_train, X_valid = (X.iloc[train_index], X.iloc[valid_index])
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

fold = Fold(X_train, y_train, X_valid, y_valid, X_test, ds.categorical_cols)
process_fold(fold)

train = fold.X_train
labels = fold.y_train
train['isFraud'] = labels

Use Negative downsampling to speed up Boruta https://www.kaggle.com/c/ieee-fraud-detection/discussion/108616#latest-628955

In [15]:
frac = 0.2
t_ = pd.concat([train[train['isFraud']==1], train[train['isFraud']==0].sample(frac=frac, random_state=0)]).sample(frac=1, random_state=0)
labels = t_['isFraud']#.ravel()
t_.drop(['isFraud'], axis=1, inplace=True)
t_ = SimpleImputer(missing_values=np.nan, strategy="mean").fit_transform(t_)

In [16]:
train.shape

(472432, 426)

In [17]:
t_.shape

(108756, 425)

In [37]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels
rf = RandomForestClassifier(n_jobs=1, n_estimators=50, class_weight='balanced', max_depth=5)

In [38]:
# define Boruta feature selection method
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

In [39]:
# find all relevant features
feat_selector.fit(t_, labels)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	425
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	302
Tentative: 	57
Rejected: 	66
Iteration: 	9 / 100
Confirmed: 	302
Tentative: 	57
Rejected: 	66
Iteration: 	10 / 100
Confirmed: 	302
Tentative: 	57
Rejected: 	66
Iteration: 	11 / 100
Confirmed: 	302
Tentative: 	57
Rejected: 	66
Iteration: 	12 / 100
Confirmed: 	310
Tentative: 	49
Rejected: 	66
Iteration: 	13 / 100
Confirmed: 	310
Tentative: 	49
Rejected: 	66
Iteration: 	14 / 100
Confirmed: 	310
Tentative: 	49
Rejected: 	66
Iteration: 	15 / 100
Confirmed: 	310
Tentative: 	45
Rejected: 	70
Iteration: 	16 / 100
Confirmed: 	

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=5,
                                          max_features='auto',
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=523, n_jobs=1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x365B3E570,
                                          verbose=0, warm_start=False),
         max_iter=100, n_esti

In [40]:
print(feat_selector.n_features_)

321


In [41]:
# check selected features
print(feat_selector.support_)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True False  True  True  True
  True  True False False False  True  True  True  True  True  True False
 False  True  True  True  True  True  True  True  True  True  True  True
  True False False False False False False False  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True False
 False  True  True  True  True  True False False False False False False
 False False False False  True  True False False False False False False
 False False  True  True  True False False  True  True  True  True False
  True  True  True  True False  True  True  True  True  True False  True
 False False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

In [42]:
print(feat_selector.support_weak_)

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False False False
 False False False False False False  True False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False Fa

In [43]:
# check ranking of features
print(feat_selector.ranking_)

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  3  1  1  1
  1  1 17 17  2  1  1  1  1  1  1 42 37  1  1  1  1  1  1  1  1  1  1  1
  1 36 73 39 30 45 26 43  1  1  1  1  1  1  1  1  1  1 27  1  1  1  1 73
 73  1  1  1  1  1  2 10 40 14 73 45 14 41 38 48  1  1  2 32 14 32 23 24
 34 35  1  1  1 48 45  1  1  1  1 22  1  1  1  1 73  1  1  1  1  1 73  1
 73  2  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  6  6 21 25  1  6 14  2 19 32  2 10 18  6 29 28  2  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1

In [49]:
# number of selected features
print('\n Number of selected features:')
print (feat_selector.n_features_)

feature_df = pd.DataFrame(train.drop(['isFraud'], axis=1).columns.tolist(), columns=['features'])
feature_df['rank']=feat_selector.ranking_
feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True)
print('\n Top %d features:' % feat_selector.n_features_)
print(feature_df.head(feat_selector.n_features_))


 Number of selected features:
321

 Top 321 features:
                                      features  rank
0                                TransactionID     1
1                                   Vgroup_1_1     1
2                                   Vgroup_1_0     1
3                                   Vgroup_0_1     1
4                                   Vgroup_0_0     1
..                                         ...   ...
316                    groupby_card1_id_02_std     1
317  groupby_bank_type_TransactionAmt_std_norm     1
318                                   cos_hour     1
319               groupby_card1_id_02_std_norm     1
320                                 sinus_hour     1

[321 rows x 2 columns]


In [52]:
feature_df

Unnamed: 0,features,rank
0,TransactionID,1
1,Vgroup_1_1,1
2,Vgroup_1_0,1
3,Vgroup_0_1,1
4,Vgroup_0_0,1
5,card3_DT_M_month_day_dist_best,1
6,card3_DT_D_hour_dist,1
7,C11__C13,1
8,addr1__card1,1
9,Vgroup_1_2,1


In [None]:
selected = train.drop(['isFraud'], axis=1).columns[feat_selector.support_]
train = train[selected]
train['id'] = tr_ids
train['target'] = y
train = train.set_index('id')
train.to_csv('train_boruta_filtered.csv', index_label='id')
test = test[selected]
test['id'] = te_ids
test = test.set_index('id')
test.to_csv('test_boruta_filtered.csv', index_label='id')