# Boruta Feature Elimination (BFE)
Aggregated Data + user behavior patterns(Time)

1. Random Forest Classifier
2. Extra Trees Classifier
3. Ada Boost Classifier

In [1]:
import numpy  as np
import pandas as pd

### Read data

In [2]:
df_trn = pd.read_csv('../data/train-v4-cut.csv')
df_tst = pd.read_csv('../data/test-v4-cut.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [32]:
df_trn.shape

(95662, 277)

In [4]:
from boruta import BorutaPy

### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
# define random forest classifier, with utilising all cores and sampling in proportion to y labels
rfc = RandomForestClassifier(n_estimators=250, class_weight='balanced', n_jobs=-1)

In [8]:
# define Boruta feature selection method
feature_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=24)

In [10]:
%time
# find all relevant features - 5 features should be selected
feature_selector.fit(X_trn.values, y_trn.values)

CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 15.3 µs
Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	69
Tentative: 	108
Rejected: 	99
Iteration: 	9 / 100
Confirmed: 	69
Tentative: 	108
Rejected: 	99
Iteration: 	10 / 100
Confirmed: 	69
Tentative: 	108
Rejected: 	99
Iteration: 	11 / 100
Confirmed: 	69
Tentative: 	108
Rejected: 	99
Iteration: 	12 / 100
Confirmed: 	76
Tentative: 	86
Rejected: 	114
Iteration: 	13 / 100
Confirmed: 	76
Tentative: 	86
Rejected: 	114
Iteration: 	14 / 100
Confirmed: 	76
Tentative: 	86
Rejected: 	114
Iteration: 	15 / 100
Confirmed: 	76

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=None,
                                          max_features='auto',
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=151, n_jobs=-1,
                                          oob_score=False,
                                          random_state=<mtrand.RandomState object at 0x7f21f7234ee8>,
                                          verbose=0, warm_start=False),
         max_it

In [23]:
feature_selector.n_features_

86

In [22]:
# check selected features - first 5 features are selected
feature_selector.support_

array([ True, False,  True, False,  True, False, False,  True,  True,
       False,  True,  True, False, False,  True, False,  True, False,
       False, False, False, False,  True, False,  True,  True,  True,
        True, False, False, False,  True, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False,  True, False,  True,  True, False,  True,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False,  True, False,
       False, False,  True, False,  True, False, False, False,  True,
       False, False,  True,  True, False,  True,  True, False,  True,
        True, False,

In [24]:
feature_selector.support_weak_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [16]:
# check ranking of features
feature_selector.ranking_

array([  1,  38,   1, 170,   1,  59,  85,   1,   1, 133,   1,   1,  62,
        10,   1,  19,   1,  68,  82,  12,  65,   7,   1,  73,   1,   1,
         1,   1, 177,  77,  51,   1, 103, 164, 105,  69,  44,  97,   1,
       106,   1,   2,  42, 158, 177,  28, 148,  48, 145, 166, 130,  96,
         1,  50, 162,   2,   1, 168,   1,   1, 117,   1,   1, 173,   2,
       154,  47,   1,  95, 107, 167,  90,  89, 164, 101, 161, 171, 156,
       150, 174, 144, 153, 116, 104,  17,  80,   7,  81,   1,   1,  45,
       140, 175, 119, 124,  65, 123, 159,  98,  28,  22,  24, 132,  34,
         1, 164,   1,  73, 169,  68,   1,  55,   1, 138, 146,  87,   1,
        57,  99,   1,   1, 120,   1,   1,  26,   1,   1,  16,   1, 111,
       137,  14,   1,   2, 108,   1,   1, 109,   1,   1,  40,   1,   1,
        12,   1, 113,  83,  10,   2,  47, 101,   1,   1, 118,   1,   7,
       113,  28,   1,  85,   1,  77,  42,  31,   3,  71, 135,   1,   1,
       160,   1,  19, 147,  19,  34, 142,   4,  93,  89,  70,   

In [18]:
# call transform() on X to filter it down to selected features
# X_trn_filtered = feature_selector.transform(X_trn.values)
# X_tst_filtered = feature_selector.transform(X_tst.values)

In [28]:
X_trn_filtered = X_trn.iloc[:, feature_selector.support_]
X_tst_filtered = X_tst.iloc[:, feature_selector.support_]

X_trn_filtered.head()

Unnamed: 0,Value,AmountPositive,AccountId_count,AccountId_sum_value,AccountId_avg_amount_positive,AccountId_avg_value,AccountId_min_amount_positive,AccountId_max_amount_positive,AccountId_max_value,ProductId_sum_amount_negative,...,AccountId-ProviderId-ChannelId_max_amount_positive,AccountId-ProviderId-ChannelId_max_value,AccountId-ProviderId-PricingStrategy_count,AccountId-ProviderId-PricingStrategy_sum_value,AccountId-ProviderId-PricingStrategy_avg_amount_positive,AccountId-ProviderId-PricingStrategy_avg_value,AccountId-ProviderId-PricingStrategy_min_amount_positive,AccountId-ProviderId-PricingStrategy_min_value,AccountId-ProviderId-PricingStrategy_max_amount_positive,AccountId-ProviderId-PricingStrategy_max_value
0,1000,1000.0,1,0,0.0,0.0,10000000000.0,-1.0,-1,0.0,...,-1.0,-1,1,0,0.0,0.0,10000000000.0,10000000000.0,-1.0,-1
1,20,0.0,1,0,0.0,0.0,10000000000.0,-1.0,-1,0.0,...,-1.0,-1,1,0,0.0,0.0,10000000000.0,10000000000.0,-1.0,-1
2,500,500.0,1,0,0.0,0.0,10000000000.0,-1.0,-1,0.0,...,-1.0,-1,1,0,0.0,0.0,10000000000.0,10000000000.0,-1.0,-1
3,21800,20000.0,1,0,0.0,0.0,10000000000.0,-1.0,-1,0.0,...,-1.0,-1,1,0,0.0,0.0,10000000000.0,10000000000.0,-1.0,-1
4,644,0.0,2,20,0.0,10.0,0.0,0.0,20,20.0,...,0.0,20,2,20,0.0,10.0,0.0,20.0,0.0,20


### Submit

In [30]:
import os
from collections import Counter
from sklearn.ensemble import BaggingClassifier

In [31]:
predict = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=24).fit(X_trn_filtered, y_trn).predict(X_tst_filtered)
print('Resilts:', Counter(predict))
df_sbm['FraudResult'] = predict
    
# определяем был ли ранее точно такой же результат
current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())

# просматриваем все файлы в папке submitted
is_exist = False
files = os.listdir('../submitted')
files.sort()
for f in files:
    f_csv = pd.read_csv('../submitted/' + f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        is_exist = True
        print('It is the same as in: ' + f)
if not is_exist:
    print('New result! Write it')
    df_sbm.to_csv('../submitted/AlBo0812_Boruta86_Bagging.csv', encoding='utf-8', index=False)

Resilts: Counter({0.0: 44956, 1.0: 63})
New result! Write it


**Result:** `0.690909090909091`

### p-value=0.001

In [39]:
feature_selector = BorutaPy(rfc, n_estimators='auto', perc=100, alpha=0.001, verbose=2, random_state=24)

In [40]:
feature_selector.fit(X_trn.values, y_trn.values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	276
Rejected: 	0
Iteration: 	14 / 100
Confirmed: 	70
Tentative: 	137
Rejected: 	69
Iteration: 	15 / 100
Confirmed: 	70
Tentative: 	137
Rejected: 	69
Iteration: 	16 / 100
Confirmed: 	70
Tentative

BorutaPy(alpha=0.001,
         estimator=RandomForestClassifier(bootstrap=True,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=None,
                                          max_features='auto',
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=177, n_jobs=-1,
                                          oob_score=False,
                                          random_state=<mtrand.RandomState object at 0x7f21d0306e58>,
                                          verbose=0, warm_start=False),
         max_i

In [41]:
feature_selector.n_features_

101

# 

### Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
feature_importances_etc = pd.DataFrame(rfe_etc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_etc

In [None]:
top100etc = list(feature_importances_etc.index[:100])

## Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
abc     = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=250)
rfe_abc = RFE(estimator=abc, n_features_to_select=1, step=1)
rfe_abc = rfe_abc.fit(X_trn, y_trn)

In [None]:
feature_importances_abc = pd.DataFrame(rfe_abc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_abc