# Recursive Feature Elimination (RFE)
0. Example
1. Extra Trees Classifier
2. Random Forest Classifier
3. Ada Boost Classifier

In [1]:
import numpy  as np
import pandas as pd

In [2]:
from sklearn.feature_selection import RFE

### Example

In [3]:
from sklearn.datasets import make_friedman1
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)

In [4]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, 5, step=1)
selector = selector.fit(X, y)
print(selector.support_)
print(selector.ranking_)

[ True  True  True  True  True False False False False False]
[1 1 1 1 1 6 4 3 2 5]


### Read data

In [5]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')
df_tst = pd.read_csv('../data/test-agg-cut.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [6]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

### Extra Trees Classifier

In [7]:
from sklearn.ensemble import ExtraTreesClassifier

In [8]:
# Feature extraction
etc     = ExtraTreesClassifier(n_estimators=250, n_jobs=-1)
rfe_etc = RFE(estimator=etc, n_features_to_select=1, step=1)
rfe_etc = rfe_etc.fit(X_trn, y_trn)

# print(selector.support_)
# print(selector.ranking_)

In [11]:
feature_importances_etc = pd.DataFrame(rfe_etc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_etc

Unnamed: 0,importance
Value,1
AmountPositive,2
product_category_transactions__AmountNegative_global_count,3
account_provider_transactions__AmountPositive_global_avg,4
provider_transactions__Value_global_avg,5
account_product_transactions__AmountPositive_global_sum,6
product_category_transactions__AmountPositive_global_count,7
account_pricing_strategy_transactions__Value_global_avg,8
product_category_transactions__AmountNegative_week_count,9
provider_transactions__AmountPositive_week_count,10


### Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Feature extraction
rfc     = RandomForestClassifier(n_estimators=250, n_jobs=-1)
rfe_rfc = RFE(estimator=rfc, n_features_to_select=1, step=1)
rfe_rfc = rfe_rfc.fit(X_trn, y_trn)

In [14]:
feature_importances_rfc = pd.DataFrame(rfe_rfc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']).sort_values('importance', ascending=True)
feature_importances_rfc

Unnamed: 0,importance
AmountPositive,1
Value,2
product_category_transactions__AmountPositive_global_count,3
account_provider_transactions__Value_global_avg,4
account_product_transactions__AmountPositive_global_avg,5
product_category_transactions__AmountPositive_global_avg,6
provider_transactions__AmountPositive_global_avg,7
account_provider_transactions__AmountPositive_global_avg,8
pricing_strategy_transactions__AmountPositive_week_sum,9
account_product_transactions__Value_global_avg,10


## Ada Boost Classifier

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
abc     = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=250)
rfe_abc = RFE(estimator=abc, n_features_to_select=1, step=1)
rfe_abc = rfe_abc.fit(X_trn, y_trn)

  return self.tree_.compute_feature_importances()


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
feature_importances_abc = pd.DataFrame(rfe_abc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_abc

# Sumbit

In [None]:
feature_importances_etc.columns