# Recursive Feature Elimination (RFE)
Aggregated Data + user behavior patterns(Time)

1. Extra Trees Classifier
2. Random Forest Classifier
3. Ada Boost Classifier

In [1]:
import numpy  as np
import pandas as pd

In [2]:
from sklearn.feature_selection import RFE

### Read data

In [3]:
df_trn = pd.read_csv('../data/train-v4.csv')
df_tst = pd.read_csv('../data/test-v4.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [4]:
df_trn.columns[:20]

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Value', 'TransactionStartTime',
       'PricingStrategy', 'FraudResult', 'AmountNegative', 'AmountPositive',
       'business_day', 'AccountId_count', 'AccountId_sum_amount_positive'],
      dtype='object')

In [5]:
columns4drop = [
    'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'CurrencyCode',
    'CountryCode',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'TransactionStartTime',
    'PricingStrategy',
]

In [6]:
df_trn.drop(columns4drop, axis=1, inplace=True)
df_tst.drop(columns4drop, axis=1, inplace=True)

In [7]:
df_trn.to_csv('../data/train-v4-cut.csv', encoding='utf-8', index=False)
df_tst.to_csv('../data/test-v4-cut.csv',  encoding='utf-8', index=False)

In [8]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

### Extra Trees Classifier

In [9]:
from sklearn.ensemble import ExtraTreesClassifier

In [10]:
# Feature extraction
%%time
etc     = ExtraTreesClassifier(n_estimators=250, n_jobs=-1, random_state=24)
rfe_etc = RFE(estimator=etc, n_features_to_select=1, step=1)
rfe_etc = rfe_etc.fit(X_trn, y_trn)

In [11]:
feature_importances_etc = pd.DataFrame(rfe_etc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_etc

Unnamed: 0,importance
AmountPositive,1
Value,2
ProductCategory_count,3
AccountId-PricingStrategy_avg_value,4
ProviderId_avg_value,5
AccountId-ProductId-ChannelId_max_amount_positive,6
ProductId_sum_amount_positive,7
AccountId-ProviderId-PricingStrategy_avg_amount_positive,8
ProductId_count,9
PricingStrategy_hour_to_prev,10


In [15]:
top100etc = list(feature_importances_etc.index[:100])

### Submit

In [30]:
import os
from collections import Counter
from sklearn.ensemble import BaggingClassifier

In [31]:
classifier = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=24)
for k in range(1, 20):
    # prepare dataset on k columns
    X_trn_drp = X_trn[top100etc[:k]]
    X_tst_drp = X_tst[top100etc[:k]]
    
    predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
    print(k, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True
            print('It is the same as in: ' + f)
    if not is_exist:
        print('New result! Write it')
        df_sbm.to_csv('../submitted/AlBo0812_top' + str(k) + 'RFE_ETC_Bagging.csv', encoding='utf-8', index=False)

1 : Counter({0.0: 44959, 1.0: 60})
It is the same as in: AlBo0726_top1chi2_BaggingClassifier.csv
2 : Counter({0.0: 44958, 1.0: 61})
It is the same as in: AlBo0726_top2chi2_BaggingClassifier.csv
3 : Counter({0.0: 44945, 1.0: 74})
New result! Write it
4 : Counter({0.0: 44950, 1.0: 69})
New result! Write it
5 : Counter({0.0: 44950, 1.0: 69})
New result! Write it
6 : Counter({0.0: 44950, 1.0: 69})
It is the same as in: AlBo0812_top5RFE_ETC_Bagging.csv
7 : Counter({0.0: 44950, 1.0: 69})
It is the same as in: AlBo0812_top5RFE_ETC_Bagging.csv
8 : Counter({0.0: 44952, 1.0: 67})
New result! Write it
9 : Counter({0.0: 44952, 1.0: 67})
It is the same as in: AlBo0812_top8RFE_ETC_Bagging.csv
10 : Counter({0.0: 44950, 1.0: 69})
It is the same as in: AlBo0812_top5RFE_ETC_Bagging.csv
11 : Counter({0.0: 44950, 1.0: 69})
It is the same as in: AlBo0812_top5RFE_ETC_Bagging.csv
12 : Counter({0.0: 44950, 1.0: 69})
It is the same as in: AlBo0812_top5RFE_ETC_Bagging.csv
13 : Counter({0.0: 44950, 1.0: 69})
It 

**Results:**

`k frauds F1-score`

`01  60  0.679245283018868`

`02  61  0.666666666666667`

`03  74  0.7`

`04  69  0.714285714285714`

`05  69  0.689655172413793`

`06  69  0.689655172413793` (as k=05)

`07  69  0.689655172413793` (as k=05)

`08  67  0.714285714285714`

`09  67  0.714285714285714` (as k=08)

`10  69  0.689655172413793` (as k=05)

`11  69  0.689655172413793` (as k=05)

`12  69  0.689655172413793` (as k=05)

`13  69  0.689655172413793` (as k=05)

`14  69  0.689655172413793` (as k=05)

`15  69  0.689655172413793` (as k=05)

`16  69  0.689655172413793` (as k=05)

`17  67  0.666666666666667`

`18  63  0.666666666666667`

`19  65  0.678571428571428`

### Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Feature extraction
rfc     = RandomForestClassifier(n_estimators=250, n_jobs=-1)
rfe_rfc = RFE(estimator=rfc, n_features_to_select=1, step=1)
%time
rfe_rfc = rfe_rfc.fit(X_trn, y_trn)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 15.7 µs


In [20]:
feature_importances_rfc = pd.DataFrame(rfe_rfc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']).sort_values('importance', ascending=True)
feature_importances_rfc

Unnamed: 0,importance
Value,1
AmountPositive,2
ProductId_sum_amount_positive,3
AccountId-ProviderId_avg_value,4
ProviderId_avg_value,5
AccountId-ProviderId-ChannelId_avg_amount_positive,6
PricingStrategy_count,7
ProductCategory_avg_amount_positive,8
AccountId-ProviderId-ChannelId_avg_value,9
ProviderId_avg_amount_positive,10


In [33]:
top100rfc = list(feature_importances_rfc.index[:100])

In [34]:
classifier = BaggingClassifier(n_estimators=1000, n_jobs=-1, random_state=24)
for k in range(1, 20):
    # prepare dataset on k columns
    X_trn_drp = X_trn[top100rfc[:k]]
    X_tst_drp = X_tst[top100rfc[:k]]
    
    predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
    print(k, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True
            print('It is the same as in: ' + f)
    if not is_exist:
        print('New result! Write it')
        df_sbm.to_csv('../submitted/AlBo0812_top' + str(k) + 'RFE_RFC_Bagging.csv', encoding='utf-8', index=False)

1 : Counter({0.0: 44957, 1.0: 62})
New result! Write it
2 : Counter({0.0: 44958, 1.0: 61})
It is the same as in: AlBo0726_top2chi2_BaggingClassifier.csv
3 : Counter({0.0: 44946, 1.0: 73})
New result! Write it
4 : Counter({0.0: 44952, 1.0: 67})
New result! Write it
5 : Counter({0.0: 44948, 1.0: 71})
New result! Write it
6 : Counter({0.0: 44949, 1.0: 70})
New result! Write it
7 : Counter({0.0: 44950, 1.0: 69})
New result! Write it
8 : Counter({0.0: 44946, 1.0: 73})
New result! Write it
9 : Counter({0.0: 44946, 1.0: 73})
It is the same as in: AlBo0812_top8RFE_RFC_Bagging.csv
10 : Counter({0.0: 44946, 1.0: 73})
It is the same as in: AlBo0812_top8RFE_RFC_Bagging.csv
11 : Counter({0.0: 44942, 1.0: 77})
New result! Write it
12 : Counter({0.0: 44942, 1.0: 77})
New result! Write it
13 : Counter({0.0: 44941, 1.0: 78})
New result! Write it
14 : Counter({0.0: 44948, 1.0: 71})
New result! Write it
15 : Counter({0.0: 44948, 1.0: 71})
It is the same as in: AlBo0812_top14RFE_RFC_Bagging.csv
16 : Count

**Results:**

`k frauds F1-score`

`01  62`

`02  61  0.666666666666667`

`03  73  0.7`

`04  67  0.690909090909091`

`05  71  0.689655172413793`

`06  70  0.701754385964912`

`07  69  0.701754385964912`

`08  73  0.758620689655172` сравнить с 06 и 07

`09  73  0.758620689655172` (as k=08)

`10  73  0.758620689655172` (as k=08)

`11  77  0.709677419354839` 

`12  77  0.688524590163934` 

`13  78  0.709677419354839` сравнить с 11 и 12

`14  71  0.645161290322581` 

`15  71  0.645161290322581` (as k=14) 

`16  78  0.65625` 

`17  76  0.666666666666667`

`18  78  0.666666666666667` (as k=16)

`19  73  0.645161290322581`

### Analysis

In [42]:
def compare_submitted_files(fname_A, fname_B):
    df_A = pd.read_csv('../submitted/' + fname_A)
    df_B = pd.read_csv('../submitted/' + fname_B)
    
    set_A = set(df_A['TransactionId'][df_A['FraudResult']==1].tolist())
    set_B = set(df_B['TransactionId'][df_B['FraudResult']==1].tolist())

    print('Number of frauds in set A =', len(set_A))
    print('Number of frauds in set B =', len(set_B))
    print('Difference of sets A and B (set_A - set_B) =', np.sort(list(set_A - set_B)))
    print('Difference of sets A and B (set_A - set_B) =', np.sort(list(set_B - set_A)))
    print('Symmetric Difference of sets A and B (set_A ^ set_B) = ', np.sort(list(set_A  ^ set_B)))

In [43]:
compare_submitted_files('AlBo0812_top7RFE_RFC_Bagging.csv', 'AlBo0812_top8RFE_RFC_Bagging.csv')

Number of frauds in set A = 69
Number of frauds in set B = 73
Difference of sets A and B (set_A - set_B) = ['TransactionId_79898' 'TransactionId_88604']
Difference of sets A and B (set_A - set_B) = ['TransactionId_103156' 'TransactionId_114219' 'TransactionId_133930'
 'TransactionId_13904' 'TransactionId_59289' 'TransactionId_9176']
Symmetric Difference of sets A and B (set_A ^ set_B) =  ['TransactionId_103156' 'TransactionId_114219' 'TransactionId_133930'
 'TransactionId_13904' 'TransactionId_59289' 'TransactionId_79898'
 'TransactionId_88604' 'TransactionId_9176']


## Ada Boost Classifier

In [35]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
abc     = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=250)
rfe_abc = RFE(estimator=abc, n_features_to_select=1, step=1)
rfe_abc = rfe_abc.fit(X_trn, y_trn)

In [None]:
feature_importances_abc = pd.DataFrame(rfe_abc.ranking_,
                                       index = X_trn.columns,
                                       columns=['importance']
                                      ).sort_values('importance', ascending=True)
feature_importances_abc