# Bagging Classifier on top k chi2

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')

## Univariate Selection $\chi^2$

### Scaling

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))

In [5]:
scaleColumns(df_trn, list(df_trn.columns), MinMaxScaler())

In [6]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
#apply SelectKBest class to extract top 20 best features
k = 20
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X_trn, y_trn)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_trn.columns)

#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Features', 'Score']  # naming the dataframe columns
featureScores.nlargest(k, 'Score')             # print best features

Unnamed: 0,Features,Score
496,AmountPositive,5597.219242
0,Value,4762.582809
86,account_product_transactions__AmountPositive_g...,883.548964
390,account_provider_transactions__AmountPositive_...,739.340503
311,account_product_category_transactions__AmountP...,691.669685
75,account_provider_transactions__Value_global_avg,570.979859
386,account_provider_transactions__AmountPositive_...,462.555515
296,account_channel_transactions__AmountPositive_g...,384.409342
90,account_product_transactions__AmountPositive_g...,383.886192
281,account_transactions__AmountPositive_global_sum,381.275567


In [9]:
columns_top20_chi2 = list(featureScores.nlargest(k, 'Score').values[:, 0])

In [14]:
columns_top20_chi2

['AmountPositive',
 'Value',
 'account_product_transactions__AmountPositive_global_sum',
 'account_provider_transactions__AmountPositive_global_avg',
 'account_product_category_transactions__AmountPositive_global_sum',
 'account_provider_transactions__Value_global_avg',
 'account_provider_transactions__AmountPositive_global_sum',
 'account_channel_transactions__AmountPositive_global_sum',
 'account_product_transactions__AmountPositive_global_avg',
 'account_transactions__AmountPositive_global_sum',
 'account_pricing_strategy_transactions__AmountPositive_global_sum',
 'account_product_category_transactions__AmountPositive_global_avg',
 'account_product_transactions__AmountPositive_week_sum',
 'account_provider_transactions__AmountPositive_week_avg',
 'account_pricing_strategy_transactions__AmountPositive_global_avg',
 'account_product_transactions__Value_global_avg',
 'account_product_category_transactions__Value_global_avg',
 'account_channel_transactions__AmountPositive_global_avg',
 

## Подбор такого k, при котором есть отличия от уже submitted

In [10]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')
df_tst = pd.read_csv('../data/test-agg-cut.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [11]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [12]:
import os
from collections import Counter
from sklearn.ensemble import BaggingClassifier

In [13]:
classifier = BaggingClassifier(n_estimators=100, n_jobs=-1)
for k in range(1, 20):
    # prepare dataset on k columns
    X_trn_drp = X_trn[columns_top20_chi2[:k]]
    X_tst_drp = X_tst[columns_top20_chi2[:k]]
    
    predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
    print(k, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True    
    if is_exist:
        print('It is the same as in: ' + f)
    else:
        print('New result! Write it')
        df_sbm.to_csv('../submitted/AlBo0726_top' + str(k) + 'chi2_BaggingClassifier.csv', encoding='utf-8', index=False)

1 : Counter({0: 44959, 1: 60})
New result! Write it
2 : Counter({0: 44958, 1: 61})
New result! Write it
3 : Counter({0: 44943, 1: 76})
New result! Write it
4 : Counter({0: 44944, 1: 75})
New result! Write it
5 : Counter({0: 44943, 1: 76})
New result! Write it
6 : Counter({0: 44943, 1: 76})
New result! Write it
7 : Counter({0: 44944, 1: 75})
New result! Write it
8 : Counter({0: 44945, 1: 74})
New result! Write it
9 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
10 : Counter({0: 44945, 1: 74})
New result! Write it
11 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
12 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
13 : Counter({0: 44944, 1: 75})
New result! Write it
14 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
15 : Counter({0: 44944, 1: 75})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
16 :


`k   F1-score on zindi`

`01  0.679245283018868`

`02  0.666666666666667`

`03  0.766666666666667`

`04  0.8`

`05  0.766666666666667`

`06  0.8`

`07  0.8`

`08  0.8`

`10  0.8`

`13  0.813559322033898`

`18  0.827586206896552`

`20  0.813559322033898`

`25  0.793103448275862`

`26  0.793103448275862`

`27  0.793103448275862`

`28  0.793103448275862`

`29  0.696969696969697`

`30  0.73015873015873`

`31  0.793103448275862`

`32  0.754098360655738`

`35  0.741935483870968`

`38  0.721311475409836`

`39  0.754098360655738`

`40  0.676470588235294`

`43  0.741935483870968`

`46  0.733333333333333`

`53  0.754098360655738`

`54  0.741935483870968`

`62  0.741935483870968`

`63  0.779661016949153`

`71  0.733333333333333`



In [None]:
for n_frauds in range(70, 100):
    for n_frauds_pred in range(70, 100):
        


In [None]:
def f1_score