# Bagging Classifier on top k chi2

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')

## Univariate Selection $\chi^2$

### Scaling

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))

In [5]:
scaleColumns(df_trn, list(df_trn.columns), MinMaxScaler())

In [6]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
#apply SelectKBest class to extract top 400 best features
k = 400
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X_trn, y_trn)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_trn.columns)

#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Features', 'Score']  # naming the dataframe columns
featureScores.nlargest(k, 'Score')             # print best features

Unnamed: 0,Features,Score
496,AmountPositive,5597.219242
0,Value,4762.582809
86,account_product_transactions__AmountPositive_g...,883.548964
390,account_provider_transactions__AmountPositive_...,739.340503
311,account_product_category_transactions__AmountP...,691.669685
75,account_provider_transactions__Value_global_avg,570.979859
386,account_provider_transactions__AmountPositive_...,462.555515
296,account_channel_transactions__AmountPositive_g...,384.409342
90,account_product_transactions__AmountPositive_g...,383.886192
281,account_transactions__AmountPositive_global_sum,381.275567


In [9]:
columns_top400_chi2 = list(featureScores.nlargest(k, 'Score').values[:, 0])

## Подбор такого k, при котором есть отличия от уже submitted

In [10]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')
df_tst = pd.read_csv('../data/test-agg-cut.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [11]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [18]:
import os
from collections import Counter
from sklearn.ensemble import BaggingClassifier

In [21]:
classifier = BaggingClassifier(n_estimators=100, n_jobs=-1)
for k in range(21, 401):
    # prepare dataset on k columns
    X_trn_drp = X_trn[columns_top400_chi2[:k]]
    X_tst_drp = X_tst[columns_top400_chi2[:k]]
    
    predict = classifier.fit(X_trn_drp, y_trn).predict(X_tst_drp)
    print(k, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    
    # определяем был ли ранее точно такой же результат
    current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())
    # просматриваем все файлы в папке submitted
    is_exist = False
    files = os.listdir('../submitted')
    files.sort()
    for f in files:
        f_csv = pd.read_csv('../submitted/' + f)
        if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
            is_exist = True    
    if is_exist:
        print('It is the same as in: ' + f)
    else:
        print('New result! Write it')
        df_sbm.to_csv('../submitted/AlBo0726_top' + str(k) + 'chi2_BaggingClassifier.csv', encoding='utf-8', index=False)

21 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0724_top20chi2_BaggingClassifier.csv
22 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0724_top20chi2_BaggingClassifier.csv
23 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0724_top20chi2_BaggingClassifier.csv
24 : Counter({0: 44946, 1: 73})
It is the same as in: AlBo0724_top20chi2_BaggingClassifier.csv
25 : Counter({0: 44947, 1: 72})
New result! Write it
26 : Counter({0: 44944, 1: 75})
New result! Write it
27 : Counter({0: 44945, 1: 74})
New result! Write it
28 : Counter({0: 44946, 1: 73})
New result! Write it
29 : Counter({0: 44933, 1: 86})
New result! Write it
30 : Counter({0: 44938, 1: 81})
New result! Write it
31 : Counter({0: 44946, 1: 73})
New result! Write it
32 : Counter({0: 44942, 1: 77})
New result! Write it
33 : Counter({0: 44938, 1: 81})
It is the same as in: AlBo0726_top32chi2_BaggingClassifier.csv
34 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top32chi2_BaggingClassifier.csv
35 : C



69 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top63chi2_BaggingClassifier.csv
70 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top63chi2_BaggingClassifier.csv
71 : Counter({0: 44943, 1: 76})
New result! Write it
72 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top71chi2_BaggingClassifier.csv
73 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top71chi2_BaggingClassifier.csv
74 : Counter({0: 44946, 1: 73})
New result! Write it
75 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top74chi2_BaggingClassifier.csv
76 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top74chi2_BaggingClassifier.csv
77 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top74chi2_BaggingClassifier.csv
78 : Counter({0: 44937, 1: 82})
New result! Write it
79 : Counter({0: 44943, 1: 76})
It is the same as in: AlBo0726_top78chi2_BaggingClassifier.csv
80 : Counter({0: 44942, 1: 77})
It is the same as in: AlBo0726_top78chi2_BaggingC

163 : Counter({0: 44933, 1: 86})
New result! Write it
164 : Counter({0: 44940, 1: 79})
New result! Write it
165 : Counter({0: 44932, 1: 87})
New result! Write it
166 : Counter({0: 44932, 1: 87})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
167 : Counter({0: 44926, 1: 93})
New result! Write it
168 : Counter({0: 44943, 1: 76})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
169 : Counter({0: 44943, 1: 76})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
170 : Counter({0: 44922, 1: 97})
New result! Write it
171 : Counter({0: 44941, 1: 78})
New result! Write it
172 : Counter({0: 44938, 1: 81})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
173 : Counter({0: 44933, 1: 86})
New result! Write it
174 : Counter({0: 44943, 1: 76})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
175 : Counter({0: 44937, 1: 82})
New result! Write it
176 : Counter({0: 44939, 1: 80})
It is the same as in: AlBo0726_top97chi2_BaggingClassif

It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
290 : Counter({0: 44938, 1: 81})
New result! Write it
291 : Counter({0: 44931, 1: 88})
New result! Write it
292 : Counter({0: 44946, 1: 73})
New result! Write it
293 : Counter({0: 44941, 1: 78})
New result! Write it
294 : Counter({0: 44944, 1: 75})
New result! Write it
295 : Counter({0: 44936, 1: 83})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
296 : Counter({0: 44941, 1: 78})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
297 : Counter({0: 44941, 1: 78})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
298 : Counter({0: 44943, 1: 76})
New result! Write it
299 : Counter({0: 44938, 1: 81})
New result! Write it
300 : Counter({0: 44943, 1: 76})
New result! Write it
301 : Counter({0: 44939, 1: 80})
It is the same as in: AlBo0726_top97chi2_BaggingClassifier.csv
302 : Counter({0: 44933, 1: 86})
New result! Write it
303 : Counter({0: 44937, 1: 82})
New result! Write it
304 : Counter


`k   F1-score on zindi`

`20  0.813559322033898`

`25  0.793103448275862`

`26  0.793103448275862`

`27  0.793103448275862`

`28  0.793103448275862`

`29  0.696969696969697`

`30  0.73015873015873`

`31  0.793103448275862`

`32  0.754098360655738`

`35  0.741935483870968`

`38  0.721311475409836`

`39  0.754098360655738`

`40  0.676470588235294`

`43  0.741935483870968`

`46  0.733333333333333`

`53  0.754098360655738`

`54  0.741935483870968`

`62  0.741935483870968`

`63  0.779661016949153`

`71  0.733333333333333`



In [None]:
for n_frauds in range(70, 100):
    for n_frauds_pred in range(70, 100):
        


In [None]:
def f1_score