# Bagging Classifier on top k chi2

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/train-agg-cut.csv')

## Univariate Selection $\chi^2$

### Scaling

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))

In [5]:
scaleColumns(df_trn, list(df_trn.columns), MinMaxScaler())

In [6]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [8]:
#apply SelectKBest class to extract top 400 best features
k = 200
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X_trn, y_trn)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_trn.columns)

#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Features', 'Score']  # naming the dataframe columns
featureScores.nlargest(k, 'Score')             # print best features

Unnamed: 0,Features,Score
496,AmountPositive,5597.219242
0,Value,4762.582809
86,account_product_transactions__AmountPositive_g...,883.548964
390,account_provider_transactions__AmountPositive_...,739.340503
311,account_product_category_transactions__AmountP...,691.669685
75,account_provider_transactions__Value_global_avg,570.979859
386,account_provider_transactions__AmountPositive_...,462.555515
296,account_channel_transactions__AmountPositive_g...,384.409342
90,account_product_transactions__AmountPositive_g...,383.886192
281,account_transactions__AmountPositive_global_sum,381.275567


In [9]:
columns_top200_chi2 = list(featureScores.nlargest(k, 'Score').values[:, 0])

In [10]:
columns_top200_chi2

['AmountPositive',
 'Value',
 'account_product_transactions__AmountPositive_global_sum',
 'account_provider_transactions__AmountPositive_global_avg',
 'account_product_category_transactions__AmountPositive_global_sum',
 'account_provider_transactions__Value_global_avg',
 'account_provider_transactions__AmountPositive_global_sum',
 'account_channel_transactions__AmountPositive_global_sum',
 'account_product_transactions__AmountPositive_global_avg',
 'account_transactions__AmountPositive_global_sum',
 'account_pricing_strategy_transactions__AmountPositive_global_sum',
 'account_product_category_transactions__AmountPositive_global_avg',
 'account_product_transactions__AmountPositive_week_sum',
 'account_provider_transactions__AmountPositive_week_avg',
 'account_pricing_strategy_transactions__AmountPositive_global_avg',
 'account_product_transactions__Value_global_avg',
 'account_product_category_transactions__Value_global_avg',
 'account_channel_transactions__AmountPositive_global_avg',
 