## Random Forest
top 20 features

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/train-agg.csv')
df_tst = pd.read_csv('../data/test-agg.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn['AmountPositive'] = df_trn['Amount'].apply(lambda x: x if x > 0 else 0)
df_trn['AmountNegative'] = df_trn['Amount'].apply(lambda x: x if x < 0 else 0)

df_tst['AmountPositive'] = df_tst['Amount'].apply(lambda x: x if x > 0 else 0)
df_tst['AmountNegative'] = df_tst['Amount'].apply(lambda x: x if x < 0 else 0)

In [4]:
columns4drop = [
    'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'CurrencyCode',
    'CountryCode',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Amount',
    'TransactionStartTime',
    'PricingStrategy'
]

In [5]:
df_trn_cut = df_trn.drop(columns=columns4drop, axis=1)
df_tst_cut = df_tst.drop(columns=columns4drop, axis=1)

In [6]:
X_trn_cut = df_trn_cut.drop(columns=['FraudResult'], axis=1)
X_tst_cut = df_tst_cut

y_trn_cut = df_trn_cut['FraudResult']

## Univariate Selection $\chi^2$

### Scaling

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [8]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))
    return data

In [9]:
df_trn_cut_sc = scaleColumns(df_trn_cut, list(df_trn_cut.columns), MinMaxScaler())

In [10]:
X_trn_cut_sc = df_trn_cut_sc.drop(columns=['FraudResult'], axis=1)
y_trn_cut_sc = df_trn_cut_sc['FraudResult']

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [12]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X_trn_cut_sc, y_trn_cut_sc)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_trn_cut_sc.columns)

In [13]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
featureScores.nlargest(20, 'Score')  #print 10 best features

Unnamed: 0,Specs,Score
496,AmountPositive,5597.219242
0,Value,4762.582809
86,account_product_transactions__AmountPositive_g...,883.548964
390,account_provider_transactions__AmountPositive_...,739.340503
311,account_product_category_transactions__AmountP...,691.669685
75,account_provider_transactions__Value_global_avg,570.979859
386,account_provider_transactions__AmountPositive_...,462.555515
296,account_channel_transactions__AmountPositive_g...,384.409342
90,account_product_transactions__AmountPositive_g...,383.886192
281,account_transactions__AmountPositive_global_sum,381.275567


In [14]:
list(featureScores.nlargest(20, 'Score').values[:, 0])

['AmountPositive',
 'Value',
 'account_product_transactions__AmountPositive_global_sum',
 'account_provider_transactions__AmountPositive_global_avg',
 'account_product_category_transactions__AmountPositive_global_sum',
 'account_provider_transactions__Value_global_avg',
 'account_provider_transactions__AmountPositive_global_sum',
 'account_channel_transactions__AmountPositive_global_sum',
 'account_product_transactions__AmountPositive_global_avg',
 'account_transactions__AmountPositive_global_sum',
 'account_pricing_strategy_transactions__AmountPositive_global_sum',
 'account_product_category_transactions__AmountPositive_global_avg',
 'account_product_transactions__AmountPositive_week_sum',
 'account_provider_transactions__AmountPositive_week_avg',
 'account_pricing_strategy_transactions__AmountPositive_global_avg',
 'account_product_transactions__Value_global_avg',
 'account_product_category_transactions__Value_global_avg',
 'account_channel_transactions__AmountPositive_global_avg',
 

In [15]:
columns_top20_chi2 = list(featureScores.nlargest(20, 'Score').values[:, 0])

### Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from collections import Counter

In [18]:
X_trn = X_trn_cut[columns_top20_chi2]
X_tst = X_tst_cut[columns_top20_chi2]

y_trn = y_trn_cut

x = X_trn.values
y = y_trn.values

In [29]:
# save DataFrames top20chi2
df_trn_top20chi2 = X_trn
df_trn_top20chi2['FraudResult'] = y_trn
# df_trn_top20chi2.head()
df_trn_top20chi2.to_csv('../data/train_top20chi2.csv', encoding='utf-8', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [31]:
df_tst_top20chi2 = X_tst
# df_tst_top20chi2.head()
df_tst_top20chi2.to_csv('../data/test_top20chi2.csv', encoding='utf-8', index=False)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
RF = RandomForestClassifier(n_estimators=100)

In [22]:
RF.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
y_test_pred = RF.predict(x_test)

In [24]:
from sklearn.metrics import f1_score
f1_score(y_test_pred, y_test)

0.8607594936708861

**Submission**

In [25]:
y_tst_pred = RF.predict(X_tst)

In [26]:
Counter(y_tst_pred)

Counter({0: 44945, 1: 74})

In [27]:
df_sbm['FraudResult'] = y_tst_pred

In [None]:
df_sbm.to_csv('../submitted/AlBo0724_top20chi2_RF.csv', encoding='utf-8', index=False)

**Result:** `0.786885245901639`