# LightGMB and RF

Top10 aggregative feutures

In [1]:
import numpy  as np
import pandas as pd

import lightgbm

from collections import Counter
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
df_trn = pd.read_csv('../data/train-agg.csv')
df_tst = pd.read_csv('../data/test-agg.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,pricing_strategy_transactions__AmountPositive_month_sum,pricing_strategy_transactions__AmountPositive_month_count,pricing_strategy_transactions__AmountPositive_month_min,pricing_strategy_transactions__AmountPositive_month_max,pricing_strategy_transactions__AmountPositive_month_avg,pricing_strategy_transactions__AmountPositive_global_sum,pricing_strategy_transactions__AmountPositive_global_count,pricing_strategy_transactions__AmountPositive_global_min,pricing_strategy_transactions__AmountPositive_global_max,pricing_strategy_transactions__AmountPositive_global_avg
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,0.0,0,1000000000.0,-1000000000.0,0.0,0.0,0,1000000000.0,-1000000000.0,0.0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,0.0,0,1000000000.0,-1000000000.0,0.0,1000.0,1,1000.0,1000.0,1000.0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,0.0,0,1000000000.0,-1000000000.0,0.0,1000.0,2,0.0,1000.0,500.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,0.0,0,1000000000.0,-1000000000.0,0.0,1500.0,3,0.0,1000.0,500.0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,0.0,0,1000000000.0,-1000000000.0,0.0,21500.0,4,0.0,20000.0,5375.0


## Univariate Selection $\chi^2$

In [48]:
columns_top10_chi2 = [
    'Value',
    'account_product_transactions__AmountPositive_global_sum',
    'account_provider_transactions__AmountPositive_global_avg',
    'account_product_category_transactions__AmountPositive_global_sum',
    'account_provider_transactions__Value_global_avg',
    'account_provider_transactions__AmountPositive_global_sum',
    'account_channel_transactions__AmountPositive_global_sum',
    'account_product_transactions__AmountPositive_global_avg',
    'account_transactions__AmountPositive_global_sum',
    'account_pricing_strategy_transactions__AmountPositive_global_sum'
]

In [49]:
X_trn_chi2 = df_trn[columns_top10_chi2]
X_tst_chi2 = df_tst[columns_top10_chi2]

In [50]:
X_trn_chi2.head()

Unnamed: 0,Value,account_product_transactions__AmountPositive_global_sum,account_provider_transactions__AmountPositive_global_avg,account_product_category_transactions__AmountPositive_global_sum,account_provider_transactions__Value_global_avg,account_provider_transactions__AmountPositive_global_sum,account_channel_transactions__AmountPositive_global_sum,account_product_transactions__AmountPositive_global_avg,account_transactions__AmountPositive_global_sum,account_pricing_strategy_transactions__AmountPositive_global_sum
0,1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,644,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# get the labels
y_trn_chi2 = df_trn['FraudResult']

In [52]:
x = X_trn_chi2.values
y = y_trn_chi2.values

In [53]:
# Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [54]:
# train_data = lightgbm.Dataset(x, label=y, categorical_feature=[0, 1, 2, 3, 4, 7])
train_data = lightgbm.Dataset(x, label=y)
test_data  = lightgbm.Dataset(x_test, label=y_test)

### Train the model

In [55]:
parameters = {
    'application' : 'binary',
    'objective'   : 'binary',
    'metric'      : 'auc',
#     'is_unbalance': 'true',
    'boosting'    : 'rf',
    'num_leaves'  : 31,
    'n_estimators': 1000,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq'    : 20,
#     'learning_rate': 0.05,
#     'verbose': 0,
    'n_jobs': -1
}

In [56]:
model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)



[1]	valid_0's auc: 0.986748
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.999568
[3]	valid_0's auc: 0.999601
[4]	valid_0's auc: 0.999562
[5]	valid_0's auc: 0.998371
[6]	valid_0's auc: 0.99837
[7]	valid_0's auc: 0.99836
[8]	valid_0's auc: 0.998239
[9]	valid_0's auc: 0.998244
[10]	valid_0's auc: 0.997741
[11]	valid_0's auc: 0.987498
[12]	valid_0's auc: 0.987435
[13]	valid_0's auc: 0.987412
[14]	valid_0's auc: 0.98742
[15]	valid_0's auc: 0.9873
[16]	valid_0's auc: 0.987417
[17]	valid_0's auc: 0.9873
[18]	valid_0's auc: 0.987296
[19]	valid_0's auc: 0.9873
[20]	valid_0's auc: 0.9873
[21]	valid_0's auc: 0.975151
[22]	valid_0's auc: 0.97494
[23]	valid_0's auc: 0.998269
[24]	valid_0's auc: 0.998223
[25]	valid_0's auc: 0.999295
[26]	valid_0's auc: 0.999368
[27]	valid_0's auc: 0.99935
[28]	valid_0's auc: 0.999325
[29]	valid_0's auc: 0.999303
[30]	valid_0's auc: 0.999402
[31]	valid_0's auc: 0.999496
[32]	valid_0's auc: 0.999468
[33]	valid_0's auc: 0.999515
[3

In [57]:
y_test_pred = model.predict(x_test)
y_test_pred = [ 1 if i > 0.99999 else 0 for i in y_test_pred ]

In [58]:
Counter(y_test_pred)

Counter({0: 19046, 1: 87})

In [59]:
f1_score(y_test_pred, y_test) 

0.5873015873015872

### Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
RF = RandomForestClassifier(n_estimators=1000)

In [62]:
RF.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [63]:
y_test_pred = RF.predict(x_test)

In [64]:
f1_score(y_test_pred, y_test) 

0.8536585365853658

### Create a submission (LGBM)

In [None]:
df_tst_cut.head()

In [None]:
x_tst = df_tst_cut.values
x_tst[:5]

In [None]:
y_pred = model.predict(x_tst)
y_pred = [ 1 if i > 0.99999 else 0 for i in y_pred ]
Counter(y_pred)
# y_pred

In [None]:
y_pred

In [None]:
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0723_LGBM_le.csv', encoding='utf-8', index=False)

### Create a submission (RF)

In [65]:
y_tst_pred = RF.predict(X_tst_chi2)

In [66]:
Counter(y_tst_pred)

Counter({0: 44944, 1: 75})

In [67]:
df_sbm['FraudResult'] = y_tst_pred
df_sbm.to_csv('../submitted/AlBo0724_top10chi2_RF.csv', encoding='utf-8', index=False)

**Result:** `0.774193548387097`

## Feature Importance with Tree Based Classifiers

In [68]:
columns_top10_tree = [
    'Amount', 
    'Value',
    'pricing_strategy_transactions__AmountPositive_global_avg',
    'account_product_transactions__AmountPositive_global_avg',
    'account_channel_transactions__AmountPositive_global_sum',
    'provider_transactions__AmountPositive_global_sum',
    'provider_transactions__Value_week_avg',
    'account_product_category_transactions__AmountPositive_global_min',
    'product_category_transactions__Value_global_count',
    'account_provider_transactions__AmountPositive_global_avg'
]

In [71]:
X_trn_tree = df_trn[columns_top10_tree]
X_tst_tree = df_tst[columns_top10_tree]

y_trn_tree = df_trn['FraudResult']

x = X_trn_tree.values
y = y_trn_tree.values

In [72]:
# Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

### Random Forest

In [73]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
RF = RandomForestClassifier(n_estimators=100)

In [79]:
RF.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [80]:
y_test_pred = RF.predict(x_test)

In [81]:
f1_score(y_test_pred, y_test) 

0.8831168831168831

### Create a submission (RF)

In [82]:
y_tst_pred = RF.predict(X_tst_tree)

In [83]:
Counter(y_tst_pred)

Counter({0: 44945, 1: 74})

In [None]:
df_sbm['FraudResult'] = y_tst_pred

In [87]:
import os 

current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())

files = os.listdir('../submitted')
files.sort()

for f in files:
    f_csv = pd.read_csv('../submitted/'+f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        print('It is the same as in: ' + f)

It is the same as in: AlBo0724_top10tree_RF.csv


In [84]:
df_sbm.to_csv('../submitted/AlBo0724_top10tree_RF.csv', encoding='utf-8', index=False)

**Result:** `0.766666666666667` - worse than top10-chi2

## Feature Importance with Correlation Matrix

In [91]:
columns_top10_corr = [
    'Value', 
    'Amount', 
    'account_provider_transactions__AmountPositive_global_avg', 
    'account_provider_transactions__Value_global_avg', 
    'account_product_transactions__AmountPositive_global_avg', 
    'account_pricing_strategy_transactions__AmountPositive_global_avg',
    'account_product_transactions__Value_global_avg', 
    'account_product_category_transactions__AmountPositive_global_avg',
    'account_pricing_strategy_transactions__Value_global_avg',
    'account_product_category_transactions__Value_global_avg'
]

In [92]:
X_trn_corr = df_trn[columns_top10_corr]
X_tst_corr = df_tst[columns_top10_corr]

y_trn_corr = df_trn['FraudResult']

x = X_trn_corr.values
y = y_trn_corr.values

In [93]:
# Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

### Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier

In [95]:
RF = RandomForestClassifier(n_estimators=100)

In [96]:
RF.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [97]:
y_test_pred = RF.predict(x_test)

In [98]:
f1_score(y_test_pred, y_test) 

0.8799999999999999

### Create a submission (RF)

In [99]:
y_tst_pred = RF.predict(X_tst_tree)

In [100]:
Counter(y_tst_pred)

Counter({0: 44963, 1: 56})

In [101]:
df_sbm['FraudResult'] = y_tst_pred

In [102]:
import os 

current_subm_set = set(df_sbm[df_sbm['FraudResult'] == 1].index.tolist())

files = os.listdir('../submitted')
files.sort()

for f in files:
    f_csv = pd.read_csv('../submitted/'+f)
    if set(f_csv[f_csv['FraudResult'] == 1].index.tolist()) == current_subm_set:
        print('It is the same as in: ' + f)

In [103]:
df_sbm.to_csv('../submitted/AlBo0724_top10corr_RF.csv', encoding='utf-8', index=False)

**Result:** `0.576923076923077` - worse than top10-chi2 and top10-tree