# LightGMB

In [1]:
import numpy  as np
import pandas as pd

import lightgbm

from collections import Counter
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
df_trn = pd.read_csv('../data/training_le.csv')
df_tst = pd.read_csv('../data/test_le.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,36122,3956,886,4405,5,9,0,2,1000.0,1000,2018-11-15 02:18:49,2,0
1,15641,4840,3828,4405,3,5,2,1,-20.0,20,2018-11-15 02:19:08,2,0
2,53940,4228,221,4682,5,0,0,2,500.0,500,2018-11-15 02:44:21,2,0
3,102362,647,2184,987,0,20,9,2,20000.0,21800,2018-11-15 03:32:55,2,0
4,38779,4840,3828,987,3,5,2,1,-644.0,644,2018-11-15 03:34:21,2,0


In [4]:
columns4drop = [
#     'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
#     'CurrencyCode',
#     'CountryCode',
#     'ProviderId',
#     'ProductId',
    'ProductCategory',
    'ChannelId',
#     'Amount', 
#     'Value',
    'TransactionStartTime',
#     'PricingStrategy'
]

In [5]:
df_trn_cut = df_trn.drop(columns=columns4drop, axis=1)
df_tst_cut = df_tst.drop(columns=columns4drop, axis=1)

In [6]:
df_trn_cut.head()

Unnamed: 0,ProviderId,ProductId,Amount,Value,PricingStrategy,FraudResult
0,5,9,1000.0,1000,2,0
1,3,5,-20.0,20,2,0
2,5,0,500.0,500,2,0
3,0,20,20000.0,21800,2,0
4,3,5,-644.0,644,2,0


In [7]:
# get the labels
X_trn_cut = df_trn_cut.drop(columns=['FraudResult'], axis=1)
y_trn_cut = df_trn_cut['FraudResult']

In [8]:
X_trn_cut.head()

Unnamed: 0,ProviderId,ProductId,Amount,Value,PricingStrategy
0,5,9,1000.0,1000,2
1,3,5,-20.0,20,2
2,5,0,500.0,500,2
3,0,20,20000.0,21800,2
4,3,5,-644.0,644,2


In [9]:
x = X_trn_cut.values
y = y_trn_cut.values

In [10]:
# Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# train_data = lightgbm.Dataset(x, label=y, categorical_feature=[0, 1, 2, 3, 4, 7])
train_data = lightgbm.Dataset(x, label=y)
test_data  = lightgbm.Dataset(x_test, label=y_test)

### Train the model

In [12]:
parameters = {
    'application' : 'binary',
    'objective'   : 'binary',
    'metric'      : 'auc',
#     'is_unbalance': 'true',
    'boosting'    : 'rf',
    'num_leaves'  : 31,
    'n_estimators': 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq'    : 20,
#     'learning_rate': 0.05,
#     'verbose': 0,
    'n_jobs': -1
}

In [13]:
model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)



[1]	valid_0's auc: 0.986558
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.986579
[3]	valid_0's auc: 0.986423
[4]	valid_0's auc: 0.986346
[5]	valid_0's auc: 0.997269
[6]	valid_0's auc: 0.997248
[7]	valid_0's auc: 0.998588
[8]	valid_0's auc: 0.998654
[9]	valid_0's auc: 0.998577
[10]	valid_0's auc: 0.998524
[11]	valid_0's auc: 0.998527
[12]	valid_0's auc: 0.99847
[13]	valid_0's auc: 0.998468
[14]	valid_0's auc: 0.998468
[15]	valid_0's auc: 0.998472
[16]	valid_0's auc: 0.998462
[17]	valid_0's auc: 0.998462
[18]	valid_0's auc: 0.998454
[19]	valid_0's auc: 0.998452
[20]	valid_0's auc: 0.998429
[21]	valid_0's auc: 0.998421
[22]	valid_0's auc: 0.998517
[23]	valid_0's auc: 0.998542
[24]	valid_0's auc: 0.998575
[25]	valid_0's auc: 0.998595
[26]	valid_0's auc: 0.998593
[27]	valid_0's auc: 0.99853
[28]	valid_0's auc: 0.998485
[29]	valid_0's auc: 0.998479
[30]	valid_0's auc: 0.99843
[31]	valid_0's auc: 0.998513
[32]	valid_0's auc: 0.998469
[33]	valid_0's auc: 0

In [14]:
y_test_pred = model.predict(x_test)
y_test_pred = [ 1 if i > 0.99999 else 0 for i in y_test_pred ]

In [15]:
Counter(y_test_pred)

Counter({0: 19047, 1: 86})

In [16]:
f1_score(y_test_pred, y_test) 

0.576

### Create a submission

In [17]:
df_tst_cut.head()

Unnamed: 0,ProviderId,ProductId,Amount,Value,PricingStrategy
0,4,2,1000.0,1000,3
1,4,14,2000.0,2000,2
2,3,5,-50.0,50,2
3,4,9,3000.0,3000,3
4,3,5,-60.0,60,2


In [18]:
x_tst = df_tst_cut.values
x_tst[:5]

array([[ 4.0e+00,  2.0e+00,  1.0e+03,  1.0e+03,  3.0e+00],
       [ 4.0e+00,  1.4e+01,  2.0e+03,  2.0e+03,  2.0e+00],
       [ 3.0e+00,  5.0e+00, -5.0e+01,  5.0e+01,  2.0e+00],
       [ 4.0e+00,  9.0e+00,  3.0e+03,  3.0e+03,  3.0e+00],
       [ 3.0e+00,  5.0e+00, -6.0e+01,  6.0e+01,  2.0e+00]])

In [22]:
y_pred = model.predict(x_tst)
y_pred = [ 1 if i > 0.99999 else 0 for i in y_pred ]
Counter(y_pred)
# y_pred

Counter({0: 44806, 1: 213})

In [21]:
y_pred

(45019, 5)

In [23]:
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0723_LGBM_le.csv', encoding='utf-8', index=False)