# LightGMB

In [1]:
import numpy  as np
import pandas as pd

import lightgbm

from collections import Counter
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
df_trn = pd.read_csv('../data/training_pe.csv')
df_tst = pd.read_csv('../data/test_pe.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,36122,0.0,0.0,0.0,8.8e-05,0.00039,0.0004,0.003232,1000.0,1000,2018-11-15 02:18:49,0.001741,0
1,15641,3.2e-05,3.1e-05,0.0,0.000131,3.1e-05,0.003546,0.000135,-20.0,20,2018-11-15 02:19:08,0.001741,0
2,53940,0.0,0.0,0.0,8.8e-05,0.0,0.0004,0.003232,500.0,500,2018-11-15 02:44:21,0.001741,0
3,102362,0.0,0.0,0.0,0.010101,0.002646,0.00625,0.003232,20000.0,21800,2018-11-15 03:32:55,0.001741,0
4,38779,3.2e-05,3.1e-05,0.0,0.000131,3.1e-05,0.003546,0.000135,-644.0,644,2018-11-15 03:34:21,0.001741,0


In [4]:
columns4drop = [
#     'TransactionId',
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
#     'CurrencyCode',
#     'CountryCode',
#     'ProviderId',
#     'ProductId',
    'ProductCategory',
    'ChannelId',
#     'Amount', 
#     'Value',
    'TransactionStartTime',
#     'PricingStrategy'
]

In [5]:
df_trn_cut = df_trn.drop(columns=columns4drop, axis=1)
df_tst_cut = df_tst.drop(columns=columns4drop, axis=1)

In [6]:
df_trn_cut.head()

Unnamed: 0,ProviderId,ProductId,Amount,Value,PricingStrategy,FraudResult
0,8.8e-05,0.00039,1000.0,1000,0.001741,0
1,0.000131,3.1e-05,-20.0,20,0.001741,0
2,8.8e-05,0.0,500.0,500,0.001741,0
3,0.010101,0.002646,20000.0,21800,0.001741,0
4,0.000131,3.1e-05,-644.0,644,0.001741,0


In [7]:
columns4scaling = [
    'ProviderId',
    'ProductId',
#     'ProductCategory',
#     'ChannelId',
    'PricingStrategy'
]

In [8]:
num_transactions = df_trn.shape[0]
num_otliers = df_trn['FraudResult'].value_counts()[1]
num_inliers = df_trn['FraudResult'].value_counts()[0]
outliers_fraction = num_otliers / num_transactions
print('Train outliers fraction:', round(outliers_fraction, 3))

Train outliers fraction: 0.002


In [10]:
# coef = n_frauds / (n_frauds + n_nofrauds) 
for clm in columns4scaling:
    df_trn_cut[clm] -= outliers_fraction

In [11]:
# get the labels
X_trn_cut = df_trn_cut.drop(columns=['FraudResult'], axis=1)
y_trn_cut = df_trn_cut['FraudResult']

In [12]:
X_trn_cut.head()

Unnamed: 0,ProviderId,ProductId,Amount,Value,PricingStrategy
0,-0.00193,-0.001628,1000.0,1000,-0.000277
1,-0.001887,-0.001987,-20.0,20,-0.000277
2,-0.00193,-0.002018,500.0,500,-0.000277
3,0.008083,0.000628,20000.0,21800,-0.000277
4,-0.001887,-0.001987,-644.0,644,-0.000277


In [13]:
x = X_trn_cut.values
y = y_trn_cut.values

In [14]:
# Create training and validation sets
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
# train_data = lightgbm.Dataset(x, label=y, categorical_feature=[0, 1, 2, 3, 4, 7])
train_data = lightgbm.Dataset(x, label=y)
test_data  = lightgbm.Dataset(x_test, label=y_test)

### Train the model

In [16]:
parameters = {
    'application' : 'binary',
    'objective'   : 'binary',
    'metric'      : 'auc',
#     'is_unbalance': 'true',
    'boosting'    : 'rf',
    'num_leaves'  : 31,
    'n_estimators': 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq'    : 20,
#     'learning_rate': 0.05,
#     'verbose': 0,
    'n_jobs': -1
}

In [17]:
model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)



[1]	valid_0's auc: 0.986507
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.986567
[3]	valid_0's auc: 0.986401
[4]	valid_0's auc: 0.986327
[5]	valid_0's auc: 0.997284
[6]	valid_0's auc: 0.997273
[7]	valid_0's auc: 0.998608
[8]	valid_0's auc: 0.998666
[9]	valid_0's auc: 0.998592
[10]	valid_0's auc: 0.998545
[11]	valid_0's auc: 0.998545
[12]	valid_0's auc: 0.998515
[13]	valid_0's auc: 0.998515
[14]	valid_0's auc: 0.998515
[15]	valid_0's auc: 0.998519
[16]	valid_0's auc: 0.99851
[17]	valid_0's auc: 0.99851
[18]	valid_0's auc: 0.998503
[19]	valid_0's auc: 0.998499
[20]	valid_0's auc: 0.998491
[21]	valid_0's auc: 0.998443
[22]	valid_0's auc: 0.998547
[23]	valid_0's auc: 0.998573
[24]	valid_0's auc: 0.998606
[25]	valid_0's auc: 0.998624
[26]	valid_0's auc: 0.998621
[27]	valid_0's auc: 0.998519
[28]	valid_0's auc: 0.998468
[29]	valid_0's auc: 0.998462
[30]	valid_0's auc: 0.998444
[31]	valid_0's auc: 0.998479
[32]	valid_0's auc: 0.998443
[33]	valid_0's auc: 

In [18]:
y_test_pred = model.predict(x_test)
y_test_pred = [ 1 if i > 0.99999 else 0 for i in y_test_pred ]

In [19]:
Counter(y_test_pred)

Counter({0: 19047, 1: 86})

In [20]:
f1_score(y_test_pred, y_test) 

0.576

### Create a submission

In [None]:
df_tst_cut.head()

In [None]:
x_tst = df_tst_cut.values
x_tst[:5]

In [None]:
y_pred = model.predict(x_tst)
Counter(y_pred)

In [None]:
y_pred = [ 1 if i > 0.99999 else 0 for i in y_pred ]
Counter(y_pred)
# y_pred

In [None]:
y_pred

In [None]:
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0723_LGBM_pe.csv', encoding='utf-8', index=False)