# LightGBM

- Fields: `Amount` и `Value`
- Robust Scaler

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn    = pd.read_csv('../data/training_le.csv')
df_tst    = pd.read_csv('../data/test_le.csv')
df_sbm    = pd.read_csv('../data/sample_submission.csv')

In [9]:
df_trn.head()

Unnamed: 0,Amount,Value,FraudResult
0,1000.0,1000,0
1,-20.0,20,0
2,500.0,500,0
3,20000.0,21800,0
4,-644.0,644,0


In [6]:
list(df_trn.columns)

['BatchId',
 'AccountId',
 'SubscriptionId',
 'CustomerId',
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'Amount',
 'Value',
 'TransactionStartTime',
 'PricingStrategy',
 'FraudResult']

In [7]:
columns4drop = [
    'BatchId', 
    'AccountId', 
    'SubscriptionId', 
    'CustomerId', 
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'TransactionStartTime',
    'PricingStrategy']

In [8]:
df_trn = df_trn.drop(columns4drop, axis=1)
df_tst = df_tst.drop(columns4drop, axis=1)

**Scaling**

In [10]:
from sklearn.preprocessing import RobustScaler

In [11]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))
    return data

In [12]:
df_trn = scaleColumns(df_trn, ['Amount', 'Value'], RobustScaler())
df_tst = scaleColumns(df_tst, ['Amount', 'Value'], RobustScaler())

In [13]:
df_trn.head()

Unnamed: 0,Amount,Value,FraudResult
0,0.0,0.0,0
1,-0.357895,-0.207407,0
2,-0.175439,-0.10582,0
3,6.666667,4.402116,0
4,-0.576842,-0.075344,0


In [14]:
X = df_trn.drop('FraudResult', axis=1)
y = df_trn['FraudResult']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24, shuffle=y)

In [17]:
from collections import Counter

In [18]:
print('Train Label Distribution: {}'.format(Counter(y_train)))
print('Test  Label Distribution: {}'.format(Counter(y_test)))

Train Label Distribution: Counter({0: 76380, 1: 149})
Test  Label Distribution: Counter({0: 19089, 1: 44})


In [20]:
# Modeling
import lightgbm as lgb

In [21]:
# Evaluation of the model
from sklearn.model_selection import KFold

In [22]:
MAX_EVALS = 500
N_FOLDS = 10

In [23]:
# Model with default hyperparameters
model = lgb.LGBMClassifier()
model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [29]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer

In [26]:
start = timer()
model.fit(X_train, y_train)
train_time = timer() - start

In [31]:
predict   = model.predict(X_test)
predict_p = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, predict_p)
f1  = f1_score(y_test, predict)

print('The baseline F1-score on the test set is {:.4f}.'.format(f1))
print('The baseline ROC-AUC-score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))

The baseline F1-score on the test set is 0.0460.
The baseline ROC-AUC-score on the test set is 0.0501.
The baseline training time is 0.1921 seconds


In [1]:
from hpsklearn import HyperoptEstimator

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely
