## libs

In [1]:
%%time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import lightgbm as lgbm

%matplotlib inline

Wall time: 28.8 s


## get data

In [2]:
%%time
PATH_TO_TRAIN = '../data/assignment_2_train.csv'
PATH_TO_TEST = '../data/assignment_2_test.csv'

train = pd.read_csv(PATH_TO_TRAIN)
test = pd.read_csv(PATH_TO_TEST)

Wall time: 8.36 s


# Задание 4: для числовых признаков обучить модель LightGBM. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

## get dims

In [3]:
def only_dims(df):
    return df[[col for col in df.columns if df[col].dtypes != 'object']]

train_dims = only_dims(train)
test_dims = only_dims(test)

train_dims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 380 entries, TransactionID to V339
dtypes: float64(376), int64(4)
memory usage: 521.9 MB


## split on XY

In [4]:
def split_Xy(df, target):
    return df.drop(target, axis=1), df[target]

target = 'isFraud'

X_train_dims, y_train_dims = split_Xy(train_dims, target)
X_test_dims, y_test_dims = split_Xy(test_dims, target)

X_train_dims.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,...,,,,,,,,,,
1,2987001,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,...,,,,,,,,,,
2,2987002,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,...,,,,,,,,,,
3,2987003,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,...,,,,,,,,,,
4,2987004,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
y_train_dims.head()

0    0
1    0
2    0
3    0
4    0
Name: isFraud, dtype: int64

## learn LGBM

In [6]:
dtrain_dims = lgbm.Dataset(X_train_dims, y_train_dims)
dtest_dims = lgbm.Dataset(X_test_dims, y_test_dims)

In [7]:
%%time

params = {
    'device_type': 'gpu',
    'boosting': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'max_depth': 1,
    'n_estimators': 200,
    'seed': 27
}

lgbm_model = lgbm.train(
    params=params,
    train_set=dtrain_dims,
    num_boost_round=256,
    valid_sets=[dtrain_dims, dtest_dims],
    early_stopping_rounds=15,
    verbose_eval=10
)

print('training is over')



[LightGBM] [Info] Number of positive: 5141, number of negative: 174859
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32181
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 377
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 204 dense feature groups (35.02 MB) transferred to GPU in 0.036272 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028561 -> initscore=-3.526732
[LightGBM] [Info] Start training from score -3.526732
Training until validation scores don't improve for 15 rounds
[10]	training's auc: 0.809558	valid_1's auc: 0.777755
[20]	training's auc: 0.81557	valid_1's auc: 0.783451
[30]	training's auc: 0.839704	valid_1's auc: 0.819893
[40]	training's auc: 0.843236	valid_1'

## cross validation

In [8]:
%%time
lgbm_cv = lgbm.cv(
    params=params,
    train_set=dtrain_dims,
    num_boost_round=200,
    stratified=True,
    metrics="auc",
    shuffle=True,
    nfold=3, 
)
lgbm_cv = pd.DataFrame(lgbm_cv)
lgbm_cv.to_csv('./lgbm_cv_dim.csv', index=False)
lgbm_cv.tail()



[LightGBM] [Info] Number of positive: 3428, number of negative: 116572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32181
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 377
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 204 dense feature groups (23.35 MB) transferred to GPU in 0.025395 secs. 1 sparse feature groups
[LightGBM] [Info] Number of positive: 3427, number of negative: 116573
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32181
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 377
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...

Unnamed: 0,auc-mean,auc-stdv
195,0.86679,0.003855
196,0.866807,0.003856
197,0.86687,0.003898
198,0.866888,0.003881
199,0.866995,0.004003


# Задание 5: обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 4.

## recategorical

In [9]:
recat_train = train.copy()
recat_test = test.copy()

def recat(df, column, values):
    return df[column].map({v: i for i, v in enumerate(values)})

cat_columns = [col for col in train.columns if train[col].dtypes == 'object']
for col in cat_columns:
    values = train[col].unique()
    
    recat_train[col] = recat(recat_train, col, values)
    recat_test[col] = recat(recat_test, col, values)
    
recat_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(18)
memory usage: 541.1 MB


## split on XY

In [10]:
X_train_cat, y_train_cat = split_Xy(recat_train, target)
X_test_cat, y_test_cat = split_Xy(recat_test, target)

X_train_cat.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,0,13926,,150.0,0,142.0,0,...,,,,,,,,,,
1,2987001,86401,29.0,0,2755,404.0,150.0,1,102.0,0,...,,,,,,,,,,
2,2987002,86469,59.0,0,4663,490.0,150.0,2,166.0,1,...,,,,,,,,,,
3,2987003,86499,50.0,0,18132,567.0,150.0,1,117.0,1,...,,,,,,,,,,
4,2987004,86506,50.0,1,4497,514.0,150.0,1,102.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## learn LGBM

In [11]:
dtrain_cat = lgbm.Dataset(X_train_cat, y_train_cat)
dtest_cat = lgbm.Dataset(X_test_cat, y_test_cat)

In [12]:
%%time

params = {
    'device_type': 'gpu',
    'boosting': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'max_depth': 1,
    'n_estimators': 200,
    'seed': 27
}

lgbm_model = lgbm.train(
    params=params,
    train_set=dtrain_cat,
    num_boost_round=256,
    valid_sets=[dtrain_cat, dtest_cat],
    early_stopping_rounds=15,
    verbose_eval=10
)

print('training is over')



[LightGBM] [Info] Number of positive: 5141, number of negative: 174859
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32345
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 391
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 216 dense feature groups (37.08 MB) transferred to GPU in 0.038296 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028561 -> initscore=-3.526732
[LightGBM] [Info] Start training from score -3.526732
Training until validation scores don't improve for 15 rounds
[10]	training's auc: 0.809558	valid_1's auc: 0.777755
[20]	training's auc: 0.831962	valid_1's auc: 0.806346
[30]	training's auc: 0.840355	valid_1's auc: 0.821026
[40]	training's auc: 0.846322	valid_1

## cross validation

In [13]:
%%time
lgbm_cv = lgbm.cv(
    params=params,
    train_set=dtrain_cat,
    num_boost_round=200,
    stratified=True,
    metrics="auc",
    shuffle=True,
    nfold=3, 
)
lgbm_cv = pd.DataFrame(lgbm_cv)
lgbm_cv.to_csv('./lgbm_cv_cat.csv', index=False)
lgbm_cv.tail()



[LightGBM] [Info] Number of positive: 3428, number of negative: 116572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32345
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 391
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 216 dense feature groups (24.72 MB) transferred to GPU in 0.025570 secs. 1 sparse feature groups
[LightGBM] [Info] Number of positive: 3427, number of negative: 116573
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32345
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 391
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...

Unnamed: 0,auc-mean,auc-stdv
195,0.873141,0.005768
196,0.873201,0.005677
197,0.873331,0.005624
198,0.873314,0.005596
199,0.873331,0.005706


# Задание 6: обработать категориальные признаки встроенным методом в LightGBM. Выполнить задание 4. Сделать выводы о качестве работы алгоритма, по сравнению с пунктом 5.

## learn LGBM

In [14]:
dtrain_cat = lgbm.Dataset(X_train_cat, y_train_cat, categorical_feature=cat_columns)
dtest_cat = lgbm.Dataset(X_test_cat, y_test_cat, categorical_feature=cat_columns)

In [15]:
%%time

params = {
    'device_type': 'gpu',
    'boosting': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
    'max_depth': 1,
    'n_estimators': 400,
    'seed': 27
}

lgbm_model = lgbm.train(
    params=params,
    train_set=dtrain_cat,
    num_boost_round=256,
    valid_sets=[dtrain_cat, dtest_cat],
    early_stopping_rounds=15,
    categorical_feature=cat_columns,
    verbose_eval=20
)

print('training is over')



[LightGBM] [Info] Number of positive: 5141, number of negative: 174859
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32354
[LightGBM] [Info] Number of data points in the train set: 180000, number of used features: 391
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 216 dense feature groups (37.08 MB) transferred to GPU in 0.038147 secs. 1 sparse feature groups




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028561 -> initscore=-3.526732
[LightGBM] [Info] Start training from score -3.526732
Training until validation scores don't improve for 15 rounds
[20]	training's auc: 0.833094	valid_1's auc: 0.804783
[40]	training's auc: 0.850192	valid_1's auc: 0.825076
[60]	training's auc: 0.858872	valid_1's auc: 0.831772
[80]	training's auc: 0.866464	valid_1's auc: 0.838658
[100]	training's auc: 0.870458	valid_1's auc: 0.843
[120]	training's auc: 0.873937	valid_1's auc: 0.845384
[140]	training's auc: 0.876416	valid_1's auc: 0.846993
[160]	training's auc: 0.878346	valid_1's auc: 0.847465
[180]	training's auc: 0.879482	valid_1's auc: 0.84831
[200]	training's auc: 0.88121	valid_1's auc: 0.849867
[220]	training's auc: 0.882646	valid_1's auc: 0.85024
[240]	training's auc: 0.884091	valid_1's auc: 0.851519
[260]	training's auc: 0.885317	valid_1's auc: 0.852258
[280]	training's auc: 0.886235	valid_1's auc: 0.852198
[300]	training's auc: 0.887113	valid_1's auc:

## cross validation

In [16]:
%%time
lgbm_cv = lgbm.cv(
    params=params,
    train_set=dtrain_cat,
    categorical_feature=cat_columns,
    num_boost_round=200,
    stratified=True,
    metrics="auc",
    shuffle=True,
    nfold=3, 
)
lgbm_cv = pd.DataFrame(lgbm_cv)
lgbm_cv.to_csv('./lgbm_cv_cat_def.csv', index=False)
lgbm_cv.tail()



[LightGBM] [Info] Number of positive: 3428, number of negative: 116572
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32354
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 391
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 216 dense feature groups (24.72 MB) transferred to GPU in 0.026313 secs. 1 sparse feature groups
[LightGBM] [Info] Number of positive: 3427, number of negative: 116573
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32354
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 391
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1050 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...

Unnamed: 0,auc-mean,auc-stdv
395,0.884764,0.00476
396,0.884842,0.004787
397,0.884897,0.004775
398,0.884927,0.004754
399,0.884904,0.004731
