## libs

In [1]:
%%time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import xgboost as xgb

%matplotlib inline

Wall time: 9.65 s


## get data

In [2]:
%%time
PATH_TO_TRAIN = '../data/assignment_2_train.csv'
PATH_TO_TEST = '../data/assignment_2_test.csv'

train = pd.read_csv(PATH_TO_TRAIN)
test = pd.read_csv(PATH_TO_TEST)

Wall time: 8.3 s


# Задание 1: отобрать только числовые признаки и обучить модель XGBoost с параметром booster = gbtree. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

## get dims

In [3]:
def only_dims(df):
    return df[[col for col in df.columns if df[col].dtypes != 'object']]

train_dims = only_dims(train)
test_dims = only_dims(test)

train_dims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 380 entries, TransactionID to V339
dtypes: float64(376), int64(4)
memory usage: 521.9 MB


## split on XY

In [4]:
def split_Xy(df, target):
    return df.drop(target, axis=1), df[target]

target = 'isFraud'

X_train_dims, y_train_dims = split_Xy(train_dims, target)
X_test_dims, y_test_dims = split_Xy(test_dims, target)

X_train_dims.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,...,,,,,,,,,,
1,2987001,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,...,,,,,,,,,,
2,2987002,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,...,,,,,,,,,,
3,2987003,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,...,,,,,,,,,,
4,2987004,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
y_train_dims.head()

0    0
1    0
2    0
3    0
4    0
Name: isFraud, dtype: int64

## learn XGBoost

In [6]:
dtrain_dims = xgb.DMatrix(X_train_dims, y_train_dims)
dtest_dims = xgb.DMatrix(X_test_dims, y_test_dims)

In [7]:
%%time
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic', # бинарная кросэнтопия
    'eval_metric': 'auc', # roc-auc
    'learning_rate': 0.1,
    'num_parallel_tree': 1,
    'reg_lambda': 100,
    'max_depth': 5,
    'gamma': 10,
#     'nthread': 4,
    'seed': 27,
#     'gpu_id': 0,
    'tree_method': 'gpu_hist'
}

xgb_model = xgb.train(
    params=params,
    dtrain=dtrain_dims,
    num_boost_round=256,
    early_stopping_rounds=15,
    evals=[(dtrain_dims, 'train'), (dtest_dims, 'val')],
    verbose_eval=10,
    maximize=True
)

print('training is over')

[0]	train-auc:0.65490	val-auc:0.62750
[10]	train-auc:0.80375	val-auc:0.76306
[20]	train-auc:0.84216	val-auc:0.81407
[30]	train-auc:0.86807	val-auc:0.84586
[40]	train-auc:0.88013	val-auc:0.85114
[50]	train-auc:0.88825	val-auc:0.85238
[60]	train-auc:0.89368	val-auc:0.85492
[70]	train-auc:0.89820	val-auc:0.85532
[80]	train-auc:0.90108	val-auc:0.85465
[90]	train-auc:0.90425	val-auc:0.85577
[100]	train-auc:0.90633	val-auc:0.85753
[110]	train-auc:0.90819	val-auc:0.85775
[120]	train-auc:0.90819	val-auc:0.85775
training is over
Wall time: 6.25 s


## cross validation

In [8]:
%%time
xgb_cv = xgb.cv(
    params=params,
    dtrain=dtrain_dims,
    num_boost_round=200,
    stratified=True,
    metrics="auc",
    maximize=True,
    shuffle=True,
    nfold=3
)
xgb_cv.to_csv('./xgb_cv_dim.csv', index=False)
xgb_cv.tail()

Wall time: 17.7 s


Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
195,0.902551,0.001605,0.891821,0.001126
196,0.902551,0.001605,0.891821,0.001126
197,0.902551,0.001605,0.891821,0.001126
198,0.902551,0.001605,0.891821,0.001126
199,0.902551,0.001605,0.891821,0.001126


# Задание 2: обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 1.

## recategorical

In [9]:
recat_train = train.copy()
recat_test = test.copy()

def recat(df, column, values):
    return df[column].map({v: i for i, v in enumerate(values)})

cat_columns = [col for col in train.columns if train[col].dtypes == 'object']
for col in cat_columns:
    values = train[col].unique()
    
    recat_train[col] = recat(recat_train, col, values)
    recat_test[col] = recat(recat_test, col, values)
    
recat_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(18)
memory usage: 541.1 MB


## split on XY

In [10]:
X_train_cat, y_train_cat = split_Xy(recat_train, target)
X_test_cat, y_test_cat = split_Xy(recat_test, target)

X_train_cat.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,0,13926,,150.0,0,142.0,0,...,,,,,,,,,,
1,2987001,86401,29.0,0,2755,404.0,150.0,1,102.0,0,...,,,,,,,,,,
2,2987002,86469,59.0,0,4663,490.0,150.0,2,166.0,1,...,,,,,,,,,,
3,2987003,86499,50.0,0,18132,567.0,150.0,1,117.0,1,...,,,,,,,,,,
4,2987004,86506,50.0,1,4497,514.0,150.0,1,102.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## learn XGBoost

In [11]:
dtrain_cat = xgb.DMatrix(X_train_cat, y_train_cat)
dtest_cat = xgb.DMatrix(X_test_cat, y_test_cat)

In [12]:
%%time
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic', # бинарная кросэнтопия
    'eval_metric': 'auc', # roc-auc
    'learning_rate': 0.1,
    'num_parallel_tree': 1,
    'reg_lambda': 100,
    'max_depth': 3,
    'gamma': 10,
#     'nthread': 4,
    'seed': 27,
#     'gpu_id': 0,
    'tree_method': 'gpu_hist'
}

xgb_model = xgb.train(
    params=params,
    dtrain=dtrain_cat,
    num_boost_round=512,
    early_stopping_rounds=15,
    evals=[(dtrain_cat, 'train'), (dtest_cat, 'val')],
    verbose_eval=10,
    maximize=True
)

print('training is over')

[0]	train-auc:0.60979	val-auc:0.60329
[10]	train-auc:0.75591	val-auc:0.71480
[20]	train-auc:0.81699	val-auc:0.77844
[30]	train-auc:0.85326	val-auc:0.83522
[40]	train-auc:0.86211	val-auc:0.84177
[50]	train-auc:0.87218	val-auc:0.85146
[60]	train-auc:0.88121	val-auc:0.85673
[70]	train-auc:0.88666	val-auc:0.85879
[80]	train-auc:0.89014	val-auc:0.85954
[90]	train-auc:0.89380	val-auc:0.86013
[100]	train-auc:0.89657	val-auc:0.86088
[110]	train-auc:0.89935	val-auc:0.86110
[120]	train-auc:0.90124	val-auc:0.86157
[130]	train-auc:0.90273	val-auc:0.86250
[140]	train-auc:0.90423	val-auc:0.86316
[150]	train-auc:0.90571	val-auc:0.86324
[157]	train-auc:0.90637	val-auc:0.86339
training is over
Wall time: 6.03 s


## cross validation

In [13]:
%%time
xgb_cv = xgb.cv(
    params=params,
    dtrain=dtrain_cat,
    num_boost_round=200,
    stratified=True,
    metrics="auc",
    maximize=True,
    shuffle=True,
    nfold=3
)
xgb_cv.to_csv('./xgb_cv_cat.csv', index=False)
xgb_cv.tail()

Wall time: 17.6 s


Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
195,0.904044,0.001763,0.894988,0.002394
196,0.904044,0.001763,0.894988,0.002394
197,0.904044,0.001763,0.894988,0.002394
198,0.904044,0.001763,0.894988,0.002394
199,0.904044,0.001763,0.894988,0.002394
