## libs

In [1]:
%%time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import catboost as cb

%matplotlib inline

Wall time: 24.1 s


## get data

In [2]:
%%time
PATH_TO_TRAIN = '../data/assignment_2_train.csv'
PATH_TO_TEST = '../data/assignment_2_test.csv'

train = pd.read_csv(PATH_TO_TRAIN)
test = pd.read_csv(PATH_TO_TEST)

Wall time: 8.4 s


# Задание 7: для числовых признаков обучить модель CatBoost. Обучать алгоритм до тех пор, пока метрика качества не перестанет улучшаться на валидационной выборке в течение определенного числа итераций (выбрать значение самостоятельно).

## get dims

In [3]:
def only_dims(df):
    return df[[col for col in df.columns if df[col].dtypes != 'object']]

train_dims = only_dims(train)
test_dims = only_dims(test)

train_dims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 380 entries, TransactionID to V339
dtypes: float64(376), int64(4)
memory usage: 521.9 MB


## split on XY

In [4]:
def split_Xy(df, target):
    return df.drop(target, axis=1), df[target]

target = 'isFraud'

X_train_dims, y_train_dims = split_Xy(train_dims, target)
X_test_dims, y_test_dims = split_Xy(test_dims, target)

X_train_dims.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,13926,,150.0,142.0,315.0,87.0,19.0,...,,,,,,,,,,
1,2987001,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,,...,,,,,,,,,,
2,2987002,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,...,,,,,,,,,,
3,2987003,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,,...,,,,,,,,,,
4,2987004,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
y_train_dims.head()

0    0
1    0
2    0
3    0
4    0
Name: isFraud, dtype: int64

## learn CatBoost

In [6]:
dtrain_dims = cb.Pool(X_train_dims, y_train_dims)
dtest_dims = cb.Pool(X_test_dims, y_test_dims)

In [7]:
%%time
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "GPU",
    "n_estimators": 400,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 15,
    "verbose": 20,
    "random_seed": 42
}

cb_model = cb.CatBoostClassifier(**params)
cb_model.fit(dtrain_dims, eval_set=dtest_dims)

0:	learn: 0.7130944	test: 0.6615220	best: 0.6615220 (0)	total: 39.9ms	remaining: 15.9s
20:	learn: 0.8309112	test: 0.7857408	best: 0.7962039 (19)	total: 553ms	remaining: 9.98s
40:	learn: 0.8447917	test: 0.8086889	best: 0.8086889 (40)	total: 1.06s	remaining: 9.29s
60:	learn: 0.8546170	test: 0.8130772	best: 0.8157296 (56)	total: 1.58s	remaining: 8.78s
80:	learn: 0.8588886	test: 0.8158995	best: 0.8164986 (78)	total: 2.09s	remaining: 8.23s
100:	learn: 0.8626809	test: 0.8235016	best: 0.8235016 (100)	total: 2.66s	remaining: 7.87s
120:	learn: 0.8688363	test: 0.8312745	best: 0.8313245 (118)	total: 3.25s	remaining: 7.49s
140:	learn: 0.8728369	test: 0.8388864	best: 0.8389271 (139)	total: 3.81s	remaining: 6.99s
160:	learn: 0.8763913	test: 0.8409466	best: 0.8410220 (157)	total: 4.39s	remaining: 6.52s
180:	learn: 0.8789871	test: 0.8433670	best: 0.8436124 (171)	total: 4.89s	remaining: 5.92s
200:	learn: 0.8816954	test: 0.8465962	best: 0.8465962 (200)	total: 5.44s	remaining: 5.39s
220:	learn: 0.8829079

<catboost.core.CatBoostClassifier at 0x24512b44df0>

## cross validation

In [8]:
cv_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "n_estimators": 400,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "thread_count": 4,
    "verbose": 20,
    "random_seed": 42
}

cb_cv = cb.cv(
    pool=dtrain_dims,
    params=cv_params,
    num_boost_round=200,
    stratified=True,
    shuffle=True,
    nfold=3,
    verbose=50,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]
0:	test: 0.6298123	best: 0.6298123 (0)	total: 220ms	remaining: 43.8s
50:	test: 0.8511488	best: 0.8513536 (47)	total: 5.27s	remaining: 15.4s
100:	test: 0.8653577	best: 0.8654726 (99)	total: 10.4s	remaining: 10.2s
150:	test: 0.8739284	best: 0.8739284 (150)	total: 15.7s	remaining: 5.09s
199:	test: 0.8803966	best: 0.8803966 (199)	total: 21.1s	remaining: 0us

bestTest = 0.8803966127
bestIteration = 199

Training on fold [1/3]
0:	test: 0.6206148	best: 0.6206148 (0)	total: 89.4ms	remaining: 17.8s
50:	test: 0.8462812	best: 0.8464393 (49)	total: 5.02s	remaining: 14.7s
100:	test: 0.8549029	best: 0.8549029 (100)	total: 10.1s	remaining: 9.9s
150:	test: 0.8636858	best: 0.8636858 (150)	total: 15.4s	remaining: 5s
199:	test: 0.8697674	best: 0.8697674 (199)	total: 20.6s	remaining: 0us

bestTest = 0.8697674027
bestIteration = 199

Training on fold [2/3]
0:	test: 0.6362556	best: 0.6362556 (0)	total: 106ms	remaining: 21.1s
50:	test: 0.8481309	best: 0.8487363 (49)	total: 5.15s	remain

In [9]:
cb_cv = pd.DataFrame(cb_cv)
cb_cv.to_csv('./cb_cv_dim.csv', index=False)
cb_cv.tail()

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
195,195,0.874372,0.005485,0.084062,0.001629,0.082706,0.000198
196,196,0.874464,0.005367,0.084021,0.001595,0.082662,0.000209
197,197,0.874517,0.005389,0.083973,0.00158,0.082606,0.00022
198,198,0.874797,0.005347,0.083898,0.001583,0.082528,0.000222
199,199,0.874916,0.005322,0.083852,0.001582,0.082475,0.000236


# Задание 8: обработать категориальные признаки любым способом (который вы знаете) и добавить их к данным. Выполнить задание 7.

## recategorical

In [10]:
recat_train = train.copy()
recat_test = test.copy()

def recat(df, column, values):
    return df[column].map({v: i for i, v in enumerate(values)})

cat_columns = [col for col in train.columns if train[col].dtypes == 'object']
for col in cat_columns:
    values = train[col].unique()
    
    recat_train[col] = recat(recat_train, col, values)
    recat_test[col] = recat(recat_test, col, values)
    
recat_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(18)
memory usage: 541.1 MB


## split on XY

In [11]:
X_train_cat, y_train_cat = split_Xy(recat_train, target)
X_test_cat, y_test_cat = split_Xy(recat_test, target)

X_train_cat.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,0,13926,,150.0,0,142.0,0,...,,,,,,,,,,
1,2987001,86401,29.0,0,2755,404.0,150.0,1,102.0,0,...,,,,,,,,,,
2,2987002,86469,59.0,0,4663,490.0,150.0,2,166.0,1,...,,,,,,,,,,
3,2987003,86499,50.0,0,18132,567.0,150.0,1,117.0,1,...,,,,,,,,,,
4,2987004,86506,50.0,1,4497,514.0,150.0,1,102.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## learn CatBoost

In [12]:
dtrain_cat = cb.Pool(X_train_cat, y_train_cat)
dtest_cat = cb.Pool(X_test_cat, y_test_cat)

In [13]:
%%time
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "GPU",
    "n_estimators": 400,
    "max_depth": 4,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 15,
    "verbose": 20,
    "random_seed": 42
}

cb_model = cb.CatBoostClassifier(**params)
cb_model.fit(dtrain_cat, eval_set=dtest_cat)

0:	learn: 0.6903871	test: 0.6704021	best: 0.6704021 (0)	total: 20.7ms	remaining: 8.26s
20:	learn: 0.8333461	test: 0.7797557	best: 0.7886033 (17)	total: 444ms	remaining: 8.02s
40:	learn: 0.8460597	test: 0.8126695	best: 0.8126695 (40)	total: 952ms	remaining: 8.34s
60:	learn: 0.8499425	test: 0.8162133	best: 0.8165749 (54)	total: 1.39s	remaining: 7.72s
80:	learn: 0.8529387	test: 0.8238353	best: 0.8242200 (78)	total: 1.81s	remaining: 7.13s
100:	learn: 0.8545697	test: 0.8271117	best: 0.8271117 (100)	total: 2.23s	remaining: 6.62s
120:	learn: 0.8605835	test: 0.8344786	best: 0.8344786 (120)	total: 2.66s	remaining: 6.13s
140:	learn: 0.8645236	test: 0.8384119	best: 0.8384119 (140)	total: 3.09s	remaining: 5.68s
160:	learn: 0.8682649	test: 0.8419050	best: 0.8419050 (160)	total: 3.56s	remaining: 5.29s
180:	learn: 0.8719263	test: 0.8452503	best: 0.8452503 (180)	total: 4.04s	remaining: 4.88s
200:	learn: 0.8743590	test: 0.8464442	best: 0.8465344 (199)	total: 4.48s	remaining: 4.43s
220:	learn: 0.8761168

<catboost.core.CatBoostClassifier at 0x24513973b50>

## cross validation

In [14]:
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "n_estimators": 400,
    "max_depth": 4,
    "l2_leaf_reg": 100,
    "thread_count": 4,
    "early_stopping_rounds": 15,
    "verbose": 20,
    "random_seed": 42
}

cb_cv = cb.cv(
    pool=dtrain_cat,
    params=cv_params,
    num_boost_round=200,
    early_stopping_rounds=15,
    stratified=True,
    shuffle=True,
    nfold=3,
    verbose=50,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]
0:	test: 0.6709044	best: 0.6709044 (0)	total: 124ms	remaining: 24.7s
50:	test: 0.8597469	best: 0.8597469 (50)	total: 5.58s	remaining: 16.3s
100:	test: 0.8713740	best: 0.8713740 (100)	total: 11s	remaining: 10.8s
150:	test: 0.8792266	best: 0.8792266 (150)	total: 16.5s	remaining: 5.35s
199:	test: 0.8853150	best: 0.8853150 (199)	total: 22s	remaining: 0us

bestTest = 0.8853149934
bestIteration = 199

Training on fold [1/3]
0:	test: 0.7024415	best: 0.7024415 (0)	total: 103ms	remaining: 20.5s
50:	test: 0.8474275	best: 0.8475183 (48)	total: 5.04s	remaining: 14.7s
100:	test: 0.8571049	best: 0.8572394 (99)	total: 10.4s	remaining: 10.2s
150:	test: 0.8663951	best: 0.8663951 (150)	total: 15.9s	remaining: 5.16s
199:	test: 0.8746225	best: 0.8746225 (199)	total: 21.4s	remaining: 0us

bestTest = 0.8746225258
bestIteration = 199

Training on fold [2/3]
0:	test: 0.7202396	best: 0.7202396 (0)	total: 108ms	remaining: 21.4s
50:	test: 0.8451825	best: 0.8451825 (50)	total: 5.29s	remaini

In [15]:
cb_cv = pd.DataFrame(cb_cv)
cb_cv.to_csv('./cb_cv_cat.csv', index=False)
cb_cv.tail()

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
195,195,0.879562,0.005474,0.082927,0.001522,0.081444,0.000201
196,196,0.879566,0.005474,0.082902,0.001524,0.081416,0.000211
197,197,0.879728,0.005471,0.082809,0.001509,0.081318,0.000207
198,198,0.879946,0.005407,0.08272,0.001499,0.081226,0.000199
199,199,0.88014,0.005354,0.082665,0.001501,0.081162,0.000213


# Задание 9: обработать категориальные признаки встроенным методом в CatBoost. Выполнить задание 7. Сделать выводы о качестве работы алгоритма, по сравнению с пунктом 8.

## split on XY

In [16]:
X_train, y_train = split_Xy(train, target)
X_test, y_test = split_Xy(test, target)

X_train.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,2987002,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,2987003,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,2987004,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## learn CatBoost

In [17]:
dtrain_cat = cb.Pool(X_train_cat, y_train_cat, cat_features=cat_columns)
dtest_cat = cb.Pool(X_test_cat, y_test_cat, cat_features=cat_columns)

In [18]:
%%time
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "GPU",
    "n_estimators": 600,
    "max_depth": 5,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 15,
    "verbose": 20,
    "random_seed": 42
}

cb_model = cb.CatBoostClassifier(**params)
cb_model.fit(dtrain_cat, eval_set=dtest_cat)

0:	learn: 0.7300735	test: 0.7142833	best: 0.7142833 (0)	total: 201ms	remaining: 2m
20:	learn: 0.8248771	test: 0.7855597	best: 0.7857865 (16)	total: 2.04s	remaining: 56.3s
40:	learn: 0.8471214	test: 0.8217251	best: 0.8217251 (40)	total: 3.93s	remaining: 53.5s
60:	learn: 0.8548233	test: 0.8283790	best: 0.8285746 (59)	total: 5.76s	remaining: 50.9s
80:	learn: 0.8625514	test: 0.8333373	best: 0.8333373 (80)	total: 7.62s	remaining: 48.8s
100:	learn: 0.8655766	test: 0.8359495	best: 0.8359635 (99)	total: 9.47s	remaining: 46.8s
120:	learn: 0.8693514	test: 0.8383961	best: 0.8383961 (120)	total: 11.3s	remaining: 44.8s
140:	learn: 0.8732454	test: 0.8417272	best: 0.8417662 (139)	total: 13.1s	remaining: 42.8s
160:	learn: 0.8766034	test: 0.8443830	best: 0.8443953 (159)	total: 14.9s	remaining: 40.8s
180:	learn: 0.8795185	test: 0.8476391	best: 0.8476391 (180)	total: 16.8s	remaining: 38.8s
200:	learn: 0.8814993	test: 0.8492598	best: 0.8492598 (200)	total: 18.6s	remaining: 37s
220:	learn: 0.8842929	test: 

<catboost.core.CatBoostClassifier at 0x245149de100>

## cross validation

In [19]:
params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "n_estimators": 600,
    "max_depth": 5,
    "l2_leaf_reg": 100,
    "thread_count": 4,
    "early_stopping_rounds": 15,
    "verbose": 20,
    "random_seed": 42
}

cb_cv = cb.cv(
    pool=dtrain_cat,
    params=cv_params,
    num_boost_round=200,
    early_stopping_rounds=15,
    stratified=True,
    shuffle=True,
    nfold=3,
    verbose=50,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]
0:	test: 0.5852800	best: 0.5852800 (0)	total: 484ms	remaining: 1m 36s
50:	test: 0.8534595	best: 0.8540414 (49)	total: 26.6s	remaining: 1m 17s
100:	test: 0.8643172	best: 0.8643172 (100)	total: 49.4s	remaining: 48.4s
150:	test: 0.8771657	best: 0.8771666 (149)	total: 1m 12s	remaining: 23.4s
199:	test: 0.8856186	best: 0.8856186 (199)	total: 1m 33s	remaining: 0us

bestTest = 0.8856185951
bestIteration = 199

Training on fold [1/3]
0:	test: 0.6577547	best: 0.6577547 (0)	total: 808ms	remaining: 2m 40s
50:	test: 0.8414096	best: 0.8416302 (48)	total: 23s	remaining: 1m 7s
100:	test: 0.8576713	best: 0.8576713 (100)	total: 47.2s	remaining: 46.2s
150:	test: 0.8659892	best: 0.8659892 (150)	total: 1m 9s	remaining: 22.5s
199:	test: 0.8730492	best: 0.8730492 (199)	total: 1m 31s	remaining: 0us

bestTest = 0.8730491672
bestIteration = 199

Training on fold [2/3]
0:	test: 0.6682285	best: 0.6682285 (0)	total: 428ms	remaining: 1m 25s
50:	test: 0.8464366	best: 0.8464366 (50)	total: 23s

In [20]:
cb_cv = pd.DataFrame(cb_cv)
cb_cv.to_csv('./cb_cv_cat_def.csv', index=False)
cb_cv.tail()

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
195,195,0.878976,0.006338,0.083236,0.001386,0.081862,0.000263
196,196,0.879107,0.006352,0.08319,0.001395,0.081809,0.000255
197,197,0.879171,0.006348,0.083142,0.001402,0.081756,0.000257
198,198,0.879389,0.006355,0.083067,0.001381,0.081676,0.000274
199,199,0.879498,0.006291,0.083008,0.001375,0.081613,0.000277
