## libs

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb

## load data

In [2]:
%%time
train = pd.read_csv('../data/assignment_2_train.csv')
leaderboard = pd.read_csv('../data/assignment_2_test.csv')

Wall time: 8.6 s


In [3]:
train

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179995,3166995,0,3958217,39.00,W,1877,310.0,150.0,mastercard,224.0,...,,,,,,,,,,
179996,3166996,0,3958237,59.95,W,10075,514.0,150.0,mastercard,224.0,...,,,,,,,,,,
179997,3166997,0,3958241,34.00,W,6053,122.0,150.0,mastercard,195.0,...,,,,,,,,,,
179998,3166998,0,3958260,59.00,W,7726,555.0,150.0,visa,226.0,...,,,,,,,,,,


In [4]:
leaderboard

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3287000,1,7415038,226.000,W,12473,555.0,150.0,visa,226.0,...,,,,,,,,,,
1,3287001,0,7415054,3072.000,W,15651,417.0,150.0,visa,226.0,...,,,,,,,,,,
2,3287002,0,7415081,319.950,W,13844,583.0,150.0,visa,226.0,...,,,,,,,,,,
3,3287003,0,7415111,171.000,W,11556,309.0,150.0,visa,226.0,...,,,,,,,,,,
4,3287004,0,7415112,107.950,W,10985,555.0,150.0,visa,226.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,3386996,0,10091528,368.990,W,13964,496.0,150.0,mastercard,224.0,...,,,,,,,,,,
99997,3386997,0,10091533,445.330,W,10616,583.0,150.0,visa,226.0,...,,,,,,,,,,
99998,3386998,0,10091544,15.226,C,9803,583.0,150.0,visa,226.0,...,,,,,,,,,,
99999,3386999,0,10091549,34.742,C,16062,500.0,185.0,mastercard,137.0,...,,,,,,,,,,


## recategorical

In [5]:
def recat(series, vals):
    return series.map({v: i for i, v in enumerate(vals)})

cat_cols = [col for col, val in (train.dtypes == 'object').items() if val]
for col in cat_cols:
    vals = train[col].unique()
    
    train[col] = recat(train[col], vals)
    leaderboard[col] = recat(leaderboard[col], vals)

In [6]:
train

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.50,0,13926,,150.0,0,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,0,2755,404.0,150.0,1,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,0,4663,490.0,150.0,2,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,0,18132,567.0,150.0,1,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,1,4497,514.0,150.0,1,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179995,3166995,0,3958217,39.00,0,1877,310.0,150.0,1,224.0,...,,,,,,,,,,
179996,3166996,0,3958237,59.95,0,10075,514.0,150.0,1,224.0,...,,,,,,,,,,
179997,3166997,0,3958241,34.00,0,6053,122.0,150.0,1,195.0,...,,,,,,,,,,
179998,3166998,0,3958260,59.00,0,7726,555.0,150.0,2,226.0,...,,,,,,,,,,


In [7]:
leaderboard

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3287000,1,7415038,226.000,0,12473,555.0,150.0,2,226.0,...,,,,,,,,,,
1,3287001,0,7415054,3072.000,0,15651,417.0,150.0,2,226.0,...,,,,,,,,,,
2,3287002,0,7415081,319.950,0,13844,583.0,150.0,2,226.0,...,,,,,,,,,,
3,3287003,0,7415111,171.000,0,11556,309.0,150.0,2,226.0,...,,,,,,,,,,
4,3287004,0,7415112,107.950,0,10985,555.0,150.0,2,226.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,3386996,0,10091528,368.990,0,13964,496.0,150.0,1,224.0,...,,,,,,,,,,
99997,3386997,0,10091533,445.330,0,10616,583.0,150.0,2,226.0,...,,,,,,,,,,
99998,3386998,0,10091544,15.226,2,9803,583.0,150.0,2,226.0,...,,,,,,,,,,
99999,3386999,0,10091549,34.742,2,16062,500.0,185.0,1,137.0,...,,,,,,,,,,


## split LB on x&y

In [8]:
def split_on_Xy(df):
    return df.drop(['isFraud'], axis=1), df['isFraud']

X_leaderboard, y_leaderboard = split_on_Xy(leaderboard)
X_leaderboard.shape, y_leaderboard.shape

((100001, 393), (100001,))

### Задание 1: сделать Hold-Out валидацию с разбиением, размер которого будет адеквтаным, по вашему мнению; разбиение проводить по id-транзакции (TransactionID), обучать модель градиетного бустинга любой реализации с подбором числа деревьев по early_stopping критерию до достижения сходимости. Оценить качество модели на валидационной выборке, оценить расхождение по сравнению с качеством на обучающей выборке и валидационной выборке. Оценить качество на ЛБ, сравнить с качеством на обучении и валидации. Сделать выводы.

## split train on x&y

In [9]:
# если я правильно понял, что значит разбиение по TransactionID, то значит нужно по ней отсортировать df, 
# и на сплите не перемешивать...
train.sort_values(by=['TransactionID'], inplace=True)

X, y = split_on_Xy(train)
X.shape, y.shape

((180000, 393), (180000,))

## split on train & val

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.6, shuffle=False)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((108000, 393), (72000, 393), (108000,), (72000,))

## training model

In [11]:
def get_auc_score(y_true, y_pred):
    return round(roc_auc_score(y_true, y_pred), 4)

In [12]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)

In [13]:
%%time
model = xgb.train(
    params={
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',
        
        'max_depth': 7,
        'learning_rate': 0.1,
        'num_parallel_tree': 1,
        'reg_lambda': 40,
        'gamma': 9
    },
    
    dtrain=dtrain,
    evals=[(dtrain, 'train'), (dvalid, 'valid')],
    verbose_eval=10,
    
    num_boost_round=1000,
    early_stopping_rounds=15,
    maximize=True
)

y_train_pred = model.predict(xgb.DMatrix(X_train))
y_valid_pred = model.predict(xgb.DMatrix(X_valid))

print(f'Overfitting: {round(get_auc_score(y_train, y_train_pred) - get_auc_score(y_valid, y_valid_pred), 2)}')

[0]	train-auc:0.67339	valid-auc:0.65580
[10]	train-auc:0.81202	valid-auc:0.81280
[20]	train-auc:0.86550	valid-auc:0.84761
[30]	train-auc:0.88219	valid-auc:0.85564
[40]	train-auc:0.89860	valid-auc:0.86528
[50]	train-auc:0.90948	valid-auc:0.86975
[60]	train-auc:0.91733	valid-auc:0.87420
[70]	train-auc:0.92284	valid-auc:0.87870
[80]	train-auc:0.92634	valid-auc:0.88094
[90]	train-auc:0.92646	valid-auc:0.88138
[100]	train-auc:0.92646	valid-auc:0.88138
Overfitting: 0.05
Wall time: 9.84 s


## evaluate model

In [14]:
print('train =', get_auc_score(y_train, y_train_pred))
print('val =', get_auc_score(y_valid, y_valid_pred))

train = 0.9265
val = 0.8814


## check LB

In [15]:
y_leaderboard_pred = model.predict(xgb.DMatrix(X_leaderboard))

print('LB =', get_auc_score(y_leaderboard, y_leaderboard_pred))

LB = 0.867


## Вывод:
- не смотря на небольшое переобучение, лидерборд ниже валидационных выборок на 4-6%

### Задание 2: сделать Hold-Out валидацию с разбиением на 3 выборки, разбиение проводить по id-транзакции (TransactionID), размер каждой выборки подобрать самостоятельно. Повторить процедуру из п.1. для каждой выборки.

## split on valid & test

In [16]:
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, train_size=0.6, shuffle=True)
X_valid.shape, X_test.shape, y_valid.shape, y_test.shape

((43200, 393), (28800, 393), (43200,), (28800,))

## training model

In [17]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)

In [18]:
%%time
model = xgb.train(
    params={
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',
        
        'max_depth': 6,
        'learning_rate': 0.1,
        'num_parallel_tree': 1,
        'reg_lambda': 20,
        'gamma': 4
    },
    
    dtrain=dtrain,
    evals=[(dtrain, 'train'), (dvalid, 'valid')],
    verbose_eval=10,
    
    num_boost_round=1000,
    early_stopping_rounds=15,
    maximize=True
)

y_train_pred = model.predict(xgb.DMatrix(X_train))
y_valid_pred = model.predict(xgb.DMatrix(X_valid))

print(f'Overfitting: {round(get_auc_score(y_train, y_train_pred) - get_auc_score(y_valid, y_valid_pred), 2)}')

[0]	train-auc:0.67413	valid-auc:0.65270
[10]	train-auc:0.83181	valid-auc:0.82824
[20]	train-auc:0.86886	valid-auc:0.84907
[30]	train-auc:0.88455	valid-auc:0.85675
[40]	train-auc:0.90571	valid-auc:0.86686
[50]	train-auc:0.91643	valid-auc:0.87016
[60]	train-auc:0.92627	valid-auc:0.87545
[70]	train-auc:0.93162	valid-auc:0.87980
[80]	train-auc:0.93465	valid-auc:0.88191
[90]	train-auc:0.93778	valid-auc:0.88427
[100]	train-auc:0.94069	valid-auc:0.88572
[110]	train-auc:0.94374	valid-auc:0.88679
[120]	train-auc:0.94374	valid-auc:0.88679
[124]	train-auc:0.94374	valid-auc:0.88679
Overfitting: 0.06
Wall time: 8.15 s


In [19]:
y_test_pred = model.predict(xgb.DMatrix(X_test))

print('train =', get_auc_score(y_train, y_train_pred))
print('val =', get_auc_score(y_valid, y_valid_pred))
print('test =', get_auc_score(y_test, y_test_pred))

train = 0.9437
val = 0.8868
test = 0.8835


## check LB

In [20]:
y_leaderboard_pred = model.predict(xgb.DMatrix(X_leaderboard))

print('LB =', get_auc_score(y_leaderboard, y_leaderboard_pred))

LB = 0.8679


## Вывод:
- симуляция leaderboard при помощи test выборки - интересная идея, которая позволяет поставить границу между гиперпараметрами и проверочной выборкой, что в теории должно повысить стабильность модели
- були уменьшены такие показатели гиперпараметров как: глубина дерева, l2 и gamma
- итоговая точность модели почти не изменилась

### Задание 3: построить доверительный интервал на данных из п.2 на основе бутстреп выборок, оценить качество модели на ЛБ относительно полученного доверительного интервала. Сделать выводы.

In [21]:
def get_bootstrap_scores(y_true, y_pred, count=1000):
    y_true = y_true if isinstance(y_true, np.ndarray) else y_true.values
    bootstrap_idx = np.random.randint(0, len(y_true), size=(count, len(y_true)))
    return [roc_auc_score(y_true[idx], y_pred[idx]) for idx in bootstrap_idx]

def confidence_interval(scores, conf_interval=0.95):
    left_bound = np.percentile(
        scores, ((1 - conf_interval) / 2) * 100
    )
    right_bound = np.percentile(
        scores, (conf_interval + ((1 - conf_interval) / 2)) * 100
    )
    return left_bound, right_bound

In [22]:
%%time
scores = get_bootstrap_scores(y_train, model.predict(xgb.DMatrix(X_train)))
print(f'train CI:', confidence_interval(scores))

scores = get_bootstrap_scores(y_valid, model.predict(xgb.DMatrix(X_valid)))
print(f'valid CI:', confidence_interval(scores))

scores = get_bootstrap_scores(y_test, model.predict(xgb.DMatrix(X_test)))
print(f'test CI:', confidence_interval(scores))

train CI: (0.9381543382715085, 0.9487726644758472)
valid CI: (0.8760810326189118, 0.8976632869195118)
test CI: (0.8699288015539203, 0.8960274082584037)
Wall time: 49.5 s


## Вывод:
- результат LB лежит ниже левой граници CI во всех выборках, при чем, разница от 1 - 7% - валидация не очень хорошая

### Задание 4: выполнить Adversarial Validation, подобрать объекты из обучающей выборки, которые сильно похожи на объекты из assignment_2_test.csv, и использовать их в качестве валидационного набора. Оценить качество модели на ЛБ, сделать выводы о полученных результатах.

In [23]:
X_adv = pd.concat([X, X_leaderboard], axis=0)
y_adv = np.hstack((
    np.zeros(len(X)), 
    np.ones(len(X_leaderboard))
))

In [24]:
%%time
model_adv = xgb.train(
    params={
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist'
    },
    dtrain=xgb.DMatrix(X_adv, y_adv),
)

Wall time: 5.39 s


In [25]:
y_adv_pred = model_adv.predict(xgb.DMatrix(X_adv))
roc_auc_score(y_adv, y_adv_pred)

0.9999986820131798

In [26]:
y_pred = model_adv.predict(xgb.DMatrix(X_train))

pd.cut(
    y_pred, bins=np.arange(0, 1.01, 0.1)
).value_counts().sort_index()

(0.0, 0.1]    108000
(0.1, 0.2]         0
(0.2, 0.3]         0
(0.3, 0.4]         0
(0.4, 0.5]         0
(0.5, 0.6]         0
(0.6, 0.7]         0
(0.7, 0.8]         0
(0.8, 0.9]         0
(0.9, 1.0]         0
dtype: int64

## Вывод:
- из тестовой выборки нет похожих наблюдений на LB выборку

### Задание 5: сделать KFold / StratifiedKFold валидацию (на ваше усмотрение), оценить получаемые качество и разброс по метрике качества. Сделать выводы об устойчивости кросс-валидации, сходимости оценки на кросс-валидации и отложенном наборе данных; Оценить качество на ЛБ, сделать выводы.

In [27]:
models, fold_train_score, fold_val_score = [], [], []

for train_idx, val_idx in KFold(n_splits=5).split(X, y):
    
    X_train, X_val = X.loc[train_idx], X.loc[val_idx]
    y_train, y_val = y.loc[train_idx], y.loc[val_idx]
    
    dtrain = xgb.DMatrix(X_train, y_train)
    dval = xgb.DMatrix(X_val, y_val)
    
    model = xgb.train(
        params={'booster': 'gbtree',
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
                'tree_method': 'gpu_hist',
                'max_depth': 3,
                'learning_rate': 0.1,
                'num_parallel_tree': 1,
                'reg_lambda': 100,
                'gamma': 10},
        dtrain=dtrain,
        evals=[(dtrain, 'train'), (dval, 'valid')],
        verbose_eval=50,

        num_boost_round=1000,
        early_stopping_rounds=15,
        maximize=True
    )
    y_train_pred = model.predict(dtrain)
    y_val_pred = model.predict(dval)
    
    score_train = roc_auc_score(y_train, y_train_pred)
    score_val = roc_auc_score(y_val, y_val_pred)
    
    fold_train_score.append(score_train)
    fold_val_score.append(score_val)
    models.append(model)

mean_train_score = np.mean(fold_train_score)
mean_val_score = np.mean(fold_val_score)

print(f'train: {mean_train_score}', 
      f'val: {mean_val_score}',
      f'diff: {mean_train_score - mean_val_score}',
      sep='\n')

[0]	train-auc:0.61928	valid-auc:0.57191
[50]	train-auc:0.87930	valid-auc:0.85258
[100]	train-auc:0.89947	valid-auc:0.87212
[150]	train-auc:0.90727	valid-auc:0.87956
[165]	train-auc:0.90747	valid-auc:0.87953
[0]	train-auc:0.63620	valid-auc:0.66988
[50]	train-auc:0.87312	valid-auc:0.86752
[100]	train-auc:0.89795	valid-auc:0.88578
[150]	train-auc:0.90592	valid-auc:0.89160
[180]	train-auc:0.90766	valid-auc:0.89301
[0]	train-auc:0.61965	valid-auc:0.62534
[50]	train-auc:0.87432	valid-auc:0.87477
[100]	train-auc:0.89415	valid-auc:0.88931
[150]	train-auc:0.90311	valid-auc:0.89938
[173]	train-auc:0.90384	valid-auc:0.90012
[0]	train-auc:0.60888	valid-auc:0.61320
[50]	train-auc:0.87256	valid-auc:0.86257
[93]	train-auc:0.89376	valid-auc:0.87115
[0]	train-auc:0.60356	valid-auc:0.61861
[50]	train-auc:0.87536	valid-auc:0.84840
[100]	train-auc:0.89571	valid-auc:0.87514
[150]	train-auc:0.90393	valid-auc:0.88204
[174]	train-auc:0.90499	valid-auc:0.88250
train: 0.9035415598501985
val: 0.8852617140907526


In [28]:
mean_lb_score = []
for model in models:
    y_leaderboard_pred = model.predict(xgb.DMatrix(X_leaderboard))
    score = get_auc_score(y_leaderboard, y_leaderboard_pred)
    mean_lb_score.append(score)
    print('LB =', get_auc_score(y_leaderboard, y_leaderboard_pred))
    
print('mean LB =', round(np.mean(mean_lb_score), 4))

LB = 0.8606
LB = 0.8616
LB = 0.8617
LB = 0.8592
LB = 0.8635
mean LB = 0.8613
