In [1]:
!pip install catboost

Collecting catboost
  Downloading https://files.pythonhosted.org/packages/03/f6/e179ca7e5a1d2a1b53fa4ea682281e7f815b00481f49b3ec410625779ad7/catboost-0.24.1-cp37-none-win_amd64.whl (65.3MB)
Collecting graphviz (from catboost)
  Downloading https://files.pythonhosted.org/packages/62/dc/9dd6a6b9b8977248e165e075b109eea6e8eac71faa28ca378c3d98e54fbe/graphviz-0.14.1-py2.py3-none-any.whl
Collecting pandas>=0.24.0 (from catboost)
  Downloading https://files.pythonhosted.org/packages/c5/16/07da3435a161ae411eef63d6c5edcf9fd11a8a11e94f60d259693b7e0804/pandas-1.1.2-cp37-cp37m-win_amd64.whl (9.4MB)
Collecting plotly (from catboost)
  Downloading https://files.pythonhosted.org/packages/04/20/c2d77eef33dbd40c5e3263a9a8763ffca610a4c3d2b2da21c5601e5fc5d8/plotly-4.10.0-py2.py3-none-any.whl (13.0MB)
Collecting retrying>=1.3.3 (from plotly->catboost)
  Using cached https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Building whee

In [125]:
import os
from copy import deepcopy
from typing import List, Tuple

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import probplot

import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold

In [126]:
train = pd.read_csv("assignment2_data/assignment_train.csv")
test = pd.read_csv("assignment2_data/assignment_test.csv")
train.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


## Основное задание

Даны выборки для обучения и для тестирования. Задание заключается в том, чтобы попробовать разные способы валидации, проанализировать плюсы / минусы каждой и сделать выводы о том, какой способ валидации наиболее устойчивый в данной задаче.


##  Hold-Out валидация на 2 выборки

__Задание 1:__ сделать Hold-Out валидацию с разбиением, размер которого будет адеквтаным, по вашему мнению; разбиение проводить по id-транзакции (`TransactionID`), обучать модель градиетного бустинга любой реализации с подбором числа деревьев по early_stopping критерию до достижения сходимости. Оценить качество модели на тестовой выборке, оценить расхождение по сравнению с качеством на обучающей выборке и тестовой выборке.

In [None]:
train -> x_train / x_valid / x_test
# test - имитация public LB -> public_lb

In [127]:
def fit_catboost(x_train, y_train, model_params, categorical, *args):
    """
    Обучение модели CatBoostClassifier.

    Parameters
    ----------
    x_train: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y_valid: pandas.core.frame.Series
        Вектор целевой переменной для валидации модели.

    model_params: dict
        Словарь со значением гиперпараметров модели.

    categorical: List[str]
        Список с названием категориальных признаков.

    Returns
    -------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    """
    eval_set = [(x_train, y_train)]

    if args == 2:
        eval_set.append((args[0], args[1]))

    model = cb.CatBoostClassifier(**model_params)
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=eval_set,
        cat_features=categorical
    )

    return model

def evaluate_model(model, *args):
    """
    Оценка качества модели.

    Parameters
    ----------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    args: pandas.core.frame.DataFrame
        Пары из обучающей выборки и вектора истинных ответов.
        Опциональные параметры.

    """
    eval_data = [(args[i], args[i+1]) for i in range(0, len(args), 2)]
    if eval_data:
        for sample, target in eval_data:
            y_pred = model.predict_proba(sample)[:, 1]
            score = roc_auc_score(target, y_pred)
            print(f"score = {round(score, 6)}")

def prepare_data(X, categorical, to_drop):
    """
    Преобразование данных для передачи в модель.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков для передачи в модель.

    categorical: List[str]
        Список с названием категориальных признаков.

    drop_features: List[str]
        Список с названием признаков, которые не должны
        участвовать в обучении.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Матрица признаков, подготовленная для передачи в модель.

    """
    X_transformed = X.copy()
    to_drop = set(X.columns) & set(to_drop)

    if to_drop:
        X_transformed = X_transformed.drop(to_drop, axis=1)

    X_transformed[categorical] = X_transformed[categorical].astype(str)
    return X_transformed

In [128]:
to_drop = [
    "TransactionID",
    "TransactionDT",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

In [129]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)



In [130]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 35000 rows, 391 cols
x_valid.shape = 15001 rows, 391 cols
x_test.shape = 75000 rows, 391 cols


In [131]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}

model = fit_catboost(
    x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.6253433	best: 0.6253433 (0)	total: 1.36s	remaining: 22m 37s
10:	test: 0.7847311	best: 0.7847311 (10)	total: 3.02s	remaining: 4m 31s
20:	test: 0.7994185	best: 0.8016245 (19)	total: 4.24s	remaining: 3m 17s
30:	test: 0.7988680	best: 0.8016245 (19)	total: 5.46s	remaining: 2m 50s
40:	test: 0.8046661	best: 0.8047039 (39)	total: 6.58s	remaining: 2m 33s
50:	test: 0.8127584	best: 0.8128247 (49)	total: 7.81s	remaining: 2m 25s
60:	test: 0.8159951	best: 0.8163395 (58)	total: 8.93s	remaining: 2m 17s
70:	test: 0.8209795	best: 0.8209795 (70)	total: 10s	remaining: 2m 11s
80:	test: 0.8249503	best: 0.8250362 (75)	total: 11.2s	remaining: 2m 7s
90:	test: 0.8261782	best: 0.8267259 (88)	total: 12.4s	remaining: 2m 3s
100:	test: 0.8324016	best: 0.8325887 (99)	total: 13.5s	remaining: 1m 59s
110:	test: 0.8349103	best: 0.8349103 (110)	total: 14.6s	remaining: 1m 57s
120:	test: 0.8399842	best: 0.8403415 (116)	total: 15.8s	remaining: 1m 54s
130:	test: 0.8434055	best: 0.8434055 (130)	total: 16.9s	remainin

In [132]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.896412
score = 0.868581
score = 0.865602


Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.


In [9]:
from datetime import datetime, timedelta

data = train

start_date = datetime(2017, 12, 1)

data["TransactionDateTime"] = data["TransactionDT"].apply(lambda x: timedelta(seconds=x) + start_date)
data["year"] = data["TransactionDateTime"].dt.year
data["month_of_year"] = data["TransactionDateTime"].dt.month
data["day_of_week"] = data["TransactionDateTime"].dt.weekday
data["day"] = data["TransactionDateTime"].dt.day
data["hour"] = data["TransactionDateTime"].dt.hour

data.head(n=2)

data = test

start_date = datetime(2017, 12, 1)

data["TransactionDateTime"] = data["TransactionDT"].apply(lambda x: timedelta(seconds=x) + start_date)
data["year"] = data["TransactionDateTime"].dt.year
data["month_of_year"] = data["TransactionDateTime"].dt.month
data["day_of_week"] = data["TransactionDateTime"].dt.weekday
data["day"] = data["TransactionDateTime"].dt.day
data["hour"] = data["TransactionDateTime"].dt.hour
data.head(n=2)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V336,V337,V338,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,,,,,2017-12-20 00:31:32,2017,12,2,20,0
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,,,,,2017-12-20 00:31:53,2017,12,2,20,0


In [10]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)




In [11]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 35000 rows, 397 cols
x_valid.shape = 15001 rows, 397 cols
x_test.shape = 75000 rows, 397 cols


In [12]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}


model = fit_catboost(
    x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.6731757	best: 0.6731757 (0)	total: 133ms	remaining: 2m 12s
10:	test: 0.8001497	best: 0.8017052 (7)	total: 1.35s	remaining: 2m 1s
20:	test: 0.8022797	best: 0.8025527 (16)	total: 2.51s	remaining: 1m 56s
30:	test: 0.8025375	best: 0.8042660 (25)	total: 3.65s	remaining: 1m 54s
40:	test: 0.8094565	best: 0.8094565 (40)	total: 4.77s	remaining: 1m 51s
50:	test: 0.8112428	best: 0.8114043 (49)	total: 5.93s	remaining: 1m 50s
60:	test: 0.8114176	best: 0.8124757 (55)	total: 7.09s	remaining: 1m 49s
70:	test: 0.8163746	best: 0.8163746 (70)	total: 8.22s	remaining: 1m 47s
80:	test: 0.8202837	best: 0.8202837 (80)	total: 9.34s	remaining: 1m 46s
90:	test: 0.8278992	best: 0.8278992 (90)	total: 10.5s	remaining: 1m 45s
100:	test: 0.8339391	best: 0.8340826 (98)	total: 11.7s	remaining: 1m 43s
110:	test: 0.8414247	best: 0.8414247 (110)	total: 12.8s	remaining: 1m 42s
120:	test: 0.8461018	best: 0.8461018 (120)	total: 14s	remaining: 1m 41s
130:	test: 0.8520714	best: 0.8520714 (130)	total: 15.1s	remaining

In [13]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.896267
score = 0.867839
score = 0.865789


Задание 2: сгруппировать данные по card1 и посчитать среднюю сумму транзакции. Добавить в качестве признака в набор данных. Посчитать разницу между суммой транзакцией пользователя и средней суммой транзакции по данному типу card1. Построить отношение этих признаков. Повторить процедуру для всех card.


In [14]:
data = train
data["card6"].head(n=3)

0    credit
1    credit
2     debit
Name: card6, dtype: object

In [15]:
from typing import List, Optional

def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".upper() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [16]:
aggs = {
    "TransactionAmt": [np.mean]
}

stats = create_numerical_aggs(
    data, groupby_id="card1", aggs=aggs, suffix="_BY_CARD1"
)
stats.head(n=2)

Unnamed: 0,card1,TRANSACTIONAMT_MEAN_BY_CARD1
0,1001,183.0
1,1004,90.0


In [17]:
data[data['card1'] == 1004]

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V336,V337,V338,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour
33767,3020767,0,842821,150.0,R,1004,583.0,150.0,visa,226.0,...,0.0,0.0,0.0,0.0,2017-12-10 18:07:01,2017,12,6,10,18
41973,3028973,0,1022173,30.0,H,1004,583.0,150.0,visa,226.0,...,0.0,0.0,0.0,0.0,2017-12-12 19:56:13,2017,12,1,12,19


In [18]:
data = data.merge(
    stats, how="left", on="card1"
)
data.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V337,V338,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,2017-12-02 00:00:00,2017,12,5,2,0,92.125
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,2017-12-02 00:00:01,2017,12,5,2,0,274.0708


In [19]:
data["TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1"] = data["TransactionAmt"] / data["TRANSACTIONAMT_MEAN_BY_CARD1"]
data["DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1"] = data["TransactionAmt"] - data["TRANSACTIONAMT_MEAN_BY_CARD1"]
data.head(n=3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,2017-12-02 00:00:00,2017,12,5,2,0,92.125,0.743555,-23.625
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,2017-12-02 00:00:01,2017,12,5,2,0,274.0708,0.105812,-245.0708
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,2017-12-02 00:01:09,2017,12,5,2,0,94.948551,0.621389,-35.948551


In [20]:
aggs = {
    "TransactionAmt": [np.mean]
}

stats = create_numerical_aggs(
    data, groupby_id="card4", aggs=aggs, suffix="_BY_CARD4"
)
stats

Unnamed: 0,card4,TRANSACTIONAMT_MEAN_BY_CARD4
0,american express,167.022392
1,discover,211.559924
2,mastercard,124.734717
3,visa,126.812814


In [21]:
data = data.merge(
    stats, how="left", on="card4"
)
data.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,TransactionDateTime,year,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,2017-12-02 00:00:00,2017,12,5,2,0,92.125,0.743555,-23.625,211.559924
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,2017-12-02 00:00:01,2017,12,5,2,0,274.0708,0.105812,-245.0708,124.734717


In [22]:
data["TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4"] = data["TransactionAmt"] / data["TRANSACTIONAMT_MEAN_BY_CARD4"]
data["DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4"] = data["TransactionAmt"] - data["TRANSACTIONAMT_MEAN_BY_CARD4"]
data.head(n=3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,12,5,2,0,92.125,0.743555,-23.625,211.559924,0.323785,-143.059924
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,12,5,2,0,274.0708,0.105812,-245.0708,124.734717,0.232493,-95.734717
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,12,5,2,0,94.948551,0.621389,-35.948551,126.812814,0.465253,-67.812814


In [23]:
stats = create_numerical_aggs(
    data, groupby_id="card6", aggs=aggs, suffix="_BY_CARD6"
)
stats

Unnamed: 0,card6,TRANSACTIONAMT_MEAN_BY_CARD6
0,charge card,32.956667
1,credit,165.657609
2,debit,110.833585
3,debit or credit,47.313333


In [24]:
data = data.merge(
    stats, how="left", on="card6"
)
data.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,TRANSACTIONAMT_MEAN_BY_CARD6
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,5,2,0,92.125,0.743555,-23.625,211.559924,0.323785,-143.059924,165.657609
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,5,2,0,274.0708,0.105812,-245.0708,124.734717,0.232493,-95.734717,165.657609


In [25]:
data["TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6"] = data["TransactionAmt"] / data["TRANSACTIONAMT_MEAN_BY_CARD6"]
data["DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6"] = data["TransactionAmt"] - data["TRANSACTIONAMT_MEAN_BY_CARD6"]
data.head(n=3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,TRANSACTIONAMT_MEAN_BY_CARD6,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0,92.125,0.743555,-23.625,211.559924,0.323785,-143.059924,165.657609,0.413503,-97.157609
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0,274.0708,0.105812,-245.0708,124.734717,0.232493,-95.734717,165.657609,0.17506,-136.657609
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0,94.948551,0.621389,-35.948551,126.812814,0.465253,-67.812814,110.833585,0.53233,-51.833585


In [26]:
data = test

In [27]:
aggs = {
    "TransactionAmt": [np.mean]
}

stats = create_numerical_aggs(
    data, groupby_id="card1", aggs=aggs, suffix="_BY_CARD1"
)
stats.head(n=2)

Unnamed: 0,card1,TRANSACTIONAMT_MEAN_BY_CARD1
0,1006,150.0
1,1009,50.0


In [28]:
data[data['card1'] == 1004]

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V336,V337,V338,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour


In [29]:
data = data.merge(
    stats, how="left", on="card1"
)
data.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V337,V338,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,,,,2017-12-20 00:31:32,2017,12,2,20,0,112.174826
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,,,,2017-12-20 00:31:53,2017,12,2,20,0,155.495


In [30]:
data["TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1"] = data["TransactionAmt"] / data["TRANSACTIONAMT_MEAN_BY_CARD1"]
data["DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1"] = data["TransactionAmt"] - data["TRANSACTIONAMT_MEAN_BY_CARD1"]
data.head(n=3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V339,TransactionDateTime,year,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,,2017-12-20 00:31:32,2017,12,2,20,0,112.174826,0.365501,-71.174826
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,,2017-12-20 00:31:53,2017,12,2,20,0,155.495,1.035403,5.505
2,3062002,0,1643519,49.0,W,14935,543.0,150.0,mastercard,224.0,...,,2017-12-20 00:31:59,2017,12,2,20,0,49.0,1.0,0.0


In [31]:
aggs = {
    "TransactionAmt": [np.mean]
}

stats = create_numerical_aggs(
    data, groupby_id="card4", aggs=aggs, suffix="_BY_CARD4"
)
stats

Unnamed: 0,card4,TRANSACTIONAMT_MEAN_BY_CARD4
0,american express,196.453411
1,discover,204.291208
2,mastercard,122.38674
3,visa,125.377968


In [32]:
data = data.merge(
    stats, how="left", on="card4"
)
data.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,TransactionDateTime,year,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,2017-12-20 00:31:32,2017,12,2,20,0,112.174826,0.365501,-71.174826,125.377968
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,2017-12-20 00:31:53,2017,12,2,20,0,155.495,1.035403,5.505,122.38674


In [33]:
data["TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4"] = data["TransactionAmt"] / data["TRANSACTIONAMT_MEAN_BY_CARD4"]
data["DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4"] = data["TransactionAmt"] - data["TRANSACTIONAMT_MEAN_BY_CARD4"]
data.head(n=3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,month_of_year,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,12,2,20,0,112.174826,0.365501,-71.174826,125.377968,0.327011,-84.377968
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,12,2,20,0,155.495,1.035403,5.505,122.38674,1.315502,38.61326
2,3062002,0,1643519,49.0,W,14935,543.0,150.0,mastercard,224.0,...,12,2,20,0,49.0,1.0,0.0,122.38674,0.40037,-73.38674


In [34]:
stats = create_numerical_aggs(
    data, groupby_id="card6", aggs=aggs, suffix="_BY_CARD6"
)
stats

Unnamed: 0,card6,TRANSACTIONAMT_MEAN_BY_CARD6
0,charge card,21.4545
1,credit,165.392191
2,debit,109.075197
3,debit or credit,62.07


In [35]:
data = data.merge(
    stats, how="left", on="card6"
)
data.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,day_of_week,day,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,TRANSACTIONAMT_MEAN_BY_CARD6
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,2,20,0,112.174826,0.365501,-71.174826,125.377968,0.327011,-84.377968,109.075197
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,2,20,0,155.495,1.035403,5.505,122.38674,1.315502,38.61326,109.075197


In [36]:
data["TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6"] = data["TransactionAmt"] / data["TRANSACTIONAMT_MEAN_BY_CARD6"]
data["DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6"] = data["TransactionAmt"] - data["TRANSACTIONAMT_MEAN_BY_CARD6"]
data.head(n=3)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,hour,TRANSACTIONAMT_MEAN_BY_CARD1,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD1,TRANSACTIONAMT_MEAN_BY_CARD4,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD4,TRANSACTIONAMT_MEAN_BY_CARD6,TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6,DELTA_TransactionAmt_TO_TRANSACTIONAMT_MEAN_BY_CARD6
0,3062000,0,1643492,41.0,W,9500,321.0,150.0,visa,226.0,...,0,112.174826,0.365501,-71.174826,125.377968,0.327011,-84.377968,109.075197,0.375887,-68.075197
1,3062001,1,1643513,161.0,W,6668,390.0,150.0,mastercard,224.0,...,0,155.495,1.035403,5.505,122.38674,1.315502,38.61326,109.075197,1.476046,51.924803
2,3062002,0,1643519,49.0,W,14935,543.0,150.0,mastercard,224.0,...,0,49.0,1.0,0.0,122.38674,0.40037,-73.38674,109.075197,0.449231,-60.075197


In [59]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)




In [60]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 35000 rows, 397 cols
x_valid.shape = 15001 rows, 397 cols
x_test.shape = 75000 rows, 397 cols


In [61]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}


model = fit_catboost(
    x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.6731757	best: 0.6731757 (0)	total: 267ms	remaining: 4m 26s
10:	test: 0.8001497	best: 0.8017052 (7)	total: 1.44s	remaining: 2m 9s
20:	test: 0.8022797	best: 0.8025527 (16)	total: 2.59s	remaining: 2m
30:	test: 0.8025375	best: 0.8042660 (25)	total: 3.73s	remaining: 1m 56s
40:	test: 0.8094565	best: 0.8094565 (40)	total: 4.84s	remaining: 1m 53s
50:	test: 0.8112428	best: 0.8114043 (49)	total: 5.97s	remaining: 1m 51s
60:	test: 0.8114176	best: 0.8124757 (55)	total: 7.08s	remaining: 1m 49s
70:	test: 0.8163746	best: 0.8163746 (70)	total: 8.2s	remaining: 1m 47s
80:	test: 0.8202837	best: 0.8202837 (80)	total: 9.29s	remaining: 1m 45s
90:	test: 0.8278992	best: 0.8278992 (90)	total: 10.4s	remaining: 1m 44s
100:	test: 0.8339391	best: 0.8340826 (98)	total: 11.5s	remaining: 1m 42s
110:	test: 0.8414247	best: 0.8414247 (110)	total: 12.6s	remaining: 1m 41s
120:	test: 0.8461018	best: 0.8461018 (120)	total: 13.7s	remaining: 1m 39s
130:	test: 0.8520714	best: 0.8520714 (130)	total: 14.9s	remaining: 1

In [34]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.896267
score = 0.867839
score = 0.865789


Задание 3: преобразовать признаки card_1 - card_6 с помощью Frequency Encoding;


In [37]:
data = train

In [38]:
freq_encoder = data["card1"].value_counts(normalize=True)
data["card1_freq_enc"] = data["card1"].map(freq_encoder)
data[["card1", "card1_freq_enc"]].head(10)

Unnamed: 0,card1,card1_freq_enc
0,13926,8e-05
1,2755,0.0015
2,4663,0.00138
3,18132,0.00726
4,4497,2e-05
5,5937,4e-05
6,12308,0.00034
7,12695,0.01022
8,2803,0.00884
9,17399,0.00262


In [39]:
freq_encoder = data["card4"].value_counts(normalize=True)
data["card4_freq_enc"] = data["card4"].map(freq_encoder)
data[["card4", "card4_freq_enc"]].head(10)

Unnamed: 0,card4,card4_freq_enc
0,discover,0.013081
1,mastercard,0.313951
2,visa,0.651305
3,mastercard,0.313951
4,mastercard,0.313951
5,visa,0.651305
6,visa,0.651305
7,visa,0.651305
8,visa,0.651305
9,mastercard,0.313951


In [40]:
freq_encoder = data["card6"].value_counts(normalize=True)
data["card6_freq_enc"] = data["card6"].map(freq_encoder)
data[["card6", "card6_freq_enc"]].head(10)

Unnamed: 0,card6,card6_freq_enc
0,credit,0.315733
1,credit,0.315733
2,debit,0.684147
3,debit,0.684147
4,credit,0.315733
5,debit,0.684147
6,debit,0.684147
7,debit,0.684147
8,debit,0.684147
9,debit,0.684147


In [41]:
data = test

In [42]:
freq_encoder = data["card1"].value_counts(normalize=True)
data["card1_freq_enc"] = data["card1"].map(freq_encoder)
data[["card1", "card1_freq_enc"]].head(10)

Unnamed: 0,card1,card1_freq_enc
0,9500,0.020067
1,6668,2.7e-05
2,14935,1.3e-05
3,8394,0.001067
4,9500,0.020067
5,6019,0.020627
6,8358,2.7e-05
7,5700,0.001093
8,18227,0.00232
9,8058,0.000227


In [43]:
freq_encoder = data["card4"].value_counts(normalize=True)
data["card4_freq_enc"] = data["card4"].map(freq_encoder)
data[["card4", "card4_freq_enc"]].head(10)

Unnamed: 0,card4,card4_freq_enc
0,visa,0.656471
1,mastercard,0.295275
2,mastercard,0.295275
3,visa,0.656471
4,visa,0.656471
5,visa,0.656471
6,visa,0.656471
7,mastercard,0.295275
8,visa,0.656471
9,visa,0.656471


In [44]:
freq_encoder = data["card6"].value_counts(normalize=True)
data["card6_freq_enc"] = data["card6"].map(freq_encoder)
data[["card6", "card6_freq_enc"]].head(10)

Unnamed: 0,card6,card6_freq_enc
0,debit,0.663235
1,debit,0.663235
2,debit,0.663235
3,debit,0.663235
4,debit,0.663235
5,credit,0.336685
6,debit,0.663235
7,debit,0.663235
8,credit,0.336685
9,debit,0.663235


In [74]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)




In [75]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 35000 rows, 400 cols
x_valid.shape = 15001 rows, 400 cols
x_test.shape = 75000 rows, 400 cols


In [76]:

model = fit_catboost(
    x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.5563693	best: 0.5563693 (0)	total: 157ms	remaining: 2m 36s
10:	test: 0.7683229	best: 0.7683229 (10)	total: 1.23s	remaining: 1m 50s
20:	test: 0.7961048	best: 0.7961048 (20)	total: 2.31s	remaining: 1m 47s
30:	test: 0.7950917	best: 0.7972417 (26)	total: 3.39s	remaining: 1m 45s
40:	test: 0.7992484	best: 0.8014545 (38)	total: 4.42s	remaining: 1m 43s
50:	test: 0.8096182	best: 0.8096182 (50)	total: 5.49s	remaining: 1m 42s
60:	test: 0.8169722	best: 0.8169722 (60)	total: 6.57s	remaining: 1m 41s
70:	test: 0.8242967	best: 0.8244824 (67)	total: 7.68s	remaining: 1m 40s
80:	test: 0.8273846	best: 0.8273846 (80)	total: 8.75s	remaining: 1m 39s
90:	test: 0.8347649	best: 0.8347649 (90)	total: 9.83s	remaining: 1m 38s
100:	test: 0.8416862	best: 0.8416862 (100)	total: 11s	remaining: 1m 37s
110:	test: 0.8445587	best: 0.8446030 (109)	total: 12.1s	remaining: 1m 36s
120:	test: 0.8494243	best: 0.8494243 (120)	total: 13.3s	remaining: 1m 36s
130:	test: 0.8528898	best: 0.8528898 (130)	total: 14.5s	remain

In [77]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.897957
score = 0.869628
score = 0.866417


Задание 4: преобразовать признак TransactionAmt в логариф признака, выделить дробную часть и целую часть в отдельные признаки.


In [45]:
data = train

In [46]:
import math 
data["TransactionAmt1"] = data["TransactionAmt"].apply(lambda x: math.trunc(np.log(x)))
data["TransactionAmt2"] = data["TransactionAmt"].apply(lambda x: np.log(x) - math.trunc(np.log(x)))

In [47]:
data["TransactionAmt2"].head(10)

0    0.226834
1    0.367296
2    0.077537
3    0.912023
4    0.912023
5    0.891820
6    0.068904
7    0.046189
8    0.708050
9    0.762174
Name: TransactionAmt2, dtype: float64

In [48]:
data = test

In [49]:
import math 
data["TransactionAmt1"] = data["TransactionAmt"].apply(lambda x: math.trunc(np.log(x)))
data["TransactionAmt2"] = data["TransactionAmt"].apply(lambda x: np.log(x) - math.trunc(np.log(x)))

In [50]:
data["TransactionAmt2"].head(10)

0    0.713572
1    0.081404
2    0.891820
3    0.605170
4    0.574711
5    0.605170
6    0.681668
7    0.449988
8    0.605170
9    0.720312
Name: TransactionAmt2, dtype: float64

In [84]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)




In [85]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 35000 rows, 402 cols
x_valid.shape = 15001 rows, 402 cols
x_test.shape = 75000 rows, 402 cols


In [86]:

model = fit_catboost(
    x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.5755027	best: 0.5755027 (0)	total: 126ms	remaining: 2m 6s
10:	test: 0.7696169	best: 0.7751424 (9)	total: 1.27s	remaining: 1m 54s
20:	test: 0.7951340	best: 0.7955779 (18)	total: 2.39s	remaining: 1m 51s
30:	test: 0.8166982	best: 0.8166982 (30)	total: 3.5s	remaining: 1m 49s
40:	test: 0.8176049	best: 0.8176049 (40)	total: 4.62s	remaining: 1m 48s
50:	test: 0.8189826	best: 0.8192191 (46)	total: 5.74s	remaining: 1m 46s
60:	test: 0.8206574	best: 0.8206574 (60)	total: 6.85s	remaining: 1m 45s
70:	test: 0.8196922	best: 0.8210960 (63)	total: 7.97s	remaining: 1m 44s
80:	test: 0.8269190	best: 0.8269190 (80)	total: 9.09s	remaining: 1m 43s
90:	test: 0.8325014	best: 0.8331570 (89)	total: 10.2s	remaining: 1m 42s
100:	test: 0.8380460	best: 0.8380460 (100)	total: 11.4s	remaining: 1m 41s
110:	test: 0.8437438	best: 0.8439837 (109)	total: 12.6s	remaining: 1m 40s
120:	test: 0.8494611	best: 0.8494611 (120)	total: 13.7s	remaining: 1m 39s
130:	test: 0.8535305	best: 0.8535305 (130)	total: 14.8s	remaini

In [87]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.899518
score = 0.870664
score = 0.866083


Задание 5: для числовых признаков построить PCA-признаки, добавить их к основной части датасета.


In [142]:
numerical_features = x_train.select_dtypes(exclude=["object"])
numerical_features = numerical_features.columns.tolist()
numerical_features

['TransactionAmt',
 'card1',
 'card2',
 'card3',
 'card5',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',


In [143]:
x_train = x_train[numerical_features]

In [149]:
# Для начала отмасштабируем выборку
X_ = x_train.astype(float)

rows, cols = X_.shape

# центрирование - вычитание из каждого значения среднего по строке
#means = X_.mean(0)
#for i in range(rows):
#    for j in range(cols):
#        X_[i, j] -= means[j]

# деление каждого значения на стандартное отклонение
#std = np.std(X_, axis=0)
#for i in range(cols):
#    for j in range(rows):
#        X_[j][i] /= std[i]

In [150]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
XPCAreduced = pca.fit_transform(np.transpose(X_))
XPCAreduced

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Задание 6: использовать критерий отбора признаков на основе перестановок для отбора признаков, которые положительно влияют на перформанс модели. Переобучить модель и сделать выводы о полученном качестве алгоритма.


In [177]:
from tqdm import tqdm

def calculate_permutation_importance(estimator, 
                                     metric: callable,
                                     x_valid: pd.DataFrame,
                                     y_valid: pd.Series) -> pd.Series:
    """
    Расчет пермутированной важности признаков.
    """
    scores = {}
    y_pred = estimator.predict(x_valid)
    base_score = metric(y_valid, y_pred)

    for feature in tqdm(x_valid.columns):
        x_valid_copy = x_valid.copy()
        x_valid_copy[feature] = np.random.permutation(x_valid_copy[feature])

        y_pred = estimator.predict(x_valid_copy)
        score = metric(y_valid, y_pred)
        scores[feature] = base_score - score

    scores = pd.Series(scores)
    scores = scores.sort_values(ascending=False)

    return scores

In [172]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)


In [173]:
to_drop = [
    "TransactionID",
    "TransactionDT",
    "isFraud",
#    "TransactionDateTime"
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()
categorical

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9']

In [174]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 35000 rows, 391 cols
x_valid.shape = 15001 rows, 391 cols
x_test.shape = 75000 rows, 391 cols


In [166]:
#ProductCD, card4, card6, P_emaildomain, R_emaildomain, M1, M2, M3, M4, M5, M6, M7, M8, M9, TransactionDateTime

#numerical_features = x_train.select_dtypes(exclude=["object"])
#numerical_features = numerical_features.columns.tolist()
#numerical_features

['TransactionAmt',
 'card1',
 'card2',
 'card3',
 'card5',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',


In [167]:
#x_valid = x_valid[numerical_features]

In [175]:
X_ = x_valid.astype(str)

In [178]:
perm_importance = calculate_permutation_importance(
    estimator=model, metric=roc_auc_score, x_valid=X_, y_valid=y_valid
)

100%|████████████████████████████████████████| 391/391 [09:28<00:00,  1.45s/it]


In [179]:
perm_importance

C1               0.022738
V317             0.010076
R_emaildomain    0.010076
V45              0.010041
C12              0.008816
                   ...   
V253            -0.000034
V255            -0.000034
V257            -0.000034
V189            -0.000034
V266            -0.001259
Length: 391, dtype: float64