# Решение задачи контеста "Уроки настоящего от Сбербанка"

## Импорт необходимых библиотек

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import pytorch_lightning as pl
from ptls.preprocessing import PandasDataPreprocessor

## Импорт необходимых файлов

In [2]:
train_target = pd.read_csv("train_target.csv")
train_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [3]:
transactions_train = pd.read_csv("transactions_train.csv")
transactions_train.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [4]:
len(transactions_train["small_group"].unique())

202

## Data Preprocessing & Feature Engineering

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="trans_date",
    event_time_transformation="none",
    cols_category=["small_group"],
    cols_numerical=["amount_rur"],
    return_records=True,
)

In [6]:
dataset = preprocessor.fit_transform(transactions_train)

In [7]:
dataset = sorted(dataset, key=lambda x: x['client_id'])

In [8]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount_rur": "identity"},
    embeddings={
        "trans_date": {"in" : 800, "out":16},
        "small_group": {"in" : 250, "out":16}
    },
)

seq_econderr = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type="gru",
)

model = CoLESModule(
    seq_encoder=seq_econderr,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [9]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [10]:
trainer = pl.Trainer(
    gpus=1,
    max_epochs=15,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dl)

In [None]:
train_dl = inference_data_loader(dataset, num_workers=1, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in dataset]
train_df = train_df.merge(df_target, how='left', on='client_id')

train_df.to_csv("ReadyData.csv")

In [2]:
train = pd.read_csv("ReadyData.csv")

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,...,embed_248,embed_249,embed_250,embed_251,embed_252,embed_253,embed_254,embed_255,client_id,target
0,0,0.550283,-0.743802,0.258525,0.196788,-0.704841,-0.575365,-0.448614,-0.458896,0.185826,...,0.93609,0.420376,-0.442848,0.029085,0.142505,-0.023947,0.529169,0.098901,4,1
1,1,0.467701,-0.81288,0.296666,0.24608,-0.160042,-0.671525,-0.419226,-0.421998,0.274254,...,0.954387,0.352779,-0.333821,0.123616,-0.015209,-0.156397,-0.88195,0.101844,6,1
2,2,0.229802,-0.601992,-0.315547,0.246109,-0.091438,-0.820852,-0.405385,-0.215355,0.081721,...,0.980902,0.162857,-0.409549,-0.378416,0.318191,0.380631,-0.486881,0.441895,7,0
3,3,0.381037,-0.663101,0.17654,0.224918,0.579784,-0.696203,-0.403781,-0.243248,0.213346,...,0.9497,0.193698,-0.378885,-0.167125,0.040214,0.085506,0.293191,0.238585,10,3
4,4,0.461295,-0.781123,0.087376,0.350959,-0.253747,-0.75578,-0.400728,-0.543225,0.278211,...,0.96311,0.017376,-0.419453,-0.190946,0.066893,-0.185198,-0.742718,0.149033,11,3


## Разделение на X/y и train/test

In [5]:
embed_columns = [x for x in train.columns if x.startswith('embed')]
X, y = train[embed_columns], train['target']

In [6]:
X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2, random_state=42)

## Нахождение лучших параметров модели

In [11]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=100,
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 7),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False,
        loss_function='MultiClass',
        random_seed=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_true, y_pred)

sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-03 18:56:53,855] A new study created in memory with name: catboost
[I 2024-05-03 18:56:58,619] Trial 0 finished with value: 0.5721666666666667 and parameters: {'learning_rate': 0.005611516415334507, 'depth': 7, 'l2_leaf_reg': 0.20890047049266686, 'random_strength': 0.0024430162614261413, 'bagging_temperature': 1.5601864044243652, 'od_type': 'IncToDec', 'od_wait': 45}. Best is trial 0 with value: 0.5721666666666667.
[I 2024-05-03 18:57:01,028] Trial 1 finished with value: 0.5815 and parameters: {'learning_rate': 0.015930522616241012, 'depth': 6, 'l2_leaf_reg': 1.6063676259174453e-08, 'random_strength': 5.360294728728285, 'bagging_temperature': 8.324426408004218, 'od_type': 'IncToDec', 'od_wait': 17}. Best is trial 1 with value: 0.5815.
[I 2024-05-03 18:57:03,453] Trial 2 finished with value: 0.5693333333333334 and parameters: {'learning_rate': 0.0040596116104843075, 'depth': 6, 'l2_leaf_reg': 0.00020866527711063722, 'random_strength': 4.17890272377219e-06, 'bagging_temperatur

[I 2024-05-03 18:57:45,562] Trial 24 finished with value: 0.5838333333333333 and parameters: {'learning_rate': 0.09720432081137208, 'depth': 4, 'l2_leaf_reg': 0.0024437364251969533, 'random_strength': 1.7269547587376856e-06, 'bagging_temperature': 8.017772177273924, 'od_type': 'IncToDec', 'od_wait': 39}. Best is trial 12 with value: 0.5918333333333333.
[I 2024-05-03 18:57:46,998] Trial 25 finished with value: 0.573 and parameters: {'learning_rate': 0.030342292102888278, 'depth': 5, 'l2_leaf_reg': 4.127661140979095e-06, 'random_strength': 5.328962903395931e-08, 'bagging_temperature': 8.982706946943985, 'od_type': 'IncToDec', 'od_wait': 30}. Best is trial 12 with value: 0.5918333333333333.
[I 2024-05-03 18:57:49,494] Trial 26 finished with value: 0.5873333333333334 and parameters: {'learning_rate': 0.04491046470104459, 'depth': 6, 'l2_leaf_reg': 4.502474458269614e-05, 'random_strength': 6.70961262232542e-05, 'bagging_temperature': 9.193473144453646, 'od_type': 'Iter', 'od_wait': 34}. Bes

[I 2024-05-03 18:59:05,787] Trial 48 finished with value: 0.6015 and parameters: {'learning_rate': 0.07967998187204604, 'depth': 7, 'l2_leaf_reg': 12.686065250379805, 'random_strength': 2.4926243061488633e-07, 'bagging_temperature': 0.1512934447547737, 'od_type': 'Iter', 'od_wait': 26}. Best is trial 45 with value: 0.604.
[I 2024-05-03 18:59:10,636] Trial 49 finished with value: 0.5915 and parameters: {'learning_rate': 0.04174945494964402, 'depth': 7, 'l2_leaf_reg': 12.303736013340204, 'random_strength': 6.104310175619004e-07, 'bagging_temperature': 0.0812160533380291, 'od_type': 'Iter', 'od_wait': 19}. Best is trial 45 with value: 0.604.


Number of finished trials:  50
Best trial:
  Value:  0.604
  Params: 
    learning_rate: 0.074043090340053
    depth: 7
    l2_leaf_reg: 0.40118576669611555
    random_strength: 2.8533244473872057e-07
    bagging_temperature: 0.04069232418913327
    od_type: Iter
    od_wait: 28


## Обучение и тест модели

In [36]:
model = CatBoostClassifier(verbose=False,
                           iterations=1000,
                           depth=7,
                           loss_function='MultiClass',
                           od_wait=28,
                           od_type="Iter",
                           learning_rate=0.074043090340053,
                           l2_leaf_reg=0.40118576669611555,
                           bagging_temperature=0.04069232418913327,
                           random_strength=2.8533244473872057e-07,
                           custom_metric=['Accuracy'], 
                           random_seed=42)
                            
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1f91da8ed10>

In [37]:
y_pred = model.predict(X_test)

In [38]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.6091666666666666
Precision: 0.601000004218956
Recall: 0.6047757840587127
F1 Score: 0.6025219503147262


## Обучение модели на полных данных и сохранение в файл

In [39]:
model = CatBoostClassifier(verbose=False,
                           iterations=1000,
                           depth=7,
                           loss_function='MultiClass',
                           od_wait=28,
                           od_type="Iter",
                           learning_rate=0.074043090340053,
                           l2_leaf_reg=0.40118576669611555,
                           bagging_temperature=0.04069232418913327,
                           random_strength=2.8533244473872057e-07,
                           custom_metric=['Accuracy'], 
                           random_seed=42)
model.fit(X, y)
model.save_model("model_with_lifestream", format="cbm", export_parameters=None, pool=None)