# Решение задачи контеста "Уроки настоящего от Сбербанка"

## Импорт необходимых библиотек

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Импорт необходимых файлов

In [2]:
train_target = pd.read_csv("train_target.csv")
train_target.head()

Unnamed: 0,client_id,bins
0,24662,2
1,1046,0
2,34089,2
3,34848,1
4,47076,3


In [3]:
transactions_train = pd.read_csv("transactions_train.csv")
transactions_train.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


## Data Preprocessing & Feature Engineering

In [4]:
agg_features=transactions_train.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()

In [5]:
counter_df_train=transactions_train.groupby(['client_id','small_group'])['amount_rur'].count()

In [6]:
cat_counts_train=counter_df_train.reset_index().pivot(index='client_id', \
                                                      columns='small_group',values='amount_rur')
cat_counts_train=cat_counts_train.fillna(0)
cat_counts_train.columns=['small_group_'+str(i) for i in cat_counts_train.columns]

In [7]:
cat_counts_train.head()

Unnamed: 0_level_0,small_group_0,small_group_1,small_group_2,small_group_3,small_group_4,small_group_5,small_group_6,small_group_7,small_group_8,small_group_9,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,447.0,1.0,44.0,93.0,0.0,0.0,0.0,1.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.0,397.0,0.0,172.0,10.0,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2.0,79.0,5.0,27.0,19.0,1.0,0.0,2.0,1.0,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,12.0,309.0,1.0,71.0,65.0,0.0,0.0,0.0,3.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2.0,423.0,0.0,59.0,23.0,3.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train=pd.merge(train_target,agg_features,on='client_id')
train=pd.merge(train,cat_counts_train.reset_index(),on='client_id')

In [9]:
train.head()

Unnamed: 0,client_id,bins,sum,mean,std,min,max,small_group_0,small_group_1,small_group_2,...,small_group_192,small_group_193,small_group_195,small_group_196,small_group_197,small_group_198,small_group_199,small_group_200,small_group_202,small_group_203
0,24662,2,30254.011,34.774725,72.037354,0.074,1227.314,0.0,174.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1046,0,42548.57,52.015367,106.540962,0.55,1210.506,1.0,187.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34089,2,26842.816,34.325852,59.92745,0.043,782.641,0.0,372.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34848,1,15773.126,16.16099,14.224936,0.043,109.59,0.0,359.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,47076,3,12488.375,15.92905,35.473591,0.432,541.165,0.0,378.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
train.shape

(30000, 209)

In [11]:
train.drop(columns=["client_id"], inplace=True)

## Разделение на X/y и train/test

In [12]:
train_df = train.drop(columns=["bins"])
target_df = train["bins"]

In [13]:
X_train, X_test, y_train, y_true = train_test_split(train_df, target_df, test_size=0.2, random_state=42)

## Нахождение лучших параметров модели

In [15]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=100,
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 7),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False,
        loss_function='MultiClass',
        random_seed=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_true, y_pred)

sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-26 18:34:53,821] A new study created in memory with name: catboost
[I 2024-04-26 18:34:55,232] Trial 0 finished with value: 0.5531666666666667 and parameters: {'learning_rate': 0.005611516415334507, 'depth': 7, 'l2_leaf_reg': 0.20890047049266686, 'random_strength': 0.0024430162614261413, 'bagging_temperature': 1.5601864044243652, 'od_type': 'IncToDec', 'od_wait': 45}. Best is trial 0 with value: 0.5531666666666667.
[I 2024-04-26 18:34:55,999] Trial 1 finished with value: 0.5548333333333333 and parameters: {'learning_rate': 0.015930522616241012, 'depth': 6, 'l2_leaf_reg': 1.6063676259174453e-08, 'random_strength': 5.360294728728285, 'bagging_temperature': 8.324426408004218, 'od_type': 'IncToDec', 'od_wait': 17}. Best is trial 1 with value: 0.5548333333333333.
[I 2024-04-26 18:34:56,780] Trial 2 finished with value: 0.5453333333333333 and parameters: {'learning_rate': 0.0040596116104843075, 'depth': 6, 'l2_leaf_reg': 0.00020866527711063722, 'random_strength': 4.17890272377219e

[I 2024-04-26 18:35:10,691] Trial 24 finished with value: 0.5871666666666666 and parameters: {'learning_rate': 0.05466372241450119, 'depth': 4, 'l2_leaf_reg': 1.5230928748938897, 'random_strength': 0.002362664605880475, 'bagging_temperature': 0.12616255489944275, 'od_type': 'Iter', 'od_wait': 41}. Best is trial 18 with value: 0.5998333333333333.
[I 2024-04-26 18:35:11,139] Trial 25 finished with value: 0.5906666666666667 and parameters: {'learning_rate': 0.09677044908347383, 'depth': 4, 'l2_leaf_reg': 6.113220570810954, 'random_strength': 1.3922750306184197e-05, 'bagging_temperature': 2.3407036528775897, 'od_type': 'Iter', 'od_wait': 50}. Best is trial 18 with value: 0.5998333333333333.
[I 2024-04-26 18:35:11,612] Trial 26 finished with value: 0.593 and parameters: {'learning_rate': 0.06903495367371465, 'depth': 4, 'l2_leaf_reg': 0.015266321235581247, 'random_strength': 3.081045923896347e-06, 'bagging_temperature': 0.7064311434205797, 'od_type': 'IncToDec', 'od_wait': 34}. Best is tria

[I 2024-04-26 18:35:26,272] Trial 49 finished with value: 0.5578333333333333 and parameters: {'learning_rate': 0.011598643800248986, 'depth': 6, 'l2_leaf_reg': 0.07403496071799653, 'random_strength': 5.4077048495857723e-08, 'bagging_temperature': 0.082518378465511, 'od_type': 'IncToDec', 'od_wait': 19}. Best is trial 48 with value: 0.6026666666666667.


Number of finished trials:  50
Best trial:
  Value:  0.6026666666666667
  Params: 
    learning_rate: 0.0827196096870518
    depth: 6
    l2_leaf_reg: 0.11254623978683652
    random_strength: 5.4316853049253635e-08
    bagging_temperature: 0.0313696359368941
    od_type: IncToDec
    od_wait: 16


## Обучение и тест модели

In [24]:
model = CatBoostClassifier(verbose=False,
                           iterations=1000,
                           depth=7,
                           loss_function='MultiClass',
                           od_wait=40,
                           od_type="Iter",
                           learning_rate=0.09,
                           l2_leaf_reg=2.7361329693854755e-07,
                           bootstrap_type="Bayesian",
                           custom_metric=['Accuracy'], 
                           random_seed=42)
model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x255006d1690>

In [25]:
y_pred = model.predict(X_test)

In [26]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.6276666666666667
Precision: 0.6229172645645557
Recall: 0.62785297792898
F1 Score: 0.6249898983176844


## Обучение модели на полных данных и сохранение в файл

In [27]:
model = CatBoostClassifier(verbose=False,
                           iterations=1000,
                           depth=7,
                           loss_function='MultiClass',
                           od_wait=40,
                           od_type="Iter",
                           learning_rate=0.09,
                           l2_leaf_reg=2.7361329693854755e-07,
                           bootstrap_type="Bayesian",
                           custom_metric=['Accuracy'], 
                           random_seed=42)
model.fit(train_df, target_df)
model.save_model("model", format="cbm", export_parameters=None, pool=None)