# CatBoost base model

In [1]:
import os

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, sum_models, to_classifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from tqdm.notebook import tqdm

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
DATA_DIR = "../../data/amex-default-prediction/train"

<IPython.core.display.Javascript object>

# Train loop

In [4]:
skf = StratifiedKFold(n_splits=5)

<IPython.core.display.Javascript object>

In [5]:
!ls $DATA_DIR

train_group_data_10.csv  train_group_data_15.csv  train_group_data_5.csv
train_group_data_11.csv  train_group_data_1.csv   train_group_data_6.csv
train_group_data_12.csv  train_group_data_2.csv   train_group_data_7.csv
train_group_data_13.csv  train_group_data_3.csv   train_group_data_8.csv
train_group_data_14.csv  train_group_data_4.csv   train_group_data_9.csv


<IPython.core.display.Javascript object>

In [6]:
def do_work(step, file_name):
    df = pd.read_csv(os.path.join(DATA_DIR, file_name)).set_index("customer_ID")

    y = df[["target"]]
    X = df.drop("target", axis=1)

    ensemble = []

    for train_index, val_index in tqdm(
        skf.split(X, y), total=5, desc="Step: {}".format(step)
    ):
        X_sub_train, X_sub_valid = X.iloc[train_index], X.iloc[val_index]
        y_sub_train, y_sub_valid = y.iloc[train_index], y.iloc[val_index]

        train_pool = Pool(X_sub_train, y_sub_train)
        valid_pool = Pool(X_sub_valid, y_sub_valid)

        model = CatBoostClassifier()
        model.fit(train_pool, eval_set=valid_pool, verbose=False)

        ensemble.append(model)

    models_avrg = sum_models(ensemble, weights=[1.0 / len(ensemble)] * len(ensemble))
    return to_classifier(models_avrg)

<IPython.core.display.Javascript object>

In [7]:
models = []

for step, file_name in enumerate(os.listdir(DATA_DIR), 1):
    sub_model = do_work(step, file_name)
    models.append(sub_model)

Step: 1:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 2:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 3:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 4:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 5:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 6:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 7:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 8:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 9:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 10:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 11:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 12:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 13:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 14:   0%|          | 0/5 [00:00<?, ?it/s]

Step: 15:   0%|          | 0/5 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

In [8]:
models

[<catboost.core.CatBoostClassifier at 0x7fe5f6c4a700>,
 <catboost.core.CatBoostClassifier at 0x7fe5f726d8e0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f727b040>,
 <catboost.core.CatBoostClassifier at 0x7fe5f7208fd0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6c2bc70>,
 <catboost.core.CatBoostClassifier at 0x7fe5f726dbe0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6c3d040>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6c3d6d0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f72c10d0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6c4ae80>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6c4aac0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6bea580>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6beaa30>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6bea2b0>,
 <catboost.core.CatBoostClassifier at 0x7fe5f6fe0340>]

<IPython.core.display.Javascript object>

In [9]:
models_avrg = sum_models(models, weights=[1.0 / len(models)] * len(models))
models_avrg = to_classifier(models_avrg)
models_avrg

<catboost.core.CatBoostClassifier at 0x7fe5f6c30760>

<IPython.core.display.Javascript object>

In [11]:
model_save_path = os.path.join(
    os.path.dirname(DATA_DIR), "models", "catboost-model_v1.cbm"
)
models_avrg.save_model(model_save_path)

<IPython.core.display.Javascript object>