In [1]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 100)

## Useful Functions

In [56]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "geekbrains-competitive-data-analysis"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data


def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=42
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=42
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=42
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=42
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [3]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X["payment_rate"] = X['amount_annuity'] / X['amount_credit']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    X["external_scoring_rating_1_plus_2"] = np.nansum(
        X[["external_scoring_rating_1", "external_scoring_rating_2"]], axis=1
    )
    X["external_scoring_rating_1_plus_3"] = np.nansum(
        X[["external_scoring_rating_1", "external_scoring_rating_3"]], axis=1
    )
    X["external_scoring_rating_2_plus_3"] = np.nansum(
        X[["external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
    )
    X["external_scoring_rating_1_is_nan"] = np.isnan(
        X["external_scoring_rating_1"]
    )
    X["external_scoring_rating_2_is_nan"] = np.isnan(
        X["external_scoring_rating_2"]
    )
    X["external_scoring_rating_3_is_nan"] = np.isnan(
        X["external_scoring_rating_3"]
    )

    # дополнительные признаки
    X["ratio_credit_per_family_size"] = X["amount_credit"] / X["family_size"]
    X["ratio_credit_per_childrens"] = X["amount_credit"] / (1 + X["childrens"])
    X["ratio_children_to_family_size"] = X["childrens"] / X["family_size"]
    X["ratio_salary_per_family_size"] = X["total_salary"] / X["family_size"]
    X["ratio_salary_per_child"] = X["total_salary"] / X["childrens"]
    X["non_child"] = X["family_size"] - X["childrens"]
    X["ratio_child_to_non_child"] = X["childrens"] / X["non_child"]
    X["ratio_salary_per_non_child"] = X["total_salary"] / X["non_child"]
    X["ratio_credit_per_non_child"] = X["amount_credit"] / X["non_child"]

    return X

In [4]:
def create_payments_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе платежей.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходными платежами.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с группированными платежами.

    """
    if copy:
        X = X.copy()

    return X

## Base Tables

In [5]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


## client_profile

In [6]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv: shape = 250000 rows, 24 cols


  overwrite_input=overwrite_input)


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,payment_rate,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,external_scoring_rating_1_plus_2,external_scoring_rating_1_plus_3,external_scoring_rating_2_plus_3,external_scoring_rating_1_is_nan,external_scoring_rating_2_is_nan,external_scoring_rating_3_is_nan,ratio_credit_per_family_size,ratio_credit_per_childrens,ratio_children_to_family_size,ratio_salary_per_family_size,ratio_salary_per_child,non_child,ratio_child_to_non_child,ratio_salary_per_non_child,ratio_credit_per_non_child
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.428571,0.052832,2.930959,0.236315,0.678568,0.414784,0.329471,0.036237,20.0,0.085714,0.05,1.714286,1.577103,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797,0.565786,1.008039,0.914882,False,False,False,135000.0,270000.0,0.0,78750.0,inf,2.0,0.0,78750.0,135000.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0,2.0,-1.875,,,0.442295,0.802745,0.62252,0.62252,0.032481,18.86105,0.105433,0.053019,1.988583,1.227714,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462,0.442295,0.802745,1.24504,True,False,False,268458.75,536917.5,0.0,135000.0,inf,2.0,0.0,135000.0,268458.75


In [7]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

In [8]:
data.head(n=2)

Unnamed: 0,application_number,target,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,payment_rate,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,external_scoring_rating_1_plus_2,external_scoring_rating_1_plus_3,external_scoring_rating_2_plus_3,external_scoring_rating_1_is_nan,external_scoring_rating_2_is_nan,external_scoring_rating_3_is_nan,ratio_credit_per_family_size,ratio_credit_per_childrens,ratio_children_to_family_size,ratio_salary_per_family_size,ratio_salary_per_child,non_child,ratio_child_to_non_child,ratio_salary_per_non_child,ratio_credit_per_non_child
0,123687442,0.0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,0.700784,0.645914,0.71657,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.324353,4.197193,0.645914,0.71657,0.687756,0.700784,0.000917,34.025788,0.159543,0.029389,5.428571,1.59766,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111,1.346698,1.417355,1.362484,False,False,False,285000.0,427500.0,0.333333,52500.0,157500.0,2.0,0.5,78750.0,427500.0
1,123597908,1.0,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
data.shape

(275234, 70)

## payments

In [None]:
payments = get_input("payments.csv")
payments = create_payments_features(payments)
payments.head(n=2)

In [None]:
data = data.merge(
    payments, how="left", on="application_number"
)

In [None]:
data.head(n=2)

In [None]:
data.shape

## applications_history

In [None]:
applications_history = get_input("applications_history.csv")
# applications_history = create_applications_history_features(applications_history)
applications_history.head(n=2)

In [None]:
data = data.merge(
    applications_history, how="left", on="application_number"
)

In [None]:
data.head(n=2)

In [None]:
data.shape

## bki

In [None]:
bki = get_input("bki.csv")
# bki = create_bki_features(bki)
bki.head(n=2)

In [None]:
data = data.merge(
    bki, how="left", on="application_number"
)

In [None]:
data.head(n=2)

In [None]:
data.shape

## baseline + payments

In [22]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

In [23]:
train.head(n=2)

Unnamed: 0,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,payment_rate,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,external_scoring_rating_1_plus_2,external_scoring_rating_1_plus_3,external_scoring_rating_2_plus_3,external_scoring_rating_1_is_nan,external_scoring_rating_2_is_nan,external_scoring_rating_3_is_nan,ratio_credit_per_family_size,ratio_credit_per_childrens,ratio_children_to_family_size,ratio_salary_per_family_size,ratio_salary_per_child,non_child,ratio_child_to_non_child,ratio_salary_per_non_child,ratio_credit_per_non_child
0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,0.700784,0.645914,0.71657,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.324353,4.197193,0.645914,0.71657,0.687756,0.700784,0.000917,34.025788,0.159543,0.029389,5.428571,1.59766,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111,1.346698,1.417355,1.362484,False,False,False,285000.0,427500.0,0.333333,52500.0,157500.0,2.0,0.5,78750.0,427500.0
1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [24]:
test.head(n=2)

Unnamed: 0,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,payment_rate,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,external_scoring_rating_1_plus_2,external_scoring_rating_1_plus_3,external_scoring_rating_2_plus_3,external_scoring_rating_1_is_nan,external_scoring_rating_2_is_nan,external_scoring_rating_3_is_nan,ratio_credit_per_family_size,ratio_credit_per_childrens,ratio_children_to_family_size,ratio_salary_per_family_size,ratio_salary_per_child,non_child,ratio_child_to_non_child,ratio_salary_per_non_child,ratio_credit_per_non_child
110093,Cash,M,0.0,117000.0,1125000.0,32895.0,Secondary / secondary special,Married,0.028663,16007.0,2646.0,20.0,0.0,0.0,2.0,,0.628266,,0.0,0.0,0.0,0.0,1.0,4.0,5.0,4.639906,,,0.628266,0.628266,0.628266,0.628266,0.0,34.199726,0.281154,0.02924,9.615385,2.055038,70.281752,7.309302,44.217687,425.170068,12.431973,6.049509,3353.571,0.007559,0.001249,,706799.427892,,,20666.815272,,0.628266,0.0,0.628266,True,False,True,562500.0,1125000.0,0.0,58500.0,inf,2.0,0.0,58500.0,562500.0
110094,Cash,F,2.0,81000.0,312768.0,17095.5,Secondary / secondary special,Married,0.019689,10315.0,459.0,,0.0,0.0,4.0,,0.578161,0.18849,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,,,0.18849,0.578161,0.383325,0.383325,0.037961,18.295341,0.211056,0.054659,3.861333,1.657344,30.321667,7.852642,176.470588,681.411765,37.245098,22.472767,1594.809,,,,180830.265914,58953.494506,,9883.951718,3222.322825,0.578161,0.18849,0.766651,True,False,False,78192.0,104256.0,0.5,20250.0,40500.0,2.0,1.0,40500.0,156384.0


## KFold

In [76]:
cb_params = {
    "n_estimators": 5000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

In [77]:
cv = KFold(n_splits=5, random_state=42, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=categorial
)

Thu Oct  8 16:23:10 2020, Cross-Validation, 110093 rows, 68 cols
0:	test: 0.6565994	test1: 0.6638073	best: 0.6638073 (0)	total: 65.4ms	remaining: 5m 26s
10:	test: 0.6977448	test1: 0.7088725	best: 0.7088725 (10)	total: 655ms	remaining: 4m 57s
20:	test: 0.7008015	test1: 0.7134957	best: 0.7135489 (19)	total: 1.28s	remaining: 5m 2s
30:	test: 0.7028288	test1: 0.7169384	best: 0.7169384 (30)	total: 1.96s	remaining: 5m 13s
40:	test: 0.7027172	test1: 0.7172026	best: 0.7176881 (38)	total: 2.64s	remaining: 5m 19s
50:	test: 0.7028723	test1: 0.7182142	best: 0.7182142 (50)	total: 3.37s	remaining: 5m 27s
60:	test: 0.7035161	test1: 0.7185784	best: 0.7185784 (60)	total: 3.95s	remaining: 5m 20s
70:	test: 0.7045574	test1: 0.7182716	best: 0.7187004 (61)	total: 4.54s	remaining: 5m 15s
80:	test: 0.7056763	test1: 0.7191394	best: 0.7194094 (79)	total: 5.17s	remaining: 5m 14s
90:	test: 0.7059638	test1: 0.7195411	best: 0.7195411 (90)	total: 5.76s	remaining: 5m 10s
100:	test: 0.7066545	test1: 0.7198150	best: 0.7

0:	test: 0.6349044	test1: 0.6261488	best: 0.6261488 (0)	total: 92.7ms	remaining: 7m 43s
10:	test: 0.7004393	test1: 0.6965883	best: 0.6965883 (10)	total: 745ms	remaining: 5m 37s
20:	test: 0.7048944	test1: 0.6984208	best: 0.6984208 (20)	total: 1.39s	remaining: 5m 30s
30:	test: 0.7068013	test1: 0.6998469	best: 0.6998469 (30)	total: 1.95s	remaining: 5m 13s
40:	test: 0.7069735	test1: 0.6987356	best: 0.6998469 (30)	total: 2.53s	remaining: 5m 5s
50:	test: 0.7080987	test1: 0.7001478	best: 0.7001478 (50)	total: 3.09s	remaining: 4m 59s
60:	test: 0.7082226	test1: 0.7006529	best: 0.7006529 (60)	total: 3.72s	remaining: 5m 1s
70:	test: 0.7089791	test1: 0.7011174	best: 0.7020134 (68)	total: 4.31s	remaining: 4m 59s
80:	test: 0.7092325	test1: 0.7014424	best: 0.7020134 (68)	total: 4.9s	remaining: 4m 57s
90:	test: 0.7098040	test1: 0.7014968	best: 0.7022566 (88)	total: 5.49s	remaining: 4m 55s
100:	test: 0.7102780	test1: 0.7020861	best: 0.7023638 (97)	total: 6.08s	remaining: 4m 54s
110:	test: 0.7103895	tes

910:	test: 0.7444128	test1: 0.7213902	best: 0.7214736 (896)	total: 53.4s	remaining: 3m 59s
920:	test: 0.7447596	test1: 0.7212407	best: 0.7214928 (913)	total: 54s	remaining: 3m 59s
930:	test: 0.7449292	test1: 0.7213129	best: 0.7214928 (913)	total: 54.6s	remaining: 3m 58s
940:	test: 0.7452006	test1: 0.7214055	best: 0.7214928 (913)	total: 55.2s	remaining: 3m 58s
950:	test: 0.7454482	test1: 0.7214177	best: 0.7214928 (913)	total: 56s	remaining: 3m 58s
960:	test: 0.7455947	test1: 0.7214337	best: 0.7214928 (913)	total: 56.5s	remaining: 3m 57s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7214927819
bestIteration = 913

Shrink model to first 914 iterations.
Fold 2, Valid score = 0.72149
0:	test: 0.6336174	test1: 0.6340081	best: 0.6340081 (0)	total: 63.2ms	remaining: 5m 15s
10:	test: 0.7017254	test1: 0.7026158	best: 0.7030648 (9)	total: 716ms	remaining: 5m 24s
20:	test: 0.7048065	test1: 0.7048871	best: 0.7052267 (19)	total: 1.35s	remaining: 5m 20s
30:	test: 0.7056357	test1

90:	test: 0.7084701	test1: 0.7067348	best: 0.7067537 (88)	total: 5.67s	remaining: 5m 6s
100:	test: 0.7093025	test1: 0.7071178	best: 0.7074091 (99)	total: 6.23s	remaining: 5m 2s
110:	test: 0.7100646	test1: 0.7075744	best: 0.7075744 (110)	total: 6.84s	remaining: 5m 1s
120:	test: 0.7105569	test1: 0.7080978	best: 0.7080978 (120)	total: 7.41s	remaining: 4m 58s
130:	test: 0.7106607	test1: 0.7080453	best: 0.7082389 (121)	total: 7.99s	remaining: 4m 57s
140:	test: 0.7109145	test1: 0.7079828	best: 0.7082389 (121)	total: 8.61s	remaining: 4m 56s
150:	test: 0.7114929	test1: 0.7085896	best: 0.7085896 (150)	total: 9.21s	remaining: 4m 55s
160:	test: 0.7118316	test1: 0.7094474	best: 0.7094474 (160)	total: 9.8s	remaining: 4m 54s
170:	test: 0.7120645	test1: 0.7097304	best: 0.7097304 (170)	total: 10.4s	remaining: 4m 53s
180:	test: 0.7132503	test1: 0.7101715	best: 0.7102641 (178)	total: 10.9s	remaining: 4m 50s
190:	test: 0.7141671	test1: 0.7107211	best: 0.7107211 (190)	total: 11.5s	remaining: 4m 50s
200:	t

1000:	test: 0.7433401	test1: 0.7267840	best: 0.7267840 (1000)	total: 59.8s	remaining: 3m 58s
1010:	test: 0.7435567	test1: 0.7267573	best: 0.7268513 (1007)	total: 1m	remaining: 3m 58s
1020:	test: 0.7437403	test1: 0.7267658	best: 0.7268513 (1007)	total: 1m	remaining: 3m 57s
1030:	test: 0.7439420	test1: 0.7268390	best: 0.7268619 (1028)	total: 1m 1s	remaining: 3m 56s
1040:	test: 0.7442707	test1: 0.7268729	best: 0.7269064 (1038)	total: 1m 2s	remaining: 3m 56s
1050:	test: 0.7444805	test1: 0.7269639	best: 0.7269639 (1050)	total: 1m 2s	remaining: 3m 55s
1060:	test: 0.7446559	test1: 0.7269235	best: 0.7269639 (1050)	total: 1m 3s	remaining: 3m 54s
1070:	test: 0.7449067	test1: 0.7269278	best: 0.7269639 (1050)	total: 1m 3s	remaining: 3m 54s
1080:	test: 0.7449543	test1: 0.7269227	best: 0.7269639 (1050)	total: 1m 4s	remaining: 3m 54s
1090:	test: 0.7451713	test1: 0.7269416	best: 0.7270164 (1083)	total: 1m 5s	remaining: 3m 54s
1100:	test: 0.7453373	test1: 0.7271062	best: 0.7271207 (1099)	total: 1m 6s	r

660:	test: 0.7387819	test1: 0.7106185	best: 0.7106253 (659)	total: 39.6s	remaining: 4m 19s
670:	test: 0.7390260	test1: 0.7107987	best: 0.7107987 (670)	total: 40.2s	remaining: 4m 19s
680:	test: 0.7393110	test1: 0.7107616	best: 0.7107987 (670)	total: 40.8s	remaining: 4m 18s
690:	test: 0.7395433	test1: 0.7108383	best: 0.7108919 (684)	total: 41.4s	remaining: 4m 18s
700:	test: 0.7398313	test1: 0.7108382	best: 0.7108919 (684)	total: 42s	remaining: 4m 17s
710:	test: 0.7401572	test1: 0.7108227	best: 0.7109674 (703)	total: 42.7s	remaining: 4m 17s
720:	test: 0.7404560	test1: 0.7108736	best: 0.7109674 (703)	total: 43.3s	remaining: 4m 16s
730:	test: 0.7407717	test1: 0.7107477	best: 0.7109674 (703)	total: 43.9s	remaining: 4m 16s
740:	test: 0.7409619	test1: 0.7109064	best: 0.7109674 (703)	total: 44.5s	remaining: 4m 15s
750:	test: 0.7411155	test1: 0.7110295	best: 0.7110436 (747)	total: 45s	remaining: 4m 14s
760:	test: 0.7412802	test1: 0.7110075	best: 0.7110436 (747)	total: 45.6s	remaining: 4m 13s
770

In [78]:
oof_score = roc_auc_score(
    target, oof_preds
)
print(f"OOF-score = {round(oof_score, 5)}")

OOF-score = 0.724


In [None]:
# estimator, test_prediction = catboost_hold_out_validation(params=cb_params, X=train, y=target, split_params = [0.7, 0.2, 0.1], categorical=categorial)


## Подготовка прогноза

In [79]:
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

In [80]:
y_pred = np.zeros(test.shape[0])
for estimator in estimators:
    y_pred += estimator.predict_proba(test)[:, 1]

In [None]:
# y_pred = estimator.predict_proba(test)[:, 1]

In [81]:
 y_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / cv.n_splits
})
y_pred.to_csv("./geekbrains-competitive-data-analysis/5000_eval_cv_submit.csv", index=False)