In [1]:
import numpy as np
import pandas as pd
from scipy import stats

import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GroupShuffleSplit, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score,\
    recall_score, f1_score, log_loss
from sklearn.inspection import permutation_importance
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt
import shap

from numpy.typing import ArrayLike
from typing import TypeAlias

from metrics import get_metrics, check_overfitting

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

RAND = 10

In [2]:
ROOT = "../"

RAND = 10
N_SPLIT_TRAIN = 0.18
N_SPLIT_VAL = 0.15
N_FOLDS = 5

# Overview

Данные взяты из соревания на Kaggle. Основная информация по данным представлена здесь https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data

Цель - предсказать, какие клиенты с большей вероятностью не смогут выплатить свои кредиты. Оценка должна быть стабильна с течением времени

# Evaluation

Результаты оцениваются с использованием показателя стабильности Джини. Показатель Джини рассчитывается для предсказаний, соответствующих каждой неделе WEEK_NUMBER:
<center><font size="4">gini=2∗AUC−1</center>

Далее линейная регрессия, <font size="4">a⋅x+b</font>, обучается на еженедельных показателях коэффициента Джини, и falling_rate рассчитывается как <font size="4">min(0,a)</font>. Это используется для штрафования моделей.
   
В итоге, вариабельность предсказаний рассчитывается путем взятия стандартного отклонения остатков (residuals) из приведенной выше линейной регрессии с применением штрафа к изменчивости модели.

Итоговая метрика рассчитывается следующим образом:
<center><font size="4">stability metric=mean(gini)+88.0⋅min(0,a)−0.5⋅std(residuals)</center>

# Prepare data

In [4]:
df_train = pd.read_parquet(f"{ROOT}train4.parquet")
df_test = pd.read_parquet(f"{ROOT}test4.parquet")

In [5]:
cat_columns = df_train.select_dtypes("object").columns.to_list()
df_train[cat_columns] = df_train[cat_columns].astype("category")
df_test[cat_columns] = df_test[cat_columns].astype("category")

category_features = df_train.select_dtypes("category").columns.to_list()

In [6]:
X = df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
y = df_train["target"]
weeks = df_train["WEEK_NUM"]

Т.к. метрика стабильности gini рассчитывается для каждого значениям WEEK_NUM, то будем делить датасет в разрезе WEEK_NUM, чтобы значения WEEK_NUM не пересакались в тестовом и тренировочном датасете

In [7]:
def train_test_split_by_groups(X: pd.DataFrame,
                               y: pd.Series,
                               groups: pd.Series,
                               test_size=None,
                               train_size=None,
                               random_state=None,
                               shuffle=True):
    """
    Split arrays or matrices into random train and test subsets 
    according groups.
    """
    if shuffle:
        gss = GroupShuffleSplit(n_splits=1,
                                test_size=test_size,
                                train_size=train_size,
                                random_state=random_state)
        train_idx, test_idx = next(gss.split(X, y, groups))
        return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx],\
            y.iloc[test_idx], groups.iloc[train_idx], groups.iloc[test_idx]
    else:
        return train_test_split(X,
                                y,
                                groups,
                                test_size=test_size,
                                train_size=train_size,
                                shuffle=False,
                                random_state=RAND)

In [8]:
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split_by_groups(
    X, 
    y, 
    weeks, 
    test_size=N_SPLIT_TRAIN, 
    random_state=RAND)

X_train_, X_val, y_train_, y_val, w_train_, w_val = train_test_split_by_groups(
    X_train,
    y_train,
    w_train, 
    test_size=N_SPLIT_VAL,
    random_state=RAND)

print(f"All train: {round(X_train.shape[0] / X.shape[0] * 100, 2)}%")
print(f"train: {round(X_train_.shape[0] / X.shape[0] * 100, 2)}%")
print(f"val: {round(X_val.shape[0] / X.shape[0] * 100, 2)}%")
print(f"test: {round(X_test.shape[0] / X.shape[0] * 100, 2)}%")

All train: 79.67%
train: 65.35%
val: 14.31%
test: 20.33%


In [9]:
X_bin = pd.get_dummies(X)

X_train_bin, X_test_bin, y_train_bin,\
    y_test_bin, w_train_bin, w_test_bin = train_test_split_by_groups(
    X_bin, y, weeks, test_size=N_SPLIT_TRAIN, random_state=RAND)

print(f"All train: {round(X_train.shape[0] / X.shape[0] * 100, 2)}%")
print(f"test: {round(X_test.shape[0] / X.shape[0] * 100, 2)}%")
print(X_bin.shape)

All train: 79.67%
test: 20.33%
(1526659, 661)


# Training

## LightGBM Baseline

In [10]:
%%time
ratio = float(np.sum(y_train_ == 0)) / np.sum(y_train_ == 1)
eval_set = [(X_val, y_val)]

lgbm_clf = lgb.LGBMClassifier(scale_pos_weight=ratio, random_state=RAND)

lgbm_clf.fit(X_train_,
             y_train_,
             eval_metric="auc",
             eval_set=eval_set,
             callbacks=[lgb.log_evaluation(100),
                        lgb.early_stopping(100)])

[LightGBM] [Info] Number of positive: 31039, number of negative: 966698
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.288582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 29278
[LightGBM] [Info] Number of data points in the train set: 997737, number of used features: 241
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031109 -> initscore=-3.438642
[LightGBM] [Info] Start training from score -3.438642
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.833605	valid_0's binary_logloss: 0.478789
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.833605	valid_0's binary_logloss: 0.478789
CPU times: total: 1min 24s
Wall time: 16 s


In [11]:
%%time
check_overfitting(model=lgbm_clf,
                  metric_fun=roc_auc_score,
                  X_train=X_train,
                  y_train=y_train,
                  X_test=X_test,
                  y_test=y_test)

roc_auc_score train: 0.857
roc_auc_score test: 0.834
delta = 2.7 %
CPU times: total: 21.3 s
Wall time: 5.72 s


In [25]:
metrics = get_metrics(y_test, lgbm_clf.predict(X_test),
                      lgbm_clf.predict_proba(X_test), w_test,
                      'LGBMClassifier_baseline_test')

# metrics = pd.concat([
#     metrics,
#     get_metrics(y_train, lgbm_clf.predict(X_train),
#                 lgbm_clf.predict_proba(X_train), w_train,
#                 'LGBMClassifier_baseline_train')
# ])

round(metrics.set_index('model'), 5)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,gini_stability
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LGBMClassifier_baseline_test,0.72993,0.83436,0.09183,0.78556,0.16444,0.65266


## Catboost Baseline

In [18]:
%%time
ratio = float(np.sum(y_train_ == 0)) / np.sum(y_train_ == 1)
eval_set = [(X_val, y_val)]

cat_boost_clf = CatBoostClassifier(scale_pos_weight=ratio,
                                   eval_metric="AUC",
                                   cat_features=category_features,
                                   thread_count = 3,
                                   random_state=RAND)

cat_boost_clf.fit(X_train_,
                  y_train_,
                  eval_set=eval_set,
                  early_stopping_rounds=100,
                  verbose=False)

CPU times: total: 2h 30min 13s
Wall time: 51min 22s


<catboost.core.CatBoostClassifier at 0x1698ac7e8f0>

In [19]:
%%time
check_overfitting(model=cat_boost_clf,
                  metric_fun=roc_auc_score,
                  X_train=X_train,
                  y_train=y_train,
                  X_test=X_test,
                  y_test=y_test)

roc_auc_score train: 0.885
roc_auc_score test: 0.842
delta = 5.1 %
CPU times: total: 38.9 s
Wall time: 32.4 s


In [26]:
metrics = pd.concat([
    metrics,
    get_metrics(y_test, cat_boost_clf.predict(X_test),
                cat_boost_clf.predict_proba(X_test), w_test,
                'CatBoostClassifier_baseline_test')
])

# metrics = pd.concat([
#     metrics,
#     get_metrics(y_train, cat_boost_clf.predict(X_train),
#                 cat_boost_clf.predict_proba(X_train), w_train,
#                 'CatBoostClassifier_baseline_train')
# ])


round(metrics.set_index('model'), 5)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,gini_stability
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LGBMClassifier_baseline_test,0.72993,0.83436,0.09183,0.78556,0.16444,0.65266
CatBoostClassifier_baseline_test,0.77309,0.84244,0.10396,0.7491,0.18258,0.66924


## RandomForest Baseline

In [21]:
%%time
rf_clf = RandomForestClassifier(random_state=RAND)
rf_clf.fit(X_train_bin, y_train_bin)

CPU times: total: 19min 12s
Wall time: 19min 14s


In [22]:
%%time
check_overfitting(model=rf_clf,
                  metric_fun=roc_auc_score,
                  X_train=X_train_bin,
                  y_train=y_train_bin,
                  X_test=X_test_bin,
                  y_test=y_test_bin)

roc_auc_score train: 1.000
roc_auc_score test: 0.790
delta = 26.5 %
CPU times: total: 1min 4s
Wall time: 1min 4s


In [27]:
metrics = pd.concat([
    metrics,
    get_metrics(y_test_bin, rf_clf.predict(X_test_bin),
                rf_clf.predict_proba(X_test_bin), w_test_bin,
                'RandomForestClassifier_baseline_test')
])

# metrics = pd.concat([
#     metrics,
#     get_metrics(y_train_bin, rf_clf.predict(X_train_bin),
#                 rf_clf.predict_proba(X_train_bin), w_train_bin,
#                 'RandomForestClassifier_baseline_train')
# ])

metrics.set_index('model').style.highlight_max(axis=0, color='lightblue')

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,gini_stability
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LGBMClassifier_baseline_test,0.72993,0.834365,0.091831,0.785565,0.164439,0.652661
CatBoostClassifier_baseline_test,0.773091,0.842445,0.103959,0.749095,0.18258,0.669241
RandomForestClassifier_baseline_test,0.966177,0.79039,0.75,0.000286,0.000571,0.560007


**Выводы**
- лучший результат из бэйзланов показал CatBoostClassifier, но долго обучается на текущем датасете
- LGBMClassifier обучается намного быстрее, но результат чуть ниже

In [31]:
metrics.to_csv(f"baseline_metrics.csv", index=False)

# Submission

In [16]:
def submit(df_test, model):
    X_test = df_test.drop(columns=["WEEK_NUM"])
    X_test = X_test.set_index("case_id")

    df_submission = pd.DataFrame({
        "case_id": df_test["case_id"].to_numpy(),
        "score": model.predict_proba(X_test)[:, 1]
    }).set_index('case_id')

    print("Check null: ", df_submission["score"].isnull().any())

    df_submission.to_csv("submission.csv")

In [17]:
submit(df_test, lgbm_clf)

Check null:  False
