In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import optuna
from optuna.pruners import SuccessiveHalvingPruner
from statsmodels.stats.outliers_influence import variance_inflation_factor
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
# чтение данных
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,"Макс. ПДЗ за Y-1 год, дней","Сред. ПДЗ за Y-1 год, дней","Кол-во просрочек свыше 5-ти дней за Y-1 год, шт.","Общая сумма ПДЗ свыше 5-ти дней за Y-1 год, руб.","Кол-во раз ПДЗ за Y-1 год, шт.",Факт 1,Факт 2,Факт 3,Факт 4,Факт 5,...,"Y-1, Прибыль (убыток) до налогообложения , RUB","Y-4, Прибыль (убыток) от продажи, RUB","Y-3, Прибыль (убыток) от продажи, RUB","Y-2, Прибыль (убыток) от продажи, RUB","Y-1, Прибыль (убыток) от продажи, RUB",Факт просрочки,Просрочка более 30 дней,Просрочка 0-30,"Оценка потенциала контрагента 1, руб.","Оценка потенциала контрагента 2, руб."
0,0,0.0,0,0.0,0,-10,-10.0,-10.0,-10,-10,...,3603784000.0,3280355000.0,6200120000.0,871619100.0,3658634000.0,1,0,1,-1.0,-1.0
1,0,0.0,0,0.0,0,-10,-10.0,-10.0,-10,-10,...,87475160.0,16300640.0,11091720.0,51357320.0,94110190.0,1,0,1,-1.0,-1.0
2,7,5.5,1,132825.299363,2,-10,-10.0,-10.0,-10,-10,...,-645643900.0,414858600.0,161131800.0,-92989810.0,-120721000.0,1,0,1,-1.0,-1.0
3,0,0.0,0,0.0,0,-10,-10.0,-10.0,-10,-10,...,3999298000.0,4903117000.0,5186553000.0,7869977000.0,4029232000.0,1,0,1,-1.0,-1.0
4,2,2.0,0,0.0,2,-10,-10.0,-10.0,-10,-10,...,49604080000.0,23389120000.0,37279840000.0,53075240000.0,56221220000.0,1,0,1,-1.0,-1.0


In [3]:
df.shape

(853, 135)

In [4]:
print('Факт просрочки')
print(df['Факт просрочки'].value_counts())
print('\nПросрочка 0-30')
print(df['Просрочка 0-30'].value_counts())
print('\nПросрочка более 30 дней')
print(df['Просрочка более 30 дней'].value_counts())

Факт просрочки
1    471
0    382
Name: Факт просрочки, dtype: int64

Просрочка 0-30
1    503
0    350
Name: Просрочка 0-30, dtype: int64

Просрочка более 30 дней
0    696
1    157
Name: Просрочка более 30 дней, dtype: int64


In [5]:
corr_matrix = df.corr()[['Факт просрочки', 
                         'Просрочка 0-30', 
                         'Просрочка более 30 дней']].abs().sort_values(by = ['Факт просрочки', 
                         'Просрочка 0-30', 
                         'Просрочка более 30 дней'], ascending = False).drop(['Факт просрочки', 
                         'Просрочка 0-30', 
                         'Просрочка более 30 дней'])
corr_matrix.style.format("{:.3}").background_gradient(cmap = 'coolwarm')

Unnamed: 0,Факт просрочки,Просрочка 0-30,Просрочка более 30 дней
Факт 47,0.343,0.653,0.0887
Факт 4,0.339,0.663,0.0919
Факт 8,0.339,0.663,0.0919
Факт 11,0.339,0.663,0.0919
Факт 17,0.339,0.663,0.0919
Факт 18,0.339,0.663,0.0919
Факт 19,0.339,0.663,0.0919
Факт 21,0.339,0.663,0.0919
Факт 22,0.339,0.663,0.0919
Факт 34,0.339,0.663,0.0919


## Исследование
* Проведем исследование подхода к моделированию на таргете "Факт просрочки"

### Выбор модели
* Выберем и исследуем два типа моделей:
    * Логистическая регрессия
    * Градиентный бустинг (catboost)

### Предобработка данных
* Предварительно нормализуем значения фичей для ускорения моделирования (отметим, что это не обязательный шаг, но так процесс схождения к минимуму выполнится быстрее) - требуется для логистической регрессии
* Так как в датасете присутствуют три таргета - для каждого требуется построить отдельную модель

### Подбор параметров
* Подберем паракметры для с помощью оптимизатора optuna
* Отметим, что так как у нас несбалансированная выборка, мы будем использовать стратифицированную кросс-валидацию и метрику roc_auc_score для проверки качества

### Факт просрочки

In [6]:
def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', ['LogisticRegression', 'CatBoostClassifier'])
    
    y = df['Факт просрочки']
    X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)

    if classifier_name == 'LogisticRegression':
        
        # набор гиперпараметров
        lr_c = trial.suggest_loguniform('c', 1e-10, 1e10)
        lr_class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
        lr_solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
        
        # классификатор
        classifier_obj = LogisticRegression(max_iter=10000, 
                                            random_state=42,
                                            C=lr_c, 
                                            class_weight=lr_class_weight, 
                                            solver=lr_solver)
        
        # нормализация данных
        scaler = StandardScaler()
        transformer = ColumnTransformer([("st_scaler", 
                                     scaler, 
                                     X.columns)],
                                     remainder="passthrough")
        X = transformer.fit_transform(X)
    else:
        
        # набор гиперпараметров
        cb_l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 9)
        cb_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
        cb_depth = trial.suggest_int('depth', 3, 10)
        
        # классификатор
        classifier_obj = CatBoostClassifier(depth=cb_depth, 
                                            learning_rate=cb_learning_rate, 
                                            l2_leaf_reg=cb_l2_leaf_reg, 
                                            verbose=0)
        
        X = np.array(X)

    
    # обучение на кросс-валидации
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = []
    for train_index, test_index in kf.split(X,y):
        
        #split train/test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        # fit and predict
        clf = classifier_obj.fit(X_train, y_train)
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        cv_score.append(score)

    return np.mean(cv_score)

In [7]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2022-03-21 15:14:40,872][0m A new study created in memory with name: no-name-44e6aab5-494d-4f55-8537-1fd21051d600[0m
[32m[I 2022-03-21 15:18:43,740][0m Trial 0 finished with value: 0.7361166557848352 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 3, 'learning_rate': 0.034062781830547395, 'depth': 10}. Best is trial 0 with value: 0.7361166557848352.[0m
[32m[I 2022-03-21 15:19:00,549][0m Trial 1 finished with value: 0.7261443068630551 and parameters: {'classifier': 'LogisticRegression', 'c': 82610638.4617017, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 0.7361166557848352.[0m
[32m[I 2022-03-21 15:19:07,594][0m Trial 2 finished with value: 0.7500436791357402 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 9, 'learning_rate': 0.01833739893477881, 'depth': 3}. Best is trial 2 with value: 0.7500436791357402.[0m
[32m[I 2022-03-21 15:19:08,946][0m Trial 3 finished with value: 0.7345657077642697 and parameters:

[32m[I 2022-03-21 15:33:21,249][0m Trial 29 finished with value: 0.7602702878544216 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 8, 'learning_rate': 0.008934141874867154, 'depth': 4}. Best is trial 29 with value: 0.7602702878544216.[0m
[32m[I 2022-03-21 15:33:31,393][0m Trial 30 finished with value: 0.7454997248287165 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 6, 'learning_rate': 0.04253863442768653, 'depth': 4}. Best is trial 29 with value: 0.7602702878544216.[0m
[32m[I 2022-03-21 15:33:41,387][0m Trial 31 finished with value: 0.7576029442944631 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 8, 'learning_rate': 0.010657971908990826, 'depth': 4}. Best is trial 29 with value: 0.7602702878544216.[0m
[32m[I 2022-03-21 15:33:51,321][0m Trial 32 finished with value: 0.7611616552834801 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 9, 'learning_rate': 0.010064189137659423, 'depth': 4}. Best i

[32m[I 2022-03-21 15:44:43,064][0m Trial 61 finished with value: 0.7586709112187983 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 7, 'learning_rate': 0.00978794435365877, 'depth': 6}. Best is trial 32 with value: 0.7611616552834801.[0m
[32m[I 2022-03-21 15:45:13,021][0m Trial 62 finished with value: 0.7573773038419116 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 6, 'learning_rate': 0.005001798845514404, 'depth': 6}. Best is trial 32 with value: 0.7611616552834801.[0m
[32m[I 2022-03-21 15:45:59,396][0m Trial 63 finished with value: 0.7573881729162568 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 8, 'learning_rate': 0.01193986787887996, 'depth': 7}. Best is trial 32 with value: 0.7611616552834801.[0m
[32m[I 2022-03-21 15:46:18,133][0m Trial 64 finished with value: 0.7557973537631107 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 7, 'learning_rate': 0.0031637353911735926, 'depth': 5}. Best i

[32m[I 2022-03-21 16:05:13,658][0m Trial 93 finished with value: 0.7591794461211565 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 6, 'learning_rate': 0.005426145465697411, 'depth': 6}. Best is trial 32 with value: 0.7611616552834801.[0m
[32m[I 2022-03-21 16:05:43,694][0m Trial 94 finished with value: 0.7574811953532411 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 5, 'learning_rate': 0.011916687076389244, 'depth': 6}. Best is trial 32 with value: 0.7611616552834801.[0m
[32m[I 2022-03-21 16:06:14,706][0m Trial 95 finished with value: 0.7531871225962699 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 6, 'learning_rate': 0.01502536842276244, 'depth': 6}. Best is trial 32 with value: 0.7611616552834801.[0m
[32m[I 2022-03-21 16:06:33,530][0m Trial 96 finished with value: 0.7498798622842354 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 5, 'learning_rate': 0.01820297353991721, 'depth': 5}. Best is

Построим модель, используя лучшие параметры, подобранные оптимизатором:

In [8]:
study.best_params, study.best_value

({'classifier': 'CatBoostClassifier',
  'l2_leaf_reg': 9,
  'learning_rate': 0.010064189137659423,
  'depth': 4},
 0.7611616552834801)

In [9]:
y = df['Факт просрочки']
X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)


if study.best_params['classifier'] == 'LogisticRegression':
         
    # классификатор
    classifier_obj = LogisticRegression(max_iter=10000, 
                                        random_state=42,
                                        C=study.best_params['c'], 
                                        class_weight=study.best_params['class_weight'], 
                                        solver=study.best_params['solver'])
        
    # нормализация данных
    scaler = StandardScaler()
    transformer = ColumnTransformer([("st_scaler", 
                                 scaler, 
                                 X.columns)],
                                 remainder="passthrough")
    X = transformer.fit_transform(X)
else:
        
    # классификатор
    classifier_obj = CatBoostClassifier(depth=study.best_params['depth'], 
                                        learning_rate=study.best_params['learning_rate'],  
                                        l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        verbose=0)

    X = np.array(X)

    
# обучение на кросс-валидации
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_score = []
i = 1
for train_index, test_index in kf.split(X,y):
    print('{} из KFold {}'.format(i, kf.n_splits))
    #split train/test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # fit and predict
    clf = classifier_obj.fit(X_train, y_train)
    score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    print('ROC_AUC score:',score)
    i += 1
    cv_score.append(score)

print(f"\nСредний ROC_AUC score: {np.mean(cv_score)}")

1 из KFold 5
ROC_AUC score: 0.7518005540166205
2 из KFold 5
ROC_AUC score: 0.7420558165239016
3 из KFold 5
ROC_AUC score: 0.8022934512296215
4 из KFold 5
ROC_AUC score: 0.7802351623740202
5 из KFold 5
ROC_AUC score: 0.7294232922732363

Средний ROC_AUC score: 0.7611616552834801


### VIF score

Вычислим для каждой фичи коэффициент корреляции дисперсии (VIF). Известно, что чем больше увеличивается VIF, тем менее надежными будут результаты модели. В целом, значение VIF выше 10 указывает на высокую корреляцию и является поводом для беспокойства.

Оставим только фичи с коэффициентом VIF менее 10 и снова подберем гиперпараметры с помощью оптимизатора.

In [10]:
vif_data = pd.DataFrame()
tmp = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
vif_data['feature'] = tmp.columns

vif_data['vif'] = [variance_inflation_factor(tmp.values, i) for i in range(len(tmp.columns))]
features_vif = vif_data[vif_data['vif'] < 10]['feature'].values
vif_data.head()

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,feature,vif
0,"Макс. ПДЗ за Y-1 год, дней",3.351972
1,"Сред. ПДЗ за Y-1 год, дней",2.616495
2,"Кол-во просрочек свыше 5-ти дней за Y-1 год, шт.",4.984048
3,"Общая сумма ПДЗ свыше 5-ти дней за Y-1 год, руб.",3.631964
4,"Кол-во раз ПДЗ за Y-1 год, шт.",4.36383


In [11]:
def objective_vif(trial):
    classifier_name = trial.suggest_categorical('classifier', ['LogisticRegression', 'CatBoostClassifier'])
    
    y = df['Факт просрочки']
    X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
    X = X[features_vif]

    if classifier_name == 'LogisticRegression':
        
        # набор гиперпараметров
        lr_c = trial.suggest_loguniform('c', 1e-10, 1e10)
        lr_class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
        lr_solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
        
        # классификатор
        classifier_obj = LogisticRegression(max_iter=10000, 
                                            random_state=42,
                                            C=lr_c, 
                                            class_weight=lr_class_weight, 
                                            solver=lr_solver)
        
        # нормализация данных
        scaler = StandardScaler()
        transformer = ColumnTransformer([("st_scaler", 
                                     scaler, 
                                     X.columns)],
                                     remainder="passthrough")
        X = transformer.fit_transform(X)
    else:
        
        # набор гиперпараметров
        cb_l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 9)
        cb_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
        cb_depth = trial.suggest_int('depth', 3, 10)
        
        # классификатор
        classifier_obj = CatBoostClassifier(depth=cb_depth, 
                                            learning_rate=cb_learning_rate, 
                                            l2_leaf_reg=cb_l2_leaf_reg, 
                                            verbose=0)
        
        X = np.array(X)

    
    # обучение на кросс-валидации
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = []
    for train_index, test_index in kf.split(X,y):
        
        #split train/test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        # fit and predict
        clf = classifier_obj.fit(X_train, y_train)
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        cv_score.append(score)

    return np.mean(cv_score)

In [12]:
study_vif = optuna.create_study(direction='maximize')
study_vif.optimize(objective_vif, n_trials=100)

[32m[I 2022-03-21 16:07:54,538][0m A new study created in memory with name: no-name-ac0a32e7-a08c-4568-9e2c-2d218880229d[0m
[32m[I 2022-03-21 16:07:54,597][0m Trial 0 finished with value: 0.679151456341148 and parameters: {'classifier': 'LogisticRegression', 'c': 3.3778323092697665e-08, 'class_weight': None, 'solver': 'sag'}. Best is trial 0 with value: 0.679151456341148.[0m
[32m[I 2022-03-21 16:07:54,999][0m Trial 1 finished with value: 0.7482657186333441 and parameters: {'classifier': 'LogisticRegression', 'c': 47242744.899834126, 'class_weight': 'balanced', 'solver': 'sag'}. Best is trial 1 with value: 0.7482657186333441.[0m
[32m[I 2022-03-21 16:07:55,576][0m Trial 2 finished with value: 0.7483196965997403 and parameters: {'classifier': 'LogisticRegression', 'c': 19.927885837780178, 'class_weight': None, 'solver': 'saga'}. Best is trial 2 with value: 0.7483196965997403.[0m
[32m[I 2022-03-21 16:08:01,323][0m Trial 3 finished with value: 0.757034229547196 and parameters:

[32m[I 2022-03-21 16:10:10,291][0m Trial 31 finished with value: 0.7802436567087064 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 6, 'learning_rate': 0.012621702379988812, 'depth': 4}. Best is trial 31 with value: 0.7802436567087064.[0m
[32m[I 2022-03-21 16:10:13,282][0m Trial 32 finished with value: 0.7814706718695345 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 7, 'learning_rate': 0.0098598398289645, 'depth': 4}. Best is trial 32 with value: 0.7814706718695345.[0m
[32m[I 2022-03-21 16:10:17,037][0m Trial 33 finished with value: 0.77920371209506 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 5, 'learning_rate': 0.010487213004184284, 'depth': 5}. Best is trial 32 with value: 0.7814706718695345.[0m
[32m[I 2022-03-21 16:10:17,103][0m Trial 34 finished with value: 0.6793444283364423 and parameters: {'classifier': 'LogisticRegression', 'c': 1.226879143537459e-10, 'class_weight': None, 'solver': 'newton-cg'}. Best 

[32m[I 2022-03-21 16:11:52,323][0m Trial 63 finished with value: 0.7781523779161842 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 5, 'learning_rate': 0.006457207212660208, 'depth': 4}. Best is trial 32 with value: 0.7814706718695345.[0m
[32m[I 2022-03-21 16:11:56,330][0m Trial 64 finished with value: 0.7737034285040709 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 5, 'learning_rate': 0.012870461062584194, 'depth': 5}. Best is trial 32 with value: 0.7814706718695345.[0m
[32m[I 2022-03-21 16:11:59,052][0m Trial 65 finished with value: 0.7726943612466981 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 7, 'learning_rate': 0.027607015668783392, 'depth': 3}. Best is trial 32 with value: 0.7814706718695345.[0m
[32m[I 2022-03-21 16:12:02,326][0m Trial 66 finished with value: 0.7772115638531328 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 5, 'learning_rate': 0.0042521655914780545, 'depth': 4}. Best

[32m[I 2022-03-21 16:13:51,565][0m Trial 95 finished with value: 0.7772428889846523 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 8, 'learning_rate': 0.01432167979950599, 'depth': 3}. Best is trial 83 with value: 0.7826727226036179.[0m
[32m[I 2022-03-21 16:13:51,614][0m Trial 96 finished with value: 0.6835689504990207 and parameters: {'classifier': 'LogisticRegression', 'c': 0.0003070366139695898, 'class_weight': None, 'solver': 'sag'}. Best is trial 83 with value: 0.7826727226036179.[0m
[32m[I 2022-03-21 16:13:55,453][0m Trial 97 finished with value: 0.7700263976902452 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 8, 'learning_rate': 0.0016567321648197006, 'depth': 5}. Best is trial 83 with value: 0.7826727226036179.[0m
[32m[I 2022-03-21 16:13:58,608][0m Trial 98 finished with value: 0.7827087627702056 and parameters: {'classifier': 'CatBoostClassifier', 'l2_leaf_reg': 7, 'learning_rate': 0.008798911093143354, 'depth': 4}. Best is

Построим модель, используя лучшие параметры, подобранные оптимизатором:

In [13]:
study_vif.best_params, study_vif.best_value

({'classifier': 'CatBoostClassifier',
  'l2_leaf_reg': 7,
  'learning_rate': 0.008798911093143354,
  'depth': 4},
 0.7827087627702056)

In [14]:
y = df['Факт просрочки']
X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
X = X[features_vif]

if study_vif.best_params['classifier'] == 'LogisticRegression':
         
    # классификатор
    classifier_obj = LogisticRegression(max_iter=10000, 
                                        random_state=42,
                                        C=study_vif.best_params['c'], 
                                        class_weight=study_vif.best_params['class_weight'], 
                                        solver=study_vif.best_params['solver'])
        
    # нормализация данных
    scaler = StandardScaler()
    transformer = ColumnTransformer([("st_scaler", 
                                 scaler, 
                                 X.columns)],
                                 remainder="passthrough")
    X = transformer.fit_transform(X)
else:
        
    # классификатор
    classifier_obj = CatBoostClassifier(depth=study_vif.best_params['depth'], 
                                        learning_rate=study_vif.best_params['learning_rate'],  
                                        l2_leaf_reg=study_vif.best_params['l2_leaf_reg'],
                                        verbose=0)

    X = np.array(X)

    
# обучение на кросс-валидации
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_score = []
i = 1
for train_index, test_index in kf.split(X,y):
    print('{} из KFold {}'.format(i, kf.n_splits))
    #split train/test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # fit and predict
    clf = classifier_obj.fit(X_train, y_train)
    score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    print('ROC_AUC score:',score)
    i += 1
    cv_score.append(score)

print(f"\nСредний ROC_AUC score: {np.mean(cv_score)}")

1 из KFold 5
ROC_AUC score: 0.771191135734072
2 из KFold 5
ROC_AUC score: 0.7714838353136225
3 из KFold 5
ROC_AUC score: 0.8139679469466703
4 из KFold 5
ROC_AUC score: 0.7951427771556551
5 из KFold 5
ROC_AUC score: 0.7617581187010078

Средний ROC_AUC score: 0.7827087627702056


### ASHA

Один из методов многорукого бандита для подобора гиперпараметров называется ASHA (асинхронное последовательное деление пополам). Общая идея такова:

* запустить кучу конфигураций параметров в течение некоторого времени
* обрезать (половину) наименее перспективных запусков 
* запустите кучу конфигураций параметров еще на некоторое время
* обрезать (половину) наименее перспективных запусков 
* остановить, когда останется только одна конфигурация

Таким образом, поиск может сосредоточиться на более многообещающих запусках.

In [15]:
def objective_vif_pruner(trial):
    
    y = df['Факт просрочки']
    X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
    X = X[features_vif]
 
    # набор гиперпараметров
    cb_l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 9)
    cb_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
    cb_depth = trial.suggest_int('depth', 3, 10)

    # классификатор
    classifier_obj = CatBoostClassifier(depth=cb_depth, 
                                        learning_rate=cb_learning_rate, 
                                        l2_leaf_reg=cb_l2_leaf_reg, 
                                        verbose=0)

    X = np.array(X)

    
    # обучение на кросс-валидации
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = []
    for train_index, test_index in kf.split(X,y):
        
        #split train/test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        # fit and predict
        clf = classifier_obj.fit(X_train, y_train)
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        cv_score.append(score)

    return np.mean(cv_score)

In [16]:
study_vif_pruner = optuna.create_study(direction='maximize', pruner=SuccessiveHalvingPruner)
study_vif_pruner.optimize(objective_vif_pruner, n_trials=100)

[32m[I 2022-03-21 16:14:04,475][0m A new study created in memory with name: no-name-0c4589eb-8984-44af-99b2-aa02706515a2[0m
[32m[I 2022-03-21 16:14:11,277][0m Trial 0 finished with value: 0.7740425857221697 and parameters: {'l2_leaf_reg': 1, 'learning_rate': 0.007320763930920095, 'depth': 7}. Best is trial 0 with value: 0.7740425857221697.[0m
[32m[I 2022-03-21 16:14:15,148][0m Trial 1 finished with value: 0.7575702892321918 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.04238663082547672, 'depth': 5}. Best is trial 0 with value: 0.7740425857221697.[0m
[32m[I 2022-03-21 16:14:22,026][0m Trial 2 finished with value: 0.7766687990208647 and parameters: {'l2_leaf_reg': 1, 'learning_rate': 0.0058361511574492615, 'depth': 7}. Best is trial 2 with value: 0.7766687990208647.[0m
[32m[I 2022-03-21 16:14:25,177][0m Trial 3 finished with value: 0.7522577000120172 and parameters: {'l2_leaf_reg': 1, 'learning_rate': 0.08640681673232745, 'depth': 4}. Best is trial 2 with value: 0.

[32m[I 2022-03-21 16:18:39,468][0m Trial 36 finished with value: 0.7602516783917446 and parameters: {'l2_leaf_reg': 6, 'learning_rate': 0.04931440883410071, 'depth': 3}. Best is trial 26 with value: 0.7813864327162199.[0m
[32m[I 2022-03-21 16:18:44,350][0m Trial 37 finished with value: 0.780695580573137 and parameters: {'l2_leaf_reg': 9, 'learning_rate': 0.006601973230827783, 'depth': 5}. Best is trial 26 with value: 0.7813864327162199.[0m
[32m[I 2022-03-21 16:18:49,347][0m Trial 38 finished with value: 0.7786376897399766 and parameters: {'l2_leaf_reg': 9, 'learning_rate': 0.013251421279950206, 'depth': 5}. Best is trial 26 with value: 0.7813864327162199.[0m
[32m[I 2022-03-21 16:18:53,148][0m Trial 39 finished with value: 0.7578502291308032 and parameters: {'l2_leaf_reg': 9, 'learning_rate': 0.09616490772273315, 'depth': 4}. Best is trial 26 with value: 0.7813864327162199.[0m
[32m[I 2022-03-21 16:18:56,931][0m Trial 40 finished with value: 0.7652199628920616 and parameter

[32m[I 2022-03-21 16:20:56,258][0m Trial 73 finished with value: 0.7794144228789422 and parameters: {'l2_leaf_reg': 8, 'learning_rate': 0.009425669480015043, 'depth': 3}. Best is trial 64 with value: 0.782122931144414.[0m
[32m[I 2022-03-21 16:21:00,462][0m Trial 74 finished with value: 0.7768629382935094 and parameters: {'l2_leaf_reg': 9, 'learning_rate': 0.01429732607119633, 'depth': 5}. Best is trial 64 with value: 0.782122931144414.[0m
[32m[I 2022-03-21 16:21:04,140][0m Trial 75 finished with value: 0.7768766585863008 and parameters: {'l2_leaf_reg': 7, 'learning_rate': 0.00415716060349916, 'depth': 4}. Best is trial 64 with value: 0.782122931144414.[0m
[32m[I 2022-03-21 16:21:08,772][0m Trial 76 finished with value: 0.7820673993596432 and parameters: {'l2_leaf_reg': 8, 'learning_rate': 0.0071901669555621585, 'depth': 4}. Best is trial 64 with value: 0.782122931144414.[0m
[32m[I 2022-03-21 16:21:11,683][0m Trial 77 finished with value: 0.7783670612701967 and parameters:

Построим модель, используя лучшие параметры, подобранные оптимизатором:

In [17]:
study_vif_pruner.best_params, study_vif_pruner.best_value

({'l2_leaf_reg': 9, 'learning_rate': 0.012641768106244029, 'depth': 4},
 0.7828634442412659)

In [18]:
y = df['Факт просрочки']
X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
X = X[features_vif]

        
# классификатор
classifier_obj = CatBoostClassifier(depth=study_vif_pruner.best_params['depth'], 
                                    learning_rate=study_vif_pruner.best_params['learning_rate'],  
                                    l2_leaf_reg=study_vif_pruner.best_params['l2_leaf_reg'],
                                    verbose=0)

X = np.array(X)

    
# обучение на кросс-валидации
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_score = []
i = 1
for train_index, test_index in kf.split(X,y):
    print('{} из KFold {}'.format(i, kf.n_splits))
    #split train/test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # fit and predict
    clf = classifier_obj.fit(X_train, y_train)
    score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    print('ROC_AUC score:',score)
    i += 1
    cv_score.append(score)

print(f"\nСредний ROC_AUC score: {np.mean(cv_score)}")

1 из KFold 5
ROC_AUC score: 0.7759002770083102
2 из KFold 5
ROC_AUC score: 0.7702403978999725
3 из KFold 5
ROC_AUC score: 0.8164548217739708
4 из KFold 5
ROC_AUC score: 0.7910834266517357
5 из KFold 5
ROC_AUC score: 0.7606382978723404

Средний ROC_AUC score: 0.7828634442412659


### Просрочка 0-30
* Проделаем последнюю итерацию предыдущего пайплайна для таргета "Просрочка 0-30"

In [19]:
def objective_vif_pruner_30(trial):
    
    y = df['Просрочка 0-30']
    X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
    X = X[features_vif]
 
    # набор гиперпараметров
    cb_l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 9)
    cb_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
    cb_depth = trial.suggest_int('depth', 3, 10)

    # классификатор
    classifier_obj = CatBoostClassifier(depth=cb_depth, 
                                        learning_rate=cb_learning_rate, 
                                        l2_leaf_reg=cb_l2_leaf_reg, 
                                        verbose=0)

    X = np.array(X)

    
    # обучение на кросс-валидации
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = []
    for train_index, test_index in kf.split(X,y):
        
        #split train/test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        # fit and predict
        clf = classifier_obj.fit(X_train, y_train)
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        cv_score.append(score)

    return np.mean(cv_score)

In [20]:
study_30 = optuna.create_study(direction='maximize', pruner=SuccessiveHalvingPruner)
study_30.optimize(objective_vif_pruner_30, n_trials=100)

[32m[I 2022-03-21 16:22:48,514][0m A new study created in memory with name: no-name-9a5f4bdf-bf0e-4093-b1e1-5c84d6ade52b[0m
[32m[I 2022-03-21 16:22:51,682][0m Trial 0 finished with value: 0.8573131541725602 and parameters: {'l2_leaf_reg': 9, 'learning_rate': 0.09374184286277845, 'depth': 4}. Best is trial 0 with value: 0.8573131541725602.[0m
[32m[I 2022-03-21 16:22:54,198][0m Trial 1 finished with value: 0.8713043847241867 and parameters: {'l2_leaf_reg': 6, 'learning_rate': 0.028789144839377896, 'depth': 3}. Best is trial 1 with value: 0.8713043847241867.[0m
[32m[I 2022-03-21 16:22:58,871][0m Trial 2 finished with value: 0.8539768033946252 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.09681820344575857, 'depth': 6}. Best is trial 1 with value: 0.8713043847241867.[0m
[32m[I 2022-03-21 16:23:05,759][0m Trial 3 finished with value: 0.8552427157001414 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.09514529151562796, 'depth': 7}. Best is trial 1 with value: 0.87

[32m[I 2022-03-21 16:26:25,739][0m Trial 36 finished with value: 0.8888868458274398 and parameters: {'l2_leaf_reg': 1, 'learning_rate': 0.006232310106639965, 'depth': 4}. Best is trial 31 with value: 0.8919404526166902.[0m
[32m[I 2022-03-21 16:26:29,671][0m Trial 37 finished with value: 0.869563224893918 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.024619199782960014, 'depth': 5}. Best is trial 31 with value: 0.8919404526166902.[0m
[32m[I 2022-03-21 16:26:37,067][0m Trial 38 finished with value: 0.8819321074964639 and parameters: {'l2_leaf_reg': 6, 'learning_rate': 0.011753984725588935, 'depth': 7}. Best is trial 31 with value: 0.8919404526166902.[0m
[32m[I 2022-03-21 16:26:40,114][0m Trial 39 finished with value: 0.8739069306930694 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.018487081044256518, 'depth': 3}. Best is trial 31 with value: 0.8919404526166902.[0m
[32m[I 2022-03-21 16:26:45,977][0m Trial 40 finished with value: 0.8919213578500708 and paramet

[32m[I 2022-03-21 16:29:39,475][0m Trial 73 finished with value: 0.8905711456859973 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.001214963380761474, 'depth': 5}. Best is trial 41 with value: 0.8920115983026873.[0m
[32m[I 2022-03-21 16:29:44,783][0m Trial 74 finished with value: 0.8825564356435643 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.01053806973312919, 'depth': 6}. Best is trial 41 with value: 0.8920115983026873.[0m
[32m[I 2022-03-21 16:29:48,778][0m Trial 75 finished with value: 0.8628746817538898 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.03905938006581867, 'depth': 4}. Best is trial 41 with value: 0.8920115983026873.[0m
[32m[I 2022-03-21 16:29:53,486][0m Trial 76 finished with value: 0.8546130127298444 and parameters: {'l2_leaf_reg': 3, 'learning_rate': 0.09936834925930105, 'depth': 5}. Best is trial 41 with value: 0.8920115983026873.[0m
[32m[I 2022-03-21 16:30:00,103][0m Trial 77 finished with value: 0.8880144271570014 and parameter

In [21]:
study_30.best_params, study_30.best_value

({'l2_leaf_reg': 4, 'learning_rate': 0.003505717277174708, 'depth': 6},
 0.8923513437057992)

In [22]:
y = df['Просрочка 0-30']
X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
X = X[features_vif]

        
# классификатор
classifier_obj = CatBoostClassifier(depth=study_30.best_params['depth'], 
                                    learning_rate=study_30.best_params['learning_rate'],  
                                    l2_leaf_reg=study_30.best_params['l2_leaf_reg'],
                                    verbose=0)

X = np.array(X)

    
# обучение на кросс-валидации
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_score = []
i = 1
for train_index, test_index in kf.split(X,y):
    print('{} из KFold {}'.format(i, kf.n_splits))
    #split train/test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # fit and predict
    clf = classifier_obj.fit(X_train, y_train)
    score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    print('ROC_AUC score:',score)
    i += 1
    cv_score.append(score)

print(f"\nСредний ROC_AUC score: {np.mean(cv_score)}")

1 из KFold 5
ROC_AUC score: 0.9305516265912305
2 из KFold 5
ROC_AUC score: 0.8823196605374823
3 из KFold 5
ROC_AUC score: 0.8900282885431401
4 из KFold 5
ROC_AUC score: 0.8792142857142857
5 из KFold 5
ROC_AUC score: 0.8796428571428571

Средний ROC_AUC score: 0.8923513437057992


### Просрочка более 30 дней

In [23]:
def objective_vif_pruner_more_30(trial):
    
    y = df['Просрочка более 30 дней']
    X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
    X = X[features_vif]
 
    # набор гиперпараметров
    cb_l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 9)
    cb_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)
    cb_depth = trial.suggest_int('depth', 3, 10)

    # классификатор
    classifier_obj = CatBoostClassifier(depth=cb_depth, 
                                        learning_rate=cb_learning_rate, 
                                        l2_leaf_reg=cb_l2_leaf_reg, 
                                        verbose=0)

    X = np.array(X)

    
    # обучение на кросс-валидации
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = []
    for train_index, test_index in kf.split(X,y):
        
        #split train/test
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        
        # fit and predict
        clf = classifier_obj.fit(X_train, y_train)
        score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        cv_score.append(score)

    return np.mean(cv_score)

In [24]:
study_more_30 = optuna.create_study(direction='maximize', pruner=SuccessiveHalvingPruner)
study_more_30.optimize(objective_vif_pruner_more_30, n_trials=100)

[32m[I 2022-03-21 16:32:11,955][0m A new study created in memory with name: no-name-546a02a8-8bf0-43d7-82cd-48620c02f660[0m
[32m[I 2022-03-21 16:32:14,815][0m Trial 0 finished with value: 0.7818715396346517 and parameters: {'l2_leaf_reg': 2, 'learning_rate': 0.011213002311697251, 'depth': 3}. Best is trial 0 with value: 0.7818715396346517.[0m
[32m[I 2022-03-21 16:32:19,014][0m Trial 1 finished with value: 0.7470366840168418 and parameters: {'l2_leaf_reg': 7, 'learning_rate': 0.0675459628029521, 'depth': 5}. Best is trial 0 with value: 0.7818715396346517.[0m
[32m[I 2022-03-21 16:32:23,328][0m Trial 2 finished with value: 0.7364275768325432 and parameters: {'l2_leaf_reg': 4, 'learning_rate': 0.06159923969920962, 'depth': 5}. Best is trial 0 with value: 0.7818715396346517.[0m
[32m[I 2022-03-21 16:32:27,464][0m Trial 3 finished with value: 0.7299558233597454 and parameters: {'l2_leaf_reg': 1, 'learning_rate': 0.046655803938096166, 'depth': 5}. Best is trial 0 with value: 0.78

[32m[I 2022-03-21 16:41:55,221][0m Trial 36 finished with value: 0.7868015863806651 and parameters: {'l2_leaf_reg': 8, 'learning_rate': 0.009732472915949543, 'depth': 9}. Best is trial 14 with value: 0.7942569953253986.[0m
[32m[I 2022-03-21 16:42:00,838][0m Trial 37 finished with value: 0.7491114942147665 and parameters: {'l2_leaf_reg': 5, 'learning_rate': 0.05138875517582074, 'depth': 6}. Best is trial 14 with value: 0.7942569953253986.[0m
[32m[I 2022-03-21 16:42:18,687][0m Trial 38 finished with value: 0.774070881543613 and parameters: {'l2_leaf_reg': 7, 'learning_rate': 0.022840291489577585, 'depth': 9}. Best is trial 14 with value: 0.7942569953253986.[0m
[32m[I 2022-03-21 16:42:41,222][0m Trial 39 finished with value: 0.7921367238006829 and parameters: {'l2_leaf_reg': 8, 'learning_rate': 0.006850822035567245, 'depth': 10}. Best is trial 14 with value: 0.7942569953253986.[0m
[32m[I 2022-03-21 16:43:09,485][0m Trial 40 finished with value: 0.7823370520173722 and paramet

[32m[I 2022-03-21 16:53:57,829][0m Trial 73 finished with value: 0.7863283244372244 and parameters: {'l2_leaf_reg': 7, 'learning_rate': 0.010363449924421751, 'depth': 8}. Best is trial 14 with value: 0.7942569953253986.[0m
[32m[I 2022-03-21 16:54:07,573][0m Trial 74 finished with value: 0.794968587342108 and parameters: {'l2_leaf_reg': 7, 'learning_rate': 0.005007193891429718, 'depth': 8}. Best is trial 74 with value: 0.794968587342108.[0m
[32m[I 2022-03-21 16:54:17,346][0m Trial 75 finished with value: 0.7963966614726652 and parameters: {'l2_leaf_reg': 7, 'learning_rate': 0.003852481683804885, 'depth': 8}. Best is trial 75 with value: 0.7963966614726652.[0m
[32m[I 2022-03-21 16:54:25,274][0m Trial 76 finished with value: 0.7957524533368698 and parameters: {'l2_leaf_reg': 8, 'learning_rate': 0.004322633173745245, 'depth': 7}. Best is trial 75 with value: 0.7963966614726652.[0m
[32m[I 2022-03-21 16:54:31,812][0m Trial 77 finished with value: 0.793516021284355 and parameter

In [25]:
study_more_30.best_params, study_more_30.best_value

({'l2_leaf_reg': 7, 'learning_rate': 0.003852481683804885, 'depth': 8},
 0.7963966614726652)

In [26]:
y = df['Просрочка более 30 дней']
X = df.drop(['Факт просрочки', 'Просрочка более 30 дней', 'Просрочка 0-30'], axis=1)
X = X[features_vif]

        
# классификатор
classifier_obj = CatBoostClassifier(depth=study_more_30.best_params['depth'], 
                                    learning_rate=study_more_30.best_params['learning_rate'],  
                                    l2_leaf_reg=study_more_30.best_params['l2_leaf_reg'],
                                    verbose=0)

X = np.array(X)

    
# обучение на кросс-валидации
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_score = []
i = 1
for train_index, test_index in kf.split(X,y):
    print('{} из KFold {}'.format(i, kf.n_splits))
    #split train/test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    # fit and predict
    clf = classifier_obj.fit(X_train, y_train)
    score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    print('ROC_AUC score:',score)
    i += 1
    cv_score.append(score)

print(f"\nСредний ROC_AUC score: {np.mean(cv_score)}")

1 из KFold 5
ROC_AUC score: 0.8410138248847926
2 из KFold 5
ROC_AUC score: 0.845998201438849
3 из KFold 5
ROC_AUC score: 0.708408273381295
4 из KFold 5
ROC_AUC score: 0.7917150150847064
5 из KFold 5
ROC_AUC score: 0.794847992573683

Средний ROC_AUC score: 0.7963966614726652
