In [127]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit, StratifiedGroupKFold

from numpy.typing import ArrayLike
from typing import Callable, Any

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
    recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.base import BaseEstimator

import optuna
from optuna.visualization.matplotlib import plot_param_importances
from optuna.visualization import plot_optimization_history

import matplotlib.pyplot as plt

import yaml
import joblib

import warnings
warnings.filterwarnings('ignore')

In [117]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc_config = config['preprocessing']
training_config = config['train']
prediction_config = config['prediction']

# Preprocessig data

In [118]:
df_test = pd.read_parquet(prediction_config["predict_path"])
df_test.head()

Unnamed: 0,case_id,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,...,mode_collaterals_typeofguarante_669M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,mode_subjectroles_name_541M,month_decision,weekday_decision
0,57552,100,0,183992.0,6298.8003,12155.4,0,0,0,0,...,,,,,,,,,11,5
1,57633,100,0,,8273.0,0.0,0,0,0,0,...,c7a5ad39,0.0,0.0,0.0,0.0,1.0,1.0,ab3c25cf,1,2
2,57569,100,0,0.0,4682.6,0.0,1,0,0,0,...,a55475b1,,2328.0,,33346.402,,2.0,a55475b1,12,1
3,57634,100,0,39948.8,1165.8,0.0,0,0,0,0,...,,,,,,,,,1,3
4,57630,100,0,0.0,8905.0,0.0,0,0,0,0,...,a55475b1,0.0,0.0,0.0,0.0,0.0,6.0,a55475b1,3,2


In [119]:
def get_bins(data: (int, float), first_val: (int, float),
             second_val: (int, float)) -> str:
    """
    Генерация бинов для разных признаков
    :param data: числовое значение
    :param first_val: первый порог значения для разбиения на бины
    :param second_val: второй порог значения для разбиения на бины
    :return: наименование бина
    """
    assert isinstance(data, (int, float)), "Проблема с типом данных в признаке"
    result = ("small" if data <= first_val else
              "medium" if first_val < data <= second_val else "large")
    return result


def fill_nans(df_data: pd.DataFrame, value: Any,
              columns: list[str]) -> pd.DataFrame:
    """
    Заполняет пропуски переданным значением
    :param df_data: датасет
    :param value: значение, которым необходимо заполнить пропуску
    :param columns: список признаков
    :return: датасет
    """
    for column in columns:
        if np.mean(df_data[column].isnull()) > 0:
            df_data[column] = df_data[column].fillna(value)
    return df_data


def merge_columns(df_data: pd.DataFrame, main_column: str, add_column: str,
                  new_column: str) -> pd.DataFrame:
    """
    Объединение двух дублирующихся признаков в один.
    Если значение в main_column заполнено то берется это значение, 
    иначе берется значение из add_column
    :param df_data: датасет
    :param main_column: основная признак
    :param add_column: дополнительный признак
    :param new_column: название нового признака
    :return: датасет
    """
    df_data[new_column] = df_data.apply(
        lambda x: x[add_column]
        if pd.isnull(x[main_column]) else x[main_column],
        axis=1).astype(df_data[main_column].dtype)

    df_data = df_data.drop([main_column, add_column], axis=1)
    return df_data


def replace_rare_values(df_data: pd.DataFrame,
                        columns: dict,
                        value: Any = "Other",
                        debug: bool = False):
    """
    Замена редко встречающихся значений 
    :param df_data: датасет
    :param columns: словарь, где key - имя столбца, 
                    value - порог, при котором заменяется значение столбца, 
                    если его частота ниже
    :param value: новое значение    
    :param debug: признак логирования дополнительной информации 
    :return: датасет
    """
    for column in columns.keys():
        nunique = df_data[column].nunique()
        rare_values = df_data[column].value_counts(
            dropna=True, normalize=True)[df_data[column].value_counts(
                dropna=True, normalize=True) < columns[column]].index

        df_data[column] = df_data[column].apply(lambda x: "Other"
                                                if x in rare_values else x)

        if debug:
            print(
                f"{column} reduced nunique from {nunique} to {df_data[column].nunique()}"
            )
    return df_data


def replace_values_evaluate(df_data: pd.DataFrame,
                            columns: str,
                            unique_values_path: str,
                            value: Any = "Other"):
    """
    Замена значений, которые были заменены в train
    :param df_data: датасет
    :param columns: список столбцов
    :param unique_values_path: путь до списока с признаками train
    :param value: новое значение    
    :return: датасет
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    for column in columns:
        train_values = unique_values[column]
        df_data[column] = df_data[column].apply(
            lambda x: x if x in train_values or pd.isnull(x) else "Other")
    return df_data


def drop_columns_by_corr_matrix(df_data: pd.DataFrame,
                                except_columns: list[str],
                                threshold: float = 0.9,
                                debug=False) -> pd.DataFrame:
    """
    Удаление сильно скоррелированных признаков
    :param df_data: датасет
    :param except_columns: исключаемые признаки
    :param threshold: порог, при котором признак удаляется
    :param debug: признак логирования дополнительной информации 
    :return: датасет
    """
    cor_matrix = df_data.drop(columns=except_columns,
                              axis=1).corr(method='spearman',
                                           numeric_only=True).abs()
    cor_matrix = cor_matrix.where(
        np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))

    to_drop = [
        column for column in cor_matrix.columns
        if any(cor_matrix[column] > threshold)
    ]

    if debug:
        for column in to_drop:
            print(column)
            print(cor_matrix[column][cor_matrix[column] > threshold])
            print()
    return df_data.drop(to_drop, axis=1)


def transform_types(df_data: pd.DataFrame,
                    change_type_columns: dict) -> pd.DataFrame:
    """
    Преобразование признаков в заданный тип данных
    :param df_data: датасет
    :param change_type_columns: словарь с признаками и типами данных
    :return:
    """
    return df_data.astype(change_type_columns, errors="raise")


def save_unique_train_data(df_data: pd.DataFrame, drop_columns: list,
                           unique_values_path: str) -> None:
    """
    Сохранение словаря с признаками и уникальными значениями для категориальных переменных 
    и статистическими значениями для числовых переменных
    :param df_data: датасет
    :param drop_columns: список с признаками для удаления    
    :param unique_values_path: путь до файла со словарем
    :return: None
    """
    unique_df = df_data.drop(columns=drop_columns, axis=1, errors="ignore")

    category_columns = unique_df.select_dtypes(["object", "bool",
                                                "category"]).columns.to_list()
    dict_unique = {
        key:
        unique_df[key].unique().tolist()
        if key in category_columns else unique_df[key].describe().tolist()
        for key in unique_df.columns
    }

    with open(unique_values_path, "w") as file:
        json.dump(dict_unique, file)


def check_columns_evaluate(df_data: pd.DataFrame,
                           unique_values_path: str) -> pd.DataFrame:
    """
    Удаление признаков, которых нет в train, 
    проверка на наличие признаков из train 
    и упорядочивание признаков согласно train
    :param df_data: датасет test
    :param unique_values_path: путь до списока с признаками train для сравнения
    :return: датасет test
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    missing_features = set(column_sequence).difference(set(df_data.columns))

    assert not any(missing_features), f"Пропущены признаки: {missing_features}"
    return df_data[column_sequence]

In [128]:
def preprocess_data_pipeline(df_data: pd.DataFrame,
                             flg_evaluate: bool = True,
                             debug=False,
                             **kwargs):
    """
    Пайплайн по предобработке данных
    :param df_data: датасет
    :param flg_evaluate: флаг для evaluate
    :param debug: флаг для логирования дополнительной информации 
    :return: датасет
    """
    df_data = df_data.drop(kwargs['drop_columns'], axis=1, errors="ignore")

    except_columns = [kwargs["target_column"], kwargs["groups_column"]]

    # Merge columns
    map_merge_columns = kwargs["map_merge_columns"]
    if map_merge_columns:
        for key in map_merge_columns.keys():
            df_data = merge_columns(df_data,
                                    main_column=map_merge_columns[key][0],
                                    add_column=map_merge_columns[key][1],
                                    new_column=key)

    # Bins
    map_bins_columns = kwargs["map_bins_columns"]
    if map_bins_columns:
        for key in map_bins_columns.keys():
            df_data[f"{key}_bins"] = df_data[key].apply(lambda x: get_bins(
                x,
                first_val=map_bins_columns[key][0],
                second_val=map_bins_columns[key][1],
            ))

    map_replace_rare_values = kwargs["map_replace_rare_values"]
    unique_values_path = kwargs["unique_values_path"]

    if flg_evaluate:
        if map_replace_rare_values:
            df_data = replace_values_evaluate(
                df_data,
                map_replace_rare_values.keys(),
                unique_values_path=unique_values_path)

        df_data = check_columns_evaluate(df_data=df_data,
                                         unique_values_path=unique_values_path)

    else:
        if map_replace_rare_values:
            # Replace rare values
            df_data = replace_rare_values(df_data,
                                          map_replace_rare_values,
                                          debug=debug)

    # Fill nans
    cat_columns = df_data.select_dtypes("object").columns.to_list()
    numeric_columns = df_data.select_dtypes(
        exclude=["object", "bool", "category"]).columns.to_list()
    df_data = fill_nans(df_data, value="Unknown", columns=cat_columns)
    df_data = fill_nans(df_data, value=0, columns=numeric_columns)

    # To category
    category_maps = {
        key: "category"
        for key in df_data.select_dtypes(["object"]).columns
    }
    df_data = transform_types(df_data=df_data,
                              change_type_columns=category_maps)

    if not flg_evaluate:
        corr_matrix_settings = kwargs["corr_matrix_settings"]
        if corr_matrix_settings and corr_matrix_settings["enable"]:
            df_data = drop_columns_by_corr_matrix(
                df_data,
                except_columns=except_columns,
                threshold=corr_matrix_settings["threshold"],
                debug=debug)

        save_unique_train_data(
            df_data=df_data,
            drop_columns=except_columns,
            unique_values_path=unique_values_path,
        )

    return df_data

In [134]:
df_test

Unnamed: 0,case_id,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,...,mode_collaterals_typeofguarante_669M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,mode_subjectroles_name_541M,month_decision,weekday_decision
0,57552,100,0,183992.0,6298.8003,12155.4,0,0,0,0,...,,,,,,,,,11,5
1,57633,100,0,,8273.0,0.0,0,0,0,0,...,c7a5ad39,0.0,0.0,0.0,0.0,1.0,1.0,ab3c25cf,1,2
2,57569,100,0,0.0,4682.6,0.0,1,0,0,0,...,a55475b1,,2328.0,,33346.402,,2.0,a55475b1,12,1
3,57634,100,0,39948.8,1165.8,0.0,0,0,0,0,...,,,,,,,,,1,3
4,57630,100,0,0.0,8905.0,0.0,0,0,0,0,...,a55475b1,0.0,0.0,0.0,0.0,0.0,6.0,a55475b1,3,2
5,57631,100,0,,2540.6,0.0,0,0,0,0,...,a55475b1,0.0,0.0,0.0,0.0,3.0,4.0,a55475b1,6,6
6,57549,100,0,129704.4,5742.6,3546.6,2,0,0,0,...,a55475b1,,,,,0.0,3.0,ab3c25cf,1,1
7,57551,100,0,71036.4,2844.6,0.0,1,0,0,0,...,a55475b1,,,,,0.0,3.0,ab3c25cf,11,5
8,57543,100,0,191767.36,3674.6,1218.2001,0,0,0,0,...,a55475b1,0.0,,0.0,,0.0,3.0,ab3c25cf,5,5
9,57632,100,0,63647.402,4732.0,0.0,0,0,0,0,...,a55475b1,0.0,,0.0,,3.0,,a55475b1,2,6


In [133]:
df_proc_test = preprocess_data_pipeline(df_test,
                                        flg_evaluate=True,
                                        **preproc_config)
df_proc_test

Unnamed: 0,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,...,mode_collaterals_typeofguarante_669M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,max_pmts_year_1139T,max_pmts_year_507T,mode_subjectroles_name_541M,month_decision,weekday_decision,age,age_bins
0,0,183992.0,6298.8003,12155.4,0,0,0,0,9,-9,...,Unknown,0,0,0,0,Unknown,11,5,65,large
1,0,0.0,8273.0,0.0,0,0,0,0,3,0,...,c7a5ad39,0,0,1,1,ab3c25cf,1,2,28,small
2,0,0.0,4682.6,0.0,1,0,0,0,6,2824,...,a55475b1,0,2328,0,2,a55475b1,12,1,72,large
3,0,39948.8,1165.8,0.0,0,0,0,0,0,-4,...,Unknown,0,0,0,0,Unknown,1,3,44,medium
4,0,0.0,8905.0,0.0,0,0,0,0,1,0,...,a55475b1,0,0,0,6,a55475b1,3,2,54,medium
5,0,0.0,2540.6,0.0,0,0,0,0,0,0,...,a55475b1,0,0,3,4,a55475b1,6,6,35,small
6,0,129704.4,5742.6,3546.6,2,0,0,0,10,0,...,a55475b1,0,0,0,3,ab3c25cf,1,1,62,large
7,0,71036.4,2844.6,0.0,1,0,0,0,2,-1,...,a55475b1,0,0,0,3,ab3c25cf,11,5,38,medium
8,0,191767.36,3674.6,1218.2001,0,0,0,0,9,1,...,a55475b1,0,0,0,3,ab3c25cf,5,5,40,medium
9,0,63647.402,4732.0,0.0,0,0,0,0,1,-7,...,a55475b1,0,0,3,0,a55475b1,2,6,63,large


Проверим на тренировочном датасете

In [131]:
df_train = pd.read_parquet(preproc_config["train_path"])
print(df_train.shape)
df_train[:5]

(100000, 322)


Unnamed: 0,case_id,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,...,mode_collaterals_typeofguarante_669M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,mode_subjectroles_name_541M,month_decision,weekday_decision
0,691811,19,0,,,12544.8,0.0,0,0,0,...,a55475b1,0.0,,0.0,,-1.0,,a55475b1,5,6
1,822442,39,0,,,1895.6,0.0,0,0,0,...,a55475b1,0.0,0.0,0.0,0.0,-1.0,0.0,a55475b1,10,7
2,876101,47,0,,,1768.0,0.0,0,0,0,...,a55475b1,0.0,0.0,0.0,22.896704,-1.0,0.0,a55475b1,11,4
3,945235,55,0,,,3010.6,0.0,0,0,0,...,,,,,,,,,1,6
4,2591718,26,0,0.0,66877.71,3548.4001,2486.6,0,0,0,...,a55475b1,0.0,0.0,121.853437,249.097379,-1.0,-1.0,a55475b1,7,2


In [132]:
df_proc_train = preprocess_data_pipeline(df_train, flg_evaluate=False, debug=True, **preproc_config)
print(df_proc_train.shape)
df_proc_train[:5]

lastapprcommoditycat_1041M reduced nunique from 37 to 18
lastcancelreason_561M reduced nunique from 59 to 18
lastrejectcommoditycat_161M reduced nunique from 36 to 15
mode_cancelreason_3545846M reduced nunique from 47 to 12
mode_classificationofcontr_400M reduced nunique from 44 to 13
mode_contractst_964M reduced nunique from 9 to 4
mode_financialinstitution_382M reduced nunique from 96 to 18
mode_financialinstitution_591M reduced nunique from 13 to 3
currdebt_22A
annuitynextmonth_57A    0.919336
Name: currdebt_22A, dtype: float64

disbursedcredamount_1113A
credamount_770A    0.908278
Name: disbursedcredamount_1113A, dtype: float64

interestrate_311L
eir_270L    1.0
Name: interestrate_311L, dtype: float64

lastapprdate_640D
lastactivateddate_801D    0.925234
Name: lastapprdate_640D, dtype: float64

maxdpdlast9m_1059P
maxdpdlast12m_727P    0.924516
Name: maxdpdlast9m_1059P, dtype: float64

maxdpdtolerance_374P
daysoverduetolerancedd_3976961L    0.929231
Name: maxdpdtolerance_374P, dtype

(100000, 245)


Unnamed: 0,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,...,mode_collaterals_typeofguarante_669M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,max_pmts_year_1139T,max_pmts_year_507T,mode_subjectroles_name_541M,month_decision,weekday_decision,age,age_bins
0,19,0,0,0.0,12544.8,0.0,0,0,0,0,...,a55475b1,0,0,-1,0,a55475b1,5,6,22,small
1,39,0,0,0.0,1895.6,0.0,0,0,0,0,...,a55475b1,0,0,-1,0,a55475b1,10,7,32,small
2,47,0,0,0.0,1768.0,0.0,0,0,0,0,...,a55475b1,0,0,-1,0,a55475b1,11,4,34,small
3,55,0,0,0.0,3010.6,0.0,0,0,0,0,...,Unknown,0,0,0,0,Unknown,1,6,21,small
4,26,0,0,66877.71,3548.4001,2486.6,0,0,0,0,...,a55475b1,0,0,-1,-1,a55475b1,7,2,58,large


# Prediction

In [135]:
model = joblib.load(training_config['model_path'])
df_proc_test['predict'] = model.predict(df_proc_test)

In [136]:
df_proc_test

Unnamed: 0,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,...,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,max_pmts_year_1139T,max_pmts_year_507T,mode_subjectroles_name_541M,month_decision,weekday_decision,age,age_bins,predict
0,0,183992.0,6298.8003,12155.4,0,0,0,0,9,-9,...,0,0,0,0,Unknown,11,5,65,large,0
1,0,0.0,8273.0,0.0,0,0,0,0,3,0,...,0,0,1,1,ab3c25cf,1,2,28,small,0
2,0,0.0,4682.6,0.0,1,0,0,0,6,2824,...,0,2328,0,2,a55475b1,12,1,72,large,1
3,0,39948.8,1165.8,0.0,0,0,0,0,0,-4,...,0,0,0,0,Unknown,1,3,44,medium,0
4,0,0.0,8905.0,0.0,0,0,0,0,1,0,...,0,0,0,6,a55475b1,3,2,54,medium,0
5,0,0.0,2540.6,0.0,0,0,0,0,0,0,...,0,0,3,4,a55475b1,6,6,35,small,0
6,0,129704.4,5742.6,3546.6,2,0,0,0,10,0,...,0,0,0,3,ab3c25cf,1,1,62,large,0
7,0,71036.4,2844.6,0.0,1,0,0,0,2,-1,...,0,0,0,3,ab3c25cf,11,5,38,medium,0
8,0,191767.36,3674.6,1218.2001,0,0,0,0,9,1,...,0,0,0,3,ab3c25cf,5,5,40,medium,0
9,0,63647.402,4732.0,0.0,0,0,0,0,1,-7,...,0,0,3,0,a55475b1,2,6,63,large,0
