In [57]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from scipy.stats import rankdata

pd.set_option("display.max_columns", 30)

## Useful Functions

In [2]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "geekbrains-competitive-data-analysis"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data


def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(x_train.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [3]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

In [4]:
def xgboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
#    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        
        #-----
        dtrain = xgb.DMatrix(x_train, y_train)
        dvalid = xgb.DMatrix(x_valid, y_valid)

        model = xgb.train(
          params=params,
          dtrain=dtrain,
          evals=[(dtrain, "dtrain"), (dvalid, "dvalid")],
          early_stopping_rounds=25,
          num_boost_round=1000,
          verbose_eval=10,
          maximize=True,
        )
        #____
        
        oof_preds[valid_idx] = model.predict(dvalid)
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


## Base Tables

In [5]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


## client_profile

In [6]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv: shape = 250000 rows, 24 cols


  overwrite_input=overwrite_input)


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,...,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,...,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [7]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

In [8]:
pd.get_dummies(data["name_contract_type"]).head()

Unnamed: 0,Cash,Credit Card
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [9]:
pd.get_dummies(data["gender"]).head()

Unnamed: 0,F,M,XNA
0,0,1,0
1,0,0,0
2,1,0,0
3,0,1,0
4,0,0,0


In [10]:
pd.get_dummies(data["education_level"]).head()

Unnamed: 0,Academic degree,Higher education,Incomplete higher,Lower secondary,Secondary / secondary special
0,0,0,0,0,1
1,0,0,0,0,0
2,0,1,0,0,0
3,0,0,0,0,1
4,0,0,0,0,0


In [11]:
pd.get_dummies(data["family_status"]).head()

Unnamed: 0,Civil marriage,Married,Separated,Single / not married,Unknown,Widow
0,0,1,0,0,0,0
1,0,0,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,0,0,0,0,0


In [12]:
freq_encoder = data["education_level"].value_counts(normalize=True)
data["education_level_freq_enc"] = data["education_level"].map(freq_encoder)
data[["education_level", "education_level_freq_enc"]].head(2)

Unnamed: 0,education_level,education_level_freq_enc
0,Secondary / secondary special,0.710221
1,,


## baseline

In [13]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

XGBoost

In [14]:
import xgboost as xgb
import BayesianOptimization
xgb_params = {
    "booster": "gbtree",
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "learning_rate": 0.01,
    "nthread": 6,
    "seed": 27
}


cv = KFold(n_splits=5, random_state=1234123, shuffle=True)
new_train = train[numerical]
estimators, oof_preds_xgb = xgboost_cross_validation(
    params=xgb_params, X=new_train, y=target, cv=cv, categorical=categorial
)

  import pandas.util.testing as tm


Wed Sep 30 19:58:42 2020, Cross-Validation, 110093 rows, 49 cols
[0]	dtrain-auc:0.70634	dvalid-auc:0.69114
Multiple eval metrics have been passed: 'dvalid-auc' will be used for early stopping.

Will train until dvalid-auc hasn't improved in 25 rounds.
[10]	dtrain-auc:0.71490	dvalid-auc:0.69528
[20]	dtrain-auc:0.72093	dvalid-auc:0.69807
[30]	dtrain-auc:0.72577	dvalid-auc:0.69773
[40]	dtrain-auc:0.72773	dvalid-auc:0.69887
[50]	dtrain-auc:0.72966	dvalid-auc:0.69930
[60]	dtrain-auc:0.73191	dvalid-auc:0.69958
[70]	dtrain-auc:0.73364	dvalid-auc:0.69956
[80]	dtrain-auc:0.73452	dvalid-auc:0.69941
Stopping. Best iteration:
[62]	dtrain-auc:0.73212	dvalid-auc:0.69993

Fold 1, Valid score = 0.69924
[0]	dtrain-auc:0.70376	dvalid-auc:0.69479
Multiple eval metrics have been passed: 'dvalid-auc' will be used for early stopping.

Will train until dvalid-auc hasn't improved in 25 rounds.
[10]	dtrain-auc:0.71684	dvalid-auc:0.70196
[20]	dtrain-auc:0.72148	dvalid-auc:0.70392
[30]	dtrain-auc:0.72440	dvalid-

In [15]:
oof_score = roc_auc_score(
    target, oof_preds_xgb
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.62182


In [16]:
y_pred_xgb = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
#test[categorial] = test[categorial].astype(str)

new_test = test[numerical]
dtest = xgb.DMatrix(new_test)

for estimator in estimators:
    y_pred_xgb += estimator.predict(dtest)

In [17]:
y_pred_xgb = y_pred_xgb / cv.n_splits

LightGBM 

In [18]:
import lightgbm as lgb

In [19]:
def lgboost_cross_validation(params, X, y, cv, categorical = None):
#from sklearn.model_selection import KFold, StratifiedKFold
#N_FOLDS = 10
#folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random
    estimators = []
    oof = np.zeros(len(X))
    sub = np.zeros(len(test))
    scores = [0 for _ in range(cv.n_splits)]
    for fold_, (train_idx, val_idx) in enumerate(cv.split(X.values, y)):
        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_val, y_val = X.loc[val_idx], y.loc[val_idx]
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        watchlist = [train_data, val_data]
        clf = lgb.train(params, train_set = train_data, valid_sets=watchlist)
        oof[val_idx] = clf.predict(X_val)
        sub += clf.predict(new_test)/cv.n_splits
        scores[fold_] = roc_auc_score(y[val_idx], oof[val_idx])
        print("Fold {}: {}".format(fold_+1, round(scores[fold_],5)))
        estimators.append(clf)
    
    print("CV score(auc): {:<8.5f}, (std: {:<8.5f})".format(roc_auc_score(y, oof), np.std(scores)))

    print("="*65)
    return estimators,oof


In [20]:
from sklearn.model_selection import KFold, StratifiedKFold
lgb_params = {'learning_rate': 0.3,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}

N_FOLDS = 10
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
new_train = train[numerical]
new_test = test[numerical]
estimators, oof_preds_lgb = lgboost_cross_validation(
    params=lgb_params, X=new_train, y=target, cv=cv, categorical=categorial
)

[1]	training's auc: 0.705233	valid_1's auc: 0.696419
[2]	training's auc: 0.716252	valid_1's auc: 0.702777
[3]	training's auc: 0.721656	valid_1's auc: 0.707404
[4]	training's auc: 0.726994	valid_1's auc: 0.711742
[5]	training's auc: 0.731583	valid_1's auc: 0.714943
[6]	training's auc: 0.734744	valid_1's auc: 0.71639
[7]	training's auc: 0.737442	valid_1's auc: 0.717476
[8]	training's auc: 0.743179	valid_1's auc: 0.722673
[9]	training's auc: 0.745573	valid_1's auc: 0.723342
[10]	training's auc: 0.747584	valid_1's auc: 0.721867
[11]	training's auc: 0.750348	valid_1's auc: 0.723855
[12]	training's auc: 0.753263	valid_1's auc: 0.725308
[13]	training's auc: 0.7553	valid_1's auc: 0.725078
[14]	training's auc: 0.757943	valid_1's auc: 0.725768
[15]	training's auc: 0.760415	valid_1's auc: 0.728105
[16]	training's auc: 0.762784	valid_1's auc: 0.728191
[17]	training's auc: 0.765873	valid_1's auc: 0.728716
[18]	training's auc: 0.768086	valid_1's auc: 0.728383
[19]	training's auc: 0.771715	valid_1's 

[56]	training's auc: 0.830912	valid_1's auc: 0.716638
[57]	training's auc: 0.832164	valid_1's auc: 0.716334
[58]	training's auc: 0.833192	valid_1's auc: 0.715111
[59]	training's auc: 0.834671	valid_1's auc: 0.714506
[60]	training's auc: 0.835229	valid_1's auc: 0.714775
[61]	training's auc: 0.836416	valid_1's auc: 0.713065
[62]	training's auc: 0.837713	valid_1's auc: 0.713529
[63]	training's auc: 0.839108	valid_1's auc: 0.713185
[64]	training's auc: 0.840485	valid_1's auc: 0.712505
[65]	training's auc: 0.841392	valid_1's auc: 0.712047
[66]	training's auc: 0.843282	valid_1's auc: 0.71138
[67]	training's auc: 0.845006	valid_1's auc: 0.711013
[68]	training's auc: 0.846287	valid_1's auc: 0.711876
[69]	training's auc: 0.846754	valid_1's auc: 0.712335
[70]	training's auc: 0.847684	valid_1's auc: 0.712875
[71]	training's auc: 0.848709	valid_1's auc: 0.713645
[72]	training's auc: 0.849553	valid_1's auc: 0.712986
[73]	training's auc: 0.850458	valid_1's auc: 0.712818
[74]	training's auc: 0.851164

[11]	training's auc: 0.750421	valid_1's auc: 0.701829
[12]	training's auc: 0.755774	valid_1's auc: 0.709015
[13]	training's auc: 0.758031	valid_1's auc: 0.708815
[14]	training's auc: 0.760234	valid_1's auc: 0.708412
[15]	training's auc: 0.762984	valid_1's auc: 0.711639
[16]	training's auc: 0.765241	valid_1's auc: 0.713188
[17]	training's auc: 0.76731	valid_1's auc: 0.712297
[18]	training's auc: 0.769642	valid_1's auc: 0.712975
[19]	training's auc: 0.77139	valid_1's auc: 0.71261
[20]	training's auc: 0.773277	valid_1's auc: 0.712083
[21]	training's auc: 0.775786	valid_1's auc: 0.71313
[22]	training's auc: 0.776962	valid_1's auc: 0.712938
[23]	training's auc: 0.778792	valid_1's auc: 0.713065
[24]	training's auc: 0.78133	valid_1's auc: 0.712791
[25]	training's auc: 0.78354	valid_1's auc: 0.712985
[26]	training's auc: 0.785755	valid_1's auc: 0.712267
[27]	training's auc: 0.787723	valid_1's auc: 0.712284
[28]	training's auc: 0.789685	valid_1's auc: 0.712778
[29]	training's auc: 0.791599	vali

[71]	training's auc: 0.844704	valid_1's auc: 0.737588
[72]	training's auc: 0.845473	valid_1's auc: 0.73727
[73]	training's auc: 0.846804	valid_1's auc: 0.737637
[74]	training's auc: 0.847968	valid_1's auc: 0.737276
[75]	training's auc: 0.848809	valid_1's auc: 0.736802
[76]	training's auc: 0.850151	valid_1's auc: 0.736708
[77]	training's auc: 0.850801	valid_1's auc: 0.737085
[78]	training's auc: 0.851808	valid_1's auc: 0.736493
[79]	training's auc: 0.853159	valid_1's auc: 0.737832
[80]	training's auc: 0.854203	valid_1's auc: 0.737499
[81]	training's auc: 0.855266	valid_1's auc: 0.738077
[82]	training's auc: 0.856328	valid_1's auc: 0.737948
[83]	training's auc: 0.856637	valid_1's auc: 0.738293
[84]	training's auc: 0.857817	valid_1's auc: 0.738421
[85]	training's auc: 0.858704	valid_1's auc: 0.738808
[86]	training's auc: 0.859485	valid_1's auc: 0.738682
[87]	training's auc: 0.860282	valid_1's auc: 0.739017
[88]	training's auc: 0.861258	valid_1's auc: 0.738801
[89]	training's auc: 0.862045

[33]	training's auc: 0.798511	valid_1's auc: 0.723126
[34]	training's auc: 0.800909	valid_1's auc: 0.722544
[35]	training's auc: 0.802481	valid_1's auc: 0.722214
[36]	training's auc: 0.804269	valid_1's auc: 0.721205
[37]	training's auc: 0.805925	valid_1's auc: 0.72018
[38]	training's auc: 0.807213	valid_1's auc: 0.720333
[39]	training's auc: 0.808298	valid_1's auc: 0.720471
[40]	training's auc: 0.810307	valid_1's auc: 0.720351
[41]	training's auc: 0.811718	valid_1's auc: 0.719739
[42]	training's auc: 0.81315	valid_1's auc: 0.719438
[43]	training's auc: 0.81357	valid_1's auc: 0.719014
[44]	training's auc: 0.814976	valid_1's auc: 0.718856
[45]	training's auc: 0.81588	valid_1's auc: 0.718116
[46]	training's auc: 0.817535	valid_1's auc: 0.718309
[47]	training's auc: 0.818775	valid_1's auc: 0.718233
[48]	training's auc: 0.819817	valid_1's auc: 0.717664
[49]	training's auc: 0.820763	valid_1's auc: 0.717883
[50]	training's auc: 0.822726	valid_1's auc: 0.718191
[51]	training's auc: 0.824078	va

[96]	training's auc: 0.869412	valid_1's auc: 0.700885
[97]	training's auc: 0.870054	valid_1's auc: 0.70098
[98]	training's auc: 0.870844	valid_1's auc: 0.700746
[99]	training's auc: 0.87194	valid_1's auc: 0.70023
[100]	training's auc: 0.872934	valid_1's auc: 0.700281
Fold 8: 0.70028
[1]	training's auc: 0.702789	valid_1's auc: 0.69033
[2]	training's auc: 0.713226	valid_1's auc: 0.696706
[3]	training's auc: 0.719802	valid_1's auc: 0.70013
[4]	training's auc: 0.726405	valid_1's auc: 0.704773
[5]	training's auc: 0.729625	valid_1's auc: 0.70429
[6]	training's auc: 0.733092	valid_1's auc: 0.704187
[7]	training's auc: 0.735907	valid_1's auc: 0.704886
[8]	training's auc: 0.739047	valid_1's auc: 0.706513
[9]	training's auc: 0.742479	valid_1's auc: 0.707367
[10]	training's auc: 0.745029	valid_1's auc: 0.70599
[11]	training's auc: 0.747687	valid_1's auc: 0.70543
[12]	training's auc: 0.752886	valid_1's auc: 0.71059
[13]	training's auc: 0.756741	valid_1's auc: 0.710571
[14]	training's auc: 0.758838

[54]	training's auc: 0.8228	valid_1's auc: 0.709393
[55]	training's auc: 0.824926	valid_1's auc: 0.710267
[56]	training's auc: 0.826154	valid_1's auc: 0.710076
[57]	training's auc: 0.827378	valid_1's auc: 0.709309
[58]	training's auc: 0.828714	valid_1's auc: 0.708891
[59]	training's auc: 0.829862	valid_1's auc: 0.709016
[60]	training's auc: 0.830579	valid_1's auc: 0.70919
[61]	training's auc: 0.83164	valid_1's auc: 0.708211
[62]	training's auc: 0.83259	valid_1's auc: 0.707571
[63]	training's auc: 0.833955	valid_1's auc: 0.70727
[64]	training's auc: 0.835292	valid_1's auc: 0.707218
[65]	training's auc: 0.836604	valid_1's auc: 0.706771
[66]	training's auc: 0.837036	valid_1's auc: 0.706842
[67]	training's auc: 0.838543	valid_1's auc: 0.706028
[68]	training's auc: 0.839711	valid_1's auc: 0.705231
[69]	training's auc: 0.84087	valid_1's auc: 0.704969
[70]	training's auc: 0.841932	valid_1's auc: 0.705094
[71]	training's auc: 0.842638	valid_1's auc: 0.704831
[72]	training's auc: 0.843538	valid

In [21]:

oof_score = roc_auc_score(
    target, oof_preds_lgb
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.70652


Ансамбль нескольких моделей

оценить корреляцию прогнозов на обучающей выборке

In [22]:
np.corrcoef(
    x = oof_preds_lgb,
    y = oof_preds_xgb
)

array([[1.        , 0.36165103],
       [0.36165103, 1.        ]])

In [23]:
oof_train = []
oof_train.append(oof_preds_xgb)
oof_train.append(oof_preds_lgb)
#oof_train = np.array(oof_train)
oof_train

[array([0.02180408, 0.11500328, 0.24272412, ..., 0.27453855, 0.23320237,
        0.30034339]),
 array([0.01071466, 0.08306752, 0.06918635, ..., 0.08362073, 0.01843834,
        0.0997197 ])]

Применить модели на тестовую выборку и оценить корреляцию.


In [24]:
y_pred_lgb = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
#test[categorial] = test[categorial].astype(str)
new_test = test[numerical]
for estimator in estimators:
    y_pred_lgb += estimator.predict(new_test)

In [25]:
y_pred_lgb = y_pred_lgb / cv.n_splits

In [27]:
y_pred_lgb

array([0.04265717, 0.28020678, 0.13450095, ..., 0.08217762, 0.01037823,
       0.02629846])

In [29]:
np.corrcoef(
    x = y_pred_lgb,
    y = y_pred_xgb
)

array([[1.        , 0.90685133],
       [0.90685133, 1.        ]])

In [30]:
oof_test = []
oof_test.append(y_pred_xgb)
oof_test.append(y_pred_lgb)
oof_test = np.array(oof_test)
oof_test

array([[0.19475852, 0.28085497, 0.28609303, ..., 0.20393237, 0.16206918,
        0.1767074 ],
       [0.04265717, 0.28020678, 0.13450095, ..., 0.08217762, 0.01037823,
        0.02629846]])

In [31]:
cor = pd.DataFrame(oof_train)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,110078,110079,110080,110081,110082,110083,110084,110085,110086,110087,110088,110089,110090,110091,110092
0,0.021804,0.115003,0.242724,0.327103,0.25387,0.309703,0.274493,0.168009,0.123239,0.25796,0.274539,0.268973,0.025762,0.125615,0.320062,...,0.09501,0.057954,0.288158,0.098362,0.258538,0.084297,0.25387,0.115003,0.274539,0.084545,0.291705,0.260985,0.274539,0.233202,0.300343
1,0.010715,0.083068,0.069186,0.113912,0.083621,0.560826,0.025938,0.03251,0.10853,0.038389,0.082242,0.06188,0.007936,0.129061,0.237281,...,0.056113,0.222659,0.025226,0.107808,0.012001,0.052649,0.081648,0.08299,0.081648,0.081648,0.083621,0.038181,0.083621,0.018438,0.09972


In [32]:
np.mean(oof_train[0] - oof_train[1])

0.12429579497430172

In [33]:
np.max(oof_train[0] - oof_train[1])

0.5034274902407732

In [34]:
cor = pd.DataFrame(oof_test)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,165126,165127,165128,165129,165130,165131,165132,165133,165134,165135,165136,165137,165138,165139,165140
0,0.194759,0.280855,0.286093,0.203932,0.167907,0.172871,0.203932,0.256291,0.157321,0.166737,0.203932,0.177878,0.17346,0.206655,0.169337,...,0.165647,0.213648,0.161938,0.160148,0.155443,0.203932,0.203932,0.203932,0.15734,0.203932,0.203932,0.169106,0.203932,0.162069,0.176707
1,0.042657,0.280207,0.134501,0.082178,0.018202,0.026691,0.082178,0.101307,0.002751,0.020221,0.082178,0.042439,0.029281,0.083732,0.020947,...,0.022487,0.062768,0.01345,0.009253,0.00399,0.082178,0.082178,0.082178,0.006549,0.082178,0.082178,0.040369,0.082178,0.010378,0.026298


In [35]:
np.mean(oof_test[0] - oof_test[1])

0.12411951150425493

Усреднить прогнозы с помощью арифмитического среднего, геометрического среднего

In [36]:
oof_preds_arifm = (oof_preds_xgb + oof_preds_lgb) / 2

In [37]:
oof_preds_geom = (oof_preds_xgb * oof_preds_lgb) ** 0.5

In [38]:
oof_score = roc_auc_score(
    target, oof_preds_arifm
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.66966


In [39]:
oof_score = roc_auc_score(
    target, oof_preds_geom
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.70013


catBoost

In [40]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=categorial
)

Wed Sep 30 20:09:28 2020, Cross-Validation, 110093 rows, 53 cols
0:	test: 0.6734332	test1: 0.6641824	best: 0.6641824 (0)	total: 378ms	remaining: 12m 35s
10:	test: 0.7007889	test1: 0.6938119	best: 0.6938119 (10)	total: 1.63s	remaining: 4m 55s
20:	test: 0.7029293	test1: 0.6963111	best: 0.6970105 (18)	total: 2.88s	remaining: 4m 31s
30:	test: 0.7058306	test1: 0.6981630	best: 0.6985139 (29)	total: 4.08s	remaining: 4m 19s
40:	test: 0.7068103	test1: 0.6994951	best: 0.6994951 (40)	total: 5.32s	remaining: 4m 14s
50:	test: 0.7073623	test1: 0.7009414	best: 0.7010275 (46)	total: 6.5s	remaining: 4m 8s
60:	test: 0.7083320	test1: 0.7028293	best: 0.7028614 (59)	total: 7.72s	remaining: 4m 5s
70:	test: 0.7087590	test1: 0.7025761	best: 0.7029574 (64)	total: 8.91s	remaining: 4m 2s
80:	test: 0.7094492	test1: 0.7035250	best: 0.7035250 (80)	total: 10.1s	remaining: 4m
90:	test: 0.7100618	test1: 0.7038065	best: 0.7038065 (90)	total: 11.4s	remaining: 3m 58s
100:	test: 0.7107377	test1: 0.7049387	best: 0.7049387 

120:	test: 0.7102942	test1: 0.7094887	best: 0.7094887 (120)	total: 14.4s	remaining: 3m 44s
130:	test: 0.7109416	test1: 0.7096544	best: 0.7097525 (125)	total: 15.6s	remaining: 3m 43s
140:	test: 0.7114409	test1: 0.7100903	best: 0.7101572 (138)	total: 16.8s	remaining: 3m 41s
150:	test: 0.7124348	test1: 0.7114451	best: 0.7114451 (150)	total: 18s	remaining: 3m 40s
160:	test: 0.7130634	test1: 0.7116085	best: 0.7116093 (156)	total: 19.2s	remaining: 3m 38s
170:	test: 0.7134697	test1: 0.7120329	best: 0.7122668 (166)	total: 20.4s	remaining: 3m 38s
180:	test: 0.7138800	test1: 0.7124840	best: 0.7125025 (179)	total: 21.6s	remaining: 3m 37s
190:	test: 0.7145701	test1: 0.7128818	best: 0.7128818 (190)	total: 22.8s	remaining: 3m 35s
200:	test: 0.7150727	test1: 0.7131927	best: 0.7131927 (200)	total: 23.9s	remaining: 3m 33s
210:	test: 0.7157641	test1: 0.7139536	best: 0.7139536 (210)	total: 25s	remaining: 3m 32s
220:	test: 0.7165304	test1: 0.7144560	best: 0.7144675 (218)	total: 26.2s	remaining: 3m 30s
230

200:	test: 0.7148753	test1: 0.7147597	best: 0.7150023 (198)	total: 24.1s	remaining: 3m 36s
210:	test: 0.7155343	test1: 0.7160551	best: 0.7160551 (210)	total: 25.4s	remaining: 3m 35s
220:	test: 0.7162537	test1: 0.7163370	best: 0.7163370 (220)	total: 26.7s	remaining: 3m 34s
230:	test: 0.7170317	test1: 0.7167893	best: 0.7168715 (229)	total: 28s	remaining: 3m 34s
240:	test: 0.7176536	test1: 0.7172015	best: 0.7172213 (239)	total: 29.3s	remaining: 3m 33s
250:	test: 0.7183309	test1: 0.7181379	best: 0.7181379 (250)	total: 30.5s	remaining: 3m 32s
260:	test: 0.7188662	test1: 0.7184569	best: 0.7184569 (260)	total: 31.8s	remaining: 3m 31s
270:	test: 0.7195983	test1: 0.7190029	best: 0.7190029 (270)	total: 33s	remaining: 3m 30s
280:	test: 0.7200566	test1: 0.7191380	best: 0.7191650 (279)	total: 34s	remaining: 3m 27s
290:	test: 0.7205093	test1: 0.7196423	best: 0.7198270 (289)	total: 35.2s	remaining: 3m 26s
300:	test: 0.7211720	test1: 0.7199142	best: 0.7200460 (299)	total: 36.4s	remaining: 3m 25s
310:	

250:	test: 0.7205258	test1: 0.7090452	best: 0.7093088 (238)	total: 28.8s	remaining: 3m 20s
260:	test: 0.7211952	test1: 0.7094150	best: 0.7094454 (258)	total: 29.9s	remaining: 3m 19s
270:	test: 0.7217562	test1: 0.7098355	best: 0.7098355 (270)	total: 31s	remaining: 3m 17s
280:	test: 0.7224329	test1: 0.7102272	best: 0.7102272 (280)	total: 32.1s	remaining: 3m 16s
290:	test: 0.7230225	test1: 0.7106736	best: 0.7107217 (289)	total: 33.2s	remaining: 3m 14s
300:	test: 0.7236039	test1: 0.7109276	best: 0.7110676 (292)	total: 34.2s	remaining: 3m 13s
310:	test: 0.7241218	test1: 0.7112946	best: 0.7114504 (309)	total: 35.4s	remaining: 3m 12s
320:	test: 0.7245276	test1: 0.7117339	best: 0.7117339 (320)	total: 36.5s	remaining: 3m 11s
330:	test: 0.7249405	test1: 0.7118951	best: 0.7118951 (330)	total: 37.7s	remaining: 3m 10s
340:	test: 0.7255719	test1: 0.7123632	best: 0.7123632 (340)	total: 38.9s	remaining: 3m 9s
350:	test: 0.7260102	test1: 0.7125803	best: 0.7127009 (347)	total: 40.1s	remaining: 3m 8s
360

1150:	test: 0.7476175	test1: 0.7213730	best: 0.7213793 (1149)	total: 2m 5s	remaining: 1m 32s
1160:	test: 0.7478340	test1: 0.7212903	best: 0.7213793 (1149)	total: 2m 6s	remaining: 1m 31s
1170:	test: 0.7479798	test1: 0.7213998	best: 0.7214150 (1168)	total: 2m 7s	remaining: 1m 30s
1180:	test: 0.7480866	test1: 0.7214179	best: 0.7214179 (1180)	total: 2m 8s	remaining: 1m 29s
1190:	test: 0.7482241	test1: 0.7213467	best: 0.7215495 (1184)	total: 2m 9s	remaining: 1m 28s
1200:	test: 0.7485157	test1: 0.7213675	best: 0.7215495 (1184)	total: 2m 10s	remaining: 1m 26s
1210:	test: 0.7486463	test1: 0.7213554	best: 0.7215495 (1184)	total: 2m 11s	remaining: 1m 25s
1220:	test: 0.7488918	test1: 0.7213513	best: 0.7215495 (1184)	total: 2m 12s	remaining: 1m 24s
1230:	test: 0.7490701	test1: 0.7214572	best: 0.7215495 (1184)	total: 2m 13s	remaining: 1m 23s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7215494853
bestIteration = 1184

Shrink model to first 1185 iterations.
Fold 4, Valid score

800:	test: 0.7393180	test1: 0.7236731	best: 0.7238894 (776)	total: 1m 23s	remaining: 2m 4s
810:	test: 0.7397406	test1: 0.7237608	best: 0.7238894 (776)	total: 1m 24s	remaining: 2m 3s
820:	test: 0.7400769	test1: 0.7240972	best: 0.7240972 (820)	total: 1m 25s	remaining: 2m 2s
830:	test: 0.7401985	test1: 0.7240538	best: 0.7241155 (821)	total: 1m 26s	remaining: 2m 1s
840:	test: 0.7403660	test1: 0.7241150	best: 0.7241158 (839)	total: 1m 27s	remaining: 2m
850:	test: 0.7405366	test1: 0.7241955	best: 0.7242106 (847)	total: 1m 28s	remaining: 1m 59s
860:	test: 0.7407310	test1: 0.7240821	best: 0.7242293 (854)	total: 1m 29s	remaining: 1m 57s
870:	test: 0.7409686	test1: 0.7239965	best: 0.7242293 (854)	total: 1m 30s	remaining: 1m 56s
880:	test: 0.7412699	test1: 0.7241222	best: 0.7242293 (854)	total: 1m 31s	remaining: 1m 55s
890:	test: 0.7414287	test1: 0.7240674	best: 0.7242293 (854)	total: 1m 32s	remaining: 1m 54s
900:	test: 0.7416672	test1: 0.7241307	best: 0.7242531 (898)	total: 1m 33s	remaining: 1m 

In [41]:
oof_preds_cb = oof_preds
oof_score = roc_auc_score(
    target, oof_preds_cb
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72378


Ансамбль нескольких моделей

оценить корреляцию прогнозов на обучающей выборке

In [42]:
np.corrcoef(
    x = oof_preds_lgb,
    y = oof_preds_xgb
)

array([[1.        , 0.36165103],
       [0.36165103, 1.        ]])

In [43]:
oof_train = []
oof_train.append(oof_preds_lgb)
oof_train.append(oof_preds_cb)
#oof_train = np.array(oof_train)
oof_train

[array([0.01071466, 0.08306752, 0.06918635, ..., 0.08362073, 0.01843834,
        0.0997197 ]),
 array([0.02397243, 0.08568921, 0.06078126, ..., 0.08364594, 0.03411563,
        0.06130429])]

Применить модели на тестовую выборку и оценить корреляцию.


In [44]:
#test = test[numerical]
y_pred_cb = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

for estimator in estimators:
    y_pred_cb += estimator.predict_proba(test)[:, 1]

In [45]:
y_pred_cb = y_pred_cb / cv.n_splits
y_pred_cb

array([0.0579011 , 0.21880019, 0.19389816, ..., 0.08437272, 0.02094533,
       0.05025549])

In [46]:
np.corrcoef(
    x = y_pred_cb,
    y = y_pred_lgb
)

array([[1.       , 0.9035469],
       [0.9035469, 1.       ]])

In [47]:
oof_test = []
oof_test.append(y_pred_lgb)
oof_test.append(y_pred_cb)
oof_test = np.array(oof_test)
oof_test

array([[0.04265717, 0.28020678, 0.13450095, ..., 0.08217762, 0.01037823,
        0.02629846],
       [0.0579011 , 0.21880019, 0.19389816, ..., 0.08437272, 0.02094533,
        0.05025549]])

In [48]:
cor = pd.DataFrame(oof_train)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,110078,110079,110080,110081,110082,110083,110084,110085,110086,110087,110088,110089,110090,110091,110092
0,0.010715,0.083068,0.069186,0.113912,0.083621,0.560826,0.025938,0.03251,0.10853,0.038389,0.082242,0.06188,0.007936,0.129061,0.237281,...,0.056113,0.222659,0.025226,0.107808,0.012001,0.052649,0.081648,0.08299,0.081648,0.081648,0.083621,0.038181,0.083621,0.018438,0.09972
1,0.023972,0.085689,0.060781,0.202907,0.082412,0.157971,0.036464,0.106675,0.108352,0.072403,0.083646,0.066941,0.021256,0.107796,0.246632,...,0.066901,0.044671,0.066687,0.099185,0.019749,0.040882,0.082412,0.085689,0.063469,0.084203,0.085914,0.05419,0.083646,0.034116,0.061304


In [49]:
np.mean(oof_train[0] - oof_train[1])

-0.0015146657931371735

In [50]:
np.max(oof_train[0] - oof_train[1])

0.6447537891184425

In [51]:
cor = pd.DataFrame(oof_test)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,165126,165127,165128,165129,165130,165131,165132,165133,165134,165135,165136,165137,165138,165139,165140
0,0.042657,0.280207,0.134501,0.082178,0.018202,0.026691,0.082178,0.101307,0.002751,0.020221,0.082178,0.042439,0.029281,0.083732,0.020947,...,0.022487,0.062768,0.01345,0.009253,0.00399,0.082178,0.082178,0.082178,0.006549,0.082178,0.082178,0.040369,0.082178,0.010378,0.026298
1,0.057901,0.2188,0.193898,0.084373,0.021954,0.0276,0.062838,0.121554,0.013626,0.032082,0.084373,0.053428,0.041428,0.10459,0.035006,...,0.027869,0.093394,0.01265,0.021012,0.011818,0.084373,0.084373,0.084373,0.010979,0.084373,0.084373,0.045271,0.084373,0.020945,0.050255


In [52]:
np.mean(oof_test[0] - oof_test[1])

-0.001287118063018821

Усреднить прогнозы с помощью арифмитического среднего, геометрического среднего

In [53]:
oof_preds_arifm = (oof_preds_lgb + oof_preds_cb) / 2

In [54]:
oof_preds_geom = (oof_preds_lgb * oof_preds_cb) ** 0.5

In [55]:
oof_score = roc_auc_score(
    target, oof_preds_arifm
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72234


In [56]:
oof_score = roc_auc_score(
    target, oof_preds_geom
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72258


In [59]:
lgb_rank = rankdata(oof_preds_lgb) 
cb_rank = rankdata(oof_preds_cb)
amean_rank = (lgb_rank + cb_rank) / 2
amean_rank

array([11362.  , 78218.  , 53087.5 , ..., 75194.75, 21968.5 , 68548.  ])

In [60]:
oof_score = roc_auc_score(
    target, amean_rank
)
print(f"OOF-score = {round(oof_score, 5)}")

OOF-score = 0.72214


Вывод:

Лучше всего с геперпараметрами из коробки работает catBoost. Он показал результат 72%. LightGBM занял 2 место и показал результат 71%, но зато модель обучалась значительно быстрее. XGBoost показал плохой результат, но возможно требуется тюнинг гиперпараметров. Ансамбль моделей catBoost и lgb не привел к улучшению результата. 