In [4]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 30)

## Useful Functions

In [5]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "geekbrains-competitive-data-analysis"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data


def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(x_train.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [6]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

In [7]:
def xgboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
#    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        
        #-----
        dtrain = xgb.DMatrix(x_train, y_train)
        dvalid = xgb.DMatrix(x_valid, y_valid)

        model = xgb.train(
          params=params,
          dtrain=dtrain,
          evals=[(dtrain, "dtrain"), (dvalid, "dvalid")],
          early_stopping_rounds=25,
          num_boost_round=1000,
          verbose_eval=10,
          maximize=True,
        )
        #____
        
        oof_preds[valid_idx] = model.predict(dvalid)
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


## Base Tables

In [71]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


## client_profile

In [72]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv: shape = 250000 rows, 24 cols


  overwrite_input=overwrite_input)


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,...,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,...,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [73]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

In [74]:
pd.get_dummies(data["name_contract_type"]).head()

Unnamed: 0,Cash,Credit Card
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [75]:
pd.get_dummies(data["gender"]).head()

Unnamed: 0,F,M,XNA
0,0,1,0
1,0,0,0
2,1,0,0
3,0,1,0
4,0,0,0


In [76]:
pd.get_dummies(data["education_level"]).head()

Unnamed: 0,Academic degree,Higher education,Incomplete higher,Lower secondary,Secondary / secondary special
0,0,0,0,0,1
1,0,0,0,0,0
2,0,1,0,0,0
3,0,0,0,0,1
4,0,0,0,0,0


In [77]:
pd.get_dummies(data["family_status"]).head()

Unnamed: 0,Civil marriage,Married,Separated,Single / not married,Unknown,Widow
0,0,1,0,0,0,0
1,0,0,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,0,0,0,0,0


In [78]:
freq_encoder = data["education_level"].value_counts(normalize=True)
data["education_level_freq_enc"] = data["education_level"].map(freq_encoder)
data[["education_level", "education_level_freq_enc"]].head(2)

Unnamed: 0,education_level,education_level_freq_enc
0,Secondary / secondary special,0.710221
1,,


## baseline

In [79]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

XGBoost

In [17]:
import xgboost as xgb
import BayesianOptimization
xgb_params = {
    "booster": "gbtree",
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "learning_rate": 0.01,
    "nthread": 6,
    "seed": 27
}


cv = KFold(n_splits=5, random_state=1234123, shuffle=True)
new_train = train[numerical]
estimators, oof_preds_xgb = xgboost_cross_validation(
    params=xgb_params, X=new_train, y=target, cv=cv, categorical=categorial
)

  import pandas.util.testing as tm


Tue Sep 29 17:26:28 2020, Cross-Validation, 110093 rows, 49 cols
[0]	dtrain-auc:0.70634	dvalid-auc:0.69116
Multiple eval metrics have been passed: 'dvalid-auc' will be used for early stopping.

Will train until dvalid-auc hasn't improved in 25 rounds.
[10]	dtrain-auc:0.71487	dvalid-auc:0.69530
[20]	dtrain-auc:0.72095	dvalid-auc:0.69808
[30]	dtrain-auc:0.72582	dvalid-auc:0.69770
[40]	dtrain-auc:0.72748	dvalid-auc:0.69812
[50]	dtrain-auc:0.72936	dvalid-auc:0.69905
[60]	dtrain-auc:0.73148	dvalid-auc:0.69939
[70]	dtrain-auc:0.73330	dvalid-auc:0.69959
[80]	dtrain-auc:0.73441	dvalid-auc:0.69940
[90]	dtrain-auc:0.73531	dvalid-auc:0.69947
Stopping. Best iteration:
[72]	dtrain-auc:0.73350	dvalid-auc:0.69971

Fold 1, Valid score = 0.69904
[0]	dtrain-auc:0.70376	dvalid-auc:0.69418
Multiple eval metrics have been passed: 'dvalid-auc' will be used for early stopping.

Will train until dvalid-auc hasn't improved in 25 rounds.
[10]	dtrain-auc:0.71684	dvalid-auc:0.70192
[20]	dtrain-auc:0.72142	dvalid-

In [18]:
oof_score = roc_auc_score(
    target, oof_preds_xgb
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.621


In [20]:
y_pred_xgb = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
#test[categorial] = test[categorial].astype(str)

new_test = test[numerical]
dtest = xgb.DMatrix(new_test)

for estimator in estimators:
    y_pred_xgb += estimator.predict(dtest)

In [45]:
y_pred_xgb = y_pred_xgb / cv.n_splits

LightGBM 

In [21]:
import lightgbm as lgb

In [99]:
def lgboost_cross_validation(params, X, y, cv, categorical = None):
#from sklearn.model_selection import KFold, StratifiedKFold
#N_FOLDS = 10
#folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random
    estimators = []
    oof = np.zeros(len(X))
    sub = np.zeros(len(test))
    scores = [0 for _ in range(cv.n_splits)]
    for fold_, (train_idx, val_idx) in enumerate(cv.split(X.values, y)):
        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_val, y_val = X.loc[val_idx], y.loc[val_idx]
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        watchlist = [train_data, val_data]
        clf = lgb.train(params, train_set = train_data, valid_sets=watchlist)
        oof[val_idx] = clf.predict(X_val)
        sub += clf.predict(new_test)/cv.n_splits
        scores[fold_] = roc_auc_score(y[val_idx], oof[val_idx])
        print("Fold {}: {}".format(fold_+1, round(scores[fold_],5)))
        estimators.append(clf)
    
    print("CV score(auc): {:<8.5f}, (std: {:<8.5f})".format(roc_auc_score(y, oof), np.std(scores)))

    print("="*65)
    return estimators,oof


In [100]:
from sklearn.model_selection import KFold, StratifiedKFold
lgb_params = {'learning_rate': 0.3,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}

N_FOLDS = 10
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
new_train = train[numerical]
new_test = test[numerical]
estimators, oof_preds_lgb = lgboost_cross_validation(
    params=lgb_params, X=new_train, y=target, cv=cv, categorical=categorial
)

[1]	training's auc: 0.692479	valid_1's auc: 0.6879
[2]	training's auc: 0.714254	valid_1's auc: 0.706002
[3]	training's auc: 0.718834	valid_1's auc: 0.708941
[4]	training's auc: 0.723428	valid_1's auc: 0.711598
[5]	training's auc: 0.72835	valid_1's auc: 0.710765
[6]	training's auc: 0.732065	valid_1's auc: 0.71654
[7]	training's auc: 0.735002	valid_1's auc: 0.718728
[8]	training's auc: 0.739276	valid_1's auc: 0.723347
[9]	training's auc: 0.744997	valid_1's auc: 0.728337
[10]	training's auc: 0.747713	valid_1's auc: 0.729121
[11]	training's auc: 0.750703	valid_1's auc: 0.729655
[12]	training's auc: 0.754628	valid_1's auc: 0.73164
[13]	training's auc: 0.756518	valid_1's auc: 0.731583
[14]	training's auc: 0.758716	valid_1's auc: 0.730815
[15]	training's auc: 0.76073	valid_1's auc: 0.730998
[16]	training's auc: 0.763353	valid_1's auc: 0.729062
[17]	training's auc: 0.766178	valid_1's auc: 0.729275
[18]	training's auc: 0.768081	valid_1's auc: 0.728359
[19]	training's auc: 0.770743	valid_1's auc

[58]	training's auc: 0.829308	valid_1's auc: 0.714164
[59]	training's auc: 0.830881	valid_1's auc: 0.713951
[60]	training's auc: 0.832399	valid_1's auc: 0.714264
[61]	training's auc: 0.834014	valid_1's auc: 0.714489
[62]	training's auc: 0.83491	valid_1's auc: 0.713923
[63]	training's auc: 0.83589	valid_1's auc: 0.714548
[64]	training's auc: 0.836884	valid_1's auc: 0.714648
[65]	training's auc: 0.838013	valid_1's auc: 0.715195
[66]	training's auc: 0.839161	valid_1's auc: 0.715138
[67]	training's auc: 0.840069	valid_1's auc: 0.716144
[68]	training's auc: 0.840928	valid_1's auc: 0.715811
[69]	training's auc: 0.841757	valid_1's auc: 0.715895
[70]	training's auc: 0.84278	valid_1's auc: 0.715584
[71]	training's auc: 0.843876	valid_1's auc: 0.715166
[72]	training's auc: 0.845653	valid_1's auc: 0.715228
[73]	training's auc: 0.84644	valid_1's auc: 0.714947
[74]	training's auc: 0.847158	valid_1's auc: 0.714515
[75]	training's auc: 0.848453	valid_1's auc: 0.714233
[76]	training's auc: 0.849276	va

[11]	training's auc: 0.748717	valid_1's auc: 0.707625
[12]	training's auc: 0.751952	valid_1's auc: 0.709766
[13]	training's auc: 0.754768	valid_1's auc: 0.710129
[14]	training's auc: 0.756806	valid_1's auc: 0.709613
[15]	training's auc: 0.75883	valid_1's auc: 0.709914
[16]	training's auc: 0.761816	valid_1's auc: 0.71051
[17]	training's auc: 0.764515	valid_1's auc: 0.712027
[18]	training's auc: 0.766202	valid_1's auc: 0.711647
[19]	training's auc: 0.7684	valid_1's auc: 0.712446
[20]	training's auc: 0.770005	valid_1's auc: 0.712024
[21]	training's auc: 0.771739	valid_1's auc: 0.712299
[22]	training's auc: 0.773191	valid_1's auc: 0.712217
[23]	training's auc: 0.774968	valid_1's auc: 0.712337
[24]	training's auc: 0.777335	valid_1's auc: 0.711587
[25]	training's auc: 0.779384	valid_1's auc: 0.711773
[26]	training's auc: 0.781397	valid_1's auc: 0.711252
[27]	training's auc: 0.783373	valid_1's auc: 0.711274
[28]	training's auc: 0.785784	valid_1's auc: 0.711689
[29]	training's auc: 0.787785	va

[63]	training's auc: 0.836646	valid_1's auc: 0.739447
[64]	training's auc: 0.837589	valid_1's auc: 0.739379
[65]	training's auc: 0.83924	valid_1's auc: 0.739081
[66]	training's auc: 0.840979	valid_1's auc: 0.738679
[67]	training's auc: 0.841866	valid_1's auc: 0.736867
[68]	training's auc: 0.843143	valid_1's auc: 0.736311
[69]	training's auc: 0.84432	valid_1's auc: 0.735801
[70]	training's auc: 0.8454	valid_1's auc: 0.735208
[71]	training's auc: 0.846081	valid_1's auc: 0.734953
[72]	training's auc: 0.846911	valid_1's auc: 0.734606
[73]	training's auc: 0.847521	valid_1's auc: 0.733874
[74]	training's auc: 0.848701	valid_1's auc: 0.734624
[75]	training's auc: 0.849644	valid_1's auc: 0.734767
[76]	training's auc: 0.851185	valid_1's auc: 0.734426
[77]	training's auc: 0.852354	valid_1's auc: 0.733754
[78]	training's auc: 0.853096	valid_1's auc: 0.734931
[79]	training's auc: 0.855096	valid_1's auc: 0.734401
[80]	training's auc: 0.855854	valid_1's auc: 0.734255
[81]	training's auc: 0.85682	val

[19]	training's auc: 0.768008	valid_1's auc: 0.734911
[20]	training's auc: 0.769492	valid_1's auc: 0.734766
[21]	training's auc: 0.771177	valid_1's auc: 0.734657
[22]	training's auc: 0.77338	valid_1's auc: 0.735137
[23]	training's auc: 0.775062	valid_1's auc: 0.734651
[24]	training's auc: 0.777707	valid_1's auc: 0.734213
[25]	training's auc: 0.779805	valid_1's auc: 0.734022
[26]	training's auc: 0.781937	valid_1's auc: 0.73308
[27]	training's auc: 0.783492	valid_1's auc: 0.732207
[28]	training's auc: 0.785297	valid_1's auc: 0.732769
[29]	training's auc: 0.787334	valid_1's auc: 0.731795
[30]	training's auc: 0.789305	valid_1's auc: 0.730938
[31]	training's auc: 0.791088	valid_1's auc: 0.730374
[32]	training's auc: 0.793145	valid_1's auc: 0.730996
[33]	training's auc: 0.794453	valid_1's auc: 0.73067
[34]	training's auc: 0.795469	valid_1's auc: 0.729694
[35]	training's auc: 0.796938	valid_1's auc: 0.729678
[36]	training's auc: 0.797993	valid_1's auc: 0.729903
[37]	training's auc: 0.799334	v

[81]	training's auc: 0.856807	valid_1's auc: 0.701981
[82]	training's auc: 0.857773	valid_1's auc: 0.702373
[83]	training's auc: 0.858426	valid_1's auc: 0.702117
[84]	training's auc: 0.859655	valid_1's auc: 0.701178
[85]	training's auc: 0.860567	valid_1's auc: 0.701373
[86]	training's auc: 0.861679	valid_1's auc: 0.700842
[87]	training's auc: 0.862297	valid_1's auc: 0.700009
[88]	training's auc: 0.862726	valid_1's auc: 0.700258
[89]	training's auc: 0.864174	valid_1's auc: 0.699901
[90]	training's auc: 0.865153	valid_1's auc: 0.700309
[91]	training's auc: 0.865977	valid_1's auc: 0.70013
[92]	training's auc: 0.866691	valid_1's auc: 0.699873
[93]	training's auc: 0.867593	valid_1's auc: 0.699067
[94]	training's auc: 0.868614	valid_1's auc: 0.699593
[95]	training's auc: 0.869911	valid_1's auc: 0.699559
[96]	training's auc: 0.870708	valid_1's auc: 0.699677
[97]	training's auc: 0.870918	valid_1's auc: 0.699487
[98]	training's auc: 0.87172	valid_1's auc: 0.700151
[99]	training's auc: 0.872138	

[40]	training's auc: 0.807818	valid_1's auc: 0.713739
[41]	training's auc: 0.80957	valid_1's auc: 0.713691
[42]	training's auc: 0.810567	valid_1's auc: 0.714515
[43]	training's auc: 0.811734	valid_1's auc: 0.714552
[44]	training's auc: 0.812986	valid_1's auc: 0.714185
[45]	training's auc: 0.814121	valid_1's auc: 0.713897
[46]	training's auc: 0.815402	valid_1's auc: 0.714249
[47]	training's auc: 0.817084	valid_1's auc: 0.713412
[48]	training's auc: 0.81822	valid_1's auc: 0.713142
[49]	training's auc: 0.818693	valid_1's auc: 0.713
[50]	training's auc: 0.820739	valid_1's auc: 0.712734
[51]	training's auc: 0.822126	valid_1's auc: 0.71249
[52]	training's auc: 0.823696	valid_1's auc: 0.712448
[53]	training's auc: 0.824937	valid_1's auc: 0.71296
[54]	training's auc: 0.826314	valid_1's auc: 0.712843
[55]	training's auc: 0.827889	valid_1's auc: 0.712165
[56]	training's auc: 0.829268	valid_1's auc: 0.711968
[57]	training's auc: 0.83091	valid_1's auc: 0.711745
[58]	training's auc: 0.832411	valid_

In [101]:

oof_score = roc_auc_score(
    target, oof_preds_lgb
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.7104


Ансамбль нескольких моделей

оценить корреляцию прогнозов на обучающей выборке

In [102]:
oof_train = []
oof_train.append(oof_preds_xgb)
oof_train.append(oof_preds_lgb)
#oof_train = np.array(oof_train)
oof_train

[array([0.02022145, 0.0863369 , 0.22426425, ..., 0.29279172, 0.21572646,
        0.31558186]),
 array([0.05072209, 0.0828137 , 0.07258419, ..., 0.08374355, 0.01698941,
        0.08168347])]

Применить модели на тестовую выборку и оценить корреляцию.


In [103]:
y_pred_lgb = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
#test[categorial] = test[categorial].astype(str)
new_test = test[numerical]
for estimator in estimators:
    y_pred_lgb += estimator.predict(new_test)

In [104]:
y_pred_lgb = y_pred_lgb / cv.n_splits

In [105]:
y_pred_lgb

array([0.07099927, 0.30240174, 0.14171314, ..., 0.082115  , 0.02035166,
       0.03268883])

In [106]:
oof_test = []
oof_test.append(y_pred_xgb)
oof_test.append(y_pred_lgb)
oof_test = np.array(oof_test)
oof_test

array([[0.0916783 , 0.14018704, 0.14139074, ..., 0.09751794, 0.07562688,
        0.08285133],
       [0.07099927, 0.30240174, 0.14171314, ..., 0.082115  , 0.02035166,
        0.03268883]])

In [107]:
cor = pd.DataFrame(oof_train)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,110078,110079,110080,110081,110082,110083,110084,110085,110086,110087,110088,110089,110090,110091,110092
0,0.020221,0.086337,0.224264,0.341367,0.237372,0.325107,0.256032,0.143219,0.118276,0.242048,0.292792,0.250288,0.020357,0.116746,0.332793,...,0.068753,0.05201,0.271971,0.09791,0.239915,0.058776,0.237372,0.086337,0.292792,0.083044,0.275635,0.280509,0.292792,0.215726,0.315582
1,0.050722,0.082814,0.072584,0.143022,0.083744,0.207038,0.011906,0.046242,0.064347,0.069975,0.082045,0.024384,0.034073,0.095641,0.187914,...,0.063456,0.051148,0.025271,0.050759,0.011411,0.036821,0.081329,0.083016,0.081329,0.081329,0.083744,0.066289,0.083744,0.016989,0.081683


In [108]:
np.mean(oof_train[0] - oof_train[1])

0.11507584187832222

In [109]:
np.max(oof_train[0] - oof_train[1])

0.4941266823907058

In [110]:
cor = pd.DataFrame(oof_test)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,165126,165127,165128,165129,165130,165131,165132,165133,165134,165135,165136,165137,165138,165139,165140
0,0.091678,0.140187,0.141391,0.097518,0.077565,0.080674,0.097518,0.123166,0.07324,0.078034,0.097518,0.084682,0.082137,0.100131,0.078583,...,0.077121,0.102227,0.074599,0.073807,0.072255,0.097518,0.097518,0.097518,0.072948,0.097518,0.097518,0.079203,0.097518,0.075627,0.082851
1,0.070999,0.302402,0.141713,0.082115,0.02086,0.017728,0.082115,0.105363,0.002564,0.018398,0.082115,0.043093,0.030792,0.096948,0.018875,...,0.022888,0.069734,0.008201,0.007681,0.004247,0.082115,0.082115,0.082115,0.0065,0.082115,0.082115,0.053758,0.082115,0.020352,0.032689


In [111]:
np.mean(oof_test[0] - oof_test[1])

0.017855433748181063

Усреднить прогнозы с помощью арифмитического среднего, геометрического среднего

In [112]:
oof_preds_arifm = (oof_preds_xgb + oof_preds_lgb) / 2

In [113]:
oof_preds_geom = (oof_preds_xgb * oof_preds_lgb) ** 0.5

In [114]:
oof_score = roc_auc_score(
    target, oof_preds_arifm
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.67081


In [115]:
oof_score = roc_auc_score(
    target, oof_preds_geom
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.70184


catBoost

In [120]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=categorial
)

Tue Sep 29 19:17:08 2020, Cross-Validation, 110093 rows, 53 cols
0:	test: 0.6734332	test1: 0.6641824	best: 0.6641824 (0)	total: 115ms	remaining: 3m 50s
10:	test: 0.7007889	test1: 0.6938119	best: 0.6938119 (10)	total: 1.17s	remaining: 3m 30s
20:	test: 0.7029293	test1: 0.6963111	best: 0.6970105 (18)	total: 2.19s	remaining: 3m 26s
30:	test: 0.7058306	test1: 0.6981630	best: 0.6985139 (29)	total: 3.23s	remaining: 3m 24s
40:	test: 0.7068103	test1: 0.6994951	best: 0.6994951 (40)	total: 4.29s	remaining: 3m 24s
50:	test: 0.7073623	test1: 0.7009414	best: 0.7010275 (46)	total: 5.3s	remaining: 3m 22s
60:	test: 0.7083320	test1: 0.7028293	best: 0.7028614 (59)	total: 6.3s	remaining: 3m 20s
70:	test: 0.7087590	test1: 0.7025761	best: 0.7029574 (64)	total: 7.32s	remaining: 3m 18s
80:	test: 0.7094492	test1: 0.7035250	best: 0.7035250 (80)	total: 8.35s	remaining: 3m 17s
90:	test: 0.7100618	test1: 0.7038065	best: 0.7038065 (90)	total: 9.37s	remaining: 3m 16s
100:	test: 0.7107377	test1: 0.7049387	best: 0.704

120:	test: 0.7102942	test1: 0.7094887	best: 0.7094887 (120)	total: 12.3s	remaining: 3m 11s
130:	test: 0.7109416	test1: 0.7096544	best: 0.7097525 (125)	total: 13.4s	remaining: 3m 10s
140:	test: 0.7114409	test1: 0.7100903	best: 0.7101572 (138)	total: 14.3s	remaining: 3m 8s
150:	test: 0.7124348	test1: 0.7114451	best: 0.7114451 (150)	total: 15.3s	remaining: 3m 7s
160:	test: 0.7130634	test1: 0.7116085	best: 0.7116093 (156)	total: 16.4s	remaining: 3m 6s
170:	test: 0.7134697	test1: 0.7120329	best: 0.7122668 (166)	total: 17.4s	remaining: 3m 6s
180:	test: 0.7138800	test1: 0.7124840	best: 0.7125025 (179)	total: 18.4s	remaining: 3m 5s
190:	test: 0.7145701	test1: 0.7128818	best: 0.7128818 (190)	total: 19.4s	remaining: 3m 4s
200:	test: 0.7150727	test1: 0.7131927	best: 0.7131927 (200)	total: 20.4s	remaining: 3m 2s
210:	test: 0.7157641	test1: 0.7139536	best: 0.7139536 (210)	total: 21.4s	remaining: 3m 1s
220:	test: 0.7165304	test1: 0.7144560	best: 0.7144675 (218)	total: 22.4s	remaining: 3m
230:	test: 

200:	test: 0.7148753	test1: 0.7147597	best: 0.7150023 (198)	total: 21.4s	remaining: 3m 11s
210:	test: 0.7155343	test1: 0.7160551	best: 0.7160551 (210)	total: 22.5s	remaining: 3m 10s
220:	test: 0.7162537	test1: 0.7163370	best: 0.7163370 (220)	total: 23.6s	remaining: 3m 10s
230:	test: 0.7170317	test1: 0.7167893	best: 0.7168715 (229)	total: 24.7s	remaining: 3m 8s
240:	test: 0.7176536	test1: 0.7172015	best: 0.7172213 (239)	total: 25.6s	remaining: 3m 7s
250:	test: 0.7183309	test1: 0.7181379	best: 0.7181379 (250)	total: 26.6s	remaining: 3m 5s
260:	test: 0.7188662	test1: 0.7184569	best: 0.7184569 (260)	total: 27.7s	remaining: 3m 4s
270:	test: 0.7195983	test1: 0.7190029	best: 0.7190029 (270)	total: 28.7s	remaining: 3m 3s
280:	test: 0.7200566	test1: 0.7191380	best: 0.7191650 (279)	total: 29.6s	remaining: 3m
290:	test: 0.7205093	test1: 0.7196423	best: 0.7198270 (289)	total: 30.6s	remaining: 2m 59s
300:	test: 0.7211720	test1: 0.7199142	best: 0.7200460 (299)	total: 31.7s	remaining: 2m 58s
310:	tes

250:	test: 0.7205258	test1: 0.7090452	best: 0.7093088 (238)	total: 26s	remaining: 3m 1s
260:	test: 0.7211952	test1: 0.7094150	best: 0.7094454 (258)	total: 27.1s	remaining: 3m
270:	test: 0.7217562	test1: 0.7098355	best: 0.7098355 (270)	total: 28.2s	remaining: 2m 59s
280:	test: 0.7224329	test1: 0.7102272	best: 0.7102272 (280)	total: 29.2s	remaining: 2m 58s
290:	test: 0.7230225	test1: 0.7106736	best: 0.7107217 (289)	total: 30.1s	remaining: 2m 56s
300:	test: 0.7236039	test1: 0.7109276	best: 0.7110676 (292)	total: 31.1s	remaining: 2m 55s
310:	test: 0.7241218	test1: 0.7112946	best: 0.7114504 (309)	total: 32.2s	remaining: 2m 54s
320:	test: 0.7245276	test1: 0.7117339	best: 0.7117339 (320)	total: 33.2s	remaining: 2m 53s
330:	test: 0.7249405	test1: 0.7118951	best: 0.7118951 (330)	total: 34.2s	remaining: 2m 52s
340:	test: 0.7255719	test1: 0.7123632	best: 0.7123632 (340)	total: 35.2s	remaining: 2m 51s
350:	test: 0.7260102	test1: 0.7125803	best: 0.7127009 (347)	total: 36.2s	remaining: 2m 50s
360:	t

1150:	test: 0.7476175	test1: 0.7213730	best: 0.7213793 (1149)	total: 1m 54s	remaining: 1m 24s
1160:	test: 0.7478340	test1: 0.7212903	best: 0.7213793 (1149)	total: 1m 55s	remaining: 1m 23s
1170:	test: 0.7479798	test1: 0.7213998	best: 0.7214150 (1168)	total: 1m 56s	remaining: 1m 22s
1180:	test: 0.7480866	test1: 0.7214179	best: 0.7214179 (1180)	total: 1m 57s	remaining: 1m 21s
1190:	test: 0.7482241	test1: 0.7213467	best: 0.7215495 (1184)	total: 1m 58s	remaining: 1m 20s
1200:	test: 0.7485157	test1: 0.7213675	best: 0.7215495 (1184)	total: 1m 59s	remaining: 1m 19s
1210:	test: 0.7486463	test1: 0.7213554	best: 0.7215495 (1184)	total: 2m	remaining: 1m 18s
1220:	test: 0.7488918	test1: 0.7213513	best: 0.7215495 (1184)	total: 2m 1s	remaining: 1m 17s
1230:	test: 0.7490701	test1: 0.7214572	best: 0.7215495 (1184)	total: 2m 2s	remaining: 1m 16s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7215494853
bestIteration = 1184

Shrink model to first 1185 iterations.
Fold 4, Valid score 

800:	test: 0.7393180	test1: 0.7236731	best: 0.7238894 (776)	total: 1m 21s	remaining: 2m 1s
810:	test: 0.7397406	test1: 0.7237608	best: 0.7238894 (776)	total: 1m 22s	remaining: 2m
820:	test: 0.7400769	test1: 0.7240972	best: 0.7240972 (820)	total: 1m 23s	remaining: 1m 59s
830:	test: 0.7401985	test1: 0.7240538	best: 0.7241155 (821)	total: 1m 24s	remaining: 1m 58s
840:	test: 0.7403660	test1: 0.7241150	best: 0.7241158 (839)	total: 1m 25s	remaining: 1m 57s
850:	test: 0.7405366	test1: 0.7241955	best: 0.7242106 (847)	total: 1m 26s	remaining: 1m 56s
860:	test: 0.7407310	test1: 0.7240821	best: 0.7242293 (854)	total: 1m 27s	remaining: 1m 55s
870:	test: 0.7409686	test1: 0.7239965	best: 0.7242293 (854)	total: 1m 28s	remaining: 1m 54s
880:	test: 0.7412699	test1: 0.7241222	best: 0.7242293 (854)	total: 1m 29s	remaining: 1m 53s
890:	test: 0.7414287	test1: 0.7240674	best: 0.7242293 (854)	total: 1m 30s	remaining: 1m 52s
900:	test: 0.7416672	test1: 0.7241307	best: 0.7242531 (898)	total: 1m 31s	remaining: 

In [121]:
oof_preds_cb = oof_preds
oof_score = roc_auc_score(
    target, oof_preds_cb
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72378


Ансамбль нескольких моделей

оценить корреляцию прогнозов на обучающей выборке

In [122]:
oof_train = []
oof_train.append(oof_preds_lgb)
oof_train.append(oof_preds_cb)
#oof_train = np.array(oof_train)
oof_train

[array([0.05072209, 0.0828137 , 0.07258419, ..., 0.08374355, 0.01698941,
        0.08168347]),
 array([0.02397243, 0.08568921, 0.06078126, ..., 0.08364594, 0.03411563,
        0.06130429])]

Применить модели на тестовую выборку и оценить корреляцию.


In [123]:
#test = test[numerical]
y_pred_cb = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

for estimator in estimators:
    y_pred_cb += estimator.predict_proba(test)[:, 1]

In [124]:
y_pred_cb = y_pred_cb / cv.n_splits
y_pred_cb

array([0.0579011 , 0.21880019, 0.19389816, ..., 0.08437272, 0.02094533,
       0.05025549])

In [125]:
oof_test = []
oof_test.append(y_pred_lgb)
oof_test.append(y_pred_cb)
oof_test = np.array(oof_test)
oof_test

array([[0.07099927, 0.30240174, 0.14171314, ..., 0.082115  , 0.02035166,
        0.03268883],
       [0.0579011 , 0.21880019, 0.19389816, ..., 0.08437272, 0.02094533,
        0.05025549]])

In [126]:
cor = pd.DataFrame(oof_train)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,110078,110079,110080,110081,110082,110083,110084,110085,110086,110087,110088,110089,110090,110091,110092
0,0.050722,0.082814,0.072584,0.143022,0.083744,0.207038,0.011906,0.046242,0.064347,0.069975,0.082045,0.024384,0.034073,0.095641,0.187914,...,0.063456,0.051148,0.025271,0.050759,0.011411,0.036821,0.081329,0.083016,0.081329,0.081329,0.083744,0.066289,0.083744,0.016989,0.081683
1,0.023972,0.085689,0.060781,0.202907,0.082412,0.157971,0.036464,0.106675,0.108352,0.072403,0.083646,0.066941,0.021256,0.107796,0.246632,...,0.066901,0.044671,0.066687,0.099185,0.019749,0.040882,0.082412,0.085689,0.063469,0.084203,0.085914,0.05419,0.083646,0.034116,0.061304


In [127]:
np.mean(oof_train[0] - oof_train[1])

-0.001524957304519215

In [128]:
np.max(oof_train[0] - oof_train[1])

0.6606265476139571

In [129]:
cor = pd.DataFrame(oof_test)
cor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,165126,165127,165128,165129,165130,165131,165132,165133,165134,165135,165136,165137,165138,165139,165140
0,0.070999,0.302402,0.141713,0.082115,0.02086,0.017728,0.082115,0.105363,0.002564,0.018398,0.082115,0.043093,0.030792,0.096948,0.018875,...,0.022888,0.069734,0.008201,0.007681,0.004247,0.082115,0.082115,0.082115,0.0065,0.082115,0.082115,0.053758,0.082115,0.020352,0.032689
1,0.057901,0.2188,0.193898,0.084373,0.021954,0.0276,0.062838,0.121554,0.013626,0.032082,0.084373,0.053428,0.041428,0.10459,0.035006,...,0.027869,0.093394,0.01265,0.021012,0.011818,0.084373,0.084373,0.084373,0.010979,0.084373,0.084373,0.045271,0.084373,0.020945,0.050255


In [130]:
np.mean(oof_test[0] - oof_test[1])

-0.0013118593506077378

Усреднить прогнозы с помощью арифмитического среднего, геометрического среднего

In [131]:
oof_preds_arifm = (oof_preds_lgb + oof_preds_cb) / 2

In [132]:
oof_preds_geom = (oof_preds_lgb * oof_preds_cb) ** 0.5

In [133]:
oof_score = roc_auc_score(
    target, oof_preds_arifm
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72378


In [134]:
oof_score = roc_auc_score(
    target, oof_preds_geom
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.7243


Вывод:

Лучше всего с геперпараметрами из коробки работает catBoost. Он показал результат 72%. LightGBM занял 2 место и показал результат 71%, но зато модель обучалась значительно быстрее. XGBoost показал плохой результат, но возможно требуется тюнинг гиперпараметров. Ансамбль моделей catBoost и lgb не привел к улучшению результата. 