# Library

In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold, KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.impute import SimpleImputer

import optuna
from optuna.integration import CatBoostPruningCallback
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
train = pd.read_parquet('train_prep_tfidf.parquet')

In [None]:
[column for column in train.columns if 'city' in column]

['trans_city_most_frequent']

In [None]:
train.dtypes

client_id            object
neg_amount_count      int64
pos_amount_count      int64
pos_amount_sum      float64
neg_amount_sum      float64
                     ...   
trans_type_140      float64
trans_type_141      float64
trans_type_142      float64
trans_type_143      float64
gender                int64
Length: 168, dtype: object

In [None]:
X_train = train.drop(columns=["gender", "client_id", "term_id_most_frequent"])
y_train = train["gender"]

In [None]:
X_train.head()

Unnamed: 0,neg_amount_count,pos_amount_count,pos_amount_sum,neg_amount_sum,amount_mean,amount_std,amount_max,amount_min,amount_median,minutes_mean,...,trans_type_134,trans_type_135,trans_type_136,trans_type_137,trans_type_138,trans_type_139,trans_type_140,trans_type_141,trans_type_142,trans_type_143
0,472,32,590428.81,-888203.3,-590.822401,7291.653648,42665.12,-39772.06,-220.1,30.47619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,258,9,21284.88,-74627.84,-199.786367,1213.39734,15909.56,-9400.52,-117.71,27.925094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,166,25,49533.81,-62930.8,-70.141309,1196.681764,5423.8,-5424.42,-125.14,30.764398,...,0.0,0.0,0.517735,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,665,301,310856.26,-494382.45,-189.985704,2109.594352,20247.5,-36518.54,-146.665,28.512422,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,167,39,48759.24,-232812.69,-893.46335,2853.228105,3616.52,-15907.79,-109.965,29.38835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
cat_features = ["trans_city_most_frequent"]
num_features = [column for column in X_train.columns if column not in ["client_id", *cat_features]]

In [None]:
for column in X_train.columns:
    if column in cat_features:
        X_train[column] = LabelEncoder().fit_transform(X_train[column])

In [None]:
X_train

Unnamed: 0,neg_amount_count,pos_amount_count,pos_amount_sum,neg_amount_sum,amount_mean,amount_std,amount_max,amount_min,amount_median,minutes_mean,seconds_mean,weighted_mcc,term_id_most_frequent,trans_city_most_frequent
0,472,32,590428.81,-888203.30,-590.822401,7291.653648,42665.12,-39772.06,-220.100,30.476190,29.835317,0.434680,27,6
1,258,9,21284.88,-74627.84,-199.786367,1213.397340,15909.56,-9400.52,-117.710,27.925094,31.071161,0.454215,27,4
2,166,25,49533.81,-62930.80,-70.141309,1196.681764,5423.80,-5424.42,-125.140,30.764398,30.104712,0.462549,27,2
3,664,301,310856.26,-493948.66,-189.733057,2110.673634,20247.50,-36518.54,-146.620,28.483938,29.649741,0.512688,487,0
4,167,39,48759.24,-232812.69,-893.463350,2853.228105,3616.52,-15907.79,-109.965,29.388350,31.592233,0.431898,27,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,725,231,991319.74,-1523362.06,-556.529623,4746.078273,36157.09,-37944.73,-328.225,29.120293,30.669456,0.433318,27,4
7556,337,12,43458.65,-205199.47,-463.440745,1539.874884,10847.20,-10756.68,-217.550,28.318052,30.303725,0.447343,27,9
7557,162,13,55680.60,-313102.43,-1470.981886,5207.843362,7231.79,-36156.56,-145.330,29.565714,30.977143,0.444224,27,8
7558,82,34,120112.95,-68802.34,442.332845,4283.052479,25308.26,-23639.09,-65.930,31.767241,27.724138,0.449289,27,9


# Random Forest

In [None]:
def fit_rf(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    imputer = SimpleImputer(strategy="constant", fill_value=0)

    X_train = imputer.fit_transform(X_train)
    X_val = imputer.fit_transform(X_val)

    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 60),
    }

    model = RandomForestClassifier(**param, n_jobs=-1)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    return y_pred, model

In [None]:
def objective(trial, return_models=False):
    kf = KFold(n_splits=5)

    scores, models = [], []

    for train_idx, valid_idx in kf.split(X_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        y_pred, model = fit_rf(trial, train_data, valid_data) # Определили выше
        scores.append(roc_auc_score(valid_data[1], y_pred))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
        raise optuna.TrialPruned()

    if return_models:
        return result, models
    return result

In [None]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
)
study.optimize(objective,
    n_trials=20,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-11-19 12:19:01,714] A new study created in memory with name: no-name-b4bcb796-5f1e-4f84-a6d5-82776a69fb83


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2023-11-19 12:20:06,819] Trial 0 finished with value: 0.7088437660170742 and parameters: {'n_estimators': 344, 'max_depth': 4, 'min_samples_split': 92, 'min_samples_leaf': 40}. Best is trial 0 with value: 0.7088437660170742.
[I 2023-11-19 12:20:17,934] Trial 3 finished with value: 0.7176167729536489 and parameters: {'n_estimators': 653, 'max_depth': 44, 'min_samples_split': 91, 'min_samples_leaf': 27}. Best is trial 1 with value: 0.7202651984551381.
[I 2023-11-19 12:20:27,134] Trial 1 finished with value: 0.7154678921844129 and parameters: {'n_estimators': 327, 'max_depth': 36, 'min_samples_split': 47, 'min_samples_leaf': 47}. Best is trial 1 with value: 0.7154678921844129.
[I 2023-11-19 12:20:51,076] Trial 2 finished with value: 0.7183056838602223 and parameters: {'n_estimators': 921, 'max_depth': 20, 'min_samples_split': 75, 'min_samples_leaf': 15}. Best is trial 1 with value: 0.7202651984551381.
[I 2023-11-19 12:21:38,622] Trial 2 finished with value: 0.7186408048721279 and param

In [None]:
study.best_value, study.best_params

(0.7231896564804511,
 {'n_estimators': 998,
  'max_depth': 47,
  'min_samples_split': 13,
  'min_samples_leaf': 2})

In [None]:
rf_best_params = {
    'n_estimators': 998,
    'max_depth': 47,
    'min_samples_split': 13,
    'min_samples_leaf': 2
}

# Catboost

In [None]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        "iterations": 1000, # Можно не перебирать, есть Early-Stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced"]),
        "depth": trial.suggest_int("depth", 3, 9),

        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "eval_metric": "AUC", # Тоже стоит заранее определиться

        "objective": trial.suggest_categorical("objective", ["Logloss"]),
    }


    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)

    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)


    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=42,
        cat_features=cat_features,
    )

    # pruning_callback = CatBoostPruningCallback(trial, "AUC")

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=100,
        # callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    # pruning_callback.check_pruned()

    y_pred = clf.predict_proba(X_val)[:, 1]
    return clf, y_pred

In [None]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = KFold(n_splits=n_splits)

    scores, models = [], []

    for train_idx, valid_idx in kf.split(X_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(roc_auc_score(valid_data[1], y_pred))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
        raise optuna.TrialPruned()

    if return_models:
        return result, models
    return result

In [None]:
study = optuna.create_study(direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
)
study.optimize(objective,
    n_trials=50,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2023-11-19 13:42:30,201] A new study created in memory with name: no-name-988479e4-2d59-4e71-b0e8-572968a0046a


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2023-11-19 13:45:00,038] Trial 0 finished with value: 0.810410684946989 and parameters: {'learning_rate': 0.2662019732546946, 'l2_leaf_reg': 48, 'colsample_bylevel': 0.4637392219693674, 'min_data_in_leaf': 40, 'auto_class_weights': 'Balanced', 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'objective': 'Logloss', 'bagging_temperature': 7.842498459698131}. Best is trial 0 with value: 0.810410684946989.
[I 2023-11-19 13:45:32,178] Trial 1 finished with value: 0.8138362264548897 and parameters: {'learning_rate': 0.08476752277907851, 'l2_leaf_reg': 44, 'colsample_bylevel': 0.7246361801490298, 'min_data_in_leaf': 82, 'auto_class_weights': 'Balanced', 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'objective': 'Logloss', 'bagging_temperature': 6.163743020376646}. Best is trial 1 with value: 0.8138362264548897.
[I 2023-11-19 13:46:37,199] Trial 3 finished with value: 0.8047311573207656 and parameters: {'learning_rate': 0.003482127934485393, 'l2

KeyboardInterrupt: ignored

In [None]:
study.best_params

{'learning_rate': 0.045738501890857826,
 'l2_leaf_reg': 48,
 'colsample_bylevel': 0.5954013003287684,
 'min_data_in_leaf': 81,
 'auto_class_weights': 'Balanced',
 'depth': 9,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'objective': 'Logloss'}

# XGBoost

In [None]:
xgb_params = {
    'colsample_bylevel': 0.8926654200543236,
    'colsample_bytree': 0.8521003717531913,
    'colsample_bynode': 0.7175154529959948,
    'max_depth': 8,
    'max_bin': 104,
    'gamma': 3,
    'lambda': 75,
    'booster': 'dart',
}

In [None]:
X_train

Unnamed: 0,neg_amount_count,pos_amount_count,pos_amount_sum,neg_amount_sum,amount_mean,amount_std,amount_max,amount_min,amount_median,minutes_mean,seconds_mean,weighted_mcc,term_id_most_frequent,trans_city_most_frequent
0,472,32,590428.81,-888203.30,-590.822401,7291.653648,42665.12,-39772.06,-220.100,30.476190,29.835317,0.434680,27,6
1,258,9,21284.88,-74627.84,-199.786367,1213.397340,15909.56,-9400.52,-117.710,27.925094,31.071161,0.454215,27,4
2,166,25,49533.81,-62930.80,-70.141309,1196.681764,5423.80,-5424.42,-125.140,30.764398,30.104712,0.462549,27,2
3,664,301,310856.26,-493948.66,-189.733057,2110.673634,20247.50,-36518.54,-146.620,28.483938,29.649741,0.512688,487,0
4,167,39,48759.24,-232812.69,-893.463350,2853.228105,3616.52,-15907.79,-109.965,29.388350,31.592233,0.431898,27,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,725,231,991319.74,-1523362.06,-556.529623,4746.078273,36157.09,-37944.73,-328.225,29.120293,30.669456,0.433318,27,4
7556,337,12,43458.65,-205199.47,-463.440745,1539.874884,10847.20,-10756.68,-217.550,28.318052,30.303725,0.447343,27,9
7557,162,13,55680.60,-313102.43,-1470.981886,5207.843362,7231.79,-36156.56,-145.330,29.565714,30.977143,0.444224,27,8
7558,82,34,120112.95,-68802.34,442.332845,4283.052479,25308.26,-23639.09,-65.930,31.767241,27.724138,0.449289,27,9


In [None]:
kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

scores, models = [], []

for train_idx, valid_idx in kf.split(X_train):
    train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
    valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

    train_dataset = xgb.DMatrix(
        train_data[0],
        label=train_data[1],
        nthread=-1,
        enable_categorical=True,
    )

    eval_dataset = xgb.DMatrix(
        valid_data[0],
        label=valid_data[1],
        nthread=-1,
        enable_categorical=True,
    )

    model = xgb.train(
        params=xgb_params,
        num_boost_round=100,
        dtrain=train_dataset,
        evals=[(train_dataset, 'dtrain'), (eval_dataset, 'dtest')],
        maximize=True,
        verbose_eval=False,
        early_stopping_rounds=100
    )

    y_pred = model.predict_proba(eval_dataset)[:, 1]
    # print(y_pred)

    score = roc_auc_score(valid_data[1], y_pred)
    print(score)
    scores.append(score)
    models.append(model)
result = np.mean(scores) - np.std(scores)

0.7931480746327908
0.8148563269676958
0.8150625054637644
0.7979368932038835
0.8127000238307487
0.8057155244047833
0.8135207586132899
0.8151238384623538
0.8048346276185211
0.7930512337796889


In [None]:
print(result)
print(scores)

0.7979867560683549
[0.7931480746327908, 0.8148563269676958, 0.8150625054637644, 0.7979368932038835, 0.8127000238307487, 0.8057155244047833, 0.8135207586132899, 0.8151238384623538, 0.8048346276185211, 0.7930512337796889]


# LightGBM

In [None]:
lgbm_params = {
    'lambda_l1': 5.443870803445866,
    'lambda_l2': 9.63131390497079,
    'num_leaves': 15,
    'feature_fraction': 0.8828215533898842,
    'bagging_fraction': 0.7141695226697752,
    'bagging_freq': 3,
    'min_child_samples': 34,
    'learning_rate': 0.0068789842499822765
}

In [None]:
kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

scores, models = [], []

for train_idx, valid_idx in kf.split(X_train):
    train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
    valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]
    # Подаем trials для перебора
    model = LGBMClassifier(**lgbm_params).fit(X_train, y_train)
    y_pred = model.predict(valid_data[0])[:, 1]
    score = roc_auc_score(y_pred, valid_data[1])
    print(score)
    scores.append(score)
    models.append(model)
result = np.mean(scores) - np.std(scores)

[LightGBM] [Info] Number of positive: 3361, number of negative: 4199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3325
[LightGBM] [Info] Number of data points in the train set: 7560, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444577 -> initscore=-0.222608
[LightGBM] [Info] Start training from score -0.222608
0.7267523775977457
[LightGBM] [Info] Number of positive: 3361, number of negative: 4199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3325
[LightGBM] [Info] Number of data points in the train set: 7560, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444577 -> initscore=-0.222608
[LightGBM] [Info] Start training from score -0.222608

In [None]:
print(scores)
print(result)

[0.7267523775977457, 0.752881347876714, 0.7514880952380953, 0.7391843161649642, 0.7463194872234742, 0.7486837355896522, 0.7463452128473251, 0.7419832839399289, 0.7405259141053216, 0.7381891118605406]
0.7359380722010305


# Catboost

In [None]:
catboost_features = {
    'learning_rate': 0.226604471656182,
    'l2_leaf_reg': 39,
    'colsample_bylevel': 0.16426127215861697,
    'min_data_in_leaf': 81,
    'auto_class_weights': 'SqrtBalanced',
    'depth': 5,
    'boosting_type': 'Plain',
    'bootstrap_type': 'MVS',
    'objective': 'Logloss',
    'random_seed': 42,
}

In [None]:
model = CatBoostClassifier(**catboost_features, cat_features=cat_features).fit(X_train, y_train)

0:	learn: 0.6529862	total: 10.3ms	remaining: 10.3s
1:	learn: 0.6220115	total: 17.7ms	remaining: 8.85s
2:	learn: 0.5879309	total: 27.7ms	remaining: 9.21s
3:	learn: 0.5684437	total: 35.9ms	remaining: 8.93s
4:	learn: 0.5590253	total: 44.5ms	remaining: 8.85s
5:	learn: 0.5501777	total: 52.7ms	remaining: 8.73s
6:	learn: 0.5441567	total: 62ms	remaining: 8.79s
7:	learn: 0.5390912	total: 69.5ms	remaining: 8.62s
8:	learn: 0.5351888	total: 77ms	remaining: 8.48s
9:	learn: 0.5330466	total: 86.8ms	remaining: 8.59s
10:	learn: 0.5314756	total: 94.6ms	remaining: 8.51s
11:	learn: 0.5300759	total: 103ms	remaining: 8.44s
12:	learn: 0.5286989	total: 110ms	remaining: 8.37s
13:	learn: 0.5282116	total: 118ms	remaining: 8.32s
14:	learn: 0.5255920	total: 129ms	remaining: 8.47s
15:	learn: 0.5239941	total: 137ms	remaining: 8.42s
16:	learn: 0.5220428	total: 145ms	remaining: 8.36s
17:	learn: 0.5195260	total: 152ms	remaining: 8.29s
18:	learn: 0.5182748	total: 166ms	remaining: 8.55s
19:	learn: 0.5173101	total: 174ms	

In [None]:
kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=42)

scores, models = [], []

for train_idx, valid_idx in kf.split(X_train):
    train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
    valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]
    # Подаем trials для перебора
    y_pred = model.predict_proba(valid_data[0])[:, 1]
    score = roc_auc_score(valid_data[1], y_pred)
    print(score)
    scores.append(score)
    models.append(model)
result = np.mean(scores) - np.std(scores)

0.9913245997088792
0.9954256648224167
0.99619811207987
0.9960805204335064
0.9943547604127133


In [None]:
print(scores)
print(result)

[0.9861602928851837, 0.9902284490451544, 0.9897646968569751, 0.9887548684804697, 0.9894262085278775]
0.9874308701397762


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_tr, X_ev, y_tr, y_ev = train_test_split(X_train, y_train)

In [None]:
model = CatBoostClassifier(**catboost_features, cat_features=cat_features).fit(X_tr, y_tr)

0:	learn: 0.6856581	total: 13.9ms	remaining: 13.9s
1:	learn: 0.6793910	total: 20.9ms	remaining: 10.4s
2:	learn: 0.6374057	total: 26.4ms	remaining: 8.78s
3:	learn: 0.6010253	total: 34.1ms	remaining: 8.5s
4:	learn: 0.5735907	total: 39.7ms	remaining: 7.89s
5:	learn: 0.5577118	total: 47.4ms	remaining: 7.85s
6:	learn: 0.5505628	total: 52ms	remaining: 7.38s
7:	learn: 0.5446171	total: 58.4ms	remaining: 7.24s
8:	learn: 0.5434422	total: 65ms	remaining: 7.16s
9:	learn: 0.5371683	total: 72.9ms	remaining: 7.21s
10:	learn: 0.5353858	total: 77.6ms	remaining: 6.97s
11:	learn: 0.5342040	total: 84.3ms	remaining: 6.94s
12:	learn: 0.5298193	total: 90.5ms	remaining: 6.87s
13:	learn: 0.5264144	total: 106ms	remaining: 7.48s
14:	learn: 0.5251891	total: 111ms	remaining: 7.31s
15:	learn: 0.5228227	total: 124ms	remaining: 7.66s
16:	learn: 0.5210416	total: 131ms	remaining: 7.57s
17:	learn: 0.5204642	total: 138ms	remaining: 7.52s
18:	learn: 0.5192201	total: 147ms	remaining: 7.6s
19:	learn: 0.5181709	total: 174ms	

In [None]:
roc_auc_score(y_ev, model.predict_proba(X_ev)[:, 1])

0.7860740228138258

In [None]:
for feature_name, feature_import in zip(model.feature_importances_, model.feature_names_):
    if feature_name > 1:
        print(f"Feature {feature_import}, importance: {feature_name}")

Feature 2.9526995951389288, importance: neg_amount_count
Feature 3.618775529319729, importance: pos_amount_count
Feature 2.948722535280087, importance: pos_amount_sum
Feature 4.564079711311852, importance: neg_amount_sum
Feature 3.974683642071701, importance: amount_mean
Feature 4.117748620368381, importance: amount_std
Feature 4.310646755467562, importance: amount_max
Feature 4.352506579708487, importance: amount_min
Feature 4.497444307750111, importance: amount_median
Feature 4.274606958266739, importance: minutes_mean
Feature 4.958709981733363, importance: seconds_mean
Feature 19.08085898985538, importance: weighted_mcc
Feature 4.496961046967982, importance: weighted_type
Feature 5.427116031427781, importance: weighted_term_id
Feature 2.37093618123751, importance: tr_count
Feature 3.2331231098861504, importance: tr_weekend_count
Feature 4.5086651098595585, importance: tr_dinner_count
Feature 3.7741739771184752, importance: tr_evening_count
Feature 2.794277380728271, importance: tr_2

In [None]:
import pickle
pickle.dump(model, open("model.pkl", "wb"))

# ExtraTrees

In [None]:
extra_trees_params = {
    "n_estimators": 4380,
    "max_depth": 24,
    "min_samples_leaf": 3,
}

In [None]:
imputer = SimpleImputer(strategy="constant", fill_value=0)

X_train = imputer.fit_transform(X_train)

In [None]:
kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

scores, models = [], []

for train_idx, valid_idx in kf.split(X_train):
    train_data = X_train[train_idx, :], y_train[train_idx]
    valid_data = X_train[valid_idx, :], y_train[valid_idx]
    # Подаем trials для перебора
    model = ExtraTreesClassifier(**extra_trees_params).fit(train_data[0], train_data[1])
    y_pred = model.predict_proba(valid_data[0])[:, 1]
    score = roc_auc_score(valid_data[1], y_pred)
    print(score)
    scores.append(score)
    models.append(model)
result = np.mean(scores) - np.std(scores)

0.7824198315028009
0.7952239871497413
0.8087477319802823
0.7865559381350193
0.7972656422386781


KeyboardInterrupt: 

# Pipeline

1. Pipeline для num_features
2. Pipeline для cat_features
3. Pipeline для Nan

1. CatBoost
2. XGBoost
3. LightGBM
4. ExtraTrees
5. RandomForest

meta - LogisticRegression

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Добавим визуализации
import sklearn
sklearn.set_config(display='diagram')

import warnings
warnings.filterwarnings("ignore")

In [None]:
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)),
    ("imputer", SimpleImputer(strategy='most_frequent')),

])
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])

In [None]:
# соединим два предыдущих трансформера в один
preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, num_features),
    ("categorical", categorical_transformer, cat_features)])
preprocessor.set_output(transform="pandas")
preprocessor

In [None]:
extra_trees_params = {
    "n_estimators": 4380,
    "max_depth": 24,
    "min_samples_leaf": 3,
}
# catboost_params = {
#     'learning_rate': 0.226604471656182,
#     'l2_leaf_reg': 39,
#     'colsample_bylevel': 0.16426127215861697,
#     'min_data_in_leaf': 81,
#     'auto_class_weights': 'SqrtBalanced',
#     'depth': 5,
#     'boosting_type': 'Plain',
#     'bootstrap_type': 'MVS',
#     'objective': 'Logloss',
#     'random_seed': 42,
# }
catboost_params = {
    'learning_rate': 0.008739564590258282,
    'l2_leaf_reg': 49,
    'colsample_bylevel': 0.6656708453336319,
    'min_data_in_leaf': 54,
    'auto_class_weights': 'SqrtBalanced',
    'depth': 4,
    'boosting_type': 'Plain',
    'bootstrap_type': 'Bayesian',
    'objective': 'Logloss',
    'bagging_temperature': 0.11891332172234925
}
lgbm_params = {
    'lambda_l1': 2.749488771001674,
    'lambda_l2': 5.8737806902383455e-05,
    'num_leaves': 125,
    'feature_fraction': 0.6708654362970757,
    'bagging_fraction': 0.5659539966706881,
    'bagging_freq': 5,
    'min_child_samples': 58,
    'learning_rate': 0.004057973679075833,
    'min_data_in_leaf': 23,
    'path_smooth': 1.284238260658831,
    'n_estimators': 899
}
xgb_params = {
    'colsample_bylevel': 0.8926654200543236,
    'colsample_bytree': 0.8521003717531913,
    'colsample_bynode': 0.7175154529959948,
    'max_depth': 8,
    'max_bin': 104,
    'gamma': 3,
    'lambda': 75,
    'booster': 'dart',
}
rf_params = {
    'n_estimators': 998,
    'max_depth': 47,
    'min_samples_split': 13,
    'min_samples_leaf': 2
}
lg_params = {
    'penalty': 'l2',
    'C': 7.128139370296911,
    'tol': 0.08796358953092902,
    'l1_ratio': 0.11849176610029058,
    'solver': 'saga',
    'fit_intercept': True
}
rg_params = {
    'max_iter': 1121,
    'alpha': 1.6941317331050193e-05,
    'solver': 'auto'
}
ada_params = {
    'n_estimators': 1597,
    'algorithm': 'SAMME.R',
    'learning_rate': 0.029294240902587396
}

In [None]:
lgbm_model = LGBMClassifier(**lgbm_params)
cat_model = CatBoostClassifier(**catboost_params, cat_features=cat_features, verbose=0)

In [None]:
# список базовых моделей
estimators = [
    ("ExtraTrees",  make_pipeline(preprocessor, ExtraTreesClassifier(**extra_trees_params, n_jobs=-1,))),
    # ("Ridge", make_pipeline(preprocessor, RidgeClassifier(**rg_params))),

    ("XGBoost", make_pipeline(preprocessor, XGBClassifier(**xgb_params))),
    ("LightGBM", make_pipeline(preprocessor, lgbm_model)),
    # ("CatBoost", cat_model),

    ("AdaBoost", make_pipeline(preprocessor, AdaBoostClassifier(**ada_params)))

    # То, что не дало прироста в ансамбле
    # ("SVM", make_pipeline(preprocessor, LinearSVC(verbose=False))),
    # ("MLP", make_pipeline(preprocessor, MLPClassifier(verbose=False, hidden_layer_sizes=(100, 30, ), alpha=0.001,random_state=75, max_iter = 1300, ))),
    # ("Random_forest",  make_pipeline(preprocessor, RandomForestClassifier(**rf_params, n_jobs=-1))),
    # ("Logit", make_pipeline(preprocessor, LogisticRegression(**lg_params))),
]

# в качестве мета-модели будем использовать LogisticRegression
meta_model = VotingClassifier(
    estimators=estimators,
    voting='soft',
    n_jobs=-1,
    verbose=False,
)
meta_model

In [None]:
meta_model.fit(X_train, y_train)

In [None]:
list(X_train.columns)

['neg_amount_count',
 'pos_amount_count',
 'pos_amount_sum',
 'neg_amount_sum',
 'amount_mean',
 'amount_std',
 'amount_max',
 'amount_min',
 'amount_median',
 'minutes_mean',
 'seconds_mean',
 'weighted_mcc',
 'weighted_type',
 'weighted_term_id',
 'tr_count',
 'tr_weekend_count',
 'tr_dinner_count',
 'tr_evening_count',
 'tr_22h',
 'tr_2day',
 'trans_city_most_frequent',
 'trans_type_0',
 'trans_type_1',
 'trans_type_2',
 'trans_type_3',
 'trans_type_4',
 'trans_type_5',
 'trans_type_6',
 'trans_type_7',
 'trans_type_8',
 'trans_type_9',
 'trans_type_10',
 'trans_type_11',
 'trans_type_12',
 'trans_type_13',
 'trans_type_14',
 'trans_type_15',
 'trans_type_16',
 'trans_type_17',
 'trans_type_18',
 'trans_type_19',
 'trans_type_20',
 'trans_type_21',
 'trans_type_22',
 'trans_type_23',
 'trans_type_24',
 'trans_type_25',
 'trans_type_26',
 'trans_type_27',
 'trans_type_28',
 'trans_type_29',
 'trans_type_30',
 'trans_type_31',
 'trans_type_32',
 'trans_type_33',
 'trans_type_34',
 'tr

In [None]:
import pickle

In [None]:
pickle.dump(meta_model, open("model_voting.pkl", "wb"))

In [None]:
kf = KFold(n_splits=5)

scores = []

for train_idx, valid_idx in kf.split(X_train):
    train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
    valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]
    # Подаем trials для перебора
    y_pred = meta_model.predict_proba(valid_data[0])[:, 1];
    score = roc_auc_score(valid_data[1], y_pred)
    print(score)
    scores.append(score)
result = np.mean(scores) - np.std(scores)

In [None]:
print(result)
print(scores)

0.8974997352340396
[0.9093787149240223, 0.8942440194241194, 0.9010563203376207, 0.9058990536277601, 0.902328448807322]


In [None]:
y_train

Unnamed: 0,neg_amount_count,pos_amount_count,pos_amount_sum,neg_amount_sum,amount_mean,amount_std,amount_max,amount_min,amount_median,minutes_mean,seconds_mean,weighted_mcc,term_id_most_frequent,trans_city_most_frequent
3050,1896,574,2328279.20,-1896174.26,174.941271,3853.181754,25309.85,-43386.95,-143.810,29.652632,29.082186,0.457520,1,Kaliningrad
6721,122,1,3616.32,-357358.87,-2875.955691,5573.032654,3616.32,-36155.08,-722.250,31.447154,28.471545,0.463940,1,Vladimir
2370,52,21,119611.04,-182009.44,-854.772603,6742.940325,21694.66,-21693.31,-181.260,31.739726,32.287671,0.445110,1,Tver
723,540,205,207253.09,-387717.29,-242.233826,1721.596620,14462.42,-14607.24,-144.890,29.712752,29.828188,0.453442,1,Penza
651,444,58,155586.58,-126404.08,58.132470,1389.045804,7231.47,-7158.45,-40.410,29.191235,29.364542,0.454848,1,Khabarovsk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5986,14,0,0.00,-209834.35,-14988.167857,16642.477412,-361.01,-36157.08,-3615.505,26.785714,25.071429,0.444277,1,Saint Petersburg
971,599,30,117036.81,-226300.07,-173.709475,1280.396137,10846.99,-10845.80,-154.610,30.310016,29.934817,0.441675,1,Kazan
3104,614,45,181698.22,-1032372.66,-1290.856510,6103.796271,21693.84,-72312.54,-209.520,30.251897,29.620637,0.440503,1,Moscow
1480,331,60,114885.62,-104121.69,27.529233,1120.765841,7230.76,-5061.39,-142.510,27.849105,29.762148,0.448590,1,Kaliningrad


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [None]:
stacking_classifier = meta_model
# stacking_classifier.fit(X_train, y_train)

In [None]:
pickle.dump(meta_model, open("model_voting.pkl", "wb"))

In [None]:
corr_df = pd.DataFrame()

for model, (name, _) in zip(stacking_classifier.estimators_, stacking_classifier.estimators):
    y_pred = model.fit(X_train, y_train).predict(X_val)
    print(name, 'roc_auc_score: ', round(roc_auc_score(y_val, y_pred), 4))

    corr_df[name] = model.predict(X_val)

ExtraTrees roc_auc_score:  0.64
XGBoost roc_auc_score:  0.7511
[LightGBM] [Info] Number of positive: 2521, number of negative: 3149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5069
[LightGBM] [Info] Number of data points in the train set: 5670, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444621 -> initscore=-0.222429
[LightGBM] [Info] Start training from score -0.222429
LightGBM roc_auc_score:  0.7512
AdaBoost roc_auc_score:  0.748


In [None]:
corr_df.corr().style.background_gradient(cmap="RdYlGn")

Unnamed: 0,ExtraTrees,XGBoost,LightGBM,AdaBoost
ExtraTrees,1.0,0.474431,0.490771,0.509379
XGBoost,0.474431,1.0,0.885425,0.839326
LightGBM,0.490771,0.885425,1.0,0.8371
AdaBoost,0.509379,0.839326,0.8371,1.0
