In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

from sklearn.model_selection import (train_test_split, cross_val_score, cross_val_predict, cross_validate,
                                    GridSearchCV, RandomizedSearchCV, KFold)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix


import lightgbm as lgb


import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set()
sns.set_palette('husl')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_rows=100
pd.options.display.max_columns=250

#### Чтение и обзор данных

In [2]:
def read_data(path):
    df = pd.read_csv(path)
    print(f'Shape: {df.shape}')
    return df

In [3]:
train_df = read_data('data/train.csv')
train_df.head(3)

Shape: (595212, 59)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10,1,-1,0,1,4,1,0,0,1,12,2,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0


In [4]:
test_df = read_data('data/test.csv')
test_df.head(3)

Shape: (892816, 58)


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,0,0,0,0,0,12,1,0,0,0.5,0.3,0.610328,7,1,-1,0,-1,1,1,1,2,1,65,1,0.316228,0.669556,0.352136,3.464102,0.1,0.8,0.6,1,1,6,3,6,2,9,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,0,0,0,0,0,5,1,0,0,0.9,0.5,0.771362,4,1,-1,0,0,11,1,1,0,1,103,1,0.316228,0.60632,0.358329,2.828427,0.4,0.5,0.4,3,3,8,4,10,2,7,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,0,0,0,0,0,10,0,0,0,0.4,0.0,0.916174,11,1,-1,0,-1,14,1,1,2,1,29,3,0.4,0.896239,0.398497,3.316625,0.6,0.6,0.6,2,3,7,4,6,3,12,4,0,2,4,0,0,0,0,0,0


### Списки признаков

In [5]:
category_features = train_df.filter(like='cat').columns.to_list()
category_features_car = train_df[category_features].filter(like='car').columns.to_list()
category_features_ind = train_df[category_features].filter(like='ind').columns.to_list()

binary_features = train_df.filter(like='bin').columns.tolist()
binary_features_ind = train_df[binary_features].filter(like='ind').columns.tolist()
binary_features_calc = train_df[binary_features].filter(like='calc').columns.tolist()

numeric_features = [col for col in train_df.columns if col not in (
    train_df.filter(like='cat').columns.tolist() + train_df.filter(like='bin').columns.tolist() + ['id', 'target'])]

numeric_features_ind = train_df[numeric_features].filter(like='ind').columns.tolist()
numeric_features_reg = train_df[numeric_features].filter(like='reg').columns.tolist()
numeric_features_calc = train_df[numeric_features].filter(like='calc').columns.tolist()
numeric_features_car = train_df[numeric_features].filter(like='car').columns.tolist()

In [6]:
features = [
    category_features_car +
    category_features_ind +
    binary_features_ind +
#     binary_features_calc +
    numeric_features_ind +
    numeric_features_reg +
#     numeric_features_calc +
    numeric_features_car
][0]

### Кросс-валидация

In [7]:
def lightgbm_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели lightgbm.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="auc", verbose=100, 
#             early_stopping_rounds=200,
            categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 3)}")
        folds_scores.append(round(score, 3))
        estimators.append(model)
        print("*"*65)
        print()

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

lgbm_params = {
    'random_state': 42, 
    'n_jobs': -1, 
    'n_estimators': 7500, 
    'learning_rate': 0.001, 
    'min_child_weight': 5.325007906385521, 
    'num_leaves': 31, 
    'subsample': 0.9678104205595882
}



cv = KFold(n_splits=7, random_state=27, shuffle=True)

In [8]:
%%time
estimators, oof_preds = lightgbm_cross_validation(
    lgbm_params, train_df[features], train_df['target'], cv=cv
)

Sat Mar 13 14:28:16 2021, Cross-Validation, 595212 rows, 37 cols
[100]	valid_0's auc: 0.611669	valid_0's binary_logloss: 0.155394
[200]	valid_0's auc: 0.614155	valid_0's binary_logloss: 0.154905
[300]	valid_0's auc: 0.615989	valid_0's binary_logloss: 0.154518
[400]	valid_0's auc: 0.617553	valid_0's binary_logloss: 0.154199
[500]	valid_0's auc: 0.619011	valid_0's binary_logloss: 0.153931
[600]	valid_0's auc: 0.620128	valid_0's binary_logloss: 0.153693
[700]	valid_0's auc: 0.621383	valid_0's binary_logloss: 0.153474
[800]	valid_0's auc: 0.622611	valid_0's binary_logloss: 0.153277
[900]	valid_0's auc: 0.623731	valid_0's binary_logloss: 0.153105
[1000]	valid_0's auc: 0.62433	valid_0's binary_logloss: 0.152965
[1100]	valid_0's auc: 0.62457	valid_0's binary_logloss: 0.152848
[1200]	valid_0's auc: 0.624907	valid_0's binary_logloss: 0.152744
[1300]	valid_0's auc: 0.625246	valid_0's binary_logloss: 0.152654
[1400]	valid_0's auc: 0.625616	valid_0's binary_logloss: 0.15257
[1500]	valid_0's auc: 0

[4900]	valid_0's auc: 0.64392	valid_0's binary_logloss: 0.155666
[5000]	valid_0's auc: 0.643928	valid_0's binary_logloss: 0.155664
[5100]	valid_0's auc: 0.643962	valid_0's binary_logloss: 0.155659
[5200]	valid_0's auc: 0.64397	valid_0's binary_logloss: 0.155655
[5300]	valid_0's auc: 0.643987	valid_0's binary_logloss: 0.15565
[5400]	valid_0's auc: 0.644006	valid_0's binary_logloss: 0.155645
[5500]	valid_0's auc: 0.644014	valid_0's binary_logloss: 0.155642
[5600]	valid_0's auc: 0.644014	valid_0's binary_logloss: 0.155639
[5700]	valid_0's auc: 0.644008	valid_0's binary_logloss: 0.155637
[5800]	valid_0's auc: 0.644005	valid_0's binary_logloss: 0.155636
[5900]	valid_0's auc: 0.644029	valid_0's binary_logloss: 0.155633
[6000]	valid_0's auc: 0.644054	valid_0's binary_logloss: 0.15563
[6100]	valid_0's auc: 0.644076	valid_0's binary_logloss: 0.155628
[6200]	valid_0's auc: 0.644095	valid_0's binary_logloss: 0.155625
[6300]	valid_0's auc: 0.64413	valid_0's binary_logloss: 0.155622
[6400]	valid_0'

[2100]	valid_0's auc: 0.636192	valid_0's binary_logloss: 0.151127
[2200]	valid_0's auc: 0.636543	valid_0's binary_logloss: 0.151083
[2300]	valid_0's auc: 0.636875	valid_0's binary_logloss: 0.151041
[2400]	valid_0's auc: 0.637142	valid_0's binary_logloss: 0.151003
[2500]	valid_0's auc: 0.637586	valid_0's binary_logloss: 0.150958
[2600]	valid_0's auc: 0.637928	valid_0's binary_logloss: 0.150922
[2700]	valid_0's auc: 0.638227	valid_0's binary_logloss: 0.150889
[2800]	valid_0's auc: 0.638529	valid_0's binary_logloss: 0.150859
[2900]	valid_0's auc: 0.638839	valid_0's binary_logloss: 0.150829
[3000]	valid_0's auc: 0.639142	valid_0's binary_logloss: 0.150801
[3100]	valid_0's auc: 0.639371	valid_0's binary_logloss: 0.150778
[3200]	valid_0's auc: 0.639599	valid_0's binary_logloss: 0.150756
[3300]	valid_0's auc: 0.639795	valid_0's binary_logloss: 0.150736
[3400]	valid_0's auc: 0.640002	valid_0's binary_logloss: 0.150717
[3500]	valid_0's auc: 0.640243	valid_0's binary_logloss: 0.150697
[3600]	val

[7000]	valid_0's auc: 0.644396	valid_0's binary_logloss: 0.153189
[7100]	valid_0's auc: 0.644427	valid_0's binary_logloss: 0.153186
[7200]	valid_0's auc: 0.644458	valid_0's binary_logloss: 0.153183
[7300]	valid_0's auc: 0.644489	valid_0's binary_logloss: 0.15318
[7400]	valid_0's auc: 0.644528	valid_0's binary_logloss: 0.153177
[7500]	valid_0's auc: 0.644562	valid_0's binary_logloss: 0.153173
Fold 5, Valid score = 0.645
*****************************************************************

[100]	valid_0's auc: 0.616934	valid_0's binary_logloss: 0.154673
[200]	valid_0's auc: 0.618826	valid_0's binary_logloss: 0.154199
[300]	valid_0's auc: 0.620638	valid_0's binary_logloss: 0.153813
[400]	valid_0's auc: 0.621969	valid_0's binary_logloss: 0.153487
[500]	valid_0's auc: 0.622837	valid_0's binary_logloss: 0.153216
[600]	valid_0's auc: 0.623944	valid_0's binary_logloss: 0.15298
[700]	valid_0's auc: 0.624838	valid_0's binary_logloss: 0.152778
[800]	valid_0's auc: 0.626172	valid_0's binary_logloss: 

[4200]	valid_0's auc: 0.641087	valid_0's binary_logloss: 0.148192
[4300]	valid_0's auc: 0.641227	valid_0's binary_logloss: 0.148179
[4400]	valid_0's auc: 0.641326	valid_0's binary_logloss: 0.148168
[4500]	valid_0's auc: 0.641382	valid_0's binary_logloss: 0.14816
[4600]	valid_0's auc: 0.641421	valid_0's binary_logloss: 0.148153
[4700]	valid_0's auc: 0.641446	valid_0's binary_logloss: 0.148148
[4800]	valid_0's auc: 0.641541	valid_0's binary_logloss: 0.14814
[4900]	valid_0's auc: 0.641609	valid_0's binary_logloss: 0.148132
[5000]	valid_0's auc: 0.641646	valid_0's binary_logloss: 0.148127
[5100]	valid_0's auc: 0.641699	valid_0's binary_logloss: 0.148121
[5200]	valid_0's auc: 0.641731	valid_0's binary_logloss: 0.148116
[5300]	valid_0's auc: 0.641765	valid_0's binary_logloss: 0.14811
[5400]	valid_0's auc: 0.641771	valid_0's binary_logloss: 0.148107
[5500]	valid_0's auc: 0.641808	valid_0's binary_logloss: 0.148102
[5600]	valid_0's auc: 0.641862	valid_0's binary_logloss: 0.148096
[5700]	valid_

In [9]:
oof_score = roc_auc_score(
    train_df['target'], oof_preds
)
print(f"OOF-score = {round(oof_score, 3)}")

OOF-score = 0.641


In [10]:
# Функция для метрики качества классификатора 
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)


np.round(gini_normalized(train_df['target'], oof_preds), 3)

0.283

In [11]:
test_preds = []
for estimator in tqdm(estimators):
    test_preds.append(estimator.predict_proba(test_df[features])[:,1])

preds = np.array(test_preds).T
preds.shape

100%|███████████████████████████████████████████████████████████████████████████████████| 7/7 [12:20<00:00, 105.78s/it]


In [13]:
preds = preds.mean(axis=1)

test_df['target'] = preds

test_df.loc[:, ['id', 'target']].to_csv('porto_seguro_predictions.csv', index=None)