In [None]:
!pip install -U lightautoml

Collecting lightautoml
  Downloading lightautoml-0.3.8.1-py3-none-any.whl (416 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m416.4/416.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autowoe>=1.2 (from lightautoml)
  Downloading AutoWoE-1.3.2-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.7/215.7 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catboost>=0.26.1 (from lightautoml)
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes (from lightautoml)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting joblib<1.3.0 (from lightautoml)
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCo

In [None]:
# Standard python libraries
import os
import time
import requests


# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.boost_cb import BoostCB
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

np.random.seed(RANDOM_STATE)


DATASET_DIR = '../data/'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sberbank-ai-lab/LightAutoML/master/examples/data/sampled_app_train.csv'

In [None]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

CPU times: user 42 µs, sys: 0 ns, total: 42 µs
Wall time: 80.6 µs


In [None]:
data = pd.read_csv(DATASET_FULLNAME)
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


In [None]:
data['TARGET'].value_counts(normalize=True)

0    0.9201
1    0.0799
Name: TARGET, dtype: float64

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 9.3+ MB


In [None]:
train_data, test_data = train_test_split(data,
                                         test_size=TEST_SIZE,
                                         stratify=data[TARGET_NAME],
                                         random_state=RANDOM_STATE)

In [None]:
train_data.shape, test_data.shape

((8000, 122), (2000, 122))

In [None]:
# Pipeline #1

In [None]:
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.boost_cb import BoostCB
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

In [None]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [None]:
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
model0_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
selector_lvl1 = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)


pipe1 = LGBSimpleFeatures()
params_tuner1_lvl1 = OptunaTuner(n_trials=20, timeout=30)
model1_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
model2_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.02, 'num_leaves': 64, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)


#Pipeline - level 1
pipeline_lvl1 = MLPipeline([
    model0_lvl1,
    (model1_lvl1, params_tuner1_lvl1),
    model2_lvl1
], pre_selection=selector_lvl1, features_pipeline=pipe1, post_selection=None)



pipe2 = LGBSimpleFeatures()
model_lvl2 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)

pipeline_lvl2 = MLPipeline([model_lvl2], pre_selection=None, features_pipeline=pipe2, post_selection=None)


automl = AutoML(reader,
                [[pipeline_lvl1],
                [pipeline_lvl2]],
                skip_conn=False)

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})

INFO:lightautoml.reader.base:[1mTrain data shape: (8000, 122)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: ['REG_REGION_NOT_LIVE_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'FLAG_DOCUMENT_8']
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 9999999982.76 secs
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 100 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's auc: 0.728681
DEBUG:lightautoml.ml_algo.boost_lgbm:[200]	valid's auc: 0.726949
DEBUG:lightautoml.ml_algo.boost_lgbm:Early stopping, best iteration is:
[134]	valid's auc: 0.733604
INFO:lightautoml.ml_algo.base:[1mMod_0_LightGBM[0m fitting and predicting completed
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, '

In [None]:
test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))


Prediction for test data:
array([[0.05845162],
       [0.08084685],
       [0.07027299],
       ...,
       [0.08002303],
       [0.09411833],
       [0.14898382]], dtype=float32)
Shape = (2000, 1)
Check scores...
OOF score: 0.6902522472303063
TEST score: 0.7206589673913044


In [None]:
automl.levels[0][0].ml_algos[0].get_features_score()

EXT_SOURCE_3                  1885.253801
EXT_SOURCE_2                  1785.728023
DAYS_BIRTH                    1130.155983
DAYS_REGISTRATION             1016.024376
DAYS_ID_PUBLISH                961.878135
                                 ...     
REG_REGION_NOT_WORK_REGION       6.302472
ord__NAME_CONTRACT_TYPE          5.599849
AMT_REQ_CREDIT_BUREAU_HOUR       4.337980
ord__EMERGENCYSTATE_MODE         2.951914
FLAG_DOCUMENT_6                  1.502534
Length: 96, dtype: float64

In [None]:
# Analyze fitted model
print('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
print('=' * 70)

print('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
print('=' * 70)

print('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
print('=' * 70)

print('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
print('=' * 70)

Feature importances of selector:
None
Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_2_LightGBM_prediction_0    2278.610948
Lvl_0_Pipe_0_Mod_0_LightGBM_prediction_0    1189.739746
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0       0.000000
dtype: float64
Feature importances of lowest level algorithm - model 0:
EXT_SOURCE_3                  1885.253801
EXT_SOURCE_2                  1785.728023
DAYS_BIRTH                    1130.155983
DAYS_REGISTRATION             1016.024376
DAYS_ID_PUBLISH                961.878135
                                 ...     
REG_REGION_NOT_WORK_REGION       6.302472
ord__NAME_CONTRACT_TYPE          5.599849
AMT_REQ_CREDIT_BUREAU_HOUR       4.337980
ord__EMERGENCYSTATE_MODE         2.951914
FLAG_DOCUMENT_6                  1.502534
Length: 96, dtype: float64
Feature importances of lowest level algorithm - model 1:
EXT_SOURCE_3                  1657.010943
EXT_SOURCE_2                  1549.253864
DAYS_BIRTH                     987.023480
DAYS_I

In [None]:
# Pipeline - v2 (+Optuna last level)

In [None]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)



pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
model0_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
selector_lvl1 = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)


pipe1 = LGBSimpleFeatures()
params_tuner1_lvl1 = OptunaTuner(n_trials=20, timeout=30)
model1_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
model2_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.02, 'num_leaves': 64, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)


#Pipeline - level 1
pipeline_lvl1 = MLPipeline([
    model0_lvl1,
    (model1_lvl1, params_tuner1_lvl1),
    model2_lvl1
], pre_selection=selector_lvl1, features_pipeline=pipe1, post_selection=None)



pipe2 = LGBSimpleFeatures()
model_lvl2 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
params_tuner2_lvl2 = OptunaTuner(n_trials=20, timeout=30)
pipeline_lvl2 = MLPipeline([(model_lvl2, params_tuner2_lvl2)], pre_selection=None, features_pipeline=pipe2, post_selection=None)


automl = AutoML(reader,
                [[pipeline_lvl1],
                [pipeline_lvl2]],
                skip_conn=False)

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})


test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))


INFO:lightautoml.reader.base:[1mTrain data shape: (8000, 122)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: ['REG_REGION_NOT_LIVE_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'FLAG_DOCUMENT_8']
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 9999999984.67 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 8, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42, 'seed': 42}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
INFO3:lightautoml.ml_algo.boost_lgbm:Trainin

Prediction for test data:
array([[0.06539972],
       [0.07553179],
       [0.05480719],
       ...,
       [0.06585301],
       [0.06425362],
       [0.18560858]], dtype=float32)
Shape = (2000, 1)
Check scores...
OOF score: 0.7054033661735845
TEST score: 0.7167340353260869


In [None]:
# Pipeline - v3 + catboost model

In [None]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)



pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
model0_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
selector_lvl1 = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)


pipe1 = LGBSimpleFeatures()
params_tuner1_lvl1 = OptunaTuner(n_trials=20, timeout=30)
model1_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
model2_lvl1 = BoostLGBM(
    default_params={'learning_rate':0.02, 'num_leaves': 64, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)


#Pipeline - level 1
pipeline_lvl1 = MLPipeline([
    model0_lvl1,
    (model1_lvl1, params_tuner1_lvl1),
    model2_lvl1
], pre_selection=selector_lvl1, features_pipeline=pipe1, post_selection=None)



pipe2 = LGBSimpleFeatures()
model_lvl2 = BoostLGBM(
    default_params={'learning_rate':0.05, 'num_leaves': 128, 'seed': RANDOM_STATE, 'num_threads': N_THREADS}
)
params_tuner2_lvl2 = OptunaTuner(n_trials=20, timeout=30)

model_cb_lvl2 = BoostCB()

pipeline_lvl2 = MLPipeline(
    [(model_lvl2, params_tuner2_lvl2),
     model_cb_lvl2],
    pre_selection=None, features_pipeline=pipe2, post_selection=None)


automl = AutoML(reader,
                [[pipeline_lvl1],
                [pipeline_lvl2]],
                skip_conn=False)

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})


test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

INFO:lightautoml.reader.base:[1mTrain data shape: (8000, 122)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: ['REG_REGION_NOT_LIVE_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'FLAG_DOCUMENT_8']
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 9999999997.74 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 8, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42, 'seed': 42}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
INFO3:lightautoml.ml_algo.boost_lgbm:Trainin

Prediction for test data:
array([[0.06539972],
       [0.07553179],
       [0.05480719],
       ...,
       [0.06585301],
       [0.06425362],
       [0.18560858]], dtype=float32)
Shape = (2000, 1)
Check scores...
OOF score: 0.7054033661735845
TEST score: 0.7167340353260869
