# Step 0.0. Install LightAutoML

Uncomment if doesn't clone repository by git. (ex.: colab, kaggle version)

In [1]:
! pip install -U lightautoml

Collecting lightautoml
[?25l  Downloading https://files.pythonhosted.org/packages/01/b7/eddea00dbc08237ba75d0bff3926def73e3be81afc3d2e9f4652c24fd1e8/LightAutoML-0.2.14-py3-none-any.whl (250kB)
[K     |████████████████████████████████| 256kB 8.3MB/s 
[?25hCollecting importlib-metadata<2.0,>=1.0; python_version < "3.8"
  Downloading https://files.pythonhosted.org/packages/8e/58/cdea07eb51fc2b906db0968a94700866fc46249bdc75cac23f9d13168929/importlib_metadata-1.7.0-py2.py3-none-any.whl
Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/2b/21/d13081805e1e1afc71f5bb743ece324c8bd576237c51b899ecb38a717502/optuna-2.7.0-py3-none-any.whl (293kB)
[K     |████████████████████████████████| 296kB 48.2MB/s 
Collecting lightgbm<3.0,>=2.3
[?25l  Downloading https://files.pythonhosted.org/packages/0b/9d/ddcb2f43aca194987f1a99e27edf41cf9bc39ea750c3371c2a62698c509a/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 22.7M

# Step 0.1. Import necessary libraries 

In [2]:
# Standard python libraries
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender

[2021-05-30 12:50:28,405] (INFO): 'pattern' package not found; tag filters are not available for English


# Step 0.2. Parameters 

In [3]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'Is_Lead' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [4]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Change profiling decorators settings 

By default, profiling decorators are turned off for speed and memory reduction. If you want to see profiling report after using LAMA, you need to turn on the decorators using command below: 

In [5]:
p = Profiler()
p.change_deco_settings({'enabled': True})

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 0.5. Example data load 

Load a dataset from the repository if doesn't clone repository by git.

In [13]:
DATASET_DIR = '/content/drive/MyDrive/'
DATASET_NAME = 'train_new.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)


In [14]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

CPU times: user 0 ns, sys: 669 µs, total: 669 µs
Wall time: 871 µs


In [16]:
%%time

data = pd.read_csv(DATASET_FULLNAME)
data.head()

CPU times: user 150 ms, sys: 11.9 ms, total: 162 ms
Wall time: 173 ms


# Step 0.6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [61]:
# # import library
# from imblearn.over_sampling import SMOTE

# smote = SMOTE()

# # fit predictor and target variable
# x_smote, y_smote = smote.fit_resample(X, y)
# X_train = pd.DataFrame(x_smote, columns=X.columns)
# y_train = pd.DataFrame(y_smote, columns=['Is_Lead'])
# data = pd.concat([X_train,y_train],axis=1)
# data.head()


Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.




Unnamed: 0,Age,Vintage,Avg_Account_Balance,Gender_Male,Region_Code_RG251,Region_Code_RG252,Region_Code_RG253,Region_Code_RG254,Region_Code_RG255,Region_Code_RG256,Region_Code_RG257,Region_Code_RG258,Region_Code_RG259,Region_Code_RG260,Region_Code_RG261,Region_Code_RG262,Region_Code_RG263,Region_Code_RG264,Region_Code_RG265,Region_Code_RG266,Region_Code_RG267,Region_Code_RG268,Region_Code_RG269,Region_Code_RG270,Region_Code_RG271,Region_Code_RG272,Region_Code_RG273,Region_Code_RG274,Region_Code_RG275,Region_Code_RG276,Region_Code_RG277,Region_Code_RG278,Region_Code_RG279,Region_Code_RG280,Region_Code_RG281,Region_Code_RG282,Region_Code_RG283,Region_Code_RG284,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4,Credit_Product_Yes,Is_Active_Yes,Is_Lead
0,73,43,1045696,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0.0
1,30,32,581988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0
2,56,26,1484315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0.0
3,34,19,470454,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.0
4,30,33,886787,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0.0


# Step 0.7. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [17]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2021-05-30 12:53:26,467] (INFO): Data splitted. Parts sizes: train_data = (196580, 10), test_data = (49145, 10)


CPU times: user 111 ms, sys: 398 µs, total: 112 ms
Wall time: 114 ms


In [18]:
len(train_data)

196580

# ========= AutoML creation =========

![AutoML pipeline for this task](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/imgs/tutorial_1_pipeline.png?raw=1)


## Step 1. Create Task and PandasReader

In [19]:
%%time

task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

CPU times: user 6.27 ms, sys: 0 ns, total: 6.27 ms
Wall time: 9.13 ms


## Step 2. Create feature selector (if necessary) 

In [20]:
%%time

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


CPU times: user 4.65 ms, sys: 99 µs, total: 4.75 ms
Wall time: 7.22 ms


## Step 3.1. Create 1st level ML pipeline for AutoML 

Our first level ML pipeline:
- Simple features for gradient boosting built on selected features (using step 2) 
- 2 different models:
    * LightGBM with params tuning (using OptunaTuner)
    * LightGBM with heuristic params


In [21]:
%%time 

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

CPU times: user 3.11 ms, sys: 0 ns, total: 3.11 ms
Wall time: 4.01 ms


## Step 3.2. Create 2nd level ML pipeline for AutoML 

Our second level ML pipeline:
- Using simple features as well, but now it will be Out-Of-Fold (OOF) predictions of algos from 1st level
- Only one LGBM model without params tuning
- Without feature selection on this stage because we want to use all OOFs here

In [22]:
%%time

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 1.37 ms, sys: 0 ns, total: 1.37 ms
Wall time: 1.38 ms


## Step 4. Create AutoML pipeline 

AutoML pipeline consist of:
- Reader for data preparation
- First level ML pipeline (as built in step 3.1)
- Second level ML pipeline (as built in step 3.2)
- `Skip_conn = False` equals here "not to use initial features on the second level pipeline"

In [23]:
%%time 

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

CPU times: user 1.25 ms, sys: 1.05 ms, total: 2.3 ms
Wall time: 2.32 ms


## Step 5. Train AutoML on loaded data 

In cell below we train AutoML with target column `TARGET` to receive fitted model and OOF predictions:

In [24]:
%%time 

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (196580, 10)


[2021-05-30 12:54:06,584] (INFO): NumExpr defaulting to 2 threads.


Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999982.866423 secs
Start fitting LightGBM ...

===== Start working with fold 0 for LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.802499
[200]	valid's auc: 0.801937
Early stopping, best iteration is:
[106]	valid's auc: 0.802588
LightGBM fitting and predicting completed
Optuna may run 6299999973.042978 secs


[2021-05-30 12:54:21,688] (INFO): A new study created in memory with name: no-name-71f05e08-8a26-4e70-81db-58fb32047881


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.800743
Early stopping, best iteration is:
[64]	valid's auc: 0.80122
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-05-30 12:54:36,554] (INFO): Trial 0 finished with value: 0.8012195519172629 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: 0.8012195519172629.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.800836
Early stopping, best iteration is:
[60]	valid's auc: 0.801773
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-05-30 12:54:47,583] (INFO): Trial 1 finished with value: 0.8017731069410011 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 1 with value: 0.8017731069410011.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.802143
[200]	valid's auc: 0.80215
Early stopping, best iteration is:
[146]	valid's auc: 0.802433
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-05-30 12:54:56,725] (INFO): Trial 2 finished with value: 0.8024332472822987 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 2 with value: 0.8024332472822987.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.802143
[200]	valid's auc: 0.80215
Early stopping, best iteration is:
[146]	valid's auc: 0.802433

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.797108
[200]	valid's auc: 0.797217
Early stopping, best iteration is:
[183]	valid's auc: 0.797348

===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.801325
[200]	valid's auc: 0.801099
Early stopping, best iteration is:
[114]	valid's auc: 0.801417

===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.79576
[200]	valid's auc: 0.795712
Early s

[2021-05-30 12:57:27,689] (INFO): oof_pred:
array([[0.2199113 ],
       [0.20649932],
       [0.30044973],
       ...,
       [0.23551995],
       [0.11141734],
       [0.09806565]], dtype=float32)
Shape = (196580, 1)


CPU times: user 5min 4s, sys: 51.5 s, total: 5min 55s
Wall time: 3min 31s


## Step 6. Analyze fitted model  

Below we analyze feature importances of different algos:

In [None]:
logging.info('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
logging.info('=' * 70)

## Step 7. Predict to test data and check scores

In [27]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2021-05-30 13:01:22,962] (INFO): Prediction for test data:
array([[0.15541112],
       [0.14936374],
       [0.7216174 ],
       ...,
       [0.1270724 ],
       [0.11239512],
       [0.2583373 ]], dtype=float32)
Shape = (49145, 1)
[2021-05-30 13:01:22,963] (INFO): Check scores...
[2021-05-30 13:01:23,031] (INFO): OOF score: 0.7853082015972583
[2021-05-30 13:01:23,049] (INFO): TEST score: 0.7991029890876837


CPU times: user 10.3 s, sys: 4.42 ms, total: 10.3 s
Wall time: 5.36 s


In [29]:

test_df = pd.read_csv('/content/drive/MyDrive/test.csv')
result = pd.DataFrame(test_df.ID,columns=['ID'])

In [30]:

test_pred = automl.predict(test_df)

In [31]:

result['Is_Lead'] = test_pred.data


In [32]:
result.head()

Unnamed: 0,ID,Is_Lead
0,VBENBARO,0.110636
1,CCMEWNKY,0.287544
2,VK3KGA9M,0.131305
3,TT8RPZVC,0.104852
4,SHQZEYTZ,0.104746


In [33]:
result.to_csv('result11.csv',index=False)

## Step 8. Profiling AutoML 

To build report here, we must turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [None]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

# Appendix. Profiling report screenshots 

After loading HTML with profiling report, you can see fully folded report (please wait for green LOAD OK text for full load finish). If you click on triangle on the left, it unfolds and look like this:  

<img src="https://github.com/sberbank-ai-lab/LightAutoML/blob/master/imgs/tutorial_1_initial_report.png?raw=1" alt="Initial profiling report" style="width: 500px;"/>

If we go even deeper we will receive situation like this:

<img src="https://github.com/sberbank-ai-lab/LightAutoML/blob/master/imgs/tutorial_1_unfolded_report.png?raw=1" alt="Profiling report after several unfoldings on different levels" style="width: 500px;"/>
