# 1. Imports

In [137]:
# Standard python libraries
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
import torch



## 1.1 FEDOT imports

In [14]:
from fedot.api.main import Fedot

## 1.2 LAMA imports

In [15]:
# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

## 2. Data Preparation

In [153]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

DATASET_DIR = './example_data/test_data_files'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sberbank-ai-lab/LightAutoML/master/example_data/test_data_files/sampled_app_train.csv'

In [154]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

CPU times: user 2.6 ms, sys: 186 µs, total: 2.78 ms
Wall time: 33.6 ms


In [155]:
data = pd.read_csv(DATASET_FULLNAME)
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


In [156]:
data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

In [130]:
train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2021-07-20 17:12:33,525] (INFO): Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


# 3. Fit & Predict. Create different pipelines

## 3.1 LAMA

In [26]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

In [27]:
%%time

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))


Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999993.387394 secs
Start fitting LightGBM ...

===== Start working with fold 0 for LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.716183
Early stopping, best iteration is:
[16]	valid's auc: 0.720694
LightGBM fitting and predicting completed
Optuna may run 6299999571.722917 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.721384
[100]	valid's auc: 0.736122
Early stopping, best iteration is:
[68]	valid's auc: 0.745995
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds

[2021-07-20 12:08:40,581] (INFO): A new study created in memory with name: no-name-0d423764-7712-40bd-9f1e-865afa0736a4
[2021-07-20 12:08:45,012] (INFO): Trial 2 finished with value: 0.7459948361851918 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 2 with value: 0.7459948361851918.
[2021-07-20 12:08:46,069] (INFO): Trial 3 finished with value: 0.7192990896504534 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 2 with value: 0.7459948361851918.
[2021-07-20 12:08:47,084] (INFO): Trial 4 finished with value: 0.732246045619043 and parameters: {'feature_fraction': 0.8005575058716043, 'num_leaves': 185}. Best is trial 2 with value: 0.7459948361851918.
[2021-07-20 12:08:51,935] (INFO): Trial 8 finished with value: 0.7420284277092655 and parameters: {'feature_fraction': 0.6521211214797689, 'num_leaves': 141}. Best is trial 6 with value: 0.7493892693148592.
[2021-07-20 12:08:53,004] (INFO): Trial 9 finished wit

In [28]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))


[2021-07-20 12:09:53,795] (INFO): Prediction for test data:
array([[0.0918165 ],
       [0.06319325],
       [0.05311057],
       ...,
       [0.05190825],
       [0.05113189],
       [0.26104575]], dtype=float32)
Shape = (2000, 1)
[2021-07-20 12:09:53,796] (INFO): Check scores...
[2021-07-20 12:09:53,800] (INFO): OOF score: 0.7059763219386357
[2021-07-20 12:09:53,803] (INFO): TEST score: 0.7197197690217392


CPU times: user 465 ms, sys: 128 µs, total: 465 ms
Wall time: 142 ms


## 3.2 FEDOT

In [157]:
numerical_columns = data._get_numeric_data().columns
date_columns = ['BIRTH_DATE', 'EMP_DATE']
categorycal_columns = list(set(data.columns) - set(numerical_columns) - set(date_columns))

for cat_col in categorycal_columns:
    le = OneHotEncoder(handle_unknown='error', drop='if_binary')
    a = le.fit_transform(data[[cat_col]]).toarray()
    shift = data.columns[-1]
    if not isinstance(shift, int):
        shift = 0
    end_col = pd.DataFrame(data=a, columns=np.arange(a.shape[1]) + shift + 1)
    data = data.join(end_col)
    data.drop(axis=1, inplace=True, columns=[cat_col])

In [158]:
basic_time = '2020-01-01'
basic_interval = 'D'

for date_col in date_columns:
    data[date_col] = pd.to_datetime(data[date_col])
    data[date_col] = ((data[date_col] - np.datetime64(basic_time)) / np.timedelta64(1, basic_interval)).values.astype(np.float32)

In [133]:
train_labels = data['TARGET']
df = data.copy()
df.drop(columns=['TARGET'], axis=1, inplace=True)
train = df
num_feats = len(train.columns)

lgbc=LGBMClassifier(learning_rate=0.05, num_leaves=64)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(train, train_labels)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = train.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

51 selected features


In [162]:
# columns_drop = ['NAME_EDUCATION_TYPE', 'BIRTH_DATE', 'EMP_DATE', 'report_dt', 'NAME_CONTRACT_TYPE']

# data_copy = data.copy()
# data_copy = data_copy[list(set(list(data_copy.columns[2:20]) + ['TARGET']))]

# for column in columns_drop:
#     try:
#         data_copy.drop(columns=column, axis=1, inplace=True)
#     except:
#         print(f'{column} not in columns')




data_copy = data.copy()
data_copy = data_copy[embeded_lgb_feature + ['TARGET']]

train_data, test_data = train_test_split(data_copy,
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)

train_target, test_target = train_data[TARGET_NAME], test_data[TARGET_NAME]


logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2021-07-20 17:49:15,822] (INFO): Data splitted. Parts sizes: train_data = (8000, 52), test_data = (2000, 52)


In [163]:
learning_time = [5,10,20]
answers = {}

for time in learning_time:
    roc_auc_metrics = []
    f1_score_metrics = []

    for i in range(8):
        baseline_model = Fedot(problem='classification', learning_time=time, composer_params={'cv_folds': 4})
        baseline_model.fit(features=train_data, target=TARGET_NAME)

        #evaluate the prediction with test data
        oof_pred_fedot = baseline_model.predict_proba(features=train_data)
        test_pred_fedot = baseline_model.predict_proba(test_data)

        roc_auc_metrics.append([roc_auc_score(train_target.values, oof_pred_fedot), roc_auc_score(test_target.values, test_pred_fedot)])
        f1_score_metrics.append([f1_score(train_target.values, np.where(oof_pred_fedot > 0.5, 1, 0)), f1_score(test_target.values, np.where(test_pred_fedot > 0.5, 1, 0))])

        data_copy = data.copy()
        data_copy = data_copy[embeded_lgb_feature + ['TARGET']]

        train_data, test_data = train_test_split(data_copy,
                                                test_size=TEST_SIZE, 
                                                stratify=data[TARGET_NAME], 
                                                random_state=RANDOM_STATE)

        train_target, test_target = train_data[TARGET_NAME], test_data[TARGET_NAME]
    
    answers[time] = [roc_auc_metrics, f1_score_metrics]



Composition started. Parameters tuning: True. Set of candidate models: ['logit', 'lda', 'qda', 'dt', 'rf', 'knn', 'xgboost', 'bernb', 'scaling', 'normalization', 'simple_imputation', 'pca', 'poly_features', 'one_hot_encoding', 'rfe_lin_class', 'rfe_non_lin_class']. Composing time limit: 5 min
Model composition started
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actu

[2021-07-20 17:49:16,521] (INFO): Composition started. Parameters tuning: True. Set of candidate models: ['logit', 'lda', 'qda', 'dt', 'rf', 'knn', 'xgboost', 'bernb', 'scaling', 'normalization', 'simple_imputation', 'pca', 'poly_features', 'one_hot_encoding', 'rfe_lin_class', 'rfe_non_lin_class']. Composing time limit: 5 min
[2021-07-20 17:49:24,242] (INFO): Model composition started
[2021-07-20 17:49:27,705] (INFO): Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
[2021-07-20 17:49:31,151] (INFO): Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
[2021-07-20 17:49:34,572] (INFO): Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
[2021-07-20 17:49:41,272] (INFO): Trained operation cache is not actual because you are using new dataset for training. Par

In [181]:
print("AUC")
print("\n5 MINUTES")
print(f"train: {np.array(list(map(lambda x: x[0], answers[5][0]))).mean()}")
print(f"test: {np.array(list(map(lambda x: x[1], answers[5][0]))).mean()}")

print("\n10 MINUTES")
print(f"train: {np.array(list(map(lambda x: x[0], answers[10][0]))).mean()}")
print(f"test: {np.array(list(map(lambda x: x[1], answers[10][0]))).mean()}")

print("\n20 MINUTES")
print(f"train: {np.array(list(map(lambda x: x[0], answers[20][0]))).mean()}")
print(f"test: {np.array(list(map(lambda x: x[1], answers[20][0]))).mean()}")

print("F1")
print("\n5 MINUTES")
print(f"train: {np.array(list(map(lambda x: x[0], answers[5][1]))).mean()}")
print(f"test: {np.array(list(map(lambda x: x[1], answers[5][1]))).mean()}")

print("\n10 MINUTES")
print(f"train: {np.array(list(map(lambda x: x[0], answers[10][1]))).mean()}")
print(f"test: {np.array(list(map(lambda x: x[1], answers[10][1]))).mean()}")

print("\n20 MINUTES")
print(f"train: {np.array(list(map(lambda x: x[0], answers[20][1]))).mean()}")
print(f"test: {np.array(list(map(lambda x: x[1], answers[20][1]))).mean()}")

AUC

5 MINUTES
train: 0.7995216483735391
test: 0.7141597316576087

10 MINUTES
train: 0.8050606503972741
test: 0.7121297554347826

20 MINUTES
train: 0.7735015904571719
test: 0.723378269361413
F1

5 MINUTES
train: 0.05163309033792669
test: 0.026959744177977172

10 MINUTES
train: 0.07567242291166763
test: 0.024947205026959938

20 MINUTES
train: 0.02571195848617474
test: 0.010700132228949126


In [183]:
pd.DataFrame(answers)

Unnamed: 0,5,10,20
0,"[[0.8159289781466806, 0.7060427989130434], [0....","[[0.7784908366408507, 0.7238926630434782], [0....","[[0.7487007510504011, 0.7401664402173913], [0...."
1,"[[0.1340782122905028, 0.046242774566473986], [...","[[0.027565084226646247, 0.01234567901234568], ...","[[0.015105740181268881, 0.01234567901234568], ..."


In [136]:
#task selection, initialisation of the framework
baseline_model = Fedot(problem='classification', learning_time=5, composer_params={'cv_folds': 4})

#fit model without optimisation - single XGBoost node is used 
baseline_model.fit(features=train_data, target=TARGET_NAME)

#evaluate the prediction with test data
oof_pred_fedot = baseline_model.predict_proba(features=train_data)
test_pred_fedot = baseline_model.predict_proba(test_data)

#evaluate quality metric for the test sample
baseline_metrics = baseline_model.get_metrics()
print(baseline_metrics)

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_target.values, oof_pred_fedot)))
logging.info('TEST score: {}'.format(roc_auc_score(test_target.values, test_pred_fedot)))

Composition started. Parameters tuning: True. Set of candidate models: ['logit', 'lda', 'qda', 'dt', 'rf', 'knn', 'xgboost', 'bernb', 'scaling', 'normalization', 'simple_imputation', 'pca', 'poly_features', 'one_hot_encoding', 'rfe_lin_class', 'rfe_non_lin_class']. Composing time limit: 5 min
Model composition started
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
Trained operation cache is not actu

[2021-07-20 17:13:29,511] (INFO): Composition started. Parameters tuning: True. Set of candidate models: ['logit', 'lda', 'qda', 'dt', 'rf', 'knn', 'xgboost', 'bernb', 'scaling', 'normalization', 'simple_imputation', 'pca', 'poly_features', 'one_hot_encoding', 'rfe_lin_class', 'rfe_non_lin_class']. Composing time limit: 5 min
[2021-07-20 17:13:36,648] (INFO): Model composition started
[2021-07-20 17:13:40,158] (INFO): Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
[2021-07-20 17:13:43,447] (INFO): Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
[2021-07-20 17:13:46,533] (INFO): Trained operation cache is not actual because you are using new dataset for training. Parameter use_cache value changed to False
[2021-07-20 17:13:53,018] (INFO): Trained operation cache is not actual because you are using new dataset for training. Par

In [None]:
FEDOT
2 minutes
0.751291918781917
0.709128378239199

10 minutes
1.0
0.689672214673913

5 minutes, cv_folds=4
0.7730193748340394
0.7103872282608695

10 minutes, cv_folds=5
0.7574543245829488
0.7187805706521737

20 minutes, cv_folds=8
0.8105175757104172
0.7088213315217391

5 minutes, cv_folds=4, handle categorical and date preprocessing
0.7711868730838138
0.7255129076086958

10 minutes, cv_folds=4, handle categorical and date preprocessing
0.8072628893255683
0.7239809782608697

5 minutes, cv_folds=4, handle categorical and date preprocessing + importance
0.7961992091722245
0.7064266304347826


LAMA
40 seconds
0.7059763219386357
0.7197197690217392