In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install -U lightautoml

In [4]:
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender

[2021-07-05 17:10:14,562] (INFO): 'pattern' package not found; tag filters are not available for English


In [3]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'Price' # Target column name

In [5]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
p = Profiler()
p.change_deco_settings({'enabled': True})

In [6]:
DATASET_DIR = '/content/drive/MyDrive/'
DATASET_NAME = 'train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)

In [7]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

CPU times: user 1.16 ms, sys: 26 µs, total: 1.18 ms
Wall time: 4.66 ms


In [8]:

%%time

data = pd.read_csv(DATASET_FULLNAME)
data.head()

CPU times: user 39 ms, sys: 19.4 ms, total: 58.4 ms
Wall time: 315 ms


In [9]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

ValueError: ignored

In [13]:
%%time

task = Task('reg')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

CPU times: user 5.03 ms, sys: 0 ns, total: 5.03 ms
Wall time: 5.25 ms


In [14]:
%%time

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


CPU times: user 5.14 ms, sys: 525 µs, total: 5.67 ms
Wall time: 10.9 ms


In [15]:
%%time 

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

CPU times: user 1.5 ms, sys: 0 ns, total: 1.5 ms
Wall time: 1.51 ms


In [17]:
%%time

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 2.42 ms, sys: 70 µs, total: 2.49 ms
Wall time: 6.29 ms


In [18]:
%%time 

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

CPU times: user 1.5 ms, sys: 0 ns, total: 1.5 ms
Wall time: 1.5 ms


In [20]:
%%time 

oof_pred = automl.fit_predict(data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (19237, 18)


[2021-07-05 17:16:45,753] (INFO): NumExpr defaulting to 2 threads.


Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999985.052814 secs
Start fitting LightGBM ...

===== Start working with fold 0 for LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.03352e+09
Early stopping, best iteration is:
[6]	valid's l2: 2.79388e+08
LightGBM fitting and predicting completed
Optuna may run 6299999924.63633 secs


[2021-07-05 17:16:49,442] (INFO): A new study created in memory with name: no-name-c8e46388-0b59-4f0b-b9c9-3017027a5c26


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.02111e+09
Early stopping, best iteration is:
[6]	valid's l2: 2.76213e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:16:55,642] (INFO): Trial 0 finished with value: -276213249.4083571 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: -276213249.4083571.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.16717e+09
Early stopping, best iteration is:
[6]	valid's l2: 2.67517e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:17:00,171] (INFO): Trial 1 finished with value: -267517008.13860056 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 1 with value: -267517008.13860056.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.19233e+09
Early stopping, best iteration is:
[10]	valid's l2: 2.77683e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:17:01,897] (INFO): Trial 2 finished with value: -277682818.11794716 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 1 with value: -267517008.13860056.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.19775e+09
Early stopping, best iteration is:
[10]	valid's l2: 2.83361e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:17:07,585] (INFO): Trial 3 finished with value: -283361319.05608135 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 1 with value: -267517008.13860056.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 9.98774e+08
Early stopping, best iteration is:
[6]	valid's l2: 2.71069e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:17:12,674] (INFO): Trial 4 finished with value: -271069142.8539897 and parameters: {'feature_fraction': 0.8005575058716043, 'num_leaves': 185}. Best is trial 1 with value: -267517008.13860056.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.16412e+09
Early stopping, best iteration is:
[10]	valid's l2: 2.83155e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:17:18,945] (INFO): Trial 5 finished with value: -283154711.8320421 and parameters: {'feature_fraction': 0.5102922471479012, 'num_leaves': 248}. Best is trial 1 with value: -267517008.13860056.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.14475e+09
Early stopping, best iteration is:
[6]	valid's l2: 2.6202e+08
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-07-05 17:17:21,188] (INFO): Trial 6 finished with value: -262019646.9870119 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 66}. Best is trial 6 with value: -262019646.9870119.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.14475e+09
Early stopping, best iteration is:
[6]	valid's l2: 2.6202e+08

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.56507e+09
Early stopping, best iteration is:
[4]	valid's l2: 3.88503e+08

===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 1.80083e+11
[200]	valid's l2: 1.80079e+11
Early stopping, best iteration is:
[177]	valid's l2: 1.80074e+11

===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 4.57496e+08
Early stopping, best iteration is:
[37]	valid's l2: 2.1154e+08

===== Start wo

[2021-07-05 17:18:07,477] (INFO): oof_pred:
array([[14421.67  ],
       [19000.137 ],
       [18672.191 ],
       ...,
       [20077.592 ],
       [14652.22  ],
       [13730.8545]], dtype=float32)
Shape = (19237, 1)


CPU times: user 1min 15s, sys: 55.1 s, total: 2min 10s
Wall time: 1min 35s


In [21]:

logging.info('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
logging.info('=' * 70)

[2021-07-05 17:19:24,031] (INFO): Feature importances of selector:
ID                  3.117552e+13
Airbags             2.993246e+13
Model               2.306856e+13
Prod. year          2.090044e+13
Manufacturer        1.825860e+13
Category            3.163031e+12
Doors               1.287250e+12
Engine volume       1.033921e+12
Cylinders           7.403429e+11
Fuel type           6.837601e+11
Levy                6.792232e+11
Gear box type       5.075618e+11
Drive wheels        2.877067e+11
Leather interior    2.121734e+11
Color               1.081874e+11
Mileage             7.502334e+10
Wheel               2.906446e+10
dtype: float64
[2021-07-05 17:19:24,039] (INFO): Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0    3.216884e+14
Lvl_0_Pipe_0_Mod_0_LightGBM_prediction_0    3.206134e+14
dtype: float64
[2021-07-05 17:19:24,048] (INFO): Feature importances of lowest level algorithm - model 0:
ord__ID                  6.232138e+13
Airbags              

In [30]:
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')
test_pred = automl.predict(test_df)

result = pd.DataFrame(test_pred.data,columns=['Price'])

result.head()

Unnamed: 0,Price
0,27456.203125
1,18047.730469
2,13680.90332
3,15476.598633
4,19969.103516


In [31]:
result.to_csv('result9.csv',index=False)