In [1]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import torch
import multiprocessing as mp

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.report.report_deco import ReportDeco

In [2]:
N_THREADS = mp.cpu_count()
N_FOLDS = 4
RANDOM_STATE = 42
TIMEOUT = 4400
TEST_SIZE = 0.2
TARGET_NAME = 'per_square_meter_price'
PATH = 'data'

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [4]:
train_data = pd.read_csv(os.path.join(PATH, 'train.csv.gz'))
test_data = pd.read_csv(os.path.join(PATH, 'test.csv.gz'))
submission = pd.read_csv(os.path.join(PATH, 'test_submission.csv'))

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279792 entries, 0 to 279791
Data columns (total 77 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   city                                 279792 non-null  object 
 1   floor                                103555 non-null  object 
 2   id                                   279792 non-null  object 
 3   lat                                  279792 non-null  float64
 4   lng                                  279792 non-null  float64
 5   osm_amenity_points_in_0.001          279792 non-null  int64  
 6   osm_amenity_points_in_0.005          279792 non-null  int64  
 7   osm_amenity_points_in_0.0075         279792 non-null  int64  
 8   osm_amenity_points_in_0.01           279792 non-null  int64  
 9   osm_building_points_in_0.001         279792 non-null  int64  
 10  osm_building_points_in_0.005         279792 non-null  int64  
 11  osm_building_

In [6]:
train = train_data[train_data.price_type == 0]
test = train_data[train_data.price_type == 1]

In [8]:
from raif_hack.metrics import deviation_metric, metrics_stat

In [9]:
task = Task('reg', metric=deviation_metric, greater_is_better=False)
roles = {'target': TARGET_NAME,
         'category': ['city', 'floor', 'region', 'street'],
         'drop': ['id', 'realty_type'],
         DatetimeRole(base_date=True, seasonality=('y', 'm', 'wd'), base_feats=False): ['date']
         }

In [10]:
automl = TabularAutoML(task=task,
                       memory_limit=8,
                       timeout=TIMEOUT,
                       cpu_limit=N_THREADS,
                       verbose=2,
                       general_params={'use_algos': [['lgb', 'cb'],
                                                     ['lgb_tuned', 'cb_tuned'],
                                                     ]},
                       reader_params={'cv': N_FOLDS,
                                      'random_state': RANDOM_STATE,
                                      'n_jobs': N_THREADS},
                       selection_params={  #'importance_type': 'permutation'
                           'cutoff': 1}
                       )

In [11]:
oof_pred = automl.fit_predict(train, roles=roles)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start automl preset with listed constraints:
- time: 4400 seconds
- cpus: 8 cores
- memory: 8 gb

Train data shape: (275299, 77)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 4371.743892908096 secs
Start fitting Selector_LightGBM ...

===== Start working with fold 0 for Selector_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 7.77856e+09	valid's Opt metric: 3.95213
[200]	valid's l2: 6.88868e+09	valid's Opt metric: 3.73402
[300]	valid's l2: 6.49123e+09	valid's Opt metric: 3.64368
[400]	valid's l2: 6.26438e+09	valid's Opt metric: 3.58542
[500]	valid's l2: 6.07756e+09	valid's Opt metric: 3.53854
[600]	valid's l2: 5.9441e+09	valid's Opt metric: 3.49657
[700]	valid's l2: 5.8398e+09	valid's Opt metric: 3.46246
[800]	valid's l2: 5.74855e+09	valid's Opt metric: 3.43301
[900]	valid's l2: 5.67521e+09	valid's Opt metric: 3.40982
[1000]	valid's l2: 5.61215e+09	valid's Opt metric: 3.38495
[1100]	valid

Time limit exceeded after calculating fold 0


Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_1_CatBoost ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_1_CatBoost =====

0:	learn: 167776.0994196	test: 167402.6537983	best: 167402.6537983 (0)	total: 91.7ms	remaining: 3m 3s
100:	learn: 91590.8429952	test: 88366.9488897	best: 88366.9488897 (100)	total: 1.71s	remaining: 32.1s
200:	learn: 87916.4741644	test: 85859.7414031	best: 85859.7414031 (200)	total: 3.13s	remaining: 28s
300:	learn: 85498.9995406	test: 84435.3167305	best: 84435.3167305 (300)	total: 4.61s	remaining: 26s
400:	learn: 83713.5122343	test: 83652.2332853	best: 83652.2332853 (400)	total: 6.04s	remaining: 24.1s
500:	learn: 82279.9387585	test: 82964.1371826	best: 82964.1371826 (500)	total: 7.54s	remaining: 22.5s
600:	learn: 81022.2046339	test: 82464.8903399	best: 82462.5496876 (597)	total: 8.98s	remaining: 20.9s
700:	learn: 79957.4259357	test: 82041.1854370	best: 82041.1854370 (700)	total: 10.4s	remaining: 19.3s
800:	lear

Time limit exceeded in one of the tasks. AutoML will blend level 1 models.


Blending: Optimization starts with equal weights and score -3.3217908600981874
Blending, iter 0: score = -2.9566811292546316, weights = [1. 0.]
Blending, iter 1: score = -2.9566811292546316, weights = [1. 0.]
No score update. Terminated

Automl preset training completed in 1840.68 seconds.


In [12]:
pred = automl.predict(test)

In [13]:
metrics_stat(test[TARGET_NAME].values, pred.data[:, 0])

{'mape': 0.4098644804180296,
 'mdape': 0.2847763024498874,
 'rmse': 83554.44754987463,
 'r2': 0.12664516736389086,
 'raif_metric': 2.856257747879485}

In [24]:
deviation = ((pred.data[:, 0] - test[TARGET_NAME])/pred.data[:, 0]).median()

In [25]:
metrics_stat(test[TARGET_NAME].values, pred.data[:, 0]*(1 + deviation))

{'mape': 0.4791313475995533,
 'mdape': 0.32408673800074383,
 'rmse': 86741.99729685609,
 'r2': 0.05873823116139676,
 'raif_metric': 3.440474562195135}

In [19]:
prediction = automl.predict(test_data)

In [20]:
test_data[TARGET_NAME] = prediction.data[:,0]

In [21]:
test_data[['id',TARGET_NAME]].to_csv('sample_sabmission.csv',index=False)