## 0. Prerequisites

### 0.0. install LightAutoML

In [1]:
# %%capture
# !pip install lightautoml

# QUICK WORKAROUND FOR PROBLEM WITH PANDAS
# !pip install -U pandas

### 0.1. Import libraries
- LightAutoML modules: `TabularAutoML` preset for AutoML model creation and Task class to setup what kind of ML problem we solve (binary/multiclass classification or regression)

In [2]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

### 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset

In [3]:
N_THREADS = 12
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 900 # equal to 15 minutes
TARGET_NAME = 'final_price'

### 0.3. Imported models setup

For better reproducibility fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [4]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### 0.4. Data loading
Let's check the data we have:

In [5]:
INPUT_DIR = './input/sf-dst-predict-car-price/'

In [6]:
df = pd.read_csv(INPUT_DIR + 'train_data.csv', index_col='row_ID')
print(df.shape)
df.head()

(35000, 14)


Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,TOYOTA,Aqua s,Sedan,133000,2014,Automatic,4/5,Right-hand drive,Silver,Black,,0,For Sale,3650.0
1,MERCEDES-BENZ,C 220,Sedan,24500,2010,Manual,4/5,Left wheel,Silver,Black,,0,For Sale,6800.0
2,HYUNDAI,Veloster,Hatchback,31000,2016,Tiptronic,2/3,Left wheel,Silver,Black,KMHTC6AE3GU293912,1,For Sale,6300.0
3,HYUNDAI,Santa FE,Jeep,115459,2015,Automatic,4/5,Left wheel,Blue,Black,,1,For Sale,14488.0
4,TOYOTA,CHR,Jeep,18950,2019,Automatic,4/5,Left wheel,Black,,JTNKHMBX7K1030253,1,For Sale,5000.0


In [7]:
dt = pd.read_csv(INPUT_DIR + 'test_data.csv', index_col='row_ID')
print(dt.shape)
dt.head()

(10697, 13)


Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
35000,TOYOTA,Prius,Hatchback,323733,2012,Automatic,4/5,Left wheel,Grey,Black,JTDKN3DU6C5439638,1,For Sale
35001,HYUNDAI,Elantra,Sedan,112000,2013,Tiptronic,4/5,Left wheel,Grey,Black,SURATSHIA,1,For Sale
35002,LEXUS,NX 300,Jeep,16920,2018,Automatic,,Left wheel,Brown,,JTJYARBZ5J2104521,1,For Sale
35003,LEXUS,CT 200h,Hatchback,302742,2012,Automatic,4/5,Left wheel,White,,JTHKD5BH4C2070945,1,For Sale
35004,TOYOTA,RAV 4,Jeep,1800,2002,Manual,4/5,Left wheel,Silver,Black,,0,For Sale


In [8]:
submission = pd.read_csv(INPUT_DIR + 'sample_submission.csv')
print(submission.shape)
submission.head()

(10697, 2)


Unnamed: 0,row_ID,final_price
0,35000,0
1,35001,0
2,35002,0
3,35003,0
4,35004,0


### Step 0.5. Some user feature preparation

Be creative :)

In [9]:
# преобразование таргета к целому типу
df['final_price'] = df['final_price'].astype(np.int32)

In [10]:
df.shape[0]

35000

In [11]:
# удаление дубликатов в трейн
df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

In [12]:
df.shape[0]

26928

In [13]:
# заполнение строк с таргетом = 0, удаление 5ти выбросов со стоимостью > 200 тыс и 1 со стоимостью 111111
df.loc[2856, 'final_price'] = 38
df.loc[3146, 'final_price'] = 13
df.loc[7123, 'final_price'] = 2000
df = df[(df['final_price'] <= 200000)&(df['final_price'] != 111111)]

In [29]:
# Избавляемся от записей с нулевым значением целевой переменной
df = df[df['final_price'] != 0]
df.shape[0]

26921

In [31]:
# заполнение пустой строки для теста (проставить 0 перед сабмитом), удаляем для трейна
dt[dt['vehicle_manufacturer'].isnull()] \
    = dt[dt['vehicle_manufacturer'].isnull()].fillna(0)

In [32]:
# удаляем строки, где цена продажи меньше 10, и все строки с арендой
df = df[~((df['deal_type'] == 'For Sale')&(df['final_price'] < 10))]
df = df[~(df['deal_type'] == 'For Rent')]

In [33]:
df.shape[0]

26272

In [34]:
# выделяем пробег больше 500 000 и шаблонные значения
def one_digit(s):
    if len(s) == 1:
        return True
    if s[0] != s[1]:
        return False
    return one_digit(s[1:])

def str_miles(m):
    s = str(m)
    if m >= 500000 or m == 0:
        return 0
    if m > 1000 and one_digit(s):
        return 0
    return np.log10(m)

df['str_mileage'] = df['current_mileage'].apply(lambda x: str(x))
df['current_mileage'] = df['current_mileage'].apply(str_miles)
dt['str_mileage'] = dt['current_mileage'].apply(lambda x: str(x))
dt['current_mileage'] = dt['current_mileage'].apply(str_miles)

In [35]:
df.isna().sum()

vehicle_manufacturer          0
vehicle_model                 5
vehicle_category              0
current_mileage               0
vehicle_year                  0
vehicle_gearbox_type          0
doors_cnt                   369
wheels                        0
vehicle_color               145
vehicle_interior_color     1999
car_vin                   21397
car_leather_interior          0
deal_type                     0
final_price                   0
str_mileage                   0
dtype: int64

In [36]:
dt.isna().sum()

vehicle_manufacturer         0
vehicle_model                1
vehicle_category             0
current_mileage              0
vehicle_year                 0
vehicle_gearbox_type         0
doors_cnt                  222
wheels                       0
vehicle_color              133
vehicle_interior_color    2077
car_vin                   7130
car_leather_interior         0
deal_type                    0
str_mileage                  0
dtype: int64

In [37]:
# по сочетанию признаков нашли, где doors только 2/3 и заполнили эти пропуски в трейн
otn = df[df['doors_cnt'].isnull()]
otn

Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price,str_mileage
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10,MERCEDES-BENZ,GLE 350,Sedan,4.333931,2018,Automatic,,Left wheel,Black,,4JGDA5JB2JB095271,1,For Sale,20600,21574
72,MERCEDES-BENZ,GLA 250,Sedan,4.170027,2018,Automatic,,Left wheel,Silver,,WDCTG4EB7JJ507512,1,For Sale,1100,14792
84,MERCEDES-BENZ,E 400,Sedan,0.000000,2018,Automatic,,Left wheel,Green,,WDD1K6FB4JF033937,1,For Sale,8300,0
93,HYUNDAI,Elantra,Sedan,4.682813,2019,Automatic,,Left wheel,Grey,,,1,For Sale,900,48174
135,MERCEDES-BENZ,GLA 250,Sedan,4.479820,2018,Automatic,,Left wheel,Black,,WDCTG4EBXJJ417612,1,For Sale,150,30187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34342,TOYOTA,Camry,Sedan,4.279986,2019,Automatic,,Left wheel,Black,,4T1B61HK8KU213062,1,For Sale,11700,19054
34377,SUBARU,Impreza,Sedan,5.434794,2014,Automatic,,Left wheel,White,,JF1GPAA67E8224741,1,For Sale,2000,272141
34401,BMW,X1,Jeep,4.823566,2017,Automatic,,Left wheel,Black,,WBXHU7C39H5H35425,1,For Sale,1750,66614
34451,TOYOTA,CHR,Jeep,4.924155,2018,Automatic,,Left wheel,White,,NMTKHMBX1JR050115,1,For Sale,7900,83976


In [38]:
ot4 = df[df['doors_cnt'] == ' 4/5']
ot4

Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price,str_mileage
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,TOYOTA,Aqua s,Sedan,5.123852,2014,Automatic,4/5,Right-hand drive,Silver,Black,,0,For Sale,3650,133000
1,MERCEDES-BENZ,C 220,Sedan,4.389166,2010,Manual,4/5,Left wheel,Silver,Black,,0,For Sale,6800,24500
3,HYUNDAI,Santa FE,Jeep,5.062428,2015,Automatic,4/5,Left wheel,Blue,Black,,1,For Sale,14488,115459
4,TOYOTA,CHR,Jeep,4.277609,2019,Automatic,4/5,Left wheel,Black,,JTNKHMBX7K1030253,1,For Sale,5000,18950
5,MITSUBISHI,Delica,Jeep,5.173186,2003,Automatic,4/5,Right-hand drive,Silver,Black,,0,For Sale,20,149000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34994,BUICK,Regal,Sedan,5.361118,2011,Automatic,4/5,Left wheel,Silver,Black,,1,For Sale,90,229677
34995,FORD,Fusion,Sedan,4.942400,2013,Automatic,4/5,Left wheel,Black,,3FA6P0K98DR308742,1,For Sale,6000,87579
34996,SSANGYONG,REXTON,Jeep,5.064802,2009,Automatic,4/5,Left wheel,Silver,Black,,1,For Sale,9642,116092
34997,BMW,635,Sedan,4.633468,2015,Tiptronic,4/5,Left wheel,Grey,Grey,,1,For Sale,23500,43000


In [39]:
otall4 = ot4['vehicle_manufacturer'] + ot4['vehicle_model'] \
    + ot4['vehicle_category'] + ot4['vehicle_gearbox_type']
otall4

row_ID
0           TOYOTAAqua sSedanAutomatic
1        MERCEDES-BENZC 220SedanManual
3         HYUNDAISanta FEJeepAutomatic
4               TOYOTACHRJeepAutomatic
5        MITSUBISHIDelicaJeepAutomatic
                     ...              
34994         BUICKRegalSedanAutomatic
34995         FORDFusionSedanAutomatic
34996     SSANGYONGREXTONJeepAutomatic
34997             BMW635SedanTiptronic
34999       TOYOTAAvalonSedanAutomatic
Length: 24208, dtype: object

In [40]:
otn4 = otn[(otn['vehicle_manufacturer'] + otn['vehicle_model'] + otn['vehicle_category'] \
    + otn['vehicle_gearbox_type']).isin(otall4)]
otn4

Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price,str_mileage
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10,MERCEDES-BENZ,GLE 350,Sedan,4.333931,2018,Automatic,,Left wheel,Black,,4JGDA5JB2JB095271,1,For Sale,20600,21574
72,MERCEDES-BENZ,GLA 250,Sedan,4.170027,2018,Automatic,,Left wheel,Silver,,WDCTG4EB7JJ507512,1,For Sale,1100,14792
93,HYUNDAI,Elantra,Sedan,4.682813,2019,Automatic,,Left wheel,Grey,,,1,For Sale,900,48174
135,MERCEDES-BENZ,GLA 250,Sedan,4.479820,2018,Automatic,,Left wheel,Black,,WDCTG4EBXJJ417612,1,For Sale,150,30187
143,LEXUS,GX 460,Jeep,4.492663,2018,Automatic,,Left wheel,Black,,JTJBM7FX4J5194947,1,For Sale,10100,31093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34139,BMW,X1,Jeep,3.694781,2019,Automatic,,Left wheel,Black,,WBXHU7C59K5L12186,1,For Sale,1850,4952
34342,TOYOTA,Camry,Sedan,4.279986,2019,Automatic,,Left wheel,Black,,4T1B61HK8KU213062,1,For Sale,11700,19054
34377,SUBARU,Impreza,Sedan,5.434794,2014,Automatic,,Left wheel,White,,JF1GPAA67E8224741,1,For Sale,2000,272141
34401,BMW,X1,Jeep,4.823566,2017,Automatic,,Left wheel,Black,,WBXHU7C39H5H35425,1,For Sale,1750,66614


In [41]:
ot2 = df[df['doors_cnt'] == ' 2/3']
ot2

Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price,str_mileage
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2,HYUNDAI,Veloster,Hatchback,4.491362,2016,Tiptronic,2/3,Left wheel,Silver,Black,KMHTC6AE3GU293912,1,For Sale,6300,31000
45,MERCEDES-BENZ,Sprinter 316 CDI,Microbus,5.471292,2011,Manual,2/3,Left wheel,Blue,Black,,0,For Sale,8000,296000
47,MITSUBISHI,Pajero,Jeep,5.190332,2007,Tiptronic,2/3,Right-hand drive,Silver,Black,,0,For Sale,6000,155000
87,NISSAN,Micra,Sedan,5.394452,2004,Manual,2/3,Left wheel,Blue,Black,,0,For Sale,2550,248000
91,MAZDA,Mx-5 Miata,Cabriolet,4.556303,2016,Manual,2/3,Left wheel,White,Black,JM1NDAB74G0110909,0,For Sale,15400,36000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34903,MERCEDES-BENZ,CLK 230,Coupe,5.369216,1999,Manual,2/3,Left wheel,Blue,Black,,0,For Sale,1800,234000
34926,RENAULT,Twingo,Hatchback,5.156113,1996,Manual,2/3,Left wheel,Blue,Grey,,0,For Sale,650,143256
34929,TOYOTA,Vitz,Coupe,5.041393,2002,Automatic,2/3,Right-hand drive,Silver,Grey,,1,For Sale,2200,110000
34936,NISSAN,Micra,Sedan,5.301030,1997,Manual,2/3,Left wheel,Green,Grey,,0,For Sale,1250,200000


In [42]:
otall2 = ot2['vehicle_manufacturer'] + ot2['vehicle_model'] \
    + ot2['vehicle_category'] + ot2['vehicle_gearbox_type']
otall2

row_ID
2                  HYUNDAIVelosterHatchbackTiptronic
45       MERCEDES-BENZSprinter 316 CDIMicrobusManual
47                     MITSUBISHIPajeroJeepTiptronic
87                            NISSANMicraSedanManual
91                    MAZDAMx-5 MiataCabrioletManual
                            ...                     
34903                MERCEDES-BENZCLK 230CoupeManual
34926                   RENAULTTwingoHatchbackManual
34929                       TOYOTAVitzCoupeAutomatic
34936                         NISSANMicraSedanManual
34952                     NISSANMicraHatchbackManual
Length: 1422, dtype: object

In [43]:
otn2 = otn[(otn['vehicle_manufacturer'] + otn['vehicle_model'] + otn['vehicle_category'] \
    + otn['vehicle_gearbox_type']).isin(otall2)]
otn2

Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price,str_mileage
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
145,TOYOTA,Tundra,Sedan,4.816838,2017,Automatic,,Left wheel,Black,,5TFRM5F16HX119294,1,For Sale,8000,65590
272,LEXUS,RC F,Coupe,4.226471,2017,Automatic,,Left wheel,Black,,JTHHE5BC0H5016219,1,For Sale,2750,16845
678,TOYOTA,Tacoma,Pickup,5.558228,2011,Automatic,,Left wheel,White,,3TMMU4FN8BM028158,1,For Sale,6600,361600
753,CHEVROLET,Camaro,Sedan,5.060498,2015,Automatic,,Left wheel,Yellow,,2G1FK3DJ7F9296909,1,For Sale,3000,114947
861,PORSCHE,911,Coupe,4.564074,2013,Automatic,,Left wheel,Silver,,WP0CB2A94DS156004,1,For Sale,10800,36650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33166,DODGE,Challenger,Coupe,3.711132,2019,Automatic,,Left wheel,Black,,2C3CDZBT3KH500475,1,For Sale,5300,5142
33508,MERCEDES-BENZ,E 400,Cabriolet,0.000000,2016,Automatic,,Left wheel,White,,WDDKK6FF9GF335370,1,For Sale,7000,0
33646,TOYOTA,Camry,Sedan,4.703970,2018,Automatic,,Left wheel,White,,4T1B11HK1JU050901,1,For Sale,225,50579
34342,TOYOTA,Camry,Sedan,4.279986,2019,Automatic,,Left wheel,Black,,4T1B61HK8KU213062,1,For Sale,11700,19054


In [44]:
ind2 = otn2.index.difference(otn4.index)
df.loc[ind2, 'doors_cnt'] = ' 2/3'

In [45]:
# заполнение пропусков в doors и двух признаках цвета медианным значением
df = df.fillna({
    'doors_cnt': df['doors_cnt'].mode()[0], 
    'vehicle_color': df['vehicle_color'].mode()[0], 
    'vehicle_interior_color': df['vehicle_interior_color'].mode()[0]})
dt = dt.fillna({
    'doors_cnt': df['doors_cnt'].mode()[0], 
    'vehicle_color': df['vehicle_color'].mode()[0], 
    'vehicle_interior_color': df['vehicle_interior_color'].mode()[0]})

In [46]:
# !!!
# удаляем столбец с vin и заполняем пропуски в модели производителем
df.drop(['car_vin'], axis=1, inplace=True)
dt.drop(['car_vin'], axis=1, inplace=True)
df = df.fillna({'vehicle_model': df['vehicle_manufacturer']})
dt = dt.fillna({'vehicle_model': dt['vehicle_manufacturer']})

In [47]:
# собираем словарь словарей для замены строковых значений числами
rdeal = {k: i for i, k in enumerate(df.groupby(by=
                        ['deal_type'])['final_price'].count().sort_values().index)}

rwheel = {k: i for i, k in enumerate(df.groupby(by=
                        ['wheels'])['final_price'].count().sort_values().index)}

rcat = {k: i for i, k in enumerate(df.groupby(by=
                        ['vehicle_category'])['final_price'].mean().sort_values().index)}

rgear = {k: i for i, k in enumerate(df.groupby(by=
                    ['vehicle_gearbox_type'])['final_price'].mean().sort_values().index)}

rdoor = {k: i for i, k in enumerate(df.groupby(by=
                        ['doors_cnt'])['final_price'].mean().sort_values().index)}

rvcol = {k: i for i, k in enumerate(df.groupby(by=
                        ['vehicle_color'])['final_price'].mean().sort_values().index)}

ricol = {k: i for i, k in enumerate(df.groupby(by=
                    ['vehicle_interior_color'])['final_price'].mean().sort_values().index)}

rman = {k: i for i, k in enumerate(df.groupby(by=
                    ['vehicle_manufacturer'])['final_price'].mean().sort_values().index)}

rmod = {k: i for i, k in enumerate(df.groupby(by=
                    ['vehicle_model'])['final_price'].mean().sort_values().index)}

scale_rdic =  {
    'vehicle_manufacturer': rman,
    'vehicle_model': rmod,
    'vehicle_category': rcat,
    'vehicle_gearbox_type': rgear,
    'doors_cnt': rdoor,
    'wheels': rwheel,
    'vehicle_color': rvcol,
    'vehicle_interior_color': ricol,
    'deal_type': rdeal}
dfn = df.replace(scale_rdic)
dft = dt.replace(scale_rdic)

In [48]:
# меняем в test некатегоризировавшихся производителей на моду
dft.loc[
    pd.to_numeric(
        dft['vehicle_manufacturer'], 
        errors='coerce').isnull(),
    'vehicle_manufacturer'] = dfn['vehicle_manufacturer'].mode()[0]
dft['vehicle_manufacturer'] = dft['vehicle_manufacturer'].astype(int)

In [49]:
from scipy import stats

In [50]:
# строим для каждого категорированного производителя моду категории модели
replmod = dfn.groupby(
    by=['vehicle_manufacturer'], 
    as_index=False)['vehicle_model'].agg(lambda x: stats.mode(x, keepdims=True).mode[0])
replmod['vehicle_model'] = replmod['vehicle_model'].apply(
    lambda x: x[0] if isinstance(x, np.ndarray) else x)

In [51]:
# прикручиваем из полученного словаря модели там, где они остались строковые
def get_model(man, mod):
    if isinstance(mod, str): 
        mod = replmod[replmod['vehicle_manufacturer'] == man]['vehicle_model']
    return mod

dft['vehicle_model'] = dft.apply(lambda x: 
        get_model(x.vehicle_manufacturer, x.vehicle_model), axis=1).astype(int)

In [52]:
# удаляем строки где год меньше 1947
df = df[df['vehicle_year'] >= 1947]

In [53]:
# меняем год на возраст
df['vehicle_year'] = 2022 - df['vehicle_year']
dt['vehicle_year'] = 2022 - dt['vehicle_year']

In [54]:
# выделяем признак из модели
df['v_model'] = df['vehicle_model'].apply(lambda x: str(x).split()[0])
df['v_model0'] = df['vehicle_model'].apply(lambda x: str(x).split()[1] if len(str(x).split()) > 1 else '')
df['v_model1'] = df['vehicle_model'].apply(lambda x: str(x).split()[2] if len(str(x).split()) > 2 else '')
dt['v_model'] = dt['vehicle_model'].apply(lambda x: str(x).split()[0])
dt['v_model0'] = dt['vehicle_model'].apply(lambda x: str(x).split()[1] if len(str(x).split()) > 1 else '')
dt['v_model1'] = dt['vehicle_model'].apply(lambda x: str(x).split()[2] if len(str(x).split()) > 2 else '')

In [55]:
# !!! удаляем несущественные признаки
# df.drop(['wheels', 'doors_cnt'], axis=1, inplace=True)

In [56]:
df.shape

(26268, 17)

In [57]:
df.isna().sum()

vehicle_manufacturer      0
vehicle_model             0
vehicle_category          0
current_mileage           0
vehicle_year              0
vehicle_gearbox_type      0
doors_cnt                 0
wheels                    0
vehicle_color             0
vehicle_interior_color    0
car_leather_interior      0
deal_type                 0
final_price               0
str_mileage               0
v_model                   0
v_model0                  0
v_model1                  0
dtype: int64

In [58]:
dt.isna().sum()

vehicle_manufacturer      0
vehicle_model             0
vehicle_category          0
current_mileage           0
vehicle_year              0
vehicle_gearbox_type      0
doors_cnt                 0
wheels                    0
vehicle_color             0
vehicle_interior_color    0
car_leather_interior      0
deal_type                 0
str_mileage               0
v_model                   0
v_model0                  0
v_model1                  0
dtype: int64

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26268 entries, 0 to 34999
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   vehicle_manufacturer    26268 non-null  object 
 1   vehicle_model           26268 non-null  object 
 2   vehicle_category        26268 non-null  object 
 3   current_mileage         26268 non-null  float64
 4   vehicle_year            26268 non-null  int64  
 5   vehicle_gearbox_type    26268 non-null  object 
 6   doors_cnt               26268 non-null  object 
 7   wheels                  26268 non-null  object 
 8   vehicle_color           26268 non-null  object 
 9   vehicle_interior_color  26268 non-null  object 
 10  car_leather_interior    26268 non-null  int64  
 11  deal_type               26268 non-null  object 
 12  final_price             26268 non-null  int32  
 13  str_mileage             26268 non-null  object 
 14  v_model                 26268 non-null

In [60]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10697 entries, 35000 to 45696
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   vehicle_manufacturer    10697 non-null  object 
 1   vehicle_model           10697 non-null  object 
 2   vehicle_category        10697 non-null  object 
 3   current_mileage         10697 non-null  float64
 4   vehicle_year            10697 non-null  int64  
 5   vehicle_gearbox_type    10697 non-null  object 
 6   doors_cnt               10697 non-null  object 
 7   wheels                  10697 non-null  object 
 8   vehicle_color           10697 non-null  object 
 9   vehicle_interior_color  10697 non-null  object 
 10  car_leather_interior    10697 non-null  int64  
 11  deal_type               10697 non-null  object 
 12  str_mileage             10697 non-null  object 
 13  v_model                 10697 non-null  object 
 14  v_model0                10697 non-

In [61]:
# возвращаюсь к переменным ноутбука
train_data = df
test_data = dt

dt.head()

Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_leather_interior,deal_type,str_mileage,v_model,v_model0,v_model1
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
35000,TOYOTA,Prius,Hatchback,5.510187,10,Automatic,4/5,Left wheel,Grey,Black,1,For Sale,323733,Prius,,
35001,HYUNDAI,Elantra,Sedan,5.049218,9,Tiptronic,4/5,Left wheel,Grey,Black,1,For Sale,112000,Elantra,,
35002,LEXUS,NX 300,Jeep,4.2284,4,Automatic,4/5,Left wheel,Brown,Black,1,For Sale,16920,NX,300,
35003,LEXUS,CT 200h,Hatchback,5.481073,10,Automatic,4/5,Left wheel,White,Black,1,For Sale,302742,CT,200h,
35004,TOYOTA,RAV 4,Jeep,3.255273,20,Manual,4/5,Left wheel,Silver,Black,0,For Sale,1800,RAV,4,


### 0.6. Data splitting for train-holdout
As we have only one file with target values, we can split it into 80%-20% for holdout usage:

In [62]:
tr_data, te_data = train_test_split(
    train_data, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

Data splitted. Parts sizes: tr_data = (21014, 17), te_data = (5254, 17)


Unnamed: 0_level_0,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_leather_interior,deal_type,final_price,str_mileage,v_model,v_model0,v_model1
row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
13711,TOYOTA,Aqua S,Sedan,5.025306,10,Tiptronic,4/5,Right-hand drive,Black,Black,0,For Sale,3700,106000,Aqua,S,
2433,TOYOTA,Prius,Sedan,5.29446,15,Variator,4/5,Left wheel,White,Beige,1,For Sale,2000,196997,Prius,,
21736,TOYOTA,Aqua,Hatchback,5.189041,7,Automatic,4/5,Left wheel,Silver,Black,1,For Sale,2865,154540,Aqua,,
30115,SSANGYONG,REXTON,Jeep,4.991226,9,Automatic,4/5,Left wheel,Black,Black,1,For Sale,12832,98000,REXTON,,
17282,DODGE,Caliber,Hatchback,5.3024,15,Automatic,4/5,Left wheel,Grey,Black,1,For Sale,275,200632,Caliber,,


# 1. Task definition

### 1.1. Task type

On the cell below we create Task object - the class to setup what task LightAutoML model should solve with specific loss and metric if necessary (more info can be found [here](https://lightautoml.readthedocs.io/en/latest/pages/modules/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task) in our documentation):

In [63]:
task = Task('reg', loss = 'mae', metric = 'mae')

### 1.2. Feature roles setup

To solve the task, we need to setup columns roles. The **only role you must setup is target role**, everything else (drop, numeric, categorical, group, weights etc.) is up to user - LightAutoML models have automatic columns typization inside:

In [64]:
roles = {
    'target': TARGET_NAME,
    'drop': ['row_ID']
}

### 1.3. LightAutoML model creation - TabularAutoML preset

In [65]:
automl = TabularUtilizedAutoML(
    task = task, 
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

# 2. AutoML training

To run autoML training use fit_predict method:
- `train_data` - Dataset to train.
- `roles` - Roles dict.
- `verbose` - Controls the verbosity: the higher, the more messages.
        <1  : messages are not displayed;
        >=1 : the computation process for layers is displayed;
        >=2 : the information about folds processing is also displayed;
        >=3 : the hyperparameters optimization process is also displayed;
        >=4 : the training process for every algorithm is displayed;

Note: out-of-fold prediction is calculated during training and returned from the fit_predict method

In [66]:
%%time 
oof_pred = automl.fit_predict(train_data, roles = roles, verbose = 1)

[18:43:01] Start automl [1mutilizator[0m with listed constraints:
[18:43:01] - time: 900.00 seconds
[18:43:01] - CPU: 12 cores
[18:43:01] - memory: 16 GB

[18:43:01] [1mIf one preset completes earlier, next preset configuration will be started[0m

[18:43:01] Start 0 automl preset configuration:
[18:43:01] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[18:43:01] Stdout logging level is INFO.
[18:43:01] Task: reg

[18:43:01] Start automl preset with listed constraints:
[18:43:01] - time: 900.00 seconds
[18:43:01] - CPU: 12 cores
[18:43:01] - memory: 16 GB

[18:43:01] [1mTrain data shape: (26268, 17)[0m

[18:43:10] Layer [1m1[0m train process start. Time left 890.74 secs


  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


[18:43:11] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...


  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


[18:44:04] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-2782.083248788606[0m
[18:44:04] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[18:44:04] Time left 836.90 secs

[18:44:05] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...


  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


[18:45:08] Time limit exceeded after calculating fold 2

[18:45:08] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m-1996.9351875567486[0m
[18:45:08] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[18:45:08] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 1.00 secs
[18:45:08] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[18:46:39] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[18:46:39] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[18:48:51] Time limit exceeded after calculating fold 1

[18:48:51] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m-1957.4119627371367[0m
[18:48:51] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[18:48:51] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[18:49:24] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoos

In [None]:
print(automl.create_model_str_desc())

# 3. Prediction on holdout and model evaluation

We now have the trained model called `automl` and it's time to see what MAE score it can receive on train and holdout data:

In [None]:
%%time

te_pred = automl.predict(te_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

In [None]:
train_data[TARGET_NAME].values

In [None]:
oof_pred.data[:, 0]

In [None]:
# print(f'TRAIN out-of-fold score: {mean_absolute_error(te_data[TARGET_NAME].values, oof_pred.data[:, 0])}')
print(f'HOLDOUT score: {mean_absolute_error(te_data[TARGET_NAME].values, te_pred.data[:, 0])}')

# 4. Feature importances calculation 

For feature importances calculation we have 2 different methods in LightAutoML:
- Fast (`fast`) - this method uses feature importances from feature selector LGBM model inside LightAutoML. It works extremely fast and almost always (almost because of situations, when feature selection is turned off or selector was removed from the final models with all GBM models). no need to use new labelled data.
- Accurate (`accurate`) - this method calculate *features permutation importances* for the whole LightAutoML model based on the **new labelled data**. It always works but can take a lot of time to finish (depending on the model structure, new labelled dataset size etc.).

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 20), fontsize = 36, grid = True)

In [None]:
%%time

# Accurate feature importances calculation (Permutation importances) -  can take long time to calculate on bigger datasets
accurate_fi = automl.get_feature_scores('accurate', te_data, silent = False)

In [None]:
accurate_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 20), fontsize = 36, grid = True)

# 5. Predict for test dataset

We are also ready to predict for our test competition dataset and submission file creation:

In [None]:
test_pred = automl.predict(test_data)
print(f'Prediction for te_data:\n{test_pred}\nShape = {test_pred.shape}')

In [None]:
submission[TARGET_NAME] = test_pred.data[:, 0]

In [None]:
# подставляю 0 в пустую строку
submission.loc[submission['row_ID'] == 37254, 'final_price'] = 0

In [None]:
submission.loc[submission['row_ID'].isin(dt[dt['deal_type'] == 'For Rent'].index),
              'final_price'] = 19

In [None]:
submission[(submission['final_price'] < 0)
           &(submission['row_ID'].isin(dt[dt['deal_type'] == 'For Rent'].index))]

In [None]:
submission.loc[(submission['final_price'] < 0)
           &(submission['row_ID'].isin(dt[dt['deal_type'] == 'For Rent'].index)),
              'final_price'] = 19

In [None]:
submission[(submission['final_price'] < 0)
           &(submission['row_ID'].isin(dt[dt['deal_type'] != 'For Rent'].index))]

In [None]:
submission.loc[(submission['final_price'] < 0)
           &(submission['row_ID'].isin(dt[dt['deal_type'] != 'For Rent'].index)),
              'final_price'] = 150  # [150, 150, 150, 375]

In [None]:
submission[submission['final_price'].isnull()] = 70

In [None]:
submission

In [None]:
submission.to_csv('subm1516.csv', index = False)