<font color='steelblue' size=5><b>Вспомогательные материалы</b></font>

In [10]:
# помним про PEP-8
# импорты из стандартной библиотеки
import warnings

# импорты сторонних библиотек
import numpy as np
import pandas as pd

# импорты модулей текущего проекта
# длина строки до 78 символов
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (
    GridSearchCV, 
    RandomizedSearchCV,
    train_test_split
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler
)

# настройки
warnings.filterwarnings("ignore")

# константы заглавными буквами
RANDOM_STATE = 42

In [11]:
def get_data_info(data):
    display(data.sample(5))
    display(data.info())
    display(data.describe(include='all'))

In [12]:
df = pd.read_csv('autos.csv')
get_data_info(df)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
174447,2016-03-12 21:41:29,2890,wagon,2004,manual,75,caddy,150000,3,petrol,volkswagen,no,2016-03-12 00:00:00,0,99610,2016-03-23 14:45:00
36382,2016-03-12 18:25:24,1090,small,1998,manual,75,micra,150000,12,petrol,nissan,no,2016-03-12 00:00:00,0,61191,2016-03-14 09:29:44
303009,2016-03-12 11:50:41,1325,,1985,manual,120,3er,150000,12,,bmw,,2016-03-12 00:00:00,0,16515,2016-03-28 05:46:57
213711,2016-03-14 16:43:47,2700,sedan,2001,manual,122,c_klasse,150000,1,petrol,mercedes_benz,no,2016-03-14 00:00:00,0,45884,2016-03-23 16:48:49
219925,2016-03-29 23:45:45,6000,convertible,2005,auto,82,roadster,125000,3,petrol,smart,no,2016-03-29 00:00:00,0,30419,2016-04-06 10:46:44


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Kilometer          354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  Repaired           283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

None

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
count,354369,354369.0,316879,354369.0,334536,354369.0,334664,354369.0,354369.0,321474,354369,283215,354369,354369.0,354369.0,354369
unique,271174,,8,,2,,250,,,7,40,2,109,,,179150
top,2016-03-24 14:49:47,,sedan,,manual,,golf,,,petrol,volkswagen,no,2016-04-03 00:00:00,,,2016-04-06 13:45:54
freq,7,,91457,,268251,,29232,,,216352,77013,247161,13719,,,17
mean,,4416.656776,,2004.234448,,110.094337,,128211.172535,5.714645,,,,,0.0,50508.689087,
std,,4514.158514,,90.227958,,189.850405,,37905.34153,3.726421,,,,,0.0,25783.096248,
min,,0.0,,1000.0,,0.0,,5000.0,0.0,,,,,0.0,1067.0,
25%,,1050.0,,1999.0,,69.0,,125000.0,3.0,,,,,0.0,30165.0,
50%,,2700.0,,2003.0,,105.0,,150000.0,6.0,,,,,0.0,49413.0,
75%,,6400.0,,2008.0,,143.0,,150000.0,9.0,,,,,0.0,71083.0,


In [13]:
df.columns = [col.lower() for col in df.columns]

<font color='steelblue' size=3><b>Предположим, что датасет уже изучен. Далее для начала необходимо разделить датасет на тренировочную и тестовую выборки.</b></font>

In [14]:
features = df.drop(['price'], axis=1)
target = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.25, random_state=RANDOM_STATE
)


In [15]:
get_data_info(X_train)

Unnamed: 0,datecrawled,vehicletype,registrationyear,gearbox,power,model,kilometer,registrationmonth,fueltype,brand,repaired,datecreated,numberofpictures,postalcode,lastseen
46211,2016-03-07 13:47:14,bus,2004,manual,133,other,150000,8,lpg,mitsubishi,no,2016-03-07 00:00:00,0,33689,2016-03-14 21:15:46
280861,2016-03-12 17:48:22,sedan,2002,,125,vectra,150000,1,petrol,opel,,2016-03-12 00:00:00,0,75365,2016-04-06 20:18:56
267888,2016-03-12 10:48:45,small,2002,manual,103,stilo,150000,11,petrol,fiat,yes,2016-03-12 00:00:00,0,81669,2016-03-12 10:48:45
254844,2016-03-10 22:46:44,convertible,2007,manual,160,mx_reihe,150000,9,petrol,mazda,no,2016-03-10 00:00:00,0,27367,2016-04-07 04:15:27
116546,2016-03-31 15:38:01,convertible,2006,manual,0,slk,50000,11,petrol,mercedes_benz,,2016-03-31 00:00:00,0,84518,2016-04-07 12:15:55


<class 'pandas.core.frame.DataFrame'>
Int64Index: 265776 entries, 236946 to 121958
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   datecrawled        265776 non-null  object
 1   vehicletype        237668 non-null  object
 2   registrationyear   265776 non-null  int64 
 3   gearbox            250825 non-null  object
 4   power              265776 non-null  int64 
 5   model              250970 non-null  object
 6   kilometer          265776 non-null  int64 
 7   registrationmonth  265776 non-null  int64 
 8   fueltype           240988 non-null  object
 9   brand              265776 non-null  object
 10  repaired           212327 non-null  object
 11  datecreated        265776 non-null  object
 12  numberofpictures   265776 non-null  int64 
 13  postalcode         265776 non-null  int64 
 14  lastseen           265776 non-null  object
dtypes: int64(6), object(9)
memory usage: 32.4+ MB


None

Unnamed: 0,datecrawled,vehicletype,registrationyear,gearbox,power,model,kilometer,registrationmonth,fueltype,brand,repaired,datecreated,numberofpictures,postalcode,lastseen
count,265776,237668,265776.0,250825,265776.0,250970,265776.0,265776.0,240988,265776,212327,265776,265776.0,265776.0,265776
unique,218153,8,,2,,249,,,7,40,2,102,,,145279
top,2016-03-19 21:49:56,sedan,,manual,,golf,,,petrol,volkswagen,no,2016-04-03 00:00:00,,,2016-04-07 09:44:27
freq,6,68668,,201243,,21899,,,162208,57776,185416,10334,,,15
mean,,,2004.295079,,109.582099,,128225.686292,5.711095,,,,,0.0,50506.427499,
std,,,92.255413,,174.758595,,37891.094797,3.728278,,,,,0.0,25794.67892,
min,,,1000.0,,0.0,,5000.0,0.0,,,,,0.0,1067.0,
25%,,,1999.0,,68.0,,125000.0,3.0,,,,,0.0,30163.0,
50%,,,2003.0,,105.0,,150000.0,6.0,,,,,0.0,49377.0,
75%,,,2008.0,,141.0,,150000.0,9.0,,,,,0.0,71088.0,


In [16]:
get_data_info(X_test)

Unnamed: 0,datecrawled,vehicletype,registrationyear,gearbox,power,model,kilometer,registrationmonth,fueltype,brand,repaired,datecreated,numberofpictures,postalcode,lastseen
16600,2016-03-17 12:51:17,small,1998,manual,50,fiesta,80000,12,petrol,ford,no,2016-03-17 00:00:00,0,26624,2016-03-21 17:15:19
164776,2016-03-21 19:39:40,,2016,manual,116,3er,150000,5,petrol,bmw,no,2016-03-21 00:00:00,0,32049,2016-03-30 07:47:06
341156,2016-03-25 14:52:32,small,2003,manual,75,corsa,150000,5,petrol,opel,no,2016-03-25 00:00:00,0,34123,2016-03-25 14:52:32
116881,2016-03-29 12:50:49,sedan,2004,manual,170,5er,150000,10,petrol,bmw,no,2016-03-29 00:00:00,0,90542,2016-04-05 22:16:56
199107,2016-03-06 16:51:58,,2017,,60,ka,150000,7,,ford,no,2016-03-06 00:00:00,0,66640,2016-03-07 10:45:53


<class 'pandas.core.frame.DataFrame'>
Int64Index: 88593 entries, 294049 to 58749
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   datecrawled        88593 non-null  object
 1   vehicletype        79211 non-null  object
 2   registrationyear   88593 non-null  int64 
 3   gearbox            83711 non-null  object
 4   power              88593 non-null  int64 
 5   model              83694 non-null  object
 6   kilometer          88593 non-null  int64 
 7   registrationmonth  88593 non-null  int64 
 8   fueltype           80486 non-null  object
 9   brand              88593 non-null  object
 10  repaired           70888 non-null  object
 11  datecreated        88593 non-null  object
 12  numberofpictures   88593 non-null  int64 
 13  postalcode         88593 non-null  int64 
 14  lastseen           88593 non-null  object
dtypes: int64(6), object(9)
memory usage: 10.8+ MB


None

Unnamed: 0,datecrawled,vehicletype,registrationyear,gearbox,power,model,kilometer,registrationmonth,fueltype,brand,repaired,datecreated,numberofpictures,postalcode,lastseen
count,88593,79211,88593.0,83711,88593.0,83694,88593.0,88593.0,80486,88593,70888,88593,88593.0,88593.0,88593
unique,83181,8,,2,,250,,,7,40,2,79,,,62997
top,2016-03-16 15:49:20,sedan,,manual,,golf,,,petrol,volkswagen,no,2016-04-03 00:00:00,,,2016-04-06 09:44:22
freq,4,22789,,67008,,7333,,,54144,19237,61745,3385,,,9
mean,,,2004.052555,,111.631032,,128167.631754,5.725294,,,,,0.0,50515.473773,
std,,,83.852286,,229.233945,,37948.22986,3.720846,,,,,0.0,25748.461621,
min,,,1000.0,,0.0,,5000.0,0.0,,,,,0.0,1067.0,
25%,,,1999.0,,69.0,,125000.0,3.0,,,,,0.0,30165.0,
50%,,,2003.0,,105.0,,150000.0,6.0,,,,,0.0,49492.0,
75%,,,2008.0,,143.0,,150000.0,9.0,,,,,0.0,71069.0,


<font color='steelblue' size=3><b>Предположим, что выборки не требуют никакой обработки аномалий, выбросов и т. д., логично перейти к кодированию и масштабированию.<br>
Допустим, мы решили, что одной из моделей будет Ridge, для котрой будем использовать OHE кодирование категориальных признаков и StandardScaler для численных признаков. Второй моделью будет RandomForestRegressor, для которой будет использоваться порядковое кодирование признака model, имеющего большое количество уникальных значений, и OHE кодирование остальных категориальных признаков.</b></font>

In [17]:
#категориальные признаки для OHE Ridge
ohe_features_ridge = X_train.select_dtypes(include='object').columns.to_list()
print(ohe_features_ridge)

#категориальные признаки для OHE RandomForestRegressor
ohe_features_rf = ohe_features_ridge.copy()
ohe_features_rf.remove('model')
ohe_features_rf

['datecrawled', 'vehicletype', 'gearbox', 'model', 'fueltype', 'brand', 'repaired', 'datecreated', 'lastseen']


['datecrawled',
 'vehicletype',
 'gearbox',
 'fueltype',
 'brand',
 'repaired',
 'datecreated',
 'lastseen']

In [18]:
num_features

NameError: name 'num_features' is not defined

In [None]:
#численные признаки
#обратите внимание, что 'repaired' — категориальный бинарный признак.
num_features = X_train.select_dtypes(exclude='object').columns.to_list()
#num_features.remove('repaired')
num_features

['registrationyear',
 'power',
 'kilometer',
 'registrationmonth',
 'numberofpictures',
 'postalcode']

<font color='steelblue' size=3><b>
Начнём с линейной модели.<br><br>
Имеется нюанс: в процессе изучения данных было обнаружено, что и для тренировочной, и для тестовой выборок у признака 'model' 82 уникальных значения, но в тренировочной выборке имеется значения 'calibra', которого нет в тестовой выборке, а в тестовой выборке имеется значение 'up', которого нет в тренировочной. Это совершенно обычная для данных ситуация, которую можно легко пропустить, ведь количество признаков абсолютно одинаковое.</b></font>

In [None]:
models_train = set(X_train['model'].unique())
models_test = set(X_test['model'].unique())
num_models_train = len(models_train)
num_models_test = len(models_test)
print(f'''
Количество уникальных значений признка "model" 
в обеих выборках одинаковое: {num_models_train == num_models_test}
''')
print(f'''
Уникальные значения признка "model" 
в обеих выборках одинаковые: {models_train == models_test}
''')
print(f'''
Только в тренировочной выборке есть значения: {models_train - models_test}
''')
print(f'''
Только в тестовой выборке есть значения: {models_test - models_train}
''')


Количество уникальных значений признка "model" 
в обеих выборках одинаковое: False


Уникальные значения признка "model" 
в обеих выборках одинаковые: False


Только в тренировочной выборке есть значения: set()


Только в тестовой выборке есть значения: {'serie_1'}



<font color='steelblue' size=3><b>
Если говорить о прямом кодировании, использование get_dummies в такой ситуации пройдёт без падающего кода, вы просто преобразуете выборки, отдадите их модели, количество признаков одинаковое, а значит код нигде не упадёт. Но в кодировании будет ошибка, ведь в тренировочной и тестовой выборках уникальные значения признака не совпадают.<br><br>
Дело в том, что get_dummies не запоминает информацию о признаках, он скорее подходит для анализа. Конечно можно найти эту особенность, после кодирования удалить из тестовой выборки признак 'up', добавить в неё признак 'calibra' c 0 значениями, тогда всё будет корректно. Предполагается, что модель будет использоваться множество раз, для каждой новой тестовой выборки придётся так же смотреть на каждый признак и делать эту предобработку. Можно даже написать функцию, которая, автоматизирует эти действия. В этом случае можно будет поздравить автора, он изобрёл велосипед 😅 Все эти нюансы уже учтены в OneHotEncoder, давайте посмотрим на него поближе:</b></font>

In [None]:
X_train_ridge = X_train.copy()
X_test_ridge = X_test.copy()

In [None]:
# drop='first' удаляет первый признак из закодированных:
# таким образом обходим dummy-ловушку
# задаём handle_unknown='ignore':
# игнорируется ранее невстречающиеся значения признака (при transform)
encoder_ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False)

# обучаем энкодер на заданных категориальных признаках тренировочной выборки
encoder_ohe.fit(X_train_ridge[ohe_features_ridge])

# добавляем закодированные признаки в X_train_ohe
# encoder_ohe.get_feature_names_out() позволяет получить названия колонок
X_train_ridge[
    encoder_ohe.get_feature_names_out()
] = encoder_ohe.transform(X_train_ridge[ohe_features_ridge])

# удаляем незакодированные категориальные признаки (изначальные колонки)
X_train_ridge = X_train_ridge.drop(ohe_features_ridge, axis=1)

# создаём скелер
scaler = StandardScaler()

# обучаем его на численных признаках тренировочной выборки, трансформируем её же
X_train_ridge[num_features] = scaler.fit_transform(X_train_ridge[num_features])

# смотрим на результат
X_train_ridge.head()

NameError: name 'OneHotEncoder' is not defined

<font color='steelblue' size=3><b>
Если решите остановиться на этом способе и Ridge окажется лучшей моделью, будем трансформировать тестовую выборку так:</b></font>

In [None]:
# энкодером, который обучен на ТРЕНИРОВОЧНОЙ ВЫБОРКЕ, кодируем тестовую
X_test_ridge[
    encoder_ohe.get_feature_names_out()
] = encoder_ohe.transform(X_test_ridge[ohe_features_ridge])

X_test_ridge = X_test_ridge.drop(ohe_features_ridge, axis=1)

# скелером, который обучен на ТРЕНИРОВОЧНОЙ ВЫБОРКЕ, масштабируем тестовую
X_test_ridge[num_features] = scaler.transform(
    X_test_ridge[num_features]
)

# смотрим на результат
X_test_ridge.head()

Unnamed: 0,registration_year,kilometer,power,repaired,brand_bmw,brand_ford,brand_mercedes_benz,brand_opel,brand_volkswagen,model_3er,...,vehicle_type_convertible,vehicle_type_coupe,vehicle_type_other,vehicle_type_sedan,vehicle_type_small,vehicle_type_suv,vehicle_type_wagon,gearbox_manual,fuel_type_other,fuel_type_petrol
33553,-0.492958,0.519842,-0.370403,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
9427,0.061052,0.507666,1.607992,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
199,-1.323973,0.885306,1.333759,0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12447,-0.492958,0.857669,-0.958045,0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
39489,-0.492958,0.302326,-1.173513,0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [None]:
# получилось одинаковое количество признаков
X_train_ridge.shape, X_test_ridge.shape

((37500, 100), (12500, 100))

In [None]:
# получились одинаковые признаки в тренировочной и тестовой выборках
(X_train_ridge.columns != X_test_ridge.columns).sum()

0

<font color='steelblue' size=3><b>
Если более элегантное решения для кодирования и масштабирования, попробуем make_column_transformer:</b></font>

In [None]:
X_train_ridge = X_train.copy()
X_test_ridge = X_test.copy()

In [None]:
# признак repaired уже бинарный, его не будем кодировать/масштабировать
# добавляем remainder='passthrough, чтобы он не пропал 
col_transformer_ridge = make_column_transformer(
    (
        OneHotEncoder(drop='first', handle_unknown='ignore'),
        ohe_features_ridge
    ),
    (
        StandardScaler(), 
        num_features
    ),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# всё готово в пару строк кода
X_train_ridge = pd.DataFrame.sparse.from_spmatrix(
    col_transformer_ridge.fit_transform(X_train_ridge),
    columns=col_transformer_ridge.get_feature_names_out()
)

# смотрим на результат
X_train_ridge.head()

Unnamed: 0,brand_bmw,brand_ford,brand_mercedes_benz,brand_opel,brand_volkswagen,model_3er,model_5er,model_6er,model_7er,model_a1,...,vehicle_type_small,vehicle_type_suv,vehicle_type_wagon,gearbox_manual,fuel_type_other,fuel_type_petrol,registration_year,kilometer,power,repaired
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-1.600978,0.56939,0.001771,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.446077,0.778466,0.393532,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.338057,0.371868,-0.879692,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.892067,-1.627904,-0.135346,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.892067,0.954276,1.451288,0.0


<font color='steelblue' size=3><b>
Если решите остановиться на этом способе и Ridge окажется лучшей моделью, будем трансформировать тестовую выборку так:</b></font>

In [None]:
# трансформируем тестовую выборку
X_test_ridge = pd.DataFrame.sparse.from_spmatrix(
    col_transformer_ridge.transform(X_test_ridge),
    columns=col_transformer_ridge.get_feature_names_out()
)

# смотрим на результат
X_test_ridge.head()

Unnamed: 0,brand_bmw,brand_ford,brand_mercedes_benz,brand_opel,brand_volkswagen,model_3er,model_5er,model_6er,model_7er,model_a1,...,vehicle_type_small,vehicle_type_suv,vehicle_type_wagon,gearbox_manual,fuel_type_other,fuel_type_petrol,registration_year,kilometer,power,repaired
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,-0.492958,0.519842,-0.370403,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.061052,0.507666,1.607992,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,-1.323973,0.885306,1.333759,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.492958,0.857669,-0.958045,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,-0.492958,0.302326,-1.173513,0.0


<font color='steelblue' size=3><b>
Теперь подбираем гиперпараметры с помощью GridSearchCV, который имеет встроенную кросс-валидацию для модели Ridge. Лучший вариант — использовать пайплайны, это предотвратит возможность утечки на кросс-валидации. О них в самом конце, а пока рассморим вариант без него:</b></font>

In [None]:
# random_state не перебирается, задаём его прямо в модели
model_ridge = Ridge(random_state=RANDOM_STATE)

# словарь с гиперпараметрами и значениями, которые хотим перебрать
param_grid_ridge = {
    'alpha': np.arange(0, 0.21, 0.01),
}

gs_ridge = GridSearchCV(
    model_ridge, 
    param_grid=param_grid_ridge, 
    scoring='neg_root_mean_squared_error', 
    n_jobs=-1
)

gs_ridge.fit(X_train_ridge, y_train)

# лучшее значение RMSE на кросс-валидации
print(f'best_score: {gs_ridge.best_score_ * -1}')

# лучшие гиперпараметры
print(f'best_params: {gs_ridge.best_params_}')

best_score: 2260.558448452929
best_params: {'alpha': 0.01}


<font color='steelblue' size=3><b>
Ещё более элегантным решением будет использовать пайплайн вместе с кросс-валидацией. Это позволит кодировать и масштабировать отдельно каждую тренировочную и валидационную выборки внутри кросс-валидации (без «подглядывания»):</b></font>

In [None]:
X_train_ridge = X_train.copy()

In [None]:
pipeline_ridge = make_pipeline(col_transformer_ridge, model_ridge)

param_grid_ridge = {
    'ridge__alpha': np.arange(0, 0.21, 0.01),
}

gs_ridge_pl = GridSearchCV(
    pipeline_ridge, 
    param_grid=param_grid_ridge, 
    scoring='neg_root_mean_squared_error', 
    n_jobs=-1
)

gs_ridge_pl.fit(X_train_ridge, y_train)

gs_ridge_best_score = gs_ridge_pl.best_score_ * -1
gs_ridge_best_params = gs_ridge_pl.best_params_

# лучшее значение RMSE на кросс-валидации
print(f'best_score: {gs_ridge_best_score}')
# лучшие гиперпараметры
print(f'best_params: {gs_ridge_best_params}')

best_score: 2260.559847084961
best_params: {'ridge__alpha': 0.02}


<font color='steelblue' size=3><b>
Перейдём к RandomForestRegressor:</b></font>

In [None]:
X_train_rf = X_train.copy()
X_test_rf = X_test.copy()

In [None]:
col_transformer_rf= make_column_transformer(
    (
        OneHotEncoder(drop='first', handle_unknown='ignore'), 
        ohe_features_rf
    ),
    (
        OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
        ['model']
    ),
    (
        StandardScaler(), 
        num_features
    ),
    remainder='passthrough'
)

model_rf = RandomForestRegressor(random_state=RANDOM_STATE)

pipline_rf = make_pipeline(col_transformer_rf, model_rf)

param_grid_rf = {
    'randomforestregressor__n_estimators': range(50, 251, 50),
    'randomforestregressor__max_depth': range(2, 15),
    'randomforestregressor__min_samples_split': (2, 3, 4),
    'randomforestregressor__min_samples_leaf': (1, 2, 3, 4)
}

# получается достаточно много комбинаций гиперпараметров при переборе
# будем использовать RandomizedSearchCV, он работает на много быстрее
gs_rf = RandomizedSearchCV(
    pipline_rf, 
    param_distributions=param_grid_rf, 
    scoring='neg_root_mean_squared_error', 
    n_jobs=-1, 
    random_state=RANDOM_STATE
)

gs_rf.fit(X_train_rf, y_train)

gs_rf_best_score = gs_rf.best_score_ * -1
gs_rf_best_params = gs_rf.best_params_
print(f'best_score: {gs_rf_best_score}')
print(f'best_params: {gs_rf_best_params}')

best_score: 1947.8392253783488
best_params: {'randomforestregressor__n_estimators': 250, 'randomforestregressor__min_samples_split': 4, 'randomforestregressor__min_samples_leaf': 3, 'randomforestregressor__max_depth': 12}


In [None]:
result = pd.DataFrame(
    [gs_ridge_best_score, gs_rf_best_score], 
    index=['Ridge', 'RandomForestRegressor'], 
    columns=['RMSE']
)
result

NameError: name 'gs_ridge_best_score' is not defined

<font color='steelblue' size=3><b>
Лучшей моделью оказалась RandomForestRegressor. Проверим её качесво на тестовой выборке:</b></font>

In [None]:
rf_prediction = gs_rf.predict(X_test_rf)
metric_test = mean_squared_error(y_test, rf_prediction, squared=False)
metric_test

1938.67987460236

<img src="https://www.storemypic.com/images/2016/11/22/keep-calm-and-good-luck-5f5ac.jpg">