In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
df = pd.read_csv('ford.csv')

In [None]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17965 entries, 0 to 17964
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17965 non-null  object 
 1   year          17965 non-null  int64  
 2   price         17965 non-null  int64  
 3   transmission  17965 non-null  object 
 4   mileage       17965 non-null  int64  
 5   fuelType      17965 non-null  object 
 6   tax           17965 non-null  int64  
 7   mpg           17965 non-null  float64
 8   engineSize    17965 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


In [None]:
def missing_value(x):
  return x.isna().mean()

missing_value(df)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

В нашем датасете имеется 17965 записей, пропусков нет

In [None]:
train, test = train_test_split(df,test_size=0.4,random_state=22)

Для обучение модели на основе Catboost необходимо также наличие валидационной выборки. Создадим ее путем разбиения тестовой выборки на равные части

In [None]:
val, test = train_test_split(test, test_size=0.5, random_state=22)

In [None]:
print(f'Размер выборки train: {len(train) / len(df)}') 
print(f'Размер выборки test: {len(test) / len(df)}') 
print(f'Размер выборки val: {len(val) / len(df)}') 

Размер выборки train: 0.6
Размер выборки test: 0.2
Размер выборки val: 0.2


Создание модели с помощью CatBoostRegressor требует указания категориальных переменных, имеющихся в датасете. Для удобства создадим список таких значений отдельно

In [None]:
X = ['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize']

y = ['price']

cat_features = ['model', 'transmission', 'fuelType']

Для оценки точности построения модели будет использовать MAPE (mean absolute percentage error)

In [None]:
params = {
          'cat_features': cat_features,
          'eval_metric': 'MAPE',
          'random_seed':22,
          'verbose':100}

In [None]:
model = CatBoostRegressor(**params)

In [None]:
model.fit(train[X],train[y],eval_set=(val[X],val[y]))

Learning rate set to 0.073962
0:	learn: 0.3706158	test: 0.3453981	best: 0.3453981 (0)	total: 14.4ms	remaining: 14.4s
100:	learn: 0.0804583	test: 0.0781916	best: 0.0781916 (100)	total: 993ms	remaining: 8.84s
200:	learn: 0.0747701	test: 0.0735637	best: 0.0735637 (200)	total: 1.92s	remaining: 7.62s
300:	learn: 0.0713295	test: 0.0714240	best: 0.0714240 (300)	total: 2.86s	remaining: 6.65s
400:	learn: 0.0691345	test: 0.0700753	best: 0.0700743 (399)	total: 3.87s	remaining: 5.78s
500:	learn: 0.0674136	test: 0.0693248	best: 0.0693248 (500)	total: 4.92s	remaining: 4.89s
600:	learn: 0.0660123	test: 0.0685131	best: 0.0685131 (600)	total: 5.93s	remaining: 3.94s
700:	learn: 0.0651711	test: 0.0681597	best: 0.0681597 (700)	total: 6.98s	remaining: 2.98s
800:	learn: 0.0643557	test: 0.0677454	best: 0.0677396 (797)	total: 7.99s	remaining: 1.99s
900:	learn: 0.0636447	test: 0.0674690	best: 0.0674594 (898)	total: 8.95s	remaining: 983ms
999:	learn: 0.0628771	test: 0.0671218	best: 0.0671211 (997)	total: 9.93s	

<catboost.core.CatBoostRegressor at 0x7f91bb2be910>

Изменим уровень обучения чтобы добиться минимизации ошибки:

In [None]:
params = {
          'cat_features': cat_features,
          'eval_metric': 'MAPE',
          'random_seed':22,
          'verbose':100,
          'learning_rate': 0.1}

In [None]:
model = CatBoostRegressor(**params)

In [None]:
model.fit(train[X],train[y],eval_set=(val[X],val[y]))

0:	learn: 0.3631221	test: 0.3384537	best: 0.3384537 (0)	total: 12.3ms	remaining: 12.3s
100:	learn: 0.0774207	test: 0.0761450	best: 0.0761450 (100)	total: 957ms	remaining: 8.51s
200:	learn: 0.0718949	test: 0.0720437	best: 0.0720260 (199)	total: 1.92s	remaining: 7.62s
300:	learn: 0.0688527	test: 0.0702875	best: 0.0702875 (300)	total: 2.93s	remaining: 6.8s
400:	learn: 0.0669383	test: 0.0692637	best: 0.0692637 (400)	total: 3.87s	remaining: 5.78s
500:	learn: 0.0654111	test: 0.0685624	best: 0.0685624 (500)	total: 4.83s	remaining: 4.82s
600:	learn: 0.0641835	test: 0.0681729	best: 0.0681652 (595)	total: 5.93s	remaining: 3.94s
700:	learn: 0.0631663	test: 0.0678430	best: 0.0678430 (700)	total: 6.97s	remaining: 2.97s
800:	learn: 0.0621088	test: 0.0675035	best: 0.0675035 (800)	total: 8.04s	remaining: 2s
900:	learn: 0.0614379	test: 0.0673258	best: 0.0673195 (883)	total: 9.03s	remaining: 992ms
999:	learn: 0.0607562	test: 0.0671500	best: 0.0671500 (999)	total: 9.99s	remaining: 0us

bestTest = 0.06714

<catboost.core.CatBoostRegressor at 0x7f91bad03950>

Изменение уровня обучения практически никак не повлияло на ошибку. Теперь вычислим ошибку на предсказанных данных: 

In [None]:
test['price_pred'] = model.predict(test[X])

In [None]:
def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

error(test['price'],test['price_pred'])

808.8604429624969
0.06944714231960757


Попробуем улучшить результаты обучив модель на всех данных (train + val). В параметрах указываем лучшую итерацию которая была на предыдущей модели.

In [None]:
train_full = pd.concat([train,val])

In [None]:
params = {
          'cat_features': cat_features,
          'eval_metric': 'MAPE',
          'random_seed':22,
          'verbose':100,
          'learning_rate': 0.1,
          'iterations': model.best_iteration_ + 1}

In [None]:
model_full = CatBoostRegressor(**params)

In [None]:
model_full.fit(train_full[X], train_full[y])

0:	learn: 0.3571673	total: 12.6ms	remaining: 12.5s
100:	learn: 0.0764948	total: 1.06s	remaining: 9.46s
200:	learn: 0.0711291	total: 2.13s	remaining: 8.47s
300:	learn: 0.0682542	total: 3.22s	remaining: 7.49s
400:	learn: 0.0665775	total: 4.29s	remaining: 6.41s
500:	learn: 0.0653870	total: 5.38s	remaining: 5.36s
600:	learn: 0.0643657	total: 6.49s	remaining: 4.31s
700:	learn: 0.0634775	total: 7.64s	remaining: 3.26s
800:	learn: 0.0627155	total: 8.8s	remaining: 2.19s
900:	learn: 0.0620276	total: 9.91s	remaining: 1.09s
999:	learn: 0.0611643	total: 11s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f91b816a9d0>

In [None]:
test['price_pred_full'] = model_full.predict(test[X])

In [None]:
error(test['price'],test['price_pred_full'])

794.8811274976246
0.0680132414056946


Изменим функцию потерь на MAE (по умолчанию установлена RMSE)

In [None]:
params = {
          'cat_features': cat_features,
          'eval_metric': 'MAPE',
          'random_seed':22,
          'verbose':100,
          'learning_rate': 0.1,
          'loss_function': 'MAE'}

In [None]:
model_mae = CatBoostRegressor(**params)

In [None]:
model_mae.fit(train_full[X], train_full[y])

0:	learn: 0.3217110	total: 15ms	remaining: 15s
100:	learn: 0.0729539	total: 1.3s	remaining: 11.6s
200:	learn: 0.0672376	total: 2.58s	remaining: 10.2s
300:	learn: 0.0649762	total: 3.81s	remaining: 8.84s
400:	learn: 0.0636559	total: 5.08s	remaining: 7.58s
500:	learn: 0.0627583	total: 6.3s	remaining: 6.28s
600:	learn: 0.0620030	total: 7.51s	remaining: 4.99s
700:	learn: 0.0613583	total: 8.74s	remaining: 3.73s
800:	learn: 0.0609354	total: 9.97s	remaining: 2.48s
900:	learn: 0.0604218	total: 11.2s	remaining: 1.23s
999:	learn: 0.0600654	total: 12.3s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f91bad03d10>

In [None]:
test['price_pred_full_MAE'] = model_mae.predict(test[X])

In [None]:
error(test['price'],test['price_pred_full_MAE'])

788.1333002293322
0.06734412438868867


Вывод: подбором размера обучающей выборки и функции потерь возможно минимизировать ошибку на тестовых данных