# Introducing:

## Context:
The cleaned data set contains information of price, transmission, mileage, fuel type, road tax, miles per gallon (mpg), and engine size. I've removed duplicate listings and cleaned the columns, but have included a notebook showing the process and the original data for anyone who wants to check/improve my work.

## Task:
*Predict cars price*

## Used libraries:
- pandas
- numpy
- matplotlib
- sklearn
- xgboost
- catboost

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('dark_background')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb
from catboost import CatBoostRegressor

# Data preparation

## Read data

In [2]:
df = pd.read_csv('/home/antonius/Projects/DS_Projects/Data-analysis-with-data-science/1_uk_used_cars_price_prediction/bmw.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


## Format data

In [3]:
len(df)

10781

In [4]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

### Skips, Nans

In [5]:
df.isna().mean().sort_values(ascending=False)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [6]:
df.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [7]:
df.select_dtypes(include='object')

Unnamed: 0,model,transmission,fuelType
0,5 Series,Automatic,Diesel
1,6 Series,Automatic,Petrol
2,5 Series,Automatic,Diesel
3,1 Series,Automatic,Diesel
4,7 Series,Automatic,Diesel
...,...,...,...
10776,X3,Automatic,Diesel
10777,5 Series,Automatic,Diesel
10778,3 Series,Manual,Petrol
10779,1 Series,Automatic,Diesel


*cat_features: model, transmission, fuelType*

In [8]:
df['model'].unique()

array([' 5 Series', ' 6 Series', ' 1 Series', ' 7 Series', ' 2 Series',
       ' 4 Series', ' X3', ' 3 Series', ' X5', ' X4', ' i3', ' X1', ' M4',
       ' X2', ' X6', ' 8 Series', ' Z4', ' X7', ' M5', ' i8', ' M2',
       ' M3', ' M6', ' Z3'], dtype=object)

In [9]:
df['transmission'].unique()

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

In [10]:
df['fuelType'].unique()

array(['Diesel', 'Petrol', 'Other', 'Hybrid', 'Electric'], dtype=object)

# Func for Checking model

In [11]:
def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

# Split dataset into train, validation, test, train_full

- train - 60%
- val - 20%
- test - 20%
- train_full - 80%

In [12]:
train, test = train_test_split(df, train_size=0.6, random_state=42)

In [13]:
len(train)

6468

In [14]:
len(test)

4313

In [15]:
len(train) / len(df)

0.5999443465355718

In [16]:
len(test) / len (df)

0.4000556534644282

In [17]:
val, test = train_test_split(test, train_size=0.5, random_state=42)

- train 60%
- val 20%
- test 20%

In [18]:
len(train) / len(df)

0.5999443465355718

In [19]:
len(val) / len(df)

0.19998144884519062

In [20]:
len(test) / len(df)

0.20007420461923756

- merge train and val
- train_full 80%
- test 20%

In [21]:
len(train)


6468

In [22]:
len(val)

2156

In [23]:
len(train) + len(val)

8624

In [24]:
train_full = pd.concat([train,val])

In [25]:
len(train_full) / len(df)

0.7999257953807625

In [26]:
len(test) / len(df)

0.20007420461923756

In [27]:
test.isna().mean().sort_values(ascending=False)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [28]:
train.isna().mean().sort_values(ascending=False)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [29]:
val.isna().mean().sort_values(ascending=False)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

# One-Hot Encoding
*cat_features: model, transmission, fuelType*

In [30]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0


In [31]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

## Perform One-Hot Encoding
*cat_features: model, transmission, fuelType*

### Perform transmission

In [36]:
#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

In [37]:
#perform one-hot encoding on cat_features: transmission
encoder_df = pd.DataFrame(encoder.fit_transform(df[['transmission']]).toarray())

In [38]:
#merge one-hot encoded columns back with original DataFrame
encoded_df = df.join(encoder_df)

In [39]:
encoded_df.head(1)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,0,1,2
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


In [40]:
encoded_df.columns

Index([       'model',         'year',        'price', 'transmission',
            'mileage',     'fuelType',          'tax',          'mpg',
         'engineSize',              0,              1,              2],
      dtype='object')

In [41]:
df['transmission'].value_counts().reset_index()

Unnamed: 0,index,transmission
0,Semi-Auto,4666
1,Automatic,3588
2,Manual,2527


In [42]:
encoded_df[[0, 1, 2]].value_counts()

0    1    2  
0.0  0.0  1.0    4666
1.0  0.0  0.0    3588
0.0  1.0  0.0    2527
dtype: int64

#### Rename Columns

In [43]:
encoded_df.columns

Index([       'model',         'year',        'price', 'transmission',
            'mileage',     'fuelType',          'tax',          'mpg',
         'engineSize',              0,              1,              2],
      dtype='object')

In [44]:
encoded_df.columns = [
    'model', 'year', 'price', 'transmission',
    'mileage', 'fuelType', 'tax',
    'mpg', 'engineSize',
    'is_automatic','is_manual', 'is_semi_auto']
encoded_df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0,1.0,0.0,0.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0,1.0,0.0,0.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5,1.0,0.0,0.0
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0,1.0,0.0,0.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0,1.0,0.0,0.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0,0.0,1.0,0.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0,1.0,0.0,0.0


In [45]:
encoded_df['is_semi_auto'].value_counts().reset_index()

Unnamed: 0,index,is_semi_auto
0,0.0,6115
1,1.0,4666


#### Drop the Original Categuniquecal Variable

In [46]:
encoded_df.drop('transmission', axis=1, inplace=True)

In [47]:
encoded_df.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


### Perform model

In [61]:
#perform one-hot encoding on cat_features: model
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df[['model']]).toarray())

In [62]:
#merge one-hot encoded columns back with original DataFrame
encoded_df2 = encoded_df.join(encoder_df)

In [63]:
encoded_df.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


In [64]:
encoded_df2.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,...,14,15,16,17,18,19,20,21,22,23
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
encoded_df2.columns

Index([       'model',         'year',        'price',      'mileage',
           'fuelType',          'tax',          'mpg',   'engineSize',
       'is_automatic',    'is_manual', 'is_semi_auto',              0,
                    1,              2,              3,              4,
                    5,              6,              7,              8,
                    9,             10,             11,             12,
                   13,             14,             15,             16,
                   17,             18,             19,             20,
                   21,             22,             23],
      dtype='object')

In [66]:
encoded_df2['model'].value_counts().reset_index()

Unnamed: 0,index,model
0,3 Series,2443
1,1 Series,1969
2,2 Series,1229
3,5 Series,1056
4,4 Series,995
5,X1,804
6,X3,551
7,X5,468
8,X2,288
9,X4,179


In [69]:
encoded_df2[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
            11, 12, 13, 14, 15, 16, 17, 18, 19, 
            20, 21, 22, 23]].value_counts()

0    1    2    3    4    5    6    7    8    9    10   11   12   13   14   15   16   17   18   19   20   21   22   23 
0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    2443
1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1969
0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1229
     0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1056
               1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     995
               0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     804
                                                                 0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 

#### Rename Columns

In [70]:
encoded_df2.columns

Index([       'model',         'year',        'price',      'mileage',
           'fuelType',          'tax',          'mpg',   'engineSize',
       'is_automatic',    'is_manual', 'is_semi_auto',              0,
                    1,              2,              3,              4,
                    5,              6,              7,              8,
                    9,             10,             11,             12,
                   13,             14,             15,             16,
                   17,             18,             19,             20,
                   21,             22,             23],
      dtype='object')

In [71]:
encoded_df2.columns = [
    'model','year','price','mileage',
    'fuelType','tax','mpg','engineSize',
    'is_automatic','is_manual','is_semi_auto',
    'Is_1_series','Is_2_series','Is_3_series','Is_4_series','Is_5_series',
    'Is_6_series','Is_7_series','Is_8_series','Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8'
]
pd.set_option('display.max_columns', None)
encoded_df2[['model','Is_1_series','Is_2_series','Is_3_series','Is_4_series','Is_5_series',
    'Is_6_series','Is_7_series','Is_8_series','Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8'
]]

Unnamed: 0,model,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8
0,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6 Series,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1 Series,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7 Series,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,X3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10777,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10778,3 Series,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10779,1 Series,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Drop the Original Categorical Variable

In [72]:
encoded_df2.drop('model', axis=1, inplace=True)

In [74]:
encoded_df2.head(1)

Unnamed: 0,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8
0,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Perform fuelType

In [75]:
#perform one-hot encoding on cat_features: fuelType
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df2[['fuelType']]).toarray())

In [76]:
#merge one-hot encoded columns back with original DataFrame
encoded_df3 = encoded_df2.join(encoder_df)

In [78]:
encoded_df3.head(1)

Unnamed: 0,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,0,1,2,3,4
0,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [79]:
encoded_df3.columns

Index([        'year',        'price',      'mileage',     'fuelType',
                'tax',          'mpg',   'engineSize', 'is_automatic',
          'is_manual', 'is_semi_auto',  'Is_1_series',  'Is_2_series',
        'Is_3_series',  'Is_4_series',  'Is_5_series',  'Is_6_series',
        'Is_7_series',  'Is_8_series',        'Is_M2',        'Is_M3',
              'Is_M4',        'Is_M5',        'Is_M6',        'Is_X1',
              'Is_X2',        'Is_X3',        'Is_X4',        'Is_X5',
              'Is_X6',        'Is_X7',        'Is_Z3',        'Is_z4',
              'Is_i3',        'Is_i8',              0,              1,
                    2,              3,              4],
      dtype='object')

In [80]:
encoded_df3['fuelType'].value_counts().reset_index()

Unnamed: 0,index,fuelType
0,Diesel,7027
1,Petrol,3417
2,Hybrid,298
3,Other,36
4,Electric,3


In [81]:
encoded_df3[[0, 1, 2, 3, 4]].value_counts()

0    1    2    3    4  
1.0  0.0  0.0  0.0  0.0    7027
0.0  0.0  0.0  0.0  1.0    3417
          1.0  0.0  0.0     298
          0.0  1.0  0.0      36
     1.0  0.0  0.0  0.0       3
dtype: int64

#### Rename Columns

In [82]:
encoded_df3.columns

Index([        'year',        'price',      'mileage',     'fuelType',
                'tax',          'mpg',   'engineSize', 'is_automatic',
          'is_manual', 'is_semi_auto',  'Is_1_series',  'Is_2_series',
        'Is_3_series',  'Is_4_series',  'Is_5_series',  'Is_6_series',
        'Is_7_series',  'Is_8_series',        'Is_M2',        'Is_M3',
              'Is_M4',        'Is_M5',        'Is_M6',        'Is_X1',
              'Is_X2',        'Is_X3',        'Is_X4',        'Is_X5',
              'Is_X6',        'Is_X7',        'Is_Z3',        'Is_z4',
              'Is_i3',        'Is_i8',              0,              1,
                    2,              3,              4],
      dtype='object')

In [83]:
encoded_df3.columns = [
    'year','price','mileage','fuelType',
    'tax','mpg','engineSize',
    'is_automatic','is_manual','is_semi_auto',
    'Is_1_series',  'Is_2_series','Is_3_series',  'Is_4_series',
    'Is_5_series','Is_6_series','Is_7_series',  'Is_8_series',
    'Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8',
    'Is_diesel','Is_electric','Is_hybrid','Is_other_fuel','Is_petrol'
    
]
encoded_df3[[
     'fuelType','Is_diesel','Is_electric','Is_hybrid','Is_other_fuel','Is_petrol'
]]

Unnamed: 0,fuelType,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,Diesel,1.0,0.0,0.0,0.0,0.0
1,Petrol,0.0,0.0,0.0,0.0,1.0
2,Diesel,1.0,0.0,0.0,0.0,0.0
3,Diesel,1.0,0.0,0.0,0.0,0.0
4,Diesel,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
10776,Diesel,1.0,0.0,0.0,0.0,0.0
10777,Diesel,1.0,0.0,0.0,0.0,0.0
10778,Petrol,0.0,0.0,0.0,0.0,1.0
10779,Diesel,1.0,0.0,0.0,0.0,0.0


#### Drop the Original Categorical Variable

In [84]:
encoded_df3.drop('fuelType', axis=1, inplace=True)

In [85]:
encoded_df3

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


**The one-hot encoding is complete and we can now feed this pandas DataFrame into any machine learning algorithm**

## Final Encoded Dataframe

In [86]:
final_encoded_df = encoded_df3

In [87]:
final_encoded_df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Split Encoded Dataset into train, validation, test, train_full

- train - 60%
- val - 20%
- test - 20%
- train_full - 80%

In [1]:
final_encoded_df.head()

NameError: name 'final_encoded_df' is not defined

In [89]:
train_encoded, test_encoded = train_test_split(
    final_encoded_df, train_size=0.6, random_state=42)

In [90]:
len(train_encoded)

6468

In [91]:
len(test_encoded)

4313

In [92]:
len(train_encoded) / len(final_encoded_df)

0.5999443465355718

In [93]:
len(test_encoded) / len (final_encoded_df)

0.4000556534644282

In [94]:
val_encoded, test_encoded = train_test_split(test_encoded, train_size=0.5, random_state=42)

- train 60%
- val 20%
- test 20%

In [95]:
len(train_encoded) / len(final_encoded_df)

0.5999443465355718

In [96]:
len(val_encoded) / len(final_encoded_df)

0.19998144884519062

In [97]:
len(test_encoded) / len(final_encoded_df)

0.20007420461923756

- merge train and val
- train_full 80%
- test 20%

In [98]:
len(train_encoded)

6468

In [99]:
len(val_encoded)

2156

In [100]:
len(train_encoded) + len(val_encoded)

8624

In [101]:
train_full_encoded = pd.concat([train_encoded,val_encoded])

In [102]:
len(train_full_encoded) / len(final_encoded_df)

0.7999257953807625

In [103]:
len(test_encoded) / len(final_encoded_df)

0.20007420461923756

In [104]:
test_encoded

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
4984,2019,52990,3086,145,34.9,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4880,2018,24081,13245,150,60.1,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9132,2016,12999,68949,200,43.5,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6451,2020,11995,10,150,34.5,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7199,2020,29875,150,145,42.2,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9627,2015,14999,78680,160,52.3,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6096,2019,29676,7365,145,41.5,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1747,2019,30570,3067,145,49.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4023,2016,14999,70054,0,148.7,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# CatBoost

*cat_features: model, transmission, fuelType*

In [136]:
X = ['model', 'year', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize']

cat_features = ['model', 'transmission', 'fuelType']

y = ['price']

## model_cbt_1_standart

In [137]:
parameters = {
    'cat_features': cat_features,
    'eval_metric': 'MAPE',
    'random_seed':42,
    'verbose':100
}

In [138]:
model_cbt_1_standart = CatBoostRegressor(**parameters)

In [139]:
model_cbt_1_standart.fit(
    train[X],
    train[y],
    eval_set=(val[X],val[y])
)

Learning rate set to 0.068263
0:	learn: 0.4607438	test: 0.4563750	best: 0.4563750 (0)	total: 10.5ms	remaining: 10.5s
100:	learn: 0.0904302	test: 0.0896388	best: 0.0896388 (100)	total: 378ms	remaining: 3.37s
200:	learn: 0.0781784	test: 0.0803125	best: 0.0803125 (200)	total: 687ms	remaining: 2.73s
300:	learn: 0.0729925	test: 0.0764506	best: 0.0764506 (300)	total: 985ms	remaining: 2.29s
400:	learn: 0.0695552	test: 0.0744676	best: 0.0744553 (399)	total: 1.27s	remaining: 1.9s
500:	learn: 0.0670683	test: 0.0732827	best: 0.0732827 (500)	total: 1.6s	remaining: 1.59s
600:	learn: 0.0652835	test: 0.0726170	best: 0.0726128 (597)	total: 1.89s	remaining: 1.26s
700:	learn: 0.0636320	test: 0.0720622	best: 0.0720500 (695)	total: 2.18s	remaining: 932ms
800:	learn: 0.0623408	test: 0.0714586	best: 0.0714329 (794)	total: 2.52s	remaining: 626ms
900:	learn: 0.0611487	test: 0.0711564	best: 0.0711321 (898)	total: 2.91s	remaining: 320ms
999:	learn: 0.0601407	test: 0.0708439	best: 0.0708403 (998)	total: 3.24s	re

<catboost.core.CatBoostRegressor at 0x7f12e6672b00>

In [140]:
model_cbt_1_standart.best_iteration_

998

In [141]:
model_cbt_1_standart.predict(test[X])

array([57417.53972205, 22003.56027391, 13461.40876567, ...,
       30553.87789981, 13818.32099473, 24849.36058457])

In [142]:
test['price_pred_cbt_1'] = model_cbt_1_standart.predict(test[X])

In [143]:
test[['price', 'price_pred_cbt_1']]

Unnamed: 0,price,price_pred_cbt_1
4984,52990,57417.539722
4880,24081,22003.560274
9132,12999,13461.408766
6451,11995,32149.002131
7199,29875,35100.139852
...,...,...
9627,14999,14271.263108
6096,29676,31555.069740
1747,30570,30553.877900
4023,14999,13818.320995


In [144]:
error(test['price'], test['price_pred_cbt_1'])

1556.5378327680894
0.07367840910148211


**Learning rate set to 0.068263**

**result on test set:**
- 1556.5378327680894
- 0.07367840910148211

**result on validation set:**
- bestTest = 0.07084025495
- bestIteration = 998

## model_cbt_2

In [38]:
parameters = {
    'cat_features': cat_features,
    'iterations':10000,
    'verbose':200,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'random_seed':42
}

In [39]:
model_cbt_2 = CatBoostRegressor(**parameters)

In [40]:
model_cbt_2.fit(
    train[X],
    train[y],
    eval_set=(val[X],val[y])
)

Learning rate set to 0.016757
0:	learn: 0.4798102	test: 0.4751976	best: 0.4751976 (0)	total: 4.08ms	remaining: 40.8s
200:	learn: 0.1153392	test: 0.1122297	best: 0.1122297 (200)	total: 605ms	remaining: 29.5s
400:	learn: 0.0898378	test: 0.0887062	best: 0.0887062 (400)	total: 1.14s	remaining: 27.3s
600:	learn: 0.0818951	test: 0.0818479	best: 0.0818475 (599)	total: 1.68s	remaining: 26.3s
800:	learn: 0.0777877	test: 0.0787110	best: 0.0787110 (800)	total: 2.24s	remaining: 25.7s
1000:	learn: 0.0747482	test: 0.0765541	best: 0.0765541 (1000)	total: 2.77s	remaining: 24.9s
1200:	learn: 0.0722384	test: 0.0747818	best: 0.0747796 (1198)	total: 3.32s	remaining: 24.3s
1400:	learn: 0.0704053	test: 0.0736951	best: 0.0736951 (1400)	total: 3.85s	remaining: 23.7s
1600:	learn: 0.0688236	test: 0.0728287	best: 0.0728287 (1600)	total: 4.39s	remaining: 23s
1800:	learn: 0.0675772	test: 0.0722127	best: 0.0722120 (1797)	total: 4.94s	remaining: 22.5s
2000:	learn: 0.0665632	test: 0.0718219	best: 0.0718206 (1998)	tot

<catboost.core.CatBoostRegressor at 0x7fb88ea3de40>

In [41]:
model_cbt_2.predict(test[X])

array([56879.30580034, 22490.28854615, 13622.82159033, ...,
       30547.50478572, 13784.8359602 , 24622.71410293])

In [42]:
test['price_pred_cbt_2'] = model_cbt_2.predict(test[X])

In [43]:
test[['price', 'price_pred_cbt_1', 'price_pred_cbt_2']]

Unnamed: 0,price,price_pred_cbt_1,price_pred_cbt_2
4984,52990,57417.539722,56879.305800
4880,24081,22003.560274,22490.288546
9132,12999,13461.408766,13622.821590
6451,11995,32149.002131,31830.457029
7199,29875,35100.139852,35229.436163
...,...,...,...
9627,14999,14271.263108,14254.926439
6096,29676,31555.069740,30628.121554
1747,30570,30553.877900,30547.504786
4023,14999,13818.320995,13784.835960


In [44]:
error(test['price'], test['price_pred_cbt_2'])

1491.640765115738
0.07087766181586952


**Learning rate set to 0.016757**

**result on test set:**
- 1491.640765115738
- 0.07087766181586952

**result on validation set:**
- bestTest = 0.0677493627
- bestIteration = 8414

## model_cbt_3_loss_funcMAE 

In [47]:
parameters = {
    'cat_features': cat_features,
    'iterations':10000,
    'verbose':200,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'loss_function': 'MAE',
    'random_seed':42
}

In [48]:
model_cbt_3_loss_funcMAE = CatBoostRegressor(**parameters)

In [49]:
model_cbt_3_loss_funcMAE.fit(
    train[X],
    train[y],
    eval_set=(val[X],val[y])
)

0:	learn: 0.4071564	test: 0.4015652	best: 0.4015652 (0)	total: 5.32ms	remaining: 53.1s
200:	learn: 0.0866586	test: 0.0861821	best: 0.0861821 (200)	total: 709ms	remaining: 34.6s
400:	learn: 0.0727190	test: 0.0743434	best: 0.0743434 (400)	total: 1.4s	remaining: 33.4s
600:	learn: 0.0671061	test: 0.0707259	best: 0.0707259 (600)	total: 2.08s	remaining: 32.5s
800:	learn: 0.0640010	test: 0.0691293	best: 0.0691034 (795)	total: 2.74s	remaining: 31.4s
1000:	learn: 0.0617751	test: 0.0682215	best: 0.0682201 (998)	total: 3.4s	remaining: 30.6s
1200:	learn: 0.0602295	test: 0.0677151	best: 0.0677151 (1200)	total: 4.07s	remaining: 29.8s
1400:	learn: 0.0591701	test: 0.0674592	best: 0.0674592 (1400)	total: 4.7s	remaining: 28.9s
1600:	learn: 0.0582648	test: 0.0673130	best: 0.0673029 (1557)	total: 5.32s	remaining: 27.9s
1800:	learn: 0.0573967	test: 0.0670691	best: 0.0670691 (1800)	total: 5.97s	remaining: 27.2s
2000:	learn: 0.0567510	test: 0.0670013	best: 0.0669923 (1984)	total: 6.6s	remaining: 26.4s
2200:	

<catboost.core.CatBoostRegressor at 0x7fb88ea3c7c0>

In [50]:
model_cbt_3_loss_funcMAE.best_iteration_

7990

In [51]:
model_cbt_3_loss_funcMAE.predict(test[X])

array([57987.14442974, 22499.89468956, 14227.03532131, ...,
       30591.67265619, 13827.77305477, 24469.94052382])

In [52]:
test['price_pred_cbt_3'] = model_cbt_3_loss_funcMAE.predict(test[X])

In [53]:
test.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize', 'price_pred_cbt_1', 'price_pred_cbt_2',
       'price_pred_cbt_3'],
      dtype='object')

In [54]:
test[['price', 'price_pred_cbt_1', 'price_pred_cbt_2','price_pred_cbt_3']]

Unnamed: 0,price,price_pred_cbt_1,price_pred_cbt_2,price_pred_cbt_3
4984,52990,57417.539722,56879.305800,57987.144430
4880,24081,22003.560274,22490.288546,22499.894690
9132,12999,13461.408766,13622.821590,14227.035321
6451,11995,32149.002131,31830.457029,32907.506246
7199,29875,35100.139852,35229.436163,32895.462147
...,...,...,...,...
9627,14999,14271.263108,14254.926439,13953.298965
6096,29676,31555.069740,30628.121554,31415.441989
1747,30570,30553.877900,30547.504786,30591.672656
4023,14999,13818.320995,13784.835960,13827.773055


In [55]:
error(test['price'], test['price_pred_cbt_3'])

1535.6125800104307
0.07122508414648335


**result on test set:**
- 1535.6125800104307
- 0.07122508414648335


**result on validation set:**
- bestTest = 0.06587125687
- bestIteration = 7990

## model 4. Learning With Complete Data.

In [56]:
model_cbt_1_standart.best_iteration_

998

In [57]:
parameters = {
    'iterations': model_cbt_1_standart.best_iteration_ + 1,
    'cat_features': cat_features,
    'eval_metric': 'MAPE',
    'random_seed':42,
    'verbose':100
}

In [58]:
model_cbt_4_train_full = CatBoostRegressor(**parameters)

In [59]:
model_cbt_4_train_full.fit(train_full[X],train_full[y])

Learning rate set to 0.057594
0:	learn: 0.4621041	total: 6.35ms	remaining: 6.34s
100:	learn: 0.0922432	total: 321ms	remaining: 2.85s
200:	learn: 0.0792846	total: 611ms	remaining: 2.42s
300:	learn: 0.0741324	total: 903ms	remaining: 2.09s
400:	learn: 0.0710318	total: 1.22s	remaining: 1.83s
500:	learn: 0.0688377	total: 1.55s	remaining: 1.54s
600:	learn: 0.0672268	total: 1.85s	remaining: 1.23s
700:	learn: 0.0655418	total: 2.15s	remaining: 915ms
800:	learn: 0.0644267	total: 2.44s	remaining: 604ms
900:	learn: 0.0634701	total: 2.74s	remaining: 299ms
998:	learn: 0.0625320	total: 3.04s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fb88ea3c2e0>

In [60]:
model_cbt_4_train_full.predict(test[X])

array([56064.74518934, 22632.28528799, 13125.49279828, ...,
       30045.65548053, 13532.64129893, 24753.96323938])

In [61]:
test['price_pred_cbt_4'] = model_cbt_4_train_full.predict(test[X])

In [62]:
test[['price', 'price_pred_cbt_1', 'price_pred_cbt_2','price_pred_cbt_3', 'price_pred_cbt_4']]

Unnamed: 0,price,price_pred_cbt_1,price_pred_cbt_2,price_pred_cbt_3,price_pred_cbt_4
4984,52990,57417.539722,56879.305800,57987.144430,56064.745189
4880,24081,22003.560274,22490.288546,22499.894690,22632.285288
9132,12999,13461.408766,13622.821590,14227.035321,13125.492798
6451,11995,32149.002131,31830.457029,32907.506246,32602.863157
7199,29875,35100.139852,35229.436163,32895.462147,34307.308655
...,...,...,...,...,...
9627,14999,14271.263108,14254.926439,13953.298965,14322.965262
6096,29676,31555.069740,30628.121554,31415.441989,30564.298365
1747,30570,30553.877900,30547.504786,30591.672656,30045.655481
4023,14999,13818.320995,13784.835960,13827.773055,13532.641299


In [65]:
error(test['price'], test['price_pred_cbt_4'])

1526.4664351832776
0.07309286515179017


**result on validation set:**
- 1526.4664351832776
- 0.07309286515179017

## model 5. Learning With Complete Data and tunned params

In [66]:
model_cbt_3_loss_funcMAE.best_iteration_

7990

In [67]:
parameters = {
    'cat_features': cat_features,
    'iterations':model_cbt_3_loss_funcMAE.best_iteration_ + 1,
    'learning_rate': 0.01,
    'verbose':200,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'loss_function': 'MAE',
    'random_seed':42
}

In [68]:
model_cbt_5 = CatBoostRegressor(**parameters)

In [69]:
model_cbt_5.fit(train_full[X],train_full[y])

0:	learn: 0.4120386	total: 10.1ms	remaining: 1m 20s
200:	learn: 0.1426312	total: 799ms	remaining: 31s
400:	learn: 0.0976066	total: 1.54s	remaining: 29.3s
600:	learn: 0.0857664	total: 2.28s	remaining: 28s
800:	learn: 0.0801102	total: 3.02s	remaining: 27.1s
1000:	learn: 0.0758014	total: 3.85s	remaining: 26.9s
1200:	learn: 0.0726483	total: 4.66s	remaining: 26.4s
1400:	learn: 0.0704302	total: 5.42s	remaining: 25.5s
1600:	learn: 0.0686288	total: 6.21s	remaining: 24.8s
1800:	learn: 0.0671681	total: 7.01s	remaining: 24.1s
2000:	learn: 0.0659842	total: 7.8s	remaining: 23.3s
2200:	learn: 0.0649545	total: 8.55s	remaining: 22.5s
2400:	learn: 0.0641130	total: 9.35s	remaining: 21.8s
2600:	learn: 0.0633503	total: 10.1s	remaining: 21s
2800:	learn: 0.0627182	total: 10.9s	remaining: 20.2s
3000:	learn: 0.0621305	total: 11.6s	remaining: 19.4s
3200:	learn: 0.0616429	total: 12.4s	remaining: 18.6s
3400:	learn: 0.0611955	total: 13.2s	remaining: 17.8s
3600:	learn: 0.0607996	total: 13.9s	remaining: 16.9s
3800:

<catboost.core.CatBoostRegressor at 0x7fb88ea3cc10>

In [70]:
model_cbt_5.predict(test[X])

array([57711.79600066, 22412.98809048, 13952.64548113, ...,
       30366.23876779, 13481.99649177, 24719.7794678 ])

In [71]:
test['price_pred_cbt_5'] = model_cbt_5.predict(test[X])

In [72]:
test[['price', 'price_pred_cbt_1', 'price_pred_cbt_2','price_pred_cbt_3', 'price_pred_cbt_4', 'price_pred_cbt_5']]

Unnamed: 0,price,price_pred_cbt_1,price_pred_cbt_2,price_pred_cbt_3,price_pred_cbt_4,price_pred_cbt_5
4984,52990,57417.539722,56879.305800,57987.144430,56064.745189,57711.796001
4880,24081,22003.560274,22490.288546,22499.894690,22632.285288,22412.988090
9132,12999,13461.408766,13622.821590,14227.035321,13125.492798,13952.645481
6451,11995,32149.002131,31830.457029,32907.506246,32602.863157,32906.238745
7199,29875,35100.139852,35229.436163,32895.462147,34307.308655,32725.853408
...,...,...,...,...,...,...
9627,14999,14271.263108,14254.926439,13953.298965,14322.965262,14165.336200
6096,29676,31555.069740,30628.121554,31415.441989,30564.298365,31527.288622
1747,30570,30553.877900,30547.504786,30591.672656,30045.655481,30366.238768
4023,14999,13818.320995,13784.835960,13827.773055,13532.641299,13481.996492


In [73]:
error(test['price'], test['price_pred_cbt_5'])

1503.3763400675214
0.06983928086175052


**result on validation set:**
- 1503.3763400675214
- 0.06983928086175052


## model_cbt_6 Learning rate 0,01

In [30]:
parameters = {
    'cat_features': cat_features,
    'iterations':30000,
    'learning_rate':0.01,
    'verbose':300,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'random_seed':42
}

In [31]:
model_cbt_6 = CatBoostRegressor(**parameters)

In [32]:
model_cbt_6.fit(
    train[X],
    train[y],
    eval_set=(val[X],val[y])
)

0:	learn: 0.4823331	test: 0.4776837	best: 0.4776837 (0)	total: 57.7ms	remaining: 28m 51s
300:	learn: 0.1252025	test: 0.1207654	best: 0.1207654 (300)	total: 843ms	remaining: 1m 23s
600:	learn: 0.0930986	test: 0.0908750	best: 0.0908750 (600)	total: 1.58s	remaining: 1m 17s
900:	learn: 0.0848493	test: 0.0837730	best: 0.0837730 (900)	total: 2.37s	remaining: 1m 16s
1200:	learn: 0.0802353	test: 0.0803469	best: 0.0803469 (1200)	total: 3.14s	remaining: 1m 15s
1500:	learn: 0.0769307	test: 0.0779746	best: 0.0779746 (1500)	total: 3.91s	remaining: 1m 14s
1800:	learn: 0.0744078	test: 0.0762822	best: 0.0762822 (1800)	total: 4.7s	remaining: 1m 13s
2100:	learn: 0.0722998	test: 0.0747966	best: 0.0747966 (2100)	total: 5.58s	remaining: 1m 14s
2400:	learn: 0.0706193	test: 0.0737257	best: 0.0737252 (2399)	total: 6.35s	remaining: 1m 13s
2700:	learn: 0.0692409	test: 0.0729578	best: 0.0729578 (2700)	total: 7.13s	remaining: 1m 12s
3000:	learn: 0.0681979	test: 0.0724226	best: 0.0724226 (3000)	total: 7.9s	remaini

<catboost.core.CatBoostRegressor at 0x7ffadf8590f0>

In [33]:
model_cbt_6.predict(test[X])

array([56515.05741763, 22535.58211369, 13798.24014925, ...,
       30193.36113048, 13837.92509603, 24647.53104752])

In [35]:
test['price_pred_cbt_6'] = model_cbt_6.predict(test[X])

In [36]:
test

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,price_pred_cbt_6
4984,X6,2019,52990,Semi-Auto,3086,Diesel,145,34.9,3.0,56515.057418
4880,X1,2018,24081,Semi-Auto,13245,Diesel,150,60.1,2.0,22535.582114
9132,1 Series,2016,12999,Automatic,68949,Petrol,200,43.5,2.0,13798.240149
6451,1 Series,2020,11995,Semi-Auto,10,Petrol,150,34.5,2.0,31563.858455
7199,3 Series,2020,29875,Semi-Auto,150,Petrol,145,42.2,2.0,35437.049942
...,...,...,...,...,...,...,...,...,...,...
9627,3 Series,2015,14999,Automatic,78680,Diesel,160,52.3,3.0,14367.077070
6096,4 Series,2019,29676,Automatic,7365,Petrol,145,41.5,3.0,30698.054846
1747,5 Series,2019,30570,Automatic,3067,Hybrid,145,49.6,2.0,30193.361130
4023,3 Series,2016,14999,Automatic,70054,Hybrid,0,148.7,2.0,13837.925096


In [37]:
test[['price', 'price_pred_cbt_6']]

Unnamed: 0,price,price_pred_cbt_6
4984,52990,56515.057418
4880,24081,22535.582114
9132,12999,13798.240149
6451,11995,31563.858455
7199,29875,35437.049942
...,...,...
9627,14999,14367.077070
6096,29676,30698.054846
1747,30570,30193.361130
4023,14999,13837.925096


In [38]:
error(test['price'], test['price_pred_cbt_6'])

1502.050751730545
0.07143548043732158


**Learning rate set to 0.016757**

**result on test set:**
- bestTest = 0.068041882

**result on validation set:**
- 1502.050751730545
- 0.07143548043732158

# CatBoost with One-Hot Encoding

In [105]:
final_encoded_df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [107]:
final_encoded_df.columns

Index(['year', 'price', 'mileage', 'tax', 'mpg', 'engineSize', 'is_automatic',
       'is_manual', 'is_semi_auto', 'Is_1_series', 'Is_2_series',
       'Is_3_series', 'Is_4_series', 'Is_5_series', 'Is_6_series',
       'Is_7_series', 'Is_8_series', 'Is_M2', 'Is_M3', 'Is_M4', 'Is_M5',
       'Is_M6', 'Is_X1', 'Is_X2', 'Is_X3', 'Is_X4', 'Is_X5', 'Is_X6', 'Is_X7',
       'Is_Z3', 'Is_z4', 'Is_i3', 'Is_i8', 'Is_diesel', 'Is_electric',
       'Is_hybrid', 'Is_other_fuel', 'Is_petrol'],
      dtype='object')

In [120]:
X_encoded = ['year', 'mileage', 'tax', 'mpg', 'engineSize', 'is_automatic',
       'is_manual', 'is_semi_auto', 'Is_1_series', 'Is_2_series',
       'Is_3_series', 'Is_4_series', 'Is_5_series', 'Is_6_series',
       'Is_7_series', 'Is_8_series', 'Is_M2', 'Is_M3', 'Is_M4', 'Is_M5',
       'Is_M6', 'Is_X1', 'Is_X2', 'Is_X3', 'Is_X4', 'Is_X5', 'Is_X6', 'Is_X7',
       'Is_Z3', 'Is_z4', 'Is_i3', 'Is_i8', 'Is_diesel', 'Is_electric',
       'Is_hybrid', 'Is_other_fuel', 'Is_petrol']

y_encoded = ['price']

In [121]:
X_encoded

['year',
 'mileage',
 'tax',
 'mpg',
 'engineSize',
 'is_automatic',
 'is_manual',
 'is_semi_auto',
 'Is_1_series',
 'Is_2_series',
 'Is_3_series',
 'Is_4_series',
 'Is_5_series',
 'Is_6_series',
 'Is_7_series',
 'Is_8_series',
 'Is_M2',
 'Is_M3',
 'Is_M4',
 'Is_M5',
 'Is_M6',
 'Is_X1',
 'Is_X2',
 'Is_X3',
 'Is_X4',
 'Is_X5',
 'Is_X6',
 'Is_X7',
 'Is_Z3',
 'Is_z4',
 'Is_i3',
 'Is_i8',
 'Is_diesel',
 'Is_electric',
 'Is_hybrid',
 'Is_other_fuel',
 'Is_petrol']

In [122]:
y_encoded

['price']

## model_cbt_7

In [148]:
parameters = {
    'eval_metric': 'MAPE',
    'random_seed':42,
    'verbose':100
}

In [157]:
model_cbt_7_standart.fit(
    train_encoded[X_encoded],
    train_encoded[y_encoded],
    eval_set=(val_encoded[X],val_encoded[y])
)

KeyError: "['model', 'transmission', 'fuelType'] not in index"

In [130]:
model_cbt_7_standart.best_iteration_

996

In [131]:
model_cbt_7_standart.predict(test_encoded[X])

array([56860.14859446, 22505.34556293, 13185.28940915, ...,
       31846.33707961, 13731.28670439, 24747.46094573])

In [132]:
test['price_pred_cbt_7'] = model_cbt_7_standart.predict(test_encoded[X])

In [133]:
test[['price', 'price_pred_cbt_1' 'price_pred_cbt_7']]

KeyError: "['price_pred_cbt_1price_pred_cbt_7'] not in index"

In [37]:
error(test['price'], test['price_pred_cbt_1'])

1556.5378327680894
0.07367840910148211


**Learning rate set to 0.068263**

**result on test set:**
- 1556.5378327680894
- 0.07367840910148211

**result on validation set:**
- bestTest = 0.07084025495
- bestIteration = 998

# XGBoost

# LightGBM

# Analysis: