# Introducing:

## Context:
The cleaned data set contains information of price, transmission, mileage, fuel type, road tax, miles per gallon (mpg), and engine size. I've removed duplicate listings and cleaned the columns, but have included a notebook showing the process and the original data for anyone who wants to check/improve my work.

## Task:
*Predict cars price*

## Used libraries:
- pandas
- numpy
- matplotlib
- sklearn
- xgboost
- catboost

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('dark_background')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error

import xgboost as xgb
from catboost import CatBoostRegressor

# Data preparation

## Read data

In [4]:
df = pd.read_csv('bmw.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


## Format data

In [5]:
len(df)

10781

In [6]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

### Skips, Nans

In [7]:
df.isna().mean().sort_values(ascending=False)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [8]:
df.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [9]:
df.select_dtypes(include='object')

Unnamed: 0,model,transmission,fuelType
0,5 Series,Automatic,Diesel
1,6 Series,Automatic,Petrol
2,5 Series,Automatic,Diesel
3,1 Series,Automatic,Diesel
4,7 Series,Automatic,Diesel
...,...,...,...
10776,X3,Automatic,Diesel
10777,5 Series,Automatic,Diesel
10778,3 Series,Manual,Petrol
10779,1 Series,Automatic,Diesel


*cat_features: model, transmission, fuelType*

In [10]:
df['model'].unique()

array([' 5 Series', ' 6 Series', ' 1 Series', ' 7 Series', ' 2 Series',
       ' 4 Series', ' X3', ' 3 Series', ' X5', ' X4', ' i3', ' X1', ' M4',
       ' X2', ' X6', ' 8 Series', ' Z4', ' X7', ' M5', ' i8', ' M2',
       ' M3', ' M6', ' Z3'], dtype=object)

In [11]:
df['transmission'].unique()

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

In [12]:
df['fuelType'].unique()

array(['Diesel', 'Petrol', 'Other', 'Hybrid', 'Electric'], dtype=object)

# Func for Checking model

In [13]:
def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

# One-Hot Encoding
*cat_features: model, transmission, fuelType*

In [14]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0


In [15]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

## Perform One-Hot Encoding
*cat_features: model, transmission, fuelType*

### Perform transmission

In [16]:
#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

In [17]:
#perform one-hot encoding on cat_features: transmission
encoder_df = pd.DataFrame(encoder.fit_transform(df[['transmission']]).toarray())

In [18]:
#merge one-hot encoded columns back with original DataFrame
encoded_df = df.join(encoder_df)

In [19]:
encoded_df.head(1)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,0,1,2
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


In [20]:
encoded_df.columns

Index([       'model',         'year',        'price', 'transmission',
            'mileage',     'fuelType',          'tax',          'mpg',
         'engineSize',              0,              1,              2],
      dtype='object')

In [21]:
df['transmission'].value_counts().reset_index()

Unnamed: 0,transmission,count
0,Semi-Auto,4666
1,Automatic,3588
2,Manual,2527


In [22]:
encoded_df[[0, 1, 2]].value_counts()

0    1    2  
0.0  0.0  1.0    4666
1.0  0.0  0.0    3588
0.0  1.0  0.0    2527
Name: count, dtype: int64

#### Rename Columns

In [23]:
encoded_df.columns

Index([       'model',         'year',        'price', 'transmission',
            'mileage',     'fuelType',          'tax',          'mpg',
         'engineSize',              0,              1,              2],
      dtype='object')

In [24]:
encoded_df.columns = [
    'model', 'year', 'price', 'transmission',
    'mileage', 'fuelType', 'tax',
    'mpg', 'engineSize',
    'is_automatic','is_manual', 'is_semi_auto']
encoded_df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0,1.0,0.0,0.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0,1.0,0.0,0.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5,1.0,0.0,0.0
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0,1.0,0.0,0.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0,1.0,0.0,0.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0,0.0,1.0,0.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0,1.0,0.0,0.0


In [25]:
encoded_df['is_semi_auto'].value_counts().reset_index()

Unnamed: 0,is_semi_auto,count
0,0.0,6115
1,1.0,4666


#### Drop the Original Categuniquecal Variable

In [26]:
encoded_df.drop('transmission', axis=1, inplace=True)

In [27]:
encoded_df.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


### Perform model

In [28]:
#perform one-hot encoding on cat_features: model
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df[['model']]).toarray())

In [29]:
#merge one-hot encoded columns back with original DataFrame
encoded_df2 = encoded_df.join(encoder_df)

In [30]:
encoded_df.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


In [31]:
encoded_df2.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,...,14,15,16,17,18,19,20,21,22,23
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
encoded_df2.columns

Index([       'model',         'year',        'price',      'mileage',
           'fuelType',          'tax',          'mpg',   'engineSize',
       'is_automatic',    'is_manual', 'is_semi_auto',              0,
                    1,              2,              3,              4,
                    5,              6,              7,              8,
                    9,             10,             11,             12,
                   13,             14,             15,             16,
                   17,             18,             19,             20,
                   21,             22,             23],
      dtype='object')

In [33]:
encoded_df2['model'].value_counts().reset_index()

Unnamed: 0,model,count
0,3 Series,2443
1,1 Series,1969
2,2 Series,1229
3,5 Series,1056
4,4 Series,995
5,X1,804
6,X3,551
7,X5,468
8,X2,288
9,X4,179


In [34]:
encoded_df2[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
            11, 12, 13, 14, 15, 16, 17, 18, 19, 
            20, 21, 22, 23]].value_counts()

0    1    2    3    4    5    6    7    8    9    10   11   12   13   14   15   16   17   18   19   20   21   22   23 
0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    2443
1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1969
0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1229
     0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1056
               1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     995
               0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     804
                                                                 0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 

#### Rename Columns

In [35]:
encoded_df2.columns

Index([       'model',         'year',        'price',      'mileage',
           'fuelType',          'tax',          'mpg',   'engineSize',
       'is_automatic',    'is_manual', 'is_semi_auto',              0,
                    1,              2,              3,              4,
                    5,              6,              7,              8,
                    9,             10,             11,             12,
                   13,             14,             15,             16,
                   17,             18,             19,             20,
                   21,             22,             23],
      dtype='object')

In [36]:
encoded_df2.columns = [
    'model','year','price','mileage',
    'fuelType','tax','mpg','engineSize',
    'is_automatic','is_manual','is_semi_auto',
    'Is_1_series','Is_2_series','Is_3_series','Is_4_series','Is_5_series',
    'Is_6_series','Is_7_series','Is_8_series','Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8'
]
pd.set_option('display.max_columns', None)
encoded_df2[['model','Is_1_series','Is_2_series','Is_3_series','Is_4_series','Is_5_series',
    'Is_6_series','Is_7_series','Is_8_series','Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8'
]]

Unnamed: 0,model,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8
0,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6 Series,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1 Series,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7 Series,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,X3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10777,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10778,3 Series,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10779,1 Series,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Drop the Original Categorical Variable

In [37]:
encoded_df2.drop('model', axis=1, inplace=True)

In [38]:
encoded_df2.head(1)

Unnamed: 0,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8
0,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Perform fuelType

In [39]:
#perform one-hot encoding on cat_features: fuelType
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df2[['fuelType']]).toarray())

In [40]:
#merge one-hot encoded columns back with original DataFrame
encoded_df3 = encoded_df2.join(encoder_df)

In [41]:
encoded_df3.head(1)

Unnamed: 0,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,0,1,2,3,4
0,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [42]:
encoded_df3.columns

Index([        'year',        'price',      'mileage',     'fuelType',
                'tax',          'mpg',   'engineSize', 'is_automatic',
          'is_manual', 'is_semi_auto',  'Is_1_series',  'Is_2_series',
        'Is_3_series',  'Is_4_series',  'Is_5_series',  'Is_6_series',
        'Is_7_series',  'Is_8_series',        'Is_M2',        'Is_M3',
              'Is_M4',        'Is_M5',        'Is_M6',        'Is_X1',
              'Is_X2',        'Is_X3',        'Is_X4',        'Is_X5',
              'Is_X6',        'Is_X7',        'Is_Z3',        'Is_z4',
              'Is_i3',        'Is_i8',              0,              1,
                    2,              3,              4],
      dtype='object')

In [44]:
encoded_df3['fuelType'].value_counts().reset_index()

Unnamed: 0,fuelType,count
0,Diesel,7027
1,Petrol,3417
2,Hybrid,298
3,Other,36
4,Electric,3


In [45]:
encoded_df3[[0, 1, 2, 3, 4]].value_counts()

0    1    2    3    4  
1.0  0.0  0.0  0.0  0.0    7027
0.0  0.0  0.0  0.0  1.0    3417
          1.0  0.0  0.0     298
          0.0  1.0  0.0      36
     1.0  0.0  0.0  0.0       3
Name: count, dtype: int64

#### Rename Columns

In [46]:
encoded_df3.columns

Index([        'year',        'price',      'mileage',     'fuelType',
                'tax',          'mpg',   'engineSize', 'is_automatic',
          'is_manual', 'is_semi_auto',  'Is_1_series',  'Is_2_series',
        'Is_3_series',  'Is_4_series',  'Is_5_series',  'Is_6_series',
        'Is_7_series',  'Is_8_series',        'Is_M2',        'Is_M3',
              'Is_M4',        'Is_M5',        'Is_M6',        'Is_X1',
              'Is_X2',        'Is_X3',        'Is_X4',        'Is_X5',
              'Is_X6',        'Is_X7',        'Is_Z3',        'Is_z4',
              'Is_i3',        'Is_i8',              0,              1,
                    2,              3,              4],
      dtype='object')

In [47]:
encoded_df3.columns = [
    'year','price','mileage','fuelType',
    'tax','mpg','engineSize',
    'is_automatic','is_manual','is_semi_auto',
    'Is_1_series',  'Is_2_series','Is_3_series',  'Is_4_series',
    'Is_5_series','Is_6_series','Is_7_series',  'Is_8_series',
    'Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8',
    'Is_diesel','Is_electric','Is_hybrid','Is_other_fuel','Is_petrol'
    
]
encoded_df3[[
     'fuelType','Is_diesel','Is_electric','Is_hybrid','Is_other_fuel','Is_petrol'
]]

Unnamed: 0,fuelType,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,Diesel,1.0,0.0,0.0,0.0,0.0
1,Petrol,0.0,0.0,0.0,0.0,1.0
2,Diesel,1.0,0.0,0.0,0.0,0.0
3,Diesel,1.0,0.0,0.0,0.0,0.0
4,Diesel,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
10776,Diesel,1.0,0.0,0.0,0.0,0.0
10777,Diesel,1.0,0.0,0.0,0.0,0.0
10778,Petrol,0.0,0.0,0.0,0.0,1.0
10779,Diesel,1.0,0.0,0.0,0.0,0.0


#### Drop the Original Categorical Variable

In [48]:
encoded_df3.drop('fuelType', axis=1, inplace=True)

In [49]:
encoded_df3

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


**The one-hot encoding is complete and we can now feed this pandas DataFrame into any machine learning algorithm**

## Final Encoded Dataframe

In [50]:
final_encoded_df = encoded_df3

In [51]:
final_encoded_df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Split Encoded Dataset into train, validation, test, train_full

- train - 60%
- val - 20%
- test - 20%
- train_full - 80%

In [52]:
final_encoded_df.head(1)

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [53]:
train_encoded, test_encoded = train_test_split(final_encoded_df, train_size=0.6, random_state=42)

In [54]:
len(train_encoded)

6468

In [55]:
len(test_encoded)

4313

In [56]:
len(train_encoded) / len(final_encoded_df)

0.5999443465355718

In [57]:
len(test_encoded) / len (final_encoded_df)

0.4000556534644282

In [58]:
val_encoded, test_encoded = train_test_split(test_encoded, train_size=0.5, random_state=42)

- train 60%
- val 20%
- test 20%

In [59]:
len(train_encoded) / len(final_encoded_df)

0.5999443465355718

In [60]:
len(val_encoded) / len(final_encoded_df)

0.19998144884519062

In [61]:
len(test_encoded) / len(final_encoded_df)

0.20007420461923756

- merge train and val
- train_full 80%
- test 20%

In [62]:
len(train_encoded)

6468

In [63]:
len(val_encoded)

2156

In [64]:
len(train_encoded) + len(val_encoded)

8624

In [65]:
train_full_encoded = pd.concat([train_encoded,val_encoded])

In [66]:
len(train_full_encoded) / len(final_encoded_df)

0.7999257953807625

In [67]:
len(test_encoded) / len(final_encoded_df)

0.20007420461923756

In [68]:
test_encoded

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
4984,2019,52990,3086,145,34.9,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4880,2018,24081,13245,150,60.1,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9132,2016,12999,68949,200,43.5,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6451,2020,11995,10,150,34.5,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7199,2020,29875,150,145,42.2,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9627,2015,14999,78680,160,52.3,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6096,2019,29676,7365,145,41.5,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1747,2019,30570,3067,145,49.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4023,2016,14999,70054,0,148.7,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# CatBoost with One-Hot Encoding

In [69]:
final_encoded_df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [70]:
final_encoded_df.columns

Index(['year', 'price', 'mileage', 'tax', 'mpg', 'engineSize', 'is_automatic',
       'is_manual', 'is_semi_auto', 'Is_1_series', 'Is_2_series',
       'Is_3_series', 'Is_4_series', 'Is_5_series', 'Is_6_series',
       'Is_7_series', 'Is_8_series', 'Is_M2', 'Is_M3', 'Is_M4', 'Is_M5',
       'Is_M6', 'Is_X1', 'Is_X2', 'Is_X3', 'Is_X4', 'Is_X5', 'Is_X6', 'Is_X7',
       'Is_Z3', 'Is_z4', 'Is_i3', 'Is_i8', 'Is_diesel', 'Is_electric',
       'Is_hybrid', 'Is_other_fuel', 'Is_petrol'],
      dtype='object')

In [71]:
X = ['year', 'mileage', 'tax', 'mpg', 'engineSize', 'is_automatic',
       'is_manual', 'is_semi_auto', 'Is_1_series', 'Is_2_series',
       'Is_3_series', 'Is_4_series', 'Is_5_series', 'Is_6_series',
       'Is_7_series', 'Is_8_series', 'Is_M2', 'Is_M3', 'Is_M4', 'Is_M5',
       'Is_M6', 'Is_X1', 'Is_X2', 'Is_X3', 'Is_X4', 'Is_X5', 'Is_X6', 'Is_X7',
       'Is_Z3', 'Is_z4', 'Is_i3', 'Is_i8', 'Is_diesel', 'Is_electric',
       'Is_hybrid', 'Is_other_fuel', 'Is_petrol']

y = ['price']

In [72]:
X

['year',
 'mileage',
 'tax',
 'mpg',
 'engineSize',
 'is_automatic',
 'is_manual',
 'is_semi_auto',
 'Is_1_series',
 'Is_2_series',
 'Is_3_series',
 'Is_4_series',
 'Is_5_series',
 'Is_6_series',
 'Is_7_series',
 'Is_8_series',
 'Is_M2',
 'Is_M3',
 'Is_M4',
 'Is_M5',
 'Is_M6',
 'Is_X1',
 'Is_X2',
 'Is_X3',
 'Is_X4',
 'Is_X5',
 'Is_X6',
 'Is_X7',
 'Is_Z3',
 'Is_z4',
 'Is_i3',
 'Is_i8',
 'Is_diesel',
 'Is_electric',
 'Is_hybrid',
 'Is_other_fuel',
 'Is_petrol']

In [73]:
y

['price']

## model_cbt_7

In [74]:
parameters = {
    'eval_metric': 'MAPE',
    'random_seed':42,
    'verbose':100
}

In [75]:
model_cbt_7 = CatBoostRegressor(**parameters)

In [76]:
%%time


model_cbt_7.fit(
    train_encoded[X],
    train_encoded[y],
    eval_set=(val_encoded[X],val_encoded[y])
)

Learning rate set to 0.068263
0:	learn: 0.4613405	test: 0.4567255	best: 0.4567255 (0)	total: 56.2ms	remaining: 56.2s
100:	learn: 0.0921419	test: 0.0906085	best: 0.0906085 (100)	total: 259ms	remaining: 2.31s
200:	learn: 0.0787665	test: 0.0787853	best: 0.0787853 (200)	total: 445ms	remaining: 1.77s
300:	learn: 0.0726477	test: 0.0748775	best: 0.0748775 (300)	total: 618ms	remaining: 1.44s
400:	learn: 0.0685846	test: 0.0723702	best: 0.0723702 (400)	total: 791ms	remaining: 1.18s
500:	learn: 0.0656730	test: 0.0708471	best: 0.0708471 (500)	total: 938ms	remaining: 935ms
600:	learn: 0.0632492	test: 0.0697501	best: 0.0697274 (594)	total: 1.11s	remaining: 736ms
700:	learn: 0.0613312	test: 0.0691393	best: 0.0691325 (699)	total: 1.31s	remaining: 559ms
800:	learn: 0.0596204	test: 0.0685382	best: 0.0685329 (799)	total: 1.47s	remaining: 366ms
900:	learn: 0.0579890	test: 0.0681711	best: 0.0681711 (900)	total: 1.63s	remaining: 180ms
999:	learn: 0.0567748	test: 0.0680246	best: 0.0680229 (996)	total: 1.78s	

<catboost.core.CatBoostRegressor at 0x7f2bac4d54b0>

In [77]:
model_cbt_7.best_iteration_

996

In [78]:
model_cbt_7.predict(test_encoded[X])

array([56860.14859446, 22505.34556293, 13185.28940915, ...,
       31846.33707961, 13731.28670439, 24747.46094573])

In [79]:
test_encoded['price_pred_cbt_7'] = model_cbt_7.predict(test_encoded[X])

In [80]:
test_encoded[['price',  'price_pred_cbt_7']].head(10).style.format({
    'price': '£{0:,.0f}',
    'price_pred_cbt_7': '£{0:,.0f}',
})

Unnamed: 0,price,price_pred_cbt_7
4984,"£52,990","£56,860"
4880,"£24,081","£22,505"
9132,"£12,999","£13,185"
6451,"£11,995","£32,690"
7199,"£29,875","£35,175"
2094,"£21,986","£26,271"
3592,"£20,490","£20,859"
7942,"£16,220","£15,758"
3284,"£20,980","£22,706"
4112,"£31,840","£33,543"


In [81]:
error(test_encoded['price'], test_encoded['price_pred_cbt_7'])

1483.436765714601
0.06997748525331794


**Learning rate set to 0.068263**

**result on test_encoded set:**
- 1483.436765714601
- 0.06997748525331794

**result on validation_encoded set:**
- bestTest = 0.06802288892


In [82]:
test_encoded['price_pred_7_err'] = test_encoded['price_pred_cbt_7']  - test_encoded['price']
test_encoded['price_pred_7_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_7']  / test_encoded['price']) * 100))) / 100)

In [83]:
test_encoded[['price','price_pred_cbt_7', 'price_pred_7_err','price_pred_7_percent_err']]\
.head(10)\
.reset_index().style.format({
    'price_pred_7_percent_err': '{:%}',

    'price_pred_cbt_7': '£{0:,.0f}',
    'price_pred_7_err': '£{0:,.0f}',
})

Unnamed: 0,index,price,price_pred_cbt_7,price_pred_7_err,price_pred_7_percent_err
0,4984,52990,"£56,860","£3,870",7.303545%
1,4880,24081,"£22,505","£-1,576",-6.543144%
2,9132,12999,"£13,185",£186,1.433106%
3,6451,11995,"£32,690","£20,695",172.532475%
4,7199,29875,"£35,175","£5,300",17.739902%
5,2094,21986,"£26,271","£4,285",19.489029%
6,3592,20490,"£20,859",£369,1.800992%
7,7942,16220,"£15,758",£-462,-2.847516%
8,3284,20980,"£22,706","£1,726",8.224682%
9,4112,31840,"£33,543","£1,703",5.348150%


## model_cbt_8

In [84]:
parameters = {
    'iterations':10000,
    'verbose':200,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'random_seed':42
}

In [85]:
model_cbt_8 = CatBoostRegressor(**parameters)

In [86]:
model_cbt_8.fit(
    train_encoded[X],
    train_encoded[y],
    eval_set=(val_encoded[X],val_encoded[y])
)

Learning rate set to 0.016757
0:	learn: 0.4799627	test: 0.4753019	best: 0.4753019 (0)	total: 2.72ms	remaining: 27.2s
200:	learn: 0.1216609	test: 0.1182153	best: 0.1182153 (200)	total: 377ms	remaining: 18.4s
400:	learn: 0.0930950	test: 0.0912627	best: 0.0912627 (400)	total: 669ms	remaining: 16s
600:	learn: 0.0838101	test: 0.0826813	best: 0.0826813 (600)	total: 972ms	remaining: 15.2s
800:	learn: 0.0783864	test: 0.0779886	best: 0.0779886 (800)	total: 1.42s	remaining: 16.3s
1000:	learn: 0.0751517	test: 0.0756776	best: 0.0756776 (1000)	total: 1.75s	remaining: 15.8s
1200:	learn: 0.0723894	test: 0.0739508	best: 0.0739482 (1198)	total: 2.06s	remaining: 15.1s
1400:	learn: 0.0704258	test: 0.0727219	best: 0.0727219 (1400)	total: 2.35s	remaining: 14.4s
1600:	learn: 0.0686779	test: 0.0716862	best: 0.0716857 (1595)	total: 2.64s	remaining: 13.9s
1800:	learn: 0.0669854	test: 0.0707638	best: 0.0707638 (1800)	total: 2.94s	remaining: 13.4s
2000:	learn: 0.0656255	test: 0.0701226	best: 0.0701226 (2000)	tot

<catboost.core.CatBoostRegressor at 0x7f2babe37640>

In [87]:
model_cbt_8.best_iteration_

7881

In [88]:
model_cbt_8.predict(test_encoded[X])

array([57499.04760294, 22668.97841796, 13720.25983871, ...,
       31247.41213645, 13811.04067773, 24572.02271138])

In [89]:
test_encoded['price_pred_cbt_8'] = model_cbt_8.predict(test_encoded[X])

In [90]:
test_encoded[['price',  'price_pred_cbt_7', 'price_pred_cbt_8']]

Unnamed: 0,price,price_pred_cbt_7,price_pred_cbt_8
4984,52990,56860.148594,57499.047603
4880,24081,22505.345563,22668.978418
9132,12999,13185.289409,13720.259839
6451,11995,32690.270386,32697.795077
7199,29875,35174.795657,36179.639853
...,...,...,...
9627,14999,14038.910167,14216.430281
6096,29676,30598.034566,29971.629218
1747,30570,31846.337080,31247.412136
4023,14999,13731.286704,13811.040678


In [91]:
error(test_encoded['price'], test_encoded['price_pred_cbt_8'])

1454.3054639485795
0.06842211463027473


**Learning rate set to 0.016757**

**result on test_encoded set:**
- 1454.3054639485795
- 0.06842211463027473

**result on validation_encoded set:**
- bestTest = 0.06635979198


In [92]:
test_encoded['price_pred_8_err'] = test_encoded['price_pred_cbt_7']  - test_encoded['price']
test_encoded['price_pred_8_percent_err_abs'] = (( - (100 - ((test_encoded['price_pred_cbt_8']  / test_encoded['price']) * 100))) / 100)

In [93]:
test_encoded[['price','price_pred_cbt_8', 'price_pred_8_err','price_pred_8_percent_err_abs']]\
.head(10)\
.reset_index().style.format({
    'price_pred_8_percent_err_abs': '{:%}',

    'price_pred_cbt_8': '£{0:,.0f}',
    'price_pred_8_err': '£{0:,.0f}',
})

Unnamed: 0,index,price,price_pred_cbt_8,price_pred_8_err,price_pred_8_percent_err_abs
0,4984,52990,"£57,499","£3,870",8.509243%
1,4880,24081,"£22,669","£-1,576",-5.863633%
2,9132,12999,"£13,720",£186,5.548579%
3,6451,11995,"£32,698","£20,695",172.595207%
4,7199,29875,"£36,180","£5,300",21.103397%
5,2094,21986,"£25,295","£4,285",15.052745%
6,3592,20490,"£20,675",£369,0.902838%
7,7942,16220,"£16,456",£-462,1.457199%
8,3284,20980,"£22,707","£1,726",8.232316%
9,4112,31840,"£33,447","£1,703",5.047241%


## model_cbt_9

In [94]:
parameters = {
    'iterations':10000,
    'verbose':300,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'loss_function': 'MAE',
    'random_seed':42
}

In [95]:
model_cbt_9 = CatBoostRegressor(**parameters)

In [96]:
model_cbt_9.fit(
    train_encoded[X],
    train_encoded[y],
    eval_set=(val_encoded[X],val_encoded[y])
)

0:	learn: 0.4075303	test: 0.4018773	best: 0.4018773 (0)	total: 3.91ms	remaining: 39.1s
300:	learn: 0.0783995	test: 0.0787958	best: 0.0787958 (300)	total: 574ms	remaining: 18.5s
600:	learn: 0.0664213	test: 0.0695095	best: 0.0695095 (600)	total: 1.16s	remaining: 18.1s
900:	learn: 0.0609598	test: 0.0669543	best: 0.0669504 (899)	total: 1.72s	remaining: 17.4s
1200:	learn: 0.0577473	test: 0.0661631	best: 0.0661631 (1200)	total: 2.25s	remaining: 16.5s
1500:	learn: 0.0555620	test: 0.0658629	best: 0.0658597 (1492)	total: 2.83s	remaining: 16s
1800:	learn: 0.0538923	test: 0.0655685	best: 0.0655684 (1797)	total: 3.44s	remaining: 15.7s
2100:	learn: 0.0525675	test: 0.0653075	best: 0.0653031 (2094)	total: 4.04s	remaining: 15.2s
2400:	learn: 0.0515308	test: 0.0651607	best: 0.0651607 (2400)	total: 4.68s	remaining: 14.8s
2700:	learn: 0.0507136	test: 0.0650150	best: 0.0650150 (2700)	total: 5.25s	remaining: 14.2s
3000:	learn: 0.0498196	test: 0.0649562	best: 0.0649432 (2974)	total: 5.88s	remaining: 13.7s
3

<catboost.core.CatBoostRegressor at 0x7f2babe5f1f0>

In [97]:
model_cbt_9.best_iteration_

4884

In [98]:
model_cbt_9.predict(test_encoded[X])

array([56965.15647403, 22484.8071941 , 13931.29522322, ...,
       32010.05090573, 13603.16216222, 24204.7979296 ])

In [99]:
test_encoded['price_pred_cbt_9'] = model_cbt_9.predict(test_encoded[X])

In [100]:
test_encoded[['price',  'price_pred_cbt_7', 'price_pred_cbt_8', 'price_pred_cbt_9']]

Unnamed: 0,price,price_pred_cbt_7,price_pred_cbt_8,price_pred_cbt_9
4984,52990,56860.148594,57499.047603,56965.156474
4880,24081,22505.345563,22668.978418,22484.807194
9132,12999,13185.289409,13720.259839,13931.295223
6451,11995,32690.270386,32697.795077,32462.425273
7199,29875,35174.795657,36179.639853,34053.521050
...,...,...,...,...
9627,14999,14038.910167,14216.430281,13924.932088
6096,29676,30598.034566,29971.629218,31576.058423
1747,30570,31846.337080,31247.412136,32010.050906
4023,14999,13731.286704,13811.040678,13603.162162


In [101]:
error(test_encoded['price'], test_encoded['price_pred_cbt_9'])

1433.9590818347572
0.06679151720587649


**result on test_encoded set:**
- 1454.3054639485795
- 0.06842211463027473


**result on validation_encoded set:**
- bestTest = 0.06467677366


In [102]:
test_encoded['price_pred_9_err'] = test_encoded['price_pred_cbt_9']  - test_encoded['price']
test_encoded['price_pred_9_percent_err_abs'] =  (( - (100 - ((test_encoded['price_pred_cbt_9']  / test_encoded['price']) * 100))) / 100)

In [103]:
test_encoded[['price','price_pred_cbt_9', 'price_pred_9_err','price_pred_9_percent_err_abs']]\
.head(10)\
.reset_index().style.format({
    'price_pred_9_percent_err_abs': '{:%}',

    'price_pred_cbt_9': '£{0:,.0f}',
    'price_pred_9_err': '£{0:,.0f}',
})

Unnamed: 0,index,price,price_pred_cbt_9,price_pred_9_err,price_pred_9_percent_err_abs
0,4984,52990,"£56,965","£3,975",7.501711%
1,4880,24081,"£22,485","£-1,596",-6.628432%
2,9132,12999,"£13,931",£932,7.172053%
3,6451,11995,"£32,462","£20,467",170.632974%
4,7199,29875,"£34,054","£4,179",13.986681%
5,2094,21986,"£26,020","£4,034",18.345795%
6,3592,20490,"£20,928",£438,2.139357%
7,7942,16220,"£14,703","£-1,517",-9.351672%
8,3284,20980,"£22,924","£1,944",9.267483%
9,4112,31840,"£33,182","£1,342",4.214804%


## model_cbt_10

In [104]:
parameters = {
    'iterations':30000,
    'learning_rate': 0.01,
    'verbose':300,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'loss_function': 'MAE',
    'random_seed':42
}

In [105]:
model_cbt_10 = CatBoostRegressor(**parameters)

In [106]:
model_cbt_10.fit(
    train_encoded[X],
    train_encoded[y],
    eval_set=(val_encoded[X],val_encoded[y])
)

0:	learn: 0.4137104	test: 0.4080776	best: 0.4080776 (0)	total: 2.88ms	remaining: 1m 26s
300:	learn: 0.1146232	test: 0.1111172	best: 0.1111172 (300)	total: 581ms	remaining: 57.4s
600:	learn: 0.0859719	test: 0.0846389	best: 0.0846389 (600)	total: 1.16s	remaining: 57s
900:	learn: 0.0780526	test: 0.0784110	best: 0.0784110 (900)	total: 1.72s	remaining: 55.5s
1200:	learn: 0.0724956	test: 0.0735891	best: 0.0735891 (1200)	total: 2.37s	remaining: 56.9s
1500:	learn: 0.0692150	test: 0.0712102	best: 0.0712102 (1500)	total: 2.94s	remaining: 55.8s
1800:	learn: 0.0661655	test: 0.0693313	best: 0.0693313 (1800)	total: 3.52s	remaining: 55.1s
2100:	learn: 0.0638923	test: 0.0680806	best: 0.0680806 (2100)	total: 4.1s	remaining: 54.4s
2400:	learn: 0.0620651	test: 0.0672313	best: 0.0672313 (2400)	total: 4.66s	remaining: 53.6s
2700:	learn: 0.0605792	test: 0.0666145	best: 0.0666145 (2700)	total: 5.28s	remaining: 53.4s
3000:	learn: 0.0593958	test: 0.0661510	best: 0.0661510 (3000)	total: 5.87s	remaining: 52.8s
3

<catboost.core.CatBoostRegressor at 0x7f2babe5ee60>

In [107]:
model_cbt_10.best_iteration_

11265

In [108]:
model_cbt_10.predict(test_encoded[X])

array([56843.00721837, 22496.69263505, 13877.44212511, ...,
       31319.25248717, 13583.00399137, 24192.84495025])

In [109]:
test_encoded['price_pred_cbt_10'] = model_cbt_10.predict(test_encoded[X])

In [110]:
test_encoded[['price',  'price_pred_cbt_7', 'price_pred_cbt_8', 'price_pred_cbt_9', 'price_pred_cbt_10']]

Unnamed: 0,price,price_pred_cbt_7,price_pred_cbt_8,price_pred_cbt_9,price_pred_cbt_10
4984,52990,56860.148594,57499.047603,56965.156474,56843.007218
4880,24081,22505.345563,22668.978418,22484.807194,22496.692635
9132,12999,13185.289409,13720.259839,13931.295223,13877.442125
6451,11995,32690.270386,32697.795077,32462.425273,32804.492621
7199,29875,35174.795657,36179.639853,34053.521050,33272.447678
...,...,...,...,...,...
9627,14999,14038.910167,14216.430281,13924.932088,14102.385227
6096,29676,30598.034566,29971.629218,31576.058423,31627.682101
1747,30570,31846.337080,31247.412136,32010.050906,31319.252487
4023,14999,13731.286704,13811.040678,13603.162162,13583.003991


In [111]:
error(test_encoded['price'], test_encoded['price_pred_cbt_10'])

1441.9741130619473
0.06695377267167792


**result on test_encoded set:**
- 1441.9741130619473
- 0.06695377267167792



**result on validation_encoded set:**
- bestTest = 0.06407486694


In [112]:
test_encoded['price_pred_10_err'] = test_encoded['price_pred_cbt_10']  - test_encoded['price']
test_encoded['price_pred_10_percent_err_abs'] =  (( - (100 - ((test_encoded['price_pred_cbt_10']  / test_encoded['price']) * 100))) / 100)

In [113]:
test_encoded[['price','price_pred_cbt_10', 'price_pred_10_err','price_pred_10_percent_err_abs']]\
.head(10)\
.reset_index().style.format({
    'price_pred_10_percent_err_abs': '{:%}',

    'price_pred_cbt_10': '£{0:,.0f}',
    'price_pred_10_err': '£{0:,.0f}',
})

Unnamed: 0,index,price,price_pred_cbt_10,price_pred_10_err,price_pred_10_percent_err_abs
0,4984,52990,"£56,843","£3,853",7.271197%
1,4880,24081,"£22,497","£-1,584",-6.579076%
2,9132,12999,"£13,877",£878,6.757767%
3,6451,11995,"£32,804","£20,809",173.484724%
4,7199,29875,"£33,272","£3,397",11.372210%
5,2094,21986,"£26,152","£4,166",18.946783%
6,3592,20490,"£20,968",£478,2.334924%
7,7942,16220,"£15,210","£-1,010",-6.224515%
8,3284,20980,"£22,924","£1,944",9.267575%
9,4112,31840,"£32,981","£1,141",3.584363%


## model 11. Learning With Complete Data.

In [114]:
model_cbt_7.best_iteration_

996

In [115]:
parameters = {
    'iterations': model_cbt_7.best_iteration_ + 1,
    'eval_metric': 'MAPE',
    'random_seed':42,
    'verbose':100
}

In [116]:
model_cbt_11 = CatBoostRegressor(**parameters)

In [117]:
model_cbt_11.fit(
    train_full_encoded[X],
    train_full_encoded[y]
)

Learning rate set to 0.057688
0:	learn: 0.4627935	total: 2.01ms	remaining: 2s
100:	learn: 0.0956411	total: 218ms	remaining: 1.93s
200:	learn: 0.0802715	total: 449ms	remaining: 1.78s
300:	learn: 0.0733093	total: 676ms	remaining: 1.56s
400:	learn: 0.0694633	total: 895ms	remaining: 1.33s
500:	learn: 0.0669526	total: 1.09s	remaining: 1.08s
600:	learn: 0.0648566	total: 1.28s	remaining: 845ms
700:	learn: 0.0633258	total: 1.48s	remaining: 624ms
800:	learn: 0.0619799	total: 1.66s	remaining: 407ms
900:	learn: 0.0606913	total: 1.85s	remaining: 198ms
996:	learn: 0.0594990	total: 2.02s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f2babe5f190>

In [118]:
model_cbt_11.predict(test_encoded[X])

array([56728.83202988, 22741.42346116, 13596.58155801, ...,
       31560.59345602, 13496.63325881, 25050.81843774])

In [119]:
test_encoded['price_pred_cbt_11'] = model_cbt_11.predict(test_encoded[X])

In [120]:
test_encoded[['price',  'price_pred_cbt_7', 'price_pred_cbt_8', 'price_pred_cbt_9', 'price_pred_cbt_10', 'price_pred_cbt_11']]

Unnamed: 0,price,price_pred_cbt_7,price_pred_cbt_8,price_pred_cbt_9,price_pred_cbt_10,price_pred_cbt_11
4984,52990,56860.148594,57499.047603,56965.156474,56843.007218,56728.832030
4880,24081,22505.345563,22668.978418,22484.807194,22496.692635,22741.423461
9132,12999,13185.289409,13720.259839,13931.295223,13877.442125,13596.581558
6451,11995,32690.270386,32697.795077,32462.425273,32804.492621,31992.154426
7199,29875,35174.795657,36179.639853,34053.521050,33272.447678,34268.745261
...,...,...,...,...,...,...
9627,14999,14038.910167,14216.430281,13924.932088,14102.385227,13868.907190
6096,29676,30598.034566,29971.629218,31576.058423,31627.682101,30918.292795
1747,30570,31846.337080,31247.412136,32010.050906,31319.252487,31560.593456
4023,14999,13731.286704,13811.040678,13603.162162,13583.003991,13496.633259


In [121]:
error(test_encoded['price'], test_encoded['price_pred_cbt_11'])

1459.7858677038416
0.06920057489079463


**result on validation set:**
- 1459.7858677038416
- 0.06920057489079463



## model 12. Learning With Complete Data and tunned params

In [122]:
parameters = {
    'iterations':50000,
    'verbose':1000,
    'od_type' : "Iter",
    'od_wait' : 500,
    'eval_metric': 'MAPE',
    'loss_function': 'MAE',
    'random_seed':42
}

In [123]:
model_cbt_12 = CatBoostRegressor(**parameters)

In [124]:
model_cbt_12.fit(
    train_full_encoded[X],
    train_full_encoded[y]
)

0:	learn: 0.4058407	total: 2.79ms	remaining: 2m 19s
1000:	learn: 0.0608655	total: 2.43s	remaining: 1m 58s
2000:	learn: 0.0544658	total: 5.09s	remaining: 2m 2s
3000:	learn: 0.0514615	total: 7.66s	remaining: 1m 59s
4000:	learn: 0.0494942	total: 10s	remaining: 1m 55s
5000:	learn: 0.0481531	total: 12.6s	remaining: 1m 53s
6000:	learn: 0.0470651	total: 15.2s	remaining: 1m 51s
7000:	learn: 0.0461264	total: 17.8s	remaining: 1m 49s
8000:	learn: 0.0453199	total: 20.3s	remaining: 1m 46s
9000:	learn: 0.0446224	total: 23s	remaining: 1m 44s
10000:	learn: 0.0440706	total: 25.8s	remaining: 1m 43s
11000:	learn: 0.0435923	total: 28.4s	remaining: 1m 40s
12000:	learn: 0.0430819	total: 31.1s	remaining: 1m 38s
13000:	learn: 0.0426741	total: 33.9s	remaining: 1m 36s
14000:	learn: 0.0423172	total: 36.7s	remaining: 1m 34s
15000:	learn: 0.0419626	total: 39.5s	remaining: 1m 32s
16000:	learn: 0.0416514	total: 42.3s	remaining: 1m 29s
17000:	learn: 0.0413076	total: 45.2s	remaining: 1m 27s
18000:	learn: 0.0410140	tot

<catboost.core.CatBoostRegressor at 0x7f2babe37c40>

In [125]:
model_cbt_12.predict(test_encoded[X])

array([57231.65316068, 22739.65474344, 14169.22705058, ...,
       31236.38425876, 13167.31184567, 24340.1518666 ])

In [126]:
test_encoded['price_pred_cbt_12'] = model_cbt_12.predict(test_encoded[X])

In [127]:
error(test_encoded['price'], test_encoded['price_pred_cbt_12'])

1405.1858032964403
0.06504003737857018


**result on validation set:**
- 1405.1858032964403
- 0.06504003737857018




# Conclusion:

In [128]:
test_encoded['price_pred_7_err'] = test_encoded['price_pred_cbt_7']  - test_encoded['price']
test_encoded['price_pred_7_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_7']  / test_encoded['price']) * 100))) / 100)

test_encoded['price_pred_8_err'] = test_encoded['price_pred_cbt_8']  - test_encoded['price']
test_encoded['price_pred_8_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_8']  / test_encoded['price']) * 100))) / 100)

test_encoded['price_pred_9_err'] = test_encoded['price_pred_cbt_9']  - test_encoded['price']
test_encoded['price_pred_9_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_9']  / test_encoded['price']) * 100))) / 100)

test_encoded['price_pred_10_err'] = test_encoded['price_pred_cbt_10']  - test_encoded['price']
test_encoded['price_pred_10_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_10']  / test_encoded['price']) * 100))) / 100)

test_encoded['price_pred_11_err'] = test_encoded['price_pred_cbt_11']  - test_encoded['price']
test_encoded['price_pred_11_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_11']  / test_encoded['price']) * 100))) / 100)

test_encoded['price_pred_12_err'] = test_encoded['price_pred_cbt_12']  - test_encoded['price']
test_encoded['price_pred_12_percent_err'] =  (( - (100 - ((test_encoded['price_pred_cbt_12']  / test_encoded['price']) * 100))) / 100)

In [129]:
test_encoded[[
    'price',
             'price_pred_cbt_7', 'price_pred_7_err','price_pred_7_percent_err',
             'price_pred_cbt_8', 'price_pred_8_err','price_pred_8_percent_err',
             'price_pred_cbt_9', 'price_pred_9_err','price_pred_9_percent_err',
             'price_pred_cbt_10', 'price_pred_10_err','price_pred_10_percent_err',
             'price_pred_cbt_11', 'price_pred_11_err','price_pred_11_percent_err',
             'price_pred_cbt_12', 'price_pred_12_err','price_pred_12_percent_err']]\
.reset_index().style.format({
    'price_pred_7_percent_err': '{:.3%}','price_pred_cbt_7': '£{0:,.0f}','price_pred_7_err': '£{0:,.0f}',
    'price_pred_8_percent_err': '{:.3%}','price_pred_cbt_8': '£{0:,.0f}','price_pred_8_err': '£{0:,.0f}',
    'price_pred_9_percent_err': '{:.3%}','price_pred_cbt_9': '£{0:,.0f}','price_pred_9_err': '£{0:,.0f}',
    'price_pred_10_percent_err': '{:.3%}','price_pred_cbt_10': '£{0:,.0f}','price_pred_10_err': '£{0:,.0f}',
    'price_pred_11_percent_err': '{:.3%}','price_pred_cbt_11': '£{0:,.0f}','price_pred_11_err': '£{0:,.0f}',
    'price_pred_12_percent_err': '{:.3%}','price_pred_cbt_12': '£{0:,.0f}','price_pred_12_err': '£{0:,.0f}',
})

Unnamed: 0,index,price,price_pred_cbt_7,price_pred_7_err,price_pred_7_percent_err,price_pred_cbt_8,price_pred_8_err,price_pred_8_percent_err,price_pred_cbt_9,price_pred_9_err,price_pred_9_percent_err,price_pred_cbt_10,price_pred_10_err,price_pred_10_percent_err,price_pred_cbt_11,price_pred_11_err,price_pred_11_percent_err,price_pred_cbt_12,price_pred_12_err,price_pred_12_percent_err
0,4984,52990,"£56,860","£3,870",7.304%,"£57,499","£4,509",8.509%,"£56,965","£3,975",7.502%,"£56,843","£3,853",7.271%,"£56,729","£3,739",7.056%,"£57,232","£4,242",8.005%
1,4880,24081,"£22,505","£-1,576",-6.543%,"£22,669","£-1,412",-5.864%,"£22,485","£-1,596",-6.628%,"£22,497","£-1,584",-6.579%,"£22,741","£-1,340",-5.563%,"£22,740","£-1,341",-5.570%
2,9132,12999,"£13,185",£186,1.433%,"£13,720",£721,5.549%,"£13,931",£932,7.172%,"£13,877",£878,6.758%,"£13,597",£598,4.597%,"£14,169","£1,170",9.002%
3,6451,11995,"£32,690","£20,695",172.532%,"£32,698","£20,703",172.595%,"£32,462","£20,467",170.633%,"£32,804","£20,809",173.485%,"£31,992","£19,997",166.712%,"£32,497","£20,502",170.922%
4,7199,29875,"£35,175","£5,300",17.740%,"£36,180","£6,305",21.103%,"£34,054","£4,179",13.987%,"£33,272","£3,397",11.372%,"£34,269","£4,394",14.707%,"£33,949","£4,074",13.637%
5,2094,21986,"£26,271","£4,285",19.489%,"£25,295","£3,309",15.053%,"£26,020","£4,034",18.346%,"£26,152","£4,166",18.947%,"£26,020","£4,034",18.349%,"£25,041","£3,055",13.894%
6,3592,20490,"£20,859",£369,1.801%,"£20,675",£185,0.903%,"£20,928",£438,2.139%,"£20,968",£478,2.335%,"£20,794",£304,1.485%,"£20,408",£-82,-0.401%
7,7942,16220,"£15,758",£-462,-2.848%,"£16,456",£236,1.457%,"£14,703","£-1,517",-9.352%,"£15,210","£-1,010",-6.225%,"£16,314",£94,0.582%,"£16,954",£734,4.526%
8,3284,20980,"£22,706","£1,726",8.225%,"£22,707","£1,727",8.232%,"£22,924","£1,944",9.267%,"£22,924","£1,944",9.268%,"£22,426","£1,446",6.894%,"£21,709",£729,3.476%
9,4112,31840,"£33,543","£1,703",5.348%,"£33,447","£1,607",5.047%,"£33,182","£1,342",4.215%,"£32,981","£1,141",3.584%,"£33,511","£1,671",5.249%,"£33,083","£1,243",3.904%
