# Introducing:

## Context:
The cleaned data set contains information of price, transmission, mileage, fuel type, road tax, miles per gallon (mpg), and engine size. I've removed duplicate listings and cleaned the columns, but have included a notebook showing the process and the original data for anyone who wants to check/improve my work.

## Task:
*Predict cars price*

## Used libraries:
- pandas
- numpy
- matplotlib
- sklearn
- xgboost

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('dark_background')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error

import xgboost as xgb
from xgboost import XGBRegressor

# Data preparation

## Read data

In [5]:
df = pd.read_csv('/home/antonius/Projects/DS_Projects/Data-analysis-with-data-science/02_uk_used_cars_price_prediction/bmw.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0


## Format data

In [6]:
len(df)

10781

In [7]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

### Skips, Nans

In [8]:
df.isna().mean().sort_values(ascending=False)

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [9]:
df.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [10]:
df.select_dtypes(include='object')

Unnamed: 0,model,transmission,fuelType
0,5 Series,Automatic,Diesel
1,6 Series,Automatic,Petrol
2,5 Series,Automatic,Diesel
3,1 Series,Automatic,Diesel
4,7 Series,Automatic,Diesel
...,...,...,...
10776,X3,Automatic,Diesel
10777,5 Series,Automatic,Diesel
10778,3 Series,Manual,Petrol
10779,1 Series,Automatic,Diesel


*cat_features: model, transmission, fuelType*

In [11]:
df['model'].unique()

array([' 5 Series', ' 6 Series', ' 1 Series', ' 7 Series', ' 2 Series',
       ' 4 Series', ' X3', ' 3 Series', ' X5', ' X4', ' i3', ' X1', ' M4',
       ' X2', ' X6', ' 8 Series', ' Z4', ' X7', ' M5', ' i8', ' M2',
       ' M3', ' M6', ' Z3'], dtype=object)

In [12]:
df['transmission'].unique()

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

In [13]:
df['fuelType'].unique()

array(['Diesel', 'Petrol', 'Other', 'Hybrid', 'Electric'], dtype=object)

# Func for Checking model

In [14]:
def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

# One-Hot Encoding
*cat_features: model, transmission, fuelType*

In [15]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0


In [16]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

## Perform One-Hot Encoding
*cat_features: model, transmission, fuelType*

### Perform transmission

In [17]:
#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

In [18]:
#perform one-hot encoding on cat_features: transmission
encoder_df = pd.DataFrame(encoder.fit_transform(df[['transmission']]).toarray())

In [19]:
#merge one-hot encoded columns back with original DataFrame
encoded_df = df.join(encoder_df)

In [20]:
encoded_df.head(1)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,0,1,2
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


In [21]:
encoded_df.columns

Index([       'model',         'year',        'price', 'transmission',
            'mileage',     'fuelType',          'tax',          'mpg',
         'engineSize',              0,              1,              2],
      dtype='object')

In [22]:
df['transmission'].value_counts().reset_index()

Unnamed: 0,index,transmission
0,Semi-Auto,4666
1,Automatic,3588
2,Manual,2527


In [23]:
encoded_df[[0, 1, 2]].value_counts()

0    1    2  
0.0  0.0  1.0    4666
1.0  0.0  0.0    3588
0.0  1.0  0.0    2527
dtype: int64

#### Rename Columns

In [24]:
encoded_df.columns

Index([       'model',         'year',        'price', 'transmission',
            'mileage',     'fuelType',          'tax',          'mpg',
         'engineSize',              0,              1,              2],
      dtype='object')

In [25]:
encoded_df.columns = [
    'model', 'year', 'price', 'transmission',
    'mileage', 'fuelType', 'tax',
    'mpg', 'engineSize',
    'is_automatic','is_manual', 'is_semi_auto']
encoded_df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0,1.0,0.0,0.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0,1.0,0.0,0.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5,1.0,0.0,0.0
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10776,X3,2016,19000,Automatic,40818,Diesel,150,54.3,2.0,1.0,0.0,0.0
10777,5 Series,2016,14600,Automatic,42947,Diesel,125,60.1,2.0,1.0,0.0,0.0
10778,3 Series,2017,13100,Manual,25468,Petrol,200,42.8,2.0,0.0,1.0,0.0
10779,1 Series,2014,9930,Automatic,45000,Diesel,30,64.2,2.0,1.0,0.0,0.0


In [26]:
encoded_df['is_semi_auto'].value_counts().reset_index()

Unnamed: 0,index,is_semi_auto
0,0.0,6115
1,1.0,4666


#### Drop the Original Categuniquecal Variable

In [27]:
encoded_df.drop('transmission', axis=1, inplace=True)

In [28]:
encoded_df.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


### Perform model

In [29]:
#perform one-hot encoding on cat_features: model
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df[['model']]).toarray())

In [30]:
#merge one-hot encoded columns back with original DataFrame
encoded_df2 = encoded_df.join(encoder_df)

In [31]:
encoded_df.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0


In [32]:
encoded_df2.head(1)

Unnamed: 0,model,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,...,14,15,16,17,18,19,20,21,22,23
0,5 Series,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
encoded_df2.columns

Index([       'model',         'year',        'price',      'mileage',
           'fuelType',          'tax',          'mpg',   'engineSize',
       'is_automatic',    'is_manual', 'is_semi_auto',              0,
                    1,              2,              3,              4,
                    5,              6,              7,              8,
                    9,             10,             11,             12,
                   13,             14,             15,             16,
                   17,             18,             19,             20,
                   21,             22,             23],
      dtype='object')

In [34]:
encoded_df2['model'].value_counts().reset_index()

Unnamed: 0,index,model
0,3 Series,2443
1,1 Series,1969
2,2 Series,1229
3,5 Series,1056
4,4 Series,995
5,X1,804
6,X3,551
7,X5,468
8,X2,288
9,X4,179


In [35]:
encoded_df2[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
            11, 12, 13, 14, 15, 16, 17, 18, 19, 
            20, 21, 22, 23]].value_counts()

0    1    2    3    4    5    6    7    8    9    10   11   12   13   14   15   16   17   18   19   20   21   22   23 
0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    2443
1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1969
0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1229
     0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0    1056
               1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     995
               0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     804
                                                                 0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0 

#### Rename Columns

In [36]:
encoded_df2.columns

Index([       'model',         'year',        'price',      'mileage',
           'fuelType',          'tax',          'mpg',   'engineSize',
       'is_automatic',    'is_manual', 'is_semi_auto',              0,
                    1,              2,              3,              4,
                    5,              6,              7,              8,
                    9,             10,             11,             12,
                   13,             14,             15,             16,
                   17,             18,             19,             20,
                   21,             22,             23],
      dtype='object')

In [37]:
encoded_df2.columns = [
    'model','year','price','mileage',
    'fuelType','tax','mpg','engineSize',
    'is_automatic','is_manual','is_semi_auto',
    'Is_1_series','Is_2_series','Is_3_series','Is_4_series','Is_5_series',
    'Is_6_series','Is_7_series','Is_8_series','Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8'
]
pd.set_option('display.max_columns', None)
encoded_df2[['model','Is_1_series','Is_2_series','Is_3_series','Is_4_series','Is_5_series',
    'Is_6_series','Is_7_series','Is_8_series','Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8'
]]

Unnamed: 0,model,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8
0,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6 Series,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1 Series,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7 Series,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,X3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10777,5 Series,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10778,3 Series,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10779,1 Series,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Drop the Original Categorical Variable

In [38]:
encoded_df2.drop('model', axis=1, inplace=True)

In [39]:
encoded_df2.head(1)

Unnamed: 0,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8
0,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Perform fuelType

In [40]:
#perform one-hot encoding on cat_features: fuelType
encoder_df = pd.DataFrame(encoder.fit_transform(encoded_df2[['fuelType']]).toarray())

In [41]:
#merge one-hot encoded columns back with original DataFrame
encoded_df3 = encoded_df2.join(encoder_df)

In [42]:
encoded_df3.head(1)

Unnamed: 0,year,price,mileage,fuelType,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,0,1,2,3,4
0,2014,11200,67068,Diesel,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [43]:
encoded_df3.columns

Index([        'year',        'price',      'mileage',     'fuelType',
                'tax',          'mpg',   'engineSize', 'is_automatic',
          'is_manual', 'is_semi_auto',  'Is_1_series',  'Is_2_series',
        'Is_3_series',  'Is_4_series',  'Is_5_series',  'Is_6_series',
        'Is_7_series',  'Is_8_series',        'Is_M2',        'Is_M3',
              'Is_M4',        'Is_M5',        'Is_M6',        'Is_X1',
              'Is_X2',        'Is_X3',        'Is_X4',        'Is_X5',
              'Is_X6',        'Is_X7',        'Is_Z3',        'Is_z4',
              'Is_i3',        'Is_i8',              0,              1,
                    2,              3,              4],
      dtype='object')

In [44]:
encoded_df3['fuelType'].value_counts().reset_index()

Unnamed: 0,index,fuelType
0,Diesel,7027
1,Petrol,3417
2,Hybrid,298
3,Other,36
4,Electric,3


In [45]:
encoded_df3[[0, 1, 2, 3, 4]].value_counts()

0    1    2    3    4  
1.0  0.0  0.0  0.0  0.0    7027
0.0  0.0  0.0  0.0  1.0    3417
          1.0  0.0  0.0     298
          0.0  1.0  0.0      36
     1.0  0.0  0.0  0.0       3
dtype: int64

#### Rename Columns

In [46]:
encoded_df3.columns

Index([        'year',        'price',      'mileage',     'fuelType',
                'tax',          'mpg',   'engineSize', 'is_automatic',
          'is_manual', 'is_semi_auto',  'Is_1_series',  'Is_2_series',
        'Is_3_series',  'Is_4_series',  'Is_5_series',  'Is_6_series',
        'Is_7_series',  'Is_8_series',        'Is_M2',        'Is_M3',
              'Is_M4',        'Is_M5',        'Is_M6',        'Is_X1',
              'Is_X2',        'Is_X3',        'Is_X4',        'Is_X5',
              'Is_X6',        'Is_X7',        'Is_Z3',        'Is_z4',
              'Is_i3',        'Is_i8',              0,              1,
                    2,              3,              4],
      dtype='object')

In [47]:
encoded_df3.columns = [
    'year','price','mileage','fuelType',
    'tax','mpg','engineSize',
    'is_automatic','is_manual','is_semi_auto',
    'Is_1_series',  'Is_2_series','Is_3_series',  'Is_4_series',
    'Is_5_series','Is_6_series','Is_7_series',  'Is_8_series',
    'Is_M2','Is_M3','Is_M4','Is_M5','Is_M6',
    'Is_X1','Is_X2','Is_X3','Is_X4','Is_X5','Is_X6','Is_X7',
    'Is_Z3','Is_z4','Is_i3','Is_i8',
    'Is_diesel','Is_electric','Is_hybrid','Is_other_fuel','Is_petrol'
    
]
encoded_df3[[
     'fuelType','Is_diesel','Is_electric','Is_hybrid','Is_other_fuel','Is_petrol'
]]

Unnamed: 0,fuelType,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,Diesel,1.0,0.0,0.0,0.0,0.0
1,Petrol,0.0,0.0,0.0,0.0,1.0
2,Diesel,1.0,0.0,0.0,0.0,0.0
3,Diesel,1.0,0.0,0.0,0.0,0.0
4,Diesel,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
10776,Diesel,1.0,0.0,0.0,0.0,0.0
10777,Diesel,1.0,0.0,0.0,0.0,0.0
10778,Petrol,0.0,0.0,0.0,0.0,1.0
10779,Diesel,1.0,0.0,0.0,0.0,0.0


#### Drop the Original Categorical Variable

In [48]:
encoded_df3.drop('fuelType', axis=1, inplace=True)

In [49]:
encoded_df3

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


**The one-hot encoding is complete and we can now feed this pandas DataFrame into any machine learning algorithm**

## Final Encoded Dataframe

In [50]:
final_encoded_df = encoded_df3

In [51]:
final_encoded_df

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,19000,40818,150,54.3,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10777,2016,14600,42947,125,60.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10778,2017,13100,25468,200,42.8,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10779,2014,9930,45000,30,64.2,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Split Encoded Dataset into train, validation, test, train_full

- train - 60%
- val - 20%
- test - 20%
- train_full - 80%

In [52]:
final_encoded_df.head(1)

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [53]:
train_encoded, test_encoded = train_test_split(final_encoded_df, train_size=0.6, random_state=42)

In [54]:
len(train_encoded)

6468

In [55]:
len(test_encoded)

4313

In [56]:
len(train_encoded) / len(final_encoded_df)

0.5999443465355718

In [57]:
len(test_encoded) / len (final_encoded_df)

0.4000556534644282

In [58]:
val_encoded, test_encoded = train_test_split(test_encoded, train_size=0.5, random_state=42)

- train 60%
- val 20%
- test 20%

In [59]:
len(train_encoded) / len(final_encoded_df)

0.5999443465355718

In [60]:
len(val_encoded) / len(final_encoded_df)

0.19998144884519062

In [61]:
len(test_encoded) / len(final_encoded_df)

0.20007420461923756

- merge train and val
- train_full 80%
- test 20%

In [62]:
len(train_encoded)

6468

In [63]:
len(val_encoded)

2156

In [64]:
len(train_encoded) + len(val_encoded)

8624

In [65]:
train_full_encoded = pd.concat([train_encoded,val_encoded])

In [66]:
len(train_full_encoded) / len(final_encoded_df)

0.7999257953807625

In [67]:
len(test_encoded) / len(final_encoded_df)

0.20007420461923756

In [68]:
test_encoded

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
4984,2019,52990,3086,145,34.9,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4880,2018,24081,13245,150,60.1,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9132,2016,12999,68949,200,43.5,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6451,2020,11995,10,150,34.5,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7199,2020,29875,150,145,42.2,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9627,2015,14999,78680,160,52.3,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6096,2019,29676,7365,145,41.5,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1747,2019,30570,3067,145,49.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4023,2016,14999,70054,0,148.7,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# XGBoost

In [69]:
final_encoded_df.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,is_automatic,is_manual,is_semi_auto,Is_1_series,Is_2_series,Is_3_series,Is_4_series,Is_5_series,Is_6_series,Is_7_series,Is_8_series,Is_M2,Is_M3,Is_M4,Is_M5,Is_M6,Is_X1,Is_X2,Is_X3,Is_X4,Is_X5,Is_X6,Is_X7,Is_Z3,Is_z4,Is_i3,Is_i8,Is_diesel,Is_electric,Is_hybrid,Is_other_fuel,Is_petrol
0,2014,11200,67068,125,57.6,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2018,27000,14827,145,42.8,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2016,16000,62794,160,51.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2017,12750,26676,145,72.4,1.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2014,14500,39554,160,50.4,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [70]:
X = ['year', 'mileage', 'tax', 'mpg', 'engineSize', 'is_automatic',
       'is_manual', 'is_semi_auto', 'Is_1_series', 'Is_2_series',
       'Is_3_series', 'Is_4_series', 'Is_5_series', 'Is_6_series',
       'Is_7_series', 'Is_8_series', 'Is_M2', 'Is_M3', 'Is_M4', 'Is_M5',
       'Is_M6', 'Is_X1', 'Is_X2', 'Is_X3', 'Is_X4', 'Is_X5', 'Is_X6', 'Is_X7',
       'Is_Z3', 'Is_z4', 'Is_i3', 'Is_i8', 'Is_diesel', 'Is_electric',
       'Is_hybrid', 'Is_other_fuel', 'Is_petrol']

y = ['price']

In [71]:
parameters = {
    'max_depth':6, 
    'eta':0.1, 
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed' :42,
    'eval_metric': 'mape'
}

In [72]:
model_xgb_1 = XGBRegressor(**parameters)

In [73]:
model_xgb_1.fit(
    train_encoded[X],
    train_encoded[y],
    eval_set=[(val_encoded[X],val_encoded[y])]
)

[0]	validation_0-mape:0.89666
[1]	validation_0-mape:0.80415
[2]	validation_0-mape:0.72158
[3]	validation_0-mape:0.64686
[4]	validation_0-mape:0.58030
[5]	validation_0-mape:0.51998
[6]	validation_0-mape:0.46611
[7]	validation_0-mape:0.41779
[8]	validation_0-mape:0.37374
[9]	validation_0-mape:0.33457
[10]	validation_0-mape:0.29928
[11]	validation_0-mape:0.26809
[12]	validation_0-mape:0.24047
[13]	validation_0-mape:0.21587
[14]	validation_0-mape:0.19469
[15]	validation_0-mape:0.17576
[16]	validation_0-mape:0.15950
[17]	validation_0-mape:0.14564
[18]	validation_0-mape:0.13412
[19]	validation_0-mape:0.12455
[20]	validation_0-mape:0.11637
[21]	validation_0-mape:0.10994
[22]	validation_0-mape:0.10417
[23]	validation_0-mape:0.09984
[24]	validation_0-mape:0.09617
[25]	validation_0-mape:0.09370
[26]	validation_0-mape:0.09136
[27]	validation_0-mape:0.08972
[28]	validation_0-mape:0.08837
[29]	validation_0-mape:0.08679
[30]	validation_0-mape:0.08549
[31]	validation_0-mape:0.08485
[32]	validation_0-

In [74]:
model_xgb_1.predict(train_encoded[X])

array([26871.654, 33483.67 , 22344.93 , ..., 27498.684, 18896.076,
       14223.466], dtype=float32)

In [75]:
test_encoded['price_pred_xgb_1'] = model_xgb_1.predict(train_encoded[X])

ValueError: Length of values (6468) does not match length of index (2157)

In [102]:
error(train_encoded['price'], train_encoded['price_pred_xgb_1'])

1344.701444589288
0.0629854612988122


**result on test_encoded set:**
- 1344.701444589288
- 0.0629854612988122

In [125]:
train_encoded[['price',  'price_pred_xgb_1']].head(10).style.format({
    'price': '£{0:,.0f}',
    'price_pred_xgb_1': '£{0:,.0f}',
})

Unnamed: 0,price,price_pred_xgb_1
5133,"£25,000","£1,872"
2121,"£31,980","£1,504"
6576,"£22,995",£-650
8330,"£12,999",£-27
3161,"£32,980","£1,680"
2630,"£26,490","£1,651"
5703,"£20,750",£576
9180,"£8,799",£-87
6369,"£28,450",£517
10681,"£21,491",£-214
