# Import Libraries

In [16]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

# Data Preprocessing

In [18]:
# load csv file in pandas dataframe
dataset = pd.read_csv('ford.csv')
dataset

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,150,47.1,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,30,57.7,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,20,67.3,1.6
17964,KA,2018,8299,Manual,5007,Petrol,145,57.7,1.2


In [19]:
# checking for missing values
dataset.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [20]:
print(dataset['transmission'].unique())
print(dataset['fuelType'].unique())

['Automatic' 'Manual' 'Semi-Auto']
['Petrol' 'Diesel' 'Hybrid' 'Electric' 'Other']


In [21]:
# encoding the categorical transmission column
dataset.replace({'transmission':{'Automatic': 0, 'Manual': 1, 'Semi-Auto': 2}}, inplace=True)

# encoding the categorical fuelType column
dataset.replace({'fuelType':{'Petrol': 0, 'Diesel': 1, 'Hybrid': 2, 'Electric': 3, 'Other': 4}}, inplace=True)

In [22]:
dataset

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,0,15944,0,150,57.7,1.0
1,Focus,2018,14000,1,9083,0,150,57.7,1.0
2,Focus,2017,13000,1,12456,0,150,57.7,1.0
3,Fiesta,2019,17500,1,10460,0,145,40.3,1.5
4,Fiesta,2019,16500,0,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,1,16700,0,150,47.1,1.4
17962,B-MAX,2014,7499,1,40700,0,30,57.7,1.0
17963,Focus,2015,9999,1,7010,1,20,67.3,1.6
17964,KA,2018,8299,1,5007,0,145,57.7,1.2


# Splitting Data into features and labels

In [23]:
x = dataset.drop(['model', 'price'], axis=1).values
y = dataset['price']
y

0        12000
1        14000
2        13000
3        17500
4        16500
         ...  
17961     8999
17962     7499
17963     9999
17964     8299
17965     8299
Name: price, Length: 17966, dtype: int64

# Lets standardize the data

In [24]:
scaler = StandardScaler()
scaler.fit(x)

In [25]:
standardized_x = scaler.transform(x)
standardized_x

array([[ 0.06512772, -2.67003231, -0.38099808, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.55286624,  0.04135139, -0.73335899, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.06512772,  0.04135139, -0.56013157, ...,  0.59135805,
        -0.02044162, -0.81138621],
       ...,
       [-0.91034931,  0.04135139, -0.83982222, ..., -1.50505332,
         0.92766777,  0.57636151],
       [ 0.55286624,  0.04135139, -0.94269045, ...,  0.51072684,
        -0.02044162, -0.34880364],
       [-0.91034931,  0.04135139, -0.94269045, ..., -1.47280084,
        -0.02044162, -0.81138621]])

In [26]:
x = standardized_x
y = dataset['price']

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)
print(x.shape, x_train.shape, x_test.shape)
print(y.shape, y_train.shape, y_test.shape)

(17966, 7) (16169, 7) (1797, 7)
(17966,) (16169,) (1797,)


# XGB Regressor

In [28]:
# load the model
xgb_model = XGBRegressor()

In [29]:
# fit our training data into the model
xgb_model.fit(x_train, y_train)

# Model Evaluation

In [31]:
# Prediction on training data
training_data_pred = xgb_model.predict(x_train)

# R2 score on training data
score_1 = metrics.r2_score(y_train, training_data_pred)

# Mean absolute error
mae = metrics.mean_absolute_error(y_train, training_data_pred)

print("R2 score on training data :", score_1)
print("Mean absolute error on training data :", mae)

R2 score on training data : 0.9531964580818831
Mean absolute error on training data : 740.4128602406732


In [32]:
# Prediction on testing data
testing_data_pred = xgb_model.predict(x_test)

# R2 score on testing data
score_1 = metrics.r2_score(y_test, testing_data_pred)

# Mean absolute error
mae = metrics.mean_absolute_error(y_test, testing_data_pred)

print("R2 score on testing data :", score_1)
print("Mean absolute error on testing data :", mae)

R2 score on testing data : 0.9116280023596516
Mean absolute error on testing data : 907.3636198473694


# Making Predictions

In [36]:
input_data = (2015, 1, 7010, 1,  20, 67.3, 1.6)

# Changing the input into numpy array and reshaping
input_changed = np.array(input_data).reshape(1,-1)

# Standardize the input
std_input = scaler.transform(input_changed)

prediction = xgb_model.predict(std_input)
print(prediction)

print("This car price estimation :", prediction[0])

[10241.161]
This car price estimation : 10241.161


# Saving Model and Scaler

In [37]:
import joblib

# save the model
joblib.dump(xgb_model, 'xgb_model.pkl')

# save the standard scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']