<a href="https://colab.research.google.com/github/ahamedistiaque/audi_used_car_analysis/blob/main/ford_used_car_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [275]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

pd.options.display.max_columns =35

sns.set_context("talk")
sns.set_style("darkgrid")

In [276]:
path='/content/drive/MyDrive/Colab Notebooks/Data Processing for Machine Learning With Python/ford.csv'

In [277]:
df=pd.read_csv(path)
display(df.sample(5))

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
15656,Focus,2016,9275,Manual,27450,Petrol,61.4,1.0
17178,Fiesta,2016,8495,Manual,41966,Petrol,62.8,1.0
12420,Fiesta,2018,9890,Manual,28298,Petrol,65.7,1.0
12597,Puma,2019,21000,Manual,150,Petrol,50.4,1.0
781,EcoSport,2016,7498,Manual,48010,Diesel,64.2,1.5


### Meaning of each column
1.   model :Audi Car Model
2.   Price	: Car Price
3.   transmission	: Total transmission
4.   mileage :Total miles travelled
5.   fuelType	: Type of fuel runs the car
6.   mpg	: Runs car Miles per gallon	
7.   engineSize	: volume of the cylinders in engine
8.   Year	: Car  Making year


In [278]:
print("Number of rows",df.shape[0])
print("Number of columns",df.shape[1])

Number of rows 17964
Number of columns 8


In [279]:
# check for the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17964 non-null  object 
 1   year          17964 non-null  int64  
 2   price         17964 non-null  int64  
 3   transmission  17964 non-null  object 
 4   mileage       17964 non-null  int64  
 5   fuelType      17964 non-null  object 
 6   mpg           17964 non-null  float64
 7   engineSize    17964 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 1.1+ MB


In [280]:
# Data statistics
df.describe()

Unnamed: 0,year,price,mileage,mpg,engineSize
count,17964.0,17964.0,17964.0,17964.0,17964.0
mean,2016.864173,12280.078435,23361.880149,57.907832,1.350824
std,2.024987,4741.318119,19471.243292,10.125632,0.432383
min,1996.0,495.0,1.0,20.8,0.0
25%,2016.0,8999.0,9987.0,52.3,1.0
50%,2017.0,11291.0,18242.5,58.9,1.2
75%,2018.0,15299.0,31052.0,65.7,1.5
max,2020.0,54995.0,177644.0,201.8,5.0


In [281]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'mpg',
       'engineSize'],
      dtype='object')

In [282]:
# Missing values in the dataset
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
mpg             0
engineSize      0
dtype: int64

In [283]:
print("FORD CAR MODEL :",df.model.unique())
print("FORD Total  CAR MODEL :",len(df.model.unique()))

FORD CAR MODEL : [' Fiesta' ' Focus' ' Puma' ' Kuga' ' EcoSport' ' C-MAX' ' Mondeo' ' Ka+'
 ' Tourneo Custom' ' S-MAX' ' B-MAX' ' Edge' ' Tourneo Connect'
 ' Grand C-MAX' ' KA' ' Galaxy' ' Mustang' ' Grand Tourneo Connect'
 ' Fusion' ' Ranger' ' Streetka' ' Escort' ' Transit Tourneo']
FORD Total  CAR MODEL : 23


In [284]:
df.model.value_counts()

 Fiesta                   6556
 Focus                    4588
 Kuga                     2225
 EcoSport                 1143
 C-MAX                     543
 Ka+                       531
 Mondeo                    526
 B-MAX                     355
 S-MAX                     296
 Grand C-MAX               247
 Galaxy                    228
 Edge                      208
 KA                        199
 Puma                       80
 Tourneo Custom             69
 Grand Tourneo Connect      59
 Mustang                    57
 Tourneo Connect            33
 Fusion                     16
 Streetka                    2
 Ranger                      1
 Escort                      1
 Transit Tourneo             1
Name: model, dtype: int64

In [285]:
df.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid', 'Electric', 'Other'], dtype=object)

In [286]:
print(df.fuelType.value_counts())

Petrol      12177
Diesel       5762
Hybrid         22
Electric        2
Other           1
Name: fuelType, dtype: int64


In [287]:
display(df.transmission.unique())

print(df.transmission.value_counts())

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

Manual       15517
Automatic     1360
Semi-Auto     1087
Name: transmission, dtype: int64


#**Encoding**

In [288]:
#label encoding for categorical data

LE = LabelEncoder()
df["transmission_N"] = LE.fit_transform(df["transmission"])
print(LE.classes_)

df["fuelType_N"] = LE.fit_transform(df["fuelType"])
print(LE.classes_)

display(df.sample(10))

['Automatic' 'Manual' 'Semi-Auto']
['Diesel' 'Electric' 'Hybrid' 'Other' 'Petrol']


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,transmission_N,fuelType_N
7194,Focus,2018,11250,Manual,8564,Petrol,60.1,1.0,1,4
13312,Fiesta,2020,14500,Manual,7,Petrol,56.5,1.0,1,4
198,Fiesta,2016,9500,Automatic,7350,Petrol,57.7,1.0,0,4
9248,Focus,2019,15240,Manual,13206,Diesel,80.7,1.5,1,0
1109,Fiesta,2018,11030,Manual,20271,Petrol,65.7,1.0,1,4
12947,Tourneo Custom,2019,25495,Automatic,9750,Diesel,31.7,2.0,0,0
15718,Fiesta,2016,7750,Manual,45000,Petrol,65.7,1.0,1,4
9691,Fiesta,2014,6100,Manual,49620,Petrol,54.3,1.2,1,4
8226,Focus,2018,17000,Manual,12657,Diesel,67.3,2.0,1,0
4820,B-MAX,2014,7998,Manual,9181,Petrol,55.4,1.0,1,4


In [289]:
#check data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   model           17964 non-null  object 
 1   year            17964 non-null  int64  
 2   price           17964 non-null  int64  
 3   transmission    17964 non-null  object 
 4   mileage         17964 non-null  int64  
 5   fuelType        17964 non-null  object 
 6   mpg             17964 non-null  float64
 7   engineSize      17964 non-null  float64
 8   transmission_N  17964 non-null  int64  
 9   fuelType_N      17964 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 1.4+ MB


In [290]:
df=df.drop(['transmission','fuelType'],axis=1)

In [291]:
###  One-hot Encoding
df_model = pd.get_dummies(df["model"])
display(df_model.sample(10))


Unnamed: 0,B-MAX,C-MAX,EcoSport,Edge,Escort,Fiesta,Focus,Fusion,Galaxy,Grand C-MAX,Grand Tourneo Connect,KA,Ka+,Kuga,Mondeo,Mustang,Puma,Ranger,S-MAX,Streetka,Tourneo Connect,Tourneo Custom,Transit Tourneo
6764,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6958,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1139,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4464,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5407,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5410,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13622,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8900,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14459,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15329,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [292]:

df_final= pd.get_dummies(df)

display(df_final.sample(10))
print(df_final.shape)

Unnamed: 0,year,price,mileage,mpg,engineSize,transmission_N,fuelType_N,model_ B-MAX,model_ C-MAX,model_ EcoSport,model_ Edge,model_ Escort,model_ Fiesta,model_ Focus,model_ Fusion,model_ Galaxy,model_ Grand C-MAX,model_ Grand Tourneo Connect,model_ KA,model_ Ka+,model_ Kuga,model_ Mondeo,model_ Mustang,model_ Puma,model_ Ranger,model_ S-MAX,model_ Streetka,model_ Tourneo Connect,model_ Tourneo Custom,model_ Transit Tourneo
17673,2015,10450,42020,54.3,1.6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
9958,2016,9077,22910,64.2,1.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13462,2013,5250,62214,54.3,1.2,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17868,2018,10250,4730,43.5,1.2,1,4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
16612,2017,12499,11830,57.7,1.0,1,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17614,2018,11995,7000,65.7,1.0,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5849,2018,9070,12268,64.2,1.1,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5582,2018,13700,10606,57.7,1.0,1,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10772,2018,10699,12219,65.7,1.0,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13325,2019,25500,5573,37.7,2.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


(17964, 30)


In [293]:
len(df_final.columns.unique())

30

In [294]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   year                          17964 non-null  int64  
 1   price                         17964 non-null  int64  
 2   mileage                       17964 non-null  int64  
 3   mpg                           17964 non-null  float64
 4   engineSize                    17964 non-null  float64
 5   transmission_N                17964 non-null  int64  
 6   fuelType_N                    17964 non-null  int64  
 7   model_ B-MAX                  17964 non-null  uint8  
 8   model_ C-MAX                  17964 non-null  uint8  
 9   model_ EcoSport               17964 non-null  uint8  
 10  model_ Edge                   17964 non-null  uint8  
 11  model_ Escort                 17964 non-null  uint8  
 12  model_ Fiesta                 17964 non-null  uint8  
 13  m

### **Data train for ML **

In [295]:
X = df_final.drop(['price'],axis=1)
y =df_final['price']


print("Shape of X = ", X.shape)
print("Shape of y = ", y.shape)

Shape of X =  (17964, 29)
Shape of y =  (17964,)


### Split train and test set


In [296]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(14371, 29) (3593, 29) (14371,) (3593,)


### **Applied Linear Regression Model**

In [297]:
model = LinearRegression()
model = model.fit(X_train, y_train)

In [298]:
y_pred = model.predict(X_test)
print(y_pred)

[14243.39747194 10730.54047355 11934.76287472 ...  6852.67724184
  9067.37100945 15976.80870748]


### Linear Regression Coefficients and intercept


In [299]:
coefficients = pd.DataFrame({'features':X.columns, 'coefficients':np.squeeze(model.coef_)})
coefficients = coefficients.sort_values(by='coefficients')
display(coefficients)

Unnamed: 0,features,coefficients
18,model_ Ka+,-7308.331
17,model_ KA,-4760.679
6,model_ B-MAX,-4205.576
11,model_ Fiesta,-3155.488
7,model_ C-MAX,-3137.655
15,model_ Grand C-MAX,-2666.527
8,model_ EcoSport,-2624.942
28,model_ Transit Tourneo,-2526.347
20,model_ Mondeo,-1244.226
12,model_ Focus,-1119.087


### **Find Prediction Error**


In [300]:
 # MAE, MAPE, MSE, RMSE, coefficient of determination values

MAE = mean_absolute_error(y_test, y_pred)# LOWER IS BETTER
print("MAE = ", MAE)

MAPE= mean_absolute_percentage_error(y_test, y_pred)
print("MAPE = ", MAPE)

MSE = mean_squared_error(y_test, y_pred, squared=True)# LOWER IS BETTER
print("MSE = ", MSE)

RMSE = mean_squared_error(y_test, y_pred, squared=False) # LOWER IS BETTER
print("RMSE = ", RMSE)

r2 = r2_score(y_test, y_pred)# Higher IS BETTER
print("r_squared = ", r2)

MAE =  1375.7264287039127
MAPE =  0.14551952782143188
MSE =  3485754.4818040915
RMSE =  1867.017536555051
r_squared =  0.8458727003729856


### Visualization



In [302]:
y_test['pred'] = y_pred
y_test['x'] = np.arange(0,y_test.shape[0],1).tolist()

plt.figure(figsize=(30,12))

sns.lineplot(x='x', y='price', data=y_test, label='actual target')
sns.lineplot(x='x', y='pred', data=y_test, label='predicted target')
plt.show()

ValueError: ignored

<Figure size 2160x864 with 0 Axes>