<a href="https://colab.research.google.com/github/ahamedistiaque/audi_used_car_analysis/blob/main/ford_used_car_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [186]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

pd.options.display.max_columns =35

sns.set_context("talk")
sns.set_style("darkgrid")

In [142]:
path='/content/drive/MyDrive/Colab Notebooks/Data Processing for Machine Learning With Python/ford.csv'

In [143]:
df=pd.read_csv(path)
display(df.sample(5))

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
12256,Focus,2018,11150,Manual,15180,Petrol,60.1,1.0
11712,Kuga,2019,21999,Manual,5900,Diesel,47.9,2.0
10754,Fiesta,2016,9775,Manual,34296,Petrol,65.7,1.0
7083,Fiesta,2014,5991,Manual,62111,Petrol,54.3,1.2
16355,Focus,2010,3000,Manual,81124,Diesel,47.9,2.0


### Meaning of each column
1.   model :Audi Car Model
2.   Price	: Car Price
3.   transmission	: Total transmission
4.   mileage :Total miles travelled
5.   fuelType	: Type of fuel runs the car
6.   mpg	: Runs car Miles per gallon	
7.   engineSize	: volume of the cylinders in engine
8.   Year	: Car  Making year


In [144]:
print("Number of rows",df.shape[0])
print("Number of columns",df.shape[1])

Number of rows 17964
Number of columns 8


In [145]:
# check for the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17964 non-null  object 
 1   year          17964 non-null  int64  
 2   price         17964 non-null  int64  
 3   transmission  17964 non-null  object 
 4   mileage       17964 non-null  int64  
 5   fuelType      17964 non-null  object 
 6   mpg           17964 non-null  float64
 7   engineSize    17964 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 1.1+ MB


In [146]:
# Data statistics
df.describe()

Unnamed: 0,year,price,mileage,mpg,engineSize
count,17964.0,17964.0,17964.0,17964.0,17964.0
mean,2016.864173,12280.078435,23361.880149,57.907832,1.350824
std,2.024987,4741.318119,19471.243292,10.125632,0.432383
min,1996.0,495.0,1.0,20.8,0.0
25%,2016.0,8999.0,9987.0,52.3,1.0
50%,2017.0,11291.0,18242.5,58.9,1.2
75%,2018.0,15299.0,31052.0,65.7,1.5
max,2020.0,54995.0,177644.0,201.8,5.0


In [147]:
df.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'mpg',
       'engineSize'],
      dtype='object')

In [148]:
# Missing values in the dataset
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
mpg             0
engineSize      0
dtype: int64

In [149]:
print("FORD CAR MODEL :",df.model.unique())
print("FORD Total  CAR MODEL :",len(df.model.unique()))

FORD CAR MODEL : [' Fiesta' ' Focus' ' Puma' ' Kuga' ' EcoSport' ' C-MAX' ' Mondeo' ' Ka+'
 ' Tourneo Custom' ' S-MAX' ' B-MAX' ' Edge' ' Tourneo Connect'
 ' Grand C-MAX' ' KA' ' Galaxy' ' Mustang' ' Grand Tourneo Connect'
 ' Fusion' ' Ranger' ' Streetka' ' Escort' ' Transit Tourneo']
FORD Total  CAR MODEL : 23


In [150]:
df.model.value_counts()

 Fiesta                   6556
 Focus                    4588
 Kuga                     2225
 EcoSport                 1143
 C-MAX                     543
 Ka+                       531
 Mondeo                    526
 B-MAX                     355
 S-MAX                     296
 Grand C-MAX               247
 Galaxy                    228
 Edge                      208
 KA                        199
 Puma                       80
 Tourneo Custom             69
 Grand Tourneo Connect      59
 Mustang                    57
 Tourneo Connect            33
 Fusion                     16
 Streetka                    2
 Ranger                      1
 Escort                      1
 Transit Tourneo             1
Name: model, dtype: int64

In [151]:
df.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid', 'Electric', 'Other'], dtype=object)

In [152]:
print(df.fuelType.value_counts())

Petrol      12177
Diesel       5762
Hybrid         22
Electric        2
Other           1
Name: fuelType, dtype: int64


In [153]:
display(df.transmission.unique())

print(df.transmission.value_counts())

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

Manual       15517
Automatic     1360
Semi-Auto     1087
Name: transmission, dtype: int64


#**Encoding**

In [154]:
#label encoding for categorical data

LE = LabelEncoder()
df["transmission_N"] = LE.fit_transform(df["transmission"])
print(LE.classes_)

df["fuelType_N"] = LE.fit_transform(df["fuelType"])
print(LE.classes_)

display(df.sample(10))

['Automatic' 'Manual' 'Semi-Auto']
['Diesel' 'Electric' 'Hybrid' 'Other' 'Petrol']


Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,transmission_N,fuelType_N
7330,Kuga,2015,12999,Manual,22500,Diesel,60.1,2.0,1,0
8221,Kuga,2015,9799,Semi-Auto,65000,Diesel,52.3,2.0,2,0
17680,Fiesta,2009,3800,Manual,67253,Petrol,49.6,1.2,1,4
16677,B-MAX,2017,11995,Manual,12500,Petrol,57.7,1.0,1,4
6023,Fiesta,2018,11490,Manual,30600,Petrol,65.7,1.0,1,4
913,Fiesta,2013,5200,Manual,50501,Petrol,65.7,1.0,1,4
14032,Kuga,2016,12490,Manual,32000,Petrol,45.6,1.5,1,4
16482,Fiesta,2018,14599,Manual,18528,Petrol,62.8,1.0,1,4
7830,Focus,2019,15510,Manual,7453,Petrol,60.1,1.0,1,4
6420,Fiesta,2018,12500,Manual,10827,Petrol,65.7,1.0,1,4


In [155]:
#check data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   model           17964 non-null  object 
 1   year            17964 non-null  int64  
 2   price           17964 non-null  int64  
 3   transmission    17964 non-null  object 
 4   mileage         17964 non-null  int64  
 5   fuelType        17964 non-null  object 
 6   mpg             17964 non-null  float64
 7   engineSize      17964 non-null  float64
 8   transmission_N  17964 non-null  int64  
 9   fuelType_N      17964 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 1.4+ MB


In [158]:
df=df.drop(['transmission','fuelType'],axis=1)

In [159]:
###  One-hot Encoding
df_model = pd.get_dummies(df["model"])
display(df_model.sample(10))


Unnamed: 0,B-MAX,C-MAX,EcoSport,Edge,Escort,Fiesta,Focus,Fusion,Galaxy,Grand C-MAX,Grand Tourneo Connect,KA,Ka+,Kuga,Mondeo,Mustang,Puma,Ranger,S-MAX,Streetka,Tourneo Connect,Tourneo Custom,Transit Tourneo
9919,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3464,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15676,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12056,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14448,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
6674,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10207,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16052,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
12085,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17330,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [164]:

df_final= pd.get_dummies(df)

display(df_final.sample(10))
print(df_final.shape)

Unnamed: 0,year,price,mileage,mpg,engineSize,transmission_N,fuelType_N,model_ B-MAX,model_ C-MAX,model_ EcoSport,model_ Edge,model_ Escort,model_ Fiesta,model_ Focus,model_ Fusion,model_ Galaxy,model_ Grand C-MAX,model_ Grand Tourneo Connect,model_ KA,model_ Ka+,model_ Kuga,model_ Mondeo,model_ Mustang,model_ Puma,model_ Ranger,model_ S-MAX,model_ Streetka,model_ Tourneo Connect,model_ Tourneo Custom,model_ Transit Tourneo
2142,2017,13650,38969,60.1,2.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2849,2013,6500,71250,67.3,1.6,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
336,2018,9949,27399,64.2,1.1,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4467,2016,7950,16674,54.3,1.2,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13267,2013,6595,31200,65.7,1.0,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
849,2019,11498,14132,58.9,1.0,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15610,2015,6800,32204,54.3,1.2,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14867,2015,12434,30970,52.3,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9483,2018,12500,9131,58.9,1.0,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10662,2018,9500,14562,55.4,1.1,1,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


(17964, 30)


In [167]:
len(df_final.columns.unique())

30

In [168]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 30 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   year                          17964 non-null  int64  
 1   price                         17964 non-null  int64  
 2   mileage                       17964 non-null  int64  
 3   mpg                           17964 non-null  float64
 4   engineSize                    17964 non-null  float64
 5   transmission_N                17964 non-null  int64  
 6   fuelType_N                    17964 non-null  int64  
 7   model_ B-MAX                  17964 non-null  uint8  
 8   model_ C-MAX                  17964 non-null  uint8  
 9   model_ EcoSport               17964 non-null  uint8  
 10  model_ Edge                   17964 non-null  uint8  
 11  model_ Escort                 17964 non-null  uint8  
 12  model_ Fiesta                 17964 non-null  uint8  
 13  m

### **Data train for ML **

In [179]:
X = df_final.drop(['price'],axis=1)
y =df_final['price']


print("Shape of X = ", X.shape)
print("Shape of y = ", y.shape)

Shape of X =  (17964, 29)
Shape of y =  (17964,)


### Split train and test set


In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(14371, 29) (3593, 29) (14371,) (3593,)


### **Applied Linear Regression Model**

In [181]:
model = LinearRegression()
model = model.fit(X_train, y_train)

In [182]:
y_pred = model.predict(X_test)
print(y_pred)

[14243.39747194 10730.54047355 11934.76287472 ...  6852.67724184
  9067.37100945 15976.80870748]


In [183]:
print(y_test,y_pred)


1087     16700
9367      9690
4705     10999
10336    29350
8509     11250
         ...  
14866    13487
11183    15299
13788     5495
17265     5685
16043    16495
Name: price, Length: 3593, dtype: int64 [14243.39747194 10730.54047355 11934.76287472 ...  6852.67724184
  9067.37100945 15976.80870748]


### **Find Prediction Error**


In [188]:
 # MAE, MAPE, MSE, RMSE, coefficient of determination values

MAE = mean_absolute_error(y_test, y_pred)# LOWER IS BETTER
print("MAE = ", MAE)

MAPE= mean_absolute_percentage_error(y_test, y_pred)
print("MAPE = ", MAPE)

MSE = mean_squared_error(y_test, y_pred, squared=True)# LOWER IS BETTER
print("MSE = ", MSE)

RMSE = mean_squared_error(y_test, y_pred, squared=False) # LOWER IS BETTER
print("RMSE = ", RMSE)

r2 = r2_score(y_test, y_pred)# Higher IS BETTER
print("r_squared = ", r2)

MAE =  1375.7264287039127
MAPE =  0.14551952782143188
MSE =  3485754.4818040915
RMSE =  1867.017536555051
r_squared =  0.8458727003729856
