# Car Price Prediction ML

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics

# Data Collection & Preprocessing


In [4]:
Car_pricepred=pd.read_csv("Car Price Prediction.csv") #to read csv

In [5]:
Car_pricepred.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [11]:
Car_pricepred.shape

(4340, 8)

In [13]:
Car_pricepred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [14]:
Car_pricepred.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [16]:
Car_pricepred.duplicated().sum()

np.int64(763)

# Checking the distribution of Categorical Data

In [17]:
Car_pricepred["fuel"].value_counts()


fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [19]:
Car_pricepred["seller_type"].value_counts()

seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64

In [21]:
Car_pricepred["transmission"].value_counts()

transmission
Manual       3892
Automatic     448
Name: count, dtype: int64

In [24]:
Car_pricepred["owner"].value_counts()

owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64

In [22]:
Car_pricepred.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


# Encoding the Categorical Data

In [10]:
Car_pricepred.replace({"fuel":{"Petrol":0,"Diesel":1,"CNG":2,"LPG":3,"Electric":4}},inplace=True)

Car_pricepred.replace({"seller_type":{"Individual":0,"Dealer":1,"Trustmark Dealer":2}},inplace=True)

Car_pricepred.replace({"transmission":{"Manual":0,"Automatic":1}},inplace=True)

Car_pricepred.replace({"owner":{"First Owner":0,"Second Owner":1,"Third Owner":2,"Fourth & Above Owner":3,
                                "Test Drive Car":4}},inplace=True)

In [11]:
Car_pricepred.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,0,0,0,0
1,Maruti Wagon R LXI Minor,2007,135000,50000,0,0,0,0
2,Hyundai Verna 1.6 SX,2012,600000,100000,1,0,0,0
3,Datsun RediGO T Option,2017,250000,46000,0,0,0,0
4,Honda Amaze VX i-DTEC,2014,450000,141000,1,0,0,1


In [12]:
# we want to drop unneccessary column
#Car_pricepred.drop(columns=["fuel"],inplace=True)

In [13]:
Car_pricepred.describe()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
count,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419,0.531336,0.276037,0.103226,0.46659
std,4.215344,578548.7,46644.102194,0.549621,0.496892,0.304289,0.74033
min,1992.0,20000.0,1.0,0.0,0.0,0.0,0.0
25%,2011.0,208749.8,35000.0,0.0,0.0,0.0,0.0
50%,2014.0,350000.0,60000.0,1.0,0.0,0.0,0.0
75%,2016.0,600000.0,90000.0,1.0,1.0,0.0,1.0
max,2020.0,8900000.0,806599.0,4.0,2.0,1.0,4.0


# Data Analysis

In [15]:
for col in Car_pricepred.columns:
    print("Unique Values of"+col)
    print(Car_pricepred [col].unique())
    print("============================")
    

Unique Values ofname
['Maruti 800 AC' 'Maruti Wagon R LXI Minor' 'Hyundai Verna 1.6 SX' ...
 'Mahindra Verito 1.5 D6 BSIII'
 'Toyota Innova 2.5 VX (Diesel) 8 Seater BS IV'
 'Hyundai i20 Magna 1.4 CRDi']
Unique Values ofyear
[2007 2012 2017 2014 2016 2015 2018 2019 2013 2011 2010 2009 2006 1996
 2005 2008 2004 1998 2003 2002 2020 2000 1999 2001 1995 1997 1992]
Unique Values ofselling_price
[  60000  135000  600000  250000  450000  140000  550000  240000  850000
  365000  260000 1650000  585000 1195000  390000 1964999 1425000  975000
 1190000  930000  525000 1735000 1375000  900000 1300000 1400000  229999
 1550000 1250000  625000 1050000  560000  290000  275000  411000  150000
  500000  100000  725000  401000  750000  310000  665000  465000  160000
  675000  300000   70000  151000  280000  350000  570000  125000  130000
  925000  200000  248000   80000  650000  495000  371000 1025000 8150000
  325000 1470000 2800000  210000 1150000 4500000 2750000 1975000  175000
 2500000  628000  399000

In [21]:
Car_pricepred.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti,2007,60000,70000,0,0,0,0
1,Maruti,2007,135000,50000,0,0,0,0
2,Hyundai,2012,600000,100000,1,0,0,0
3,Datsun,2017,250000,46000,0,0,0,0
4,Honda,2014,450000,141000,1,0,0,1


In [16]:
def get_brand_name(Car_pricepred):
    Car_pricepred=Car_pricepred.split(' ')[0]
    return Car_pricepred.strip(' ')
    

In [17]:
get_brand_name('Maruti 800 AC')

'Maruti'

In [18]:
Car_pricepred['name']=Car_pricepred['name'].apply(get_brand_name)

In [20]:
Car_pricepred['name'].unique()

array(['Maruti', 'Hyundai', 'Datsun', 'Honda', 'Tata', 'Chevrolet',
       'Toyota', 'Jaguar', 'Mercedes-Benz', 'Audi', 'Skoda', 'Jeep',
       'BMW', 'Mahindra', 'Ford', 'Nissan', 'Renault', 'Fiat',
       'Volkswagen', 'Volvo', 'Mitsubishi', 'Land', 'Daewoo', 'MG',
       'Force', 'Isuzu', 'OpelCorsa', 'Ambassador', 'Kia'], dtype=object)

In [23]:
# Remember object has no split method

In [24]:
for col in Car_pricepred.columns:
    print("Unique Values of"+col)
    print(Car_pricepred [col].unique())
    print("============================")
    

Unique Values ofname
['Maruti' 'Hyundai' 'Datsun' 'Honda' 'Tata' 'Chevrolet' 'Toyota' 'Jaguar'
 'Mercedes-Benz' 'Audi' 'Skoda' 'Jeep' 'BMW' 'Mahindra' 'Ford' 'Nissan'
 'Renault' 'Fiat' 'Volkswagen' 'Volvo' 'Mitsubishi' 'Land' 'Daewoo' 'MG'
 'Force' 'Isuzu' 'OpelCorsa' 'Ambassador' 'Kia']
Unique Values ofyear
[2007 2012 2017 2014 2016 2015 2018 2019 2013 2011 2010 2009 2006 1996
 2005 2008 2004 1998 2003 2002 2020 2000 1999 2001 1995 1997 1992]
Unique Values ofselling_price
[  60000  135000  600000  250000  450000  140000  550000  240000  850000
  365000  260000 1650000  585000 1195000  390000 1964999 1425000  975000
 1190000  930000  525000 1735000 1375000  900000 1300000 1400000  229999
 1550000 1250000  625000 1050000  560000  290000  275000  411000  150000
  500000  100000  725000  401000  750000  310000  665000  465000  160000
  675000  300000   70000  151000  280000  350000  570000  125000  130000
  925000  200000  248000   80000  650000  495000  371000 1025000 8150000
  325000 14

In [30]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Car_pricepred['name_encoded'] = le.fit_transform(Car_pricepred['name'])


In [32]:
Car_pricepred.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,name_encoded
0,Maruti,2007,60000,70000,0,0,0,0,18
1,Maruti,2007,135000,50000,0,0,0,0,18
2,Hyundai,2012,600000,100000,1,0,0,0,10
3,Datsun,2017,250000,46000,0,0,0,0,5
4,Honda,2014,450000,141000,1,0,0,1,9


# Label Encoding

In [33]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Car_pricepred['name'] = le.fit_transform(Car_pricepred['name'])


In [37]:
Car_pricepred.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,name_encoded
0,18,2007,60000,70000,0,0,0,0,18
1,18,2007,135000,50000,0,0,0,0,18
2,10,2012,600000,100000,1,0,0,0,10
3,5,2017,250000,46000,0,0,0,0,5
4,9,2014,450000,141000,1,0,0,1,9


In [39]:
Car_pricepred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   name           4340 non-null   int64
 1   year           4340 non-null   int64
 2   selling_price  4340 non-null   int64
 3   km_driven      4340 non-null   int64
 4   fuel           4340 non-null   int64
 5   seller_type    4340 non-null   int64
 6   transmission   4340 non-null   int64
 7   owner          4340 non-null   int64
 8   name_encoded   4340 non-null   int64
dtypes: int64(9)
memory usage: 305.3 KB


In [41]:
Car_pricepred["fuel"].unique()

array([0, 1, 2, 3, 4])

In [42]:
Car_pricepred["fuel"].value_counts()

fuel
1    2153
0    2123
2      40
3      23
4       1
Name: count, dtype: int64

In [43]:
Car_pricepred

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,name_encoded
0,18,2007,60000,70000,0,0,0,0,18
1,18,2007,135000,50000,0,0,0,0,18
2,10,2012,600000,100000,1,0,0,0,10
3,5,2017,250000,46000,0,0,0,0,5
4,9,2014,450000,141000,1,0,0,1,9
...,...,...,...,...,...,...,...,...,...
4335,10,2014,409999,80000,1,0,0,1,10
4336,10,2014,409999,80000,1,0,0,1,10
4337,18,2009,110000,83000,0,0,0,1,18
4338,10,2016,865000,90000,1,0,0,0,10


In [46]:
input_data=Car_pricepred.drop(columns=["selling_price"])
output_data=Car_pricepred["selling_price"]

In [47]:
x_train,x_test,y_train,y_test=train_test_split(input_data,output_data,test_size=0.2)

# Model_Creation

In [48]:
model=LinearRegression()

# Train_Model

In [49]:
model.fit(x_train,y_train)

In [50]:
# Now predict value for testing data

In [51]:
predict=model.predict(x_test)

In [52]:
predict

array([ 7.34200641e+05, -2.02363298e+05,  4.47338621e+05,  3.31088459e+05,
        7.12630675e+03,  7.04500301e+05,  3.49107077e+05,  4.85476071e+05,
        1.50532704e+06,  6.29122056e+05,  2.85345045e+05, -1.54695637e+05,
        3.06066015e+05,  5.64669653e+05,  6.39727149e+05,  2.03299576e+05,
        3.55406843e+05,  3.15848818e+05,  1.97704021e+05,  7.09698946e+05,
        4.22336742e+05,  6.16069262e+05,  3.80641184e+05,  6.45741374e+05,
        7.21216736e+05,  5.25462494e+05,  1.30559697e+06,  2.48576949e+05,
        1.45434810e+06,  4.62186033e+05,  4.27559923e+05,  2.77502371e+05,
        4.70755789e+05,  4.68093217e+05,  6.21896486e+05,  7.85000465e+05,
        7.01468374e+05,  5.71930053e+05,  3.39604417e+03,  3.08965334e+05,
        1.06645176e+06,  1.43919465e+06,  1.33029758e+06,  5.33250460e+05,
       -2.64911645e+05,  5.71865079e+05, -2.70738869e+05,  3.17736455e+05,
        2.97496734e+05,  1.82206231e+05,  2.98348882e+05,  3.48297848e+05,
        2.89071061e+05,  

In [53]:
x_train.head(1)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,name_encoded
3547,17,2019,10000,1,0,0,0,17


In [62]:
input_data_model=pd.DataFrame(
    [[17,2023,10000,1,0,0,0,17]],
    columns=["name","year","km_driven","fuel","seller_type","transmission","owner","name_encoded"])


In [63]:
model.predict(input_data_model)

array([918796.40561892])

In [65]:
import pickle as pk

In [66]:
pk.dump(model,open('model.pkl','wb'))