# Data Description



| Attribute | Description |
|----------|----------|
|Name	|The brand and model of the car
|Location	|The location in which the car is being sold or is available for purchase
|Year	|The year or edition of the model
|Kilometers_Driven	|The total kilometers driven in the car by the previous owner(s) in KM
|Fuel_Type	|The type of fuel used by the car
|Transmission	|The type of transmission used by the car
|Owner_Type	|Whether the ownership is Firsthand, Second hand or other
|Mileage	|The standard mileage offered by the car company in kmpl or km/kg
|Engine	|The displacement volume of the engine in cc
|Power	|The maximum power of the engine in bhp
|Seats	|The number of seats in the car
|New_Price	|Price of new model
|Price	|The price of the used car in INR Lakhs

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder

# Reading Data

In [2]:
df = pd.read_csv('F:\\AI corse\\Machine Learning projects\\used car datasets\\train.csv')

# Explore Data

##### Handling Missing

In [3]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
df.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price', 'Price'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [6]:
df.isnull().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [7]:
df.New_Price.value_counts()

95.13 Lakh    6
4.78 Lakh     6
63.71 Lakh    6
11.26 Lakh    5
44.28 Lakh    5
             ..
1.27 Cr       1
38.11 Lakh    1
8.12 Lakh     1
71.15 Lakh    1
9.93 Lakh     1
Name: New_Price, Length: 540, dtype: int64

In [8]:
df.drop("New_Price" , axis =1 , inplace = True)

In [9]:
df.head(1)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75


In [10]:
df.Seats.value_counts()

5.0     5014
7.0      674
8.0      134
4.0       99
6.0       31
2.0       16
10.0       5
9.0        3
0.0        1
Name: Seats, dtype: int64

In [11]:
df[df["Seats"] == 10]
# i check on google and found that thess cars have already ten seats 

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
814,Toyota Qualis FS B2,Pune,2004,77757,Diesel,Manual,Second,13.1 kmpl,2446 CC,75 bhp,10.0,3.5
917,Tata Sumo DX,Mumbai,2013,20000,Diesel,Manual,First,14.07 kmpl,1978 CC,83.1 bhp,10.0,5.25
1907,Toyota Qualis FS B3,Bangalore,2002,63000,Diesel,Manual,Third,13.1 kmpl,2446 CC,75 bhp,10.0,3.65
2267,Toyota Qualis RS E2,Pune,2004,215750,Diesel,Manual,Second,0.0 kmpl,2446 CC,null bhp,10.0,3.5
2575,Chevrolet Tavera LS B3 10 Seats BSIII,Hyderabad,2015,120000,Diesel,Manual,First,14.8 kmpl,2499 CC,80 bhp,10.0,5.5


# # #  #  # # #  # #  # #  # # # # # # # # 

In [12]:
# new methode (both ways working )
#df.at[3999 , "Seats"] = 5.0
df.iloc[3999, 10] = 5.0

In [13]:
df.Seats.value_counts()

5.0     5015
7.0      674
8.0      134
4.0       99
6.0       31
2.0       16
10.0       5
9.0        3
Name: Seats, dtype: int64

In [14]:
df.isnull()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,False,False,False,False,False,False,False,False,False,False,False,False
6015,False,False,False,False,False,False,False,False,False,False,False,False
6016,False,False,False,False,False,False,False,False,False,False,False,False
6017,False,False,False,False,False,False,False,False,False,False,False,False


In [15]:
df.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

In [16]:
# new method to calculate the percentage of missing data
df.isnull().mean() 

Name                 0.000000
Location             0.000000
Year                 0.000000
Kilometers_Driven    0.000000
Fuel_Type            0.000000
Transmission         0.000000
Owner_Type           0.000000
Mileage              0.000332
Engine               0.005981
Power                0.005981
Seats                0.006978
Price                0.000000
dtype: float64

In [17]:
df[df.Power.isnull() == True]['Name'].value_counts()


Maruti Swift 1.3 VXi                            6
BMW 5 Series 520d Sedan                         5
Hyundai Santro GLS II - Euro II                 4
Honda City 1.5 GXI                              3
Land Rover Range Rover 3.0 D                    2
Hyundai Santro Xing XG                          2
Maruti Swift 1.3 LXI                            2
Maruti Wagon R Vx                               1
Fiat Punto 1.2 Dynamic                          1
Honda Jazz Select Edition                       1
Fiat Punto 1.3 Emotion                          1
Honda CR-V AT With Sun Roof                     1
Maruti Swift 1.3 VXI ABS                        1
Toyota Etios Liva V                             1
Hyundai Santro LP zipPlus                       1
Land Rover Range Rover Sport 2005 2012 Sport    1
Maruti Swift 1.3 ZXI                            1
Honda City 1.3 DX                               1
Fiat Punto 1.4 Emotion                          1
Name: Name, dtype: int64

In [18]:
# i can use df.loc to add them together but i used df.at to use all possiple ways  // i used loc in seats
df.at[194  ,'Power' ] = '100 bhp'

In [19]:
df.at[1294  ,'Power' ] = '100 bhp'

In [20]:
df.at[5270	 ,'Power' ] = '100 bhp'

In [21]:
df.at[1385	 ,'Power' ] = '100 bhp'

In [22]:
df[df['Name'] == 'Honda City 1.5 GXI']

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
194,Honda City 1.5 GXI,Ahmedabad,2007,60006,Petrol,Manual,First,0.0 kmpl,,100 bhp,,2.95
1385,Honda City 1.5 GXI,Pune,2004,115000,Petrol,Manual,Second,0.0 kmpl,,100 bhp,,1.5
3305,Honda City 1.5 GXI,Jaipur,2007,75055,Petrol,Manual,First,12.8 kmpl,1493 CC,100 bhp,5.0,2.3
4767,Honda City 1.5 GXI,Mumbai,2005,81000,Petrol,Manual,Second,12.8 kmpl,1493 CC,100 bhp,5.0,1.1
5270,Honda City 1.5 GXI,Bangalore,2002,53000,Petrol,Manual,Second,0.0 kmpl,,100 bhp,,1.85


In [23]:
df.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                32
Seats                42
Price                 0
dtype: int64

In [24]:
df[df.Seats.isnull() == True]['Name'].value_counts()

Maruti Swift 1.3 VXi                            6
BMW 5 Series 520d Sedan                         5
Hyundai Santro GLS II - Euro II                 4
Honda City 1.5 GXI                              3
Maruti Estilo LXI                               3
Land Rover Range Rover 3.0 D                    2
Hyundai Santro Xing XG                          2
Maruti Swift 1.3 LXI                            2
Fiat Punto 1.2 Dynamic                          1
Honda Jazz Select Edition                       1
Fiat Punto 1.3 Emotion                          1
Honda CR-V AT With Sun Roof                     1
Ford Endeavour Hurricane LE                     1
Maruti Wagon R Vx                               1
Maruti Swift 1.3 VXI ABS                        1
Toyota Etios Liva V                             1
Hyundai Santro LP zipPlus                       1
Honda City 1.5 EXI                              1
Land Rover Range Rover Sport 2005 2012 Sport    1
Maruti Swift 1.3 ZXI                            1


In [25]:
df[df.Name =='BMW 5 Series 520d Sedan']

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
2530,BMW 5 Series 520d Sedan,Kochi,2014,64158,Diesel,Automatic,First,18.48 kmpl,,,,17.89
2623,BMW 5 Series 520d Sedan,Pune,2012,95000,Diesel,Automatic,Second,18.48 kmpl,,,,18.0
3272,BMW 5 Series 520d Sedan,Mumbai,2008,81000,Diesel,Automatic,Second,18.48 kmpl,,,,10.5
3520,BMW 5 Series 520d Sedan,Delhi,2012,90000,Diesel,Automatic,First,18.48 kmpl,,,,14.5
4577,BMW 5 Series 520d Sedan,Delhi,2012,72000,Diesel,Automatic,Third,18.48 kmpl,,,,13.85


In [26]:
df.iloc[df[df.Name =='Land Rover Range Rover 3.0 D'].index, 10] = 7.0

In [27]:
df.iloc[df[df.Name =='Land Rover Range Rover Sport 2005 2012 Sport'].index, 10] = 7.0

In [28]:
df.Seats =df.Seats.fillna(5.0)

In [29]:
df.Seats.isnull().sum()

0

In [30]:
df = df.dropna(axis = 0)

In [31]:
df.isnull().sum()

Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

##### handling datatype

In [32]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5981 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               5981 non-null   object 
 1   Location           5981 non-null   object 
 2   Year               5981 non-null   int64  
 3   Kilometers_Driven  5981 non-null   int64  
 4   Fuel_Type          5981 non-null   object 
 5   Transmission       5981 non-null   object 
 6   Owner_Type         5981 non-null   object 
 7   Mileage            5981 non-null   object 
 8   Engine             5981 non-null   object 
 9   Power              5981 non-null   object 
 10  Seats              5981 non-null   float64
 11  Price              5981 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 607.4+ KB


In [34]:
df.Name.value_counts()

Mahindra XUV500 W8 2WD                 49
Maruti Swift VDI                       45
Maruti Swift Dzire VDI                 34
Honda City 1.5 S MT                    34
Maruti Swift VDI BSIV                  31
                                       ..
Porsche Panamera Diesel 250hp           1
Hyundai i20 Active 1.2 SX Dual Tone     1
Maruti Swift AMT DDiS VDI               1
Maruti Wagon R VXI BS IV with ABS       1
Mahindra Xylo D4 BSIV                   1
Name: Name, Length: 1858, dtype: int64

In [35]:
df.Name = df.Name.str.split().str[0:2].str.join(' ')

In [36]:
df.Name.value_counts()

Maruti Swift         343
Honda City           266
Hyundai i20          247
Hyundai Verna        170
Toyota Innova        164
                    ... 
Volkswagen Beetle      1
Fiat Petra             1
Mini Countryman        1
Volkswagen Tiguan      1
Jaguar F               1
Name: Name, Length: 214, dtype: int64

In [37]:
df.Location.value_counts()

Mumbai        785
Hyderabad     741
Kochi         648
Coimbatore    634
Pune          613
Delhi         549
Kolkata       531
Chennai       492
Jaipur        411
Bangalore     354
Ahmedabad     223
Name: Location, dtype: int64

In [38]:
df['Age'] = df.Year.max() - df.Year
df.drop('Year', axis=1, inplace=True)

In [39]:
 df.Kilometers_Driven.value_counts()

60000    82
45000    70
65000    67
70000    60
50000    60
         ..
82085     1
68465     1
63854     1
64241     1
27365     1
Name: Kilometers_Driven, Length: 3081, dtype: int64

In [40]:
fig = px.scatter_matrix(df, dimensions=['Kilometers_Driven'])

fig.show()

  dims = [


In [41]:
px.box(df ,y='Kilometers_Driven' )

In [42]:
df[df['Kilometers_Driven']== 6500000]

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Age
2328,BMW X5,Chennai,6500000,Diesel,Automatic,First,15.97 kmpl,2993 CC,258 bhp,5.0,65.0,2


In [43]:
df.drop(df[df['Kilometers_Driven']== 131000].index, axis = 0 , inplace= True)

In [44]:
px.box(df ,y='Kilometers_Driven' )

In [45]:
df[df['Kilometers_Driven']> 131000].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 29 to 5957
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               201 non-null    object 
 1   Location           201 non-null    object 
 2   Kilometers_Driven  201 non-null    int64  
 3   Fuel_Type          201 non-null    object 
 4   Transmission       201 non-null    object 
 5   Owner_Type         201 non-null    object 
 6   Mileage            201 non-null    object 
 7   Engine             201 non-null    object 
 8   Power              201 non-null    object 
 9   Seats              201 non-null    float64
 10  Price              201 non-null    float64
 11  Age                201 non-null    int64  
dtypes: float64(2), int64(2), object(8)
memory usage: 20.4+ KB


In [46]:
 df.Fuel_Type.value_counts()

Diesel    3193
Petrol    2718
CNG         56
LPG         10
Name: Fuel_Type, dtype: int64

In [47]:
df.Transmission .value_counts()

Manual       4268
Automatic    1709
Name: Transmission, dtype: int64

In [48]:
df.Owner_Type.value_counts()

First             4901
Second             957
Third              111
Fourth & Above       8
Name: Owner_Type, dtype: int64

In [49]:
df.Mileage.value_counts

<bound method IndexOpsMixin.value_counts of 0       26.6 km/kg
1       19.67 kmpl
2        18.2 kmpl
3       20.77 kmpl
4        15.2 kmpl
           ...    
6014     28.4 kmpl
6015     24.4 kmpl
6016     14.0 kmpl
6017     18.9 kmpl
6018    25.44 kmpl
Name: Mileage, Length: 5977, dtype: object>

In [50]:
df['mileage KMPL'] =df.Mileage.str.replace('kmpl' , '')

In [51]:
df['mileage KMPL'] =df.Mileage.str.split().str[0]

In [52]:
df.drop("Mileage" , axis= 1, inplace= True)

In [53]:
df['mileage KMPL'].value_counts()

18.9     172
17.0     171
18.6     119
20.36     88
21.1      87
        ... 
9.7        1
27.28      1
14.57      1
14.33      1
17.24      1
Name: mileage KMPL, Length: 430, dtype: int64

In [54]:
df['mileage KMPL'] = df['mileage KMPL'].astype(float)

In [55]:
df["mileage KMPL"].info()

<class 'pandas.core.series.Series'>
Int64Index: 5977 entries, 0 to 6018
Series name: mileage KMPL
Non-Null Count  Dtype  
--------------  -----  
5977 non-null   float64
dtypes: float64(1)
memory usage: 93.4 KB


In [56]:
df.Engine.value_counts()

1197 CC    606
1248 CC    511
1498 CC    304
998 CC     259
2179 CC    240
          ... 
1489 CC      1
1422 CC      1
1978 CC      1
2694 CC      1
1797 CC      1
Name: Engine, Length: 145, dtype: int64

In [57]:
df.Engine = df.Engine.str.split(" ").str[0] 

In [58]:
df.Engine.value_counts()

1197    606
1248    511
1498    304
998     259
2179    240
       ... 
1489      1
1422      1
1978      1
2694      1
1797      1
Name: Engine, Length: 145, dtype: int64

In [59]:
df.Engine = df.Engine.astype(int)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5977 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               5977 non-null   object 
 1   Location           5977 non-null   object 
 2   Kilometers_Driven  5977 non-null   int64  
 3   Fuel_Type          5977 non-null   object 
 4   Transmission       5977 non-null   object 
 5   Owner_Type         5977 non-null   object 
 6   Engine             5977 non-null   int32  
 7   Power              5977 non-null   object 
 8   Seats              5977 non-null   float64
 9   Price              5977 non-null   float64
 10  Age                5977 non-null   int64  
 11  mileage KMPL       5977 non-null   float64
dtypes: float64(3), int32(1), int64(2), object(6)
memory usage: 583.7+ KB


In [61]:
df.head()

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine,Power,Seats,Price,Age,mileage KMPL
0,Maruti Wagon,Mumbai,72000,CNG,Manual,First,998,58.16 bhp,5.0,1.75,9,26.6
1,Hyundai Creta,Pune,41000,Diesel,Manual,First,1582,126.2 bhp,5.0,12.5,4,19.67
2,Honda Jazz,Chennai,46000,Petrol,Manual,First,1199,88.7 bhp,5.0,4.5,8,18.2
3,Maruti Ertiga,Chennai,87000,Diesel,Manual,First,1248,88.76 bhp,7.0,6.0,7,20.77
4,Audi A4,Coimbatore,40670,Diesel,Automatic,Second,1968,140.8 bhp,5.0,17.74,6,15.2


In [62]:
df.Power.value_counts()

74 bhp        234
98.6 bhp      131
73.9 bhp      125
140 bhp       123
78.9 bhp      111
             ... 
76.9 bhp        1
201 bhp         1
500 bhp         1
199.3 bhp       1
181.04 bhp      1
Name: Power, Length: 371, dtype: int64

In [63]:
df.Power = df.Power.str.split(" ").str[0]

In [64]:
df.Power = df.Power.replace("null", np.nan)

In [65]:
df.isnull().sum()

Name                   0
Location               0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Engine                 0
Power                107
Seats                  0
Price                  0
Age                    0
mileage KMPL           0
dtype: int64

In [66]:
df.dropna(axis = 0, inplace=True)

In [67]:
df.isnull().sum()

Name                 0
Location             0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Engine               0
Power                0
Seats                0
Price                0
Age                  0
mileage KMPL         0
dtype: int64

In [68]:
df.Power = df.Power.astype(float)

In [69]:
df.corr()





Unnamed: 0,Kilometers_Driven,Engine,Power,Seats,Price,Age,mileage KMPL
Kilometers_Driven,1.0,0.092479,0.033387,0.082648,-0.008162,0.168994,-0.060369
Engine,0.092479,1.0,0.866629,0.40078,0.65873,0.066357,-0.635543
Power,0.033387,0.866629,1.0,0.10169,0.773015,-0.015428,-0.537704
Seats,0.082648,0.40078,0.10169,1.0,0.055827,-0.008641,-0.330184
Price,-0.008162,0.65873,0.773015,0.055827,1.0,-0.299732,-0.340869
Age,0.168994,0.066357,-0.015428,-0.008641,-0.299732,1.0,-0.284179
mileage KMPL,-0.060369,-0.635543,-0.537704,-0.330184,-0.340869,-0.284179,1.0


In [70]:
px.imshow(df.corr(), width=800, height=800, title="Heatmap of Car Price Prediction Dataset")





In [71]:
df.describe()

Unnamed: 0,Kilometers_Driven,Engine,Power,Seats,Price,Age,mileage KMPL
count,5870.0,5870.0,5870.0,5870.0,5870.0,5870.0,5870.0
mean,58274.59,1625.218228,113.265489,5.283135,9.603528,5.521124,18.275876
std,92166.52,601.222481,53.877399,0.804595,11.251569,3.16424,4.369353
min,171.0,624.0,34.2,2.0,0.44,0.0,0.0
25%,33417.5,1198.0,75.0,5.0,3.51,3.0,15.26
50%,52588.0,1495.0,97.7,5.0,5.75,5.0,18.2
75%,72203.5,1985.0,138.1,5.0,10.0,7.0,21.1
max,6500000.0,5998.0,560.0,10.0,160.0,21.0,33.54


In [72]:
df[df.Price == 160.0]

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine,Power,Seats,Price,Age,mileage KMPL
4079,Land Rover,Hyderabad,25000,Diesel,Automatic,First,2993,255.0,5.0,160.0,2,13.33


In [73]:
px.box(df , y='Price')

In [74]:
df[df['Price']> 80]

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine,Power,Seats,Price,Age,mileage KMPL
1505,Land Rover,Kochi,26013,Diesel,Automatic,First,2993,255.0,5.0,97.07,0,12.65
1974,BMW 7,Coimbatore,28060,Petrol,Automatic,First,2979,320.0,5.0,93.67,1,12.05
1984,BMW 7,Bangalore,17465,Petrol,Automatic,First,2979,320.0,5.0,93.0,2,12.05
2095,Mercedes-Benz SLC,Coimbatore,2526,Petrol,Automatic,First,2996,362.07,2.0,83.96,0,19.0
4079,Land Rover,Hyderabad,25000,Diesel,Automatic,First,2993,255.0,5.0,160.0,2,13.33
4691,Mercedes-Benz SLK-Class,Bangalore,3000,Petrol,Automatic,Second,5461,421.0,2.0,90.0,5,12.0
5535,BMW X6,Ahmedabad,97003,Diesel,Automatic,First,2993,308.43,5.0,85.0,4,15.87
5781,Lamborghini Gallardo,Delhi,6500,Petrol,Automatic,Third,5204,560.0,2.0,120.0,8,6.4
5919,Jaguar F,Hyderabad,8000,Petrol,Automatic,First,5000,488.1,2.0,100.0,4,12.5


In [75]:
px.box(df , y='Price')

In [76]:
df.drop(4079 , axis = 0, inplace =True)

In [77]:
df.describe()

Unnamed: 0,Kilometers_Driven,Engine,Power,Seats,Price,Age,mileage KMPL
count,5869.0,5869.0,5869.0,5869.0,5869.0,5869.0,5869.0
mean,58280.26,1624.985176,113.241339,5.283183,9.577903,5.521724,18.276718
std,92173.35,601.008485,53.850207,0.804655,11.079895,3.164176,4.369249
min,171.0,624.0,34.2,2.0,0.44,0.0,0.0
25%,33425.0,1198.0,75.0,5.0,3.51,3.0,15.26
50%,52600.0,1495.0,97.7,5.0,5.75,5.0,18.2
75%,72204.0,1985.0,138.1,5.0,10.0,7.0,21.1
max,6500000.0,5998.0,560.0,10.0,120.0,21.0,33.54


In [78]:
#new method to use describe with objects
df.describe(include=['O'])

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type
count,5869,5869,5869,5869,5869
unique,207,11,4,2,4
top,Maruti Swift,Mumbai,Diesel,Manual,First
freq,326,774,3148,4169,4835


In [79]:
df.isnull().sum()

Name                 0
Location             0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Engine               0
Power                0
Seats                0
Price                0
Age                  0
mileage KMPL         0
dtype: int64

## scaling data

In [80]:
scaler = MinMaxScaler()
#scaler = StandardScaler()

In [81]:
numerical_cols = list(df.select_dtypes(include=['int64', 'float64','int32']).columns)

In [82]:
numerical_cols

['Kilometers_Driven',
 'Engine',
 'Power',
 'Seats',
 'Price',
 'Age',
 'mileage KMPL']

In [83]:
numerical_cols.remove('Price')

In [84]:
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [85]:
df.isnull().sum()

Name                 0
Location             0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Engine               0
Power                0
Seats                0
Price                0
Age                  0
mileage KMPL         0
dtype: int64

In [86]:
df

Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Engine,Power,Seats,Price,Age,mileage KMPL
0,Maruti Wagon,Mumbai,0.011051,CNG,Manual,First,0.069594,0.045569,0.375,1.75,0.428571,0.793083
1,Hyundai Creta,Pune,0.006282,Diesel,Manual,First,0.178266,0.174971,0.375,12.50,0.190476,0.586464
2,Honda Jazz,Chennai,0.007051,Petrol,Manual,First,0.106997,0.103652,0.375,4.50,0.380952,0.542636
3,Maruti Ertiga,Chennai,0.013359,Diesel,Manual,First,0.116115,0.103766,0.625,6.00,0.333333,0.619261
4,Audi A4,Coimbatore,0.006231,Diesel,Automatic,Second,0.250093,0.202739,0.375,17.74,0.285714,0.453190
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift,Delhi,0.004184,Diesel,Manual,First,0.116115,0.075694,0.375,4.75,0.238095,0.846750
6015,Hyundai Xcent,Jaipur,0.015359,Diesel,Manual,First,0.092296,0.069989,0.375,4.00,0.190476,0.727490
6016,Mahindra Xylo,Jaipur,0.008435,Diesel,Manual,Second,0.348716,0.147965,0.750,2.90,0.333333,0.417412
6017,Maruti Wagon,Kolkata,0.007051,Petrol,Manual,First,0.069594,0.062571,0.375,2.65,0.285714,0.563506


In [87]:
df.Owner_Type

0        First
1        First
2        First
3        First
4       Second
         ...  
6014     First
6015     First
6016    Second
6017     First
6018     First
Name: Owner_Type, Length: 5869, dtype: object

## Handling Categorical Data

In [88]:
df = pd.get_dummies(columns= ['Owner_Type' , 'Fuel_Type','Transmission'] , data = df)

In [89]:
df

Unnamed: 0,Name,Location,Kilometers_Driven,Engine,Power,Seats,Price,Age,mileage KMPL,Owner_Type_First,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual
0,Maruti Wagon,Mumbai,0.011051,0.069594,0.045569,0.375,1.75,0.428571,0.793083,1,0,0,0,1,0,0,0,0,1
1,Hyundai Creta,Pune,0.006282,0.178266,0.174971,0.375,12.50,0.190476,0.586464,1,0,0,0,0,1,0,0,0,1
2,Honda Jazz,Chennai,0.007051,0.106997,0.103652,0.375,4.50,0.380952,0.542636,1,0,0,0,0,0,0,1,0,1
3,Maruti Ertiga,Chennai,0.013359,0.116115,0.103766,0.625,6.00,0.333333,0.619261,1,0,0,0,0,1,0,0,0,1
4,Audi A4,Coimbatore,0.006231,0.250093,0.202739,0.375,17.74,0.285714,0.453190,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift,Delhi,0.004184,0.116115,0.075694,0.375,4.75,0.238095,0.846750,1,0,0,0,0,1,0,0,0,1
6015,Hyundai Xcent,Jaipur,0.015359,0.092296,0.069989,0.375,4.00,0.190476,0.727490,1,0,0,0,0,1,0,0,0,1
6016,Mahindra Xylo,Jaipur,0.008435,0.348716,0.147965,0.750,2.90,0.333333,0.417412,0,0,1,0,0,1,0,0,0,1
6017,Maruti Wagon,Kolkata,0.007051,0.069594,0.062571,0.375,2.65,0.285714,0.563506,1,0,0,0,0,0,0,1,0,1


In [90]:
df.isnull().sum()

Name                         0
Location                     0
Kilometers_Driven            0
Engine                       0
Power                        0
Seats                        0
Price                        0
Age                          0
mileage KMPL                 0
Owner_Type_First             0
Owner_Type_Fourth & Above    0
Owner_Type_Second            0
Owner_Type_Third             0
Fuel_Type_CNG                0
Fuel_Type_Diesel             0
Fuel_Type_LPG                0
Fuel_Type_Petrol             0
Transmission_Automatic       0
Transmission_Manual          0
dtype: int64

In [91]:
encoder = BinaryEncoder(cols=[ 'Name', 'Location'])


In [92]:
df=encoder.fit_transform(df)

In [93]:
df

Unnamed: 0,Name_0,Name_1,Name_2,Name_3,Name_4,Name_5,Name_6,Name_7,Location_0,Location_1,...,Owner_Type_First,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
4,0,0,0,0,0,1,0,1,0,1,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,0,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,1,0,0,0,1
6015,0,0,1,0,0,0,0,1,0,1,...,1,0,0,0,0,1,0,0,0,1
6016,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,1
6017,0,0,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,1,0,1


## split data

In [94]:
x= df.drop('Price' , axis = 1 )

In [95]:
x

Unnamed: 0,Name_0,Name_1,Name_2,Name_3,Name_4,Name_5,Name_6,Name_7,Location_0,Location_1,...,Owner_Type_First,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Automatic,Transmission_Manual
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
4,0,0,0,0,0,1,0,1,0,1,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,0,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,1,0,0,0,1
6015,0,0,1,0,0,0,0,1,0,1,...,1,0,0,0,0,1,0,0,0,1
6016,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,1
6017,0,0,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,1,0,1


In [96]:
y = df['Price']

In [97]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [98]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [99]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5869 entries, 0 to 6018
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Name_0                     5869 non-null   int64  
 1   Name_1                     5869 non-null   int64  
 2   Name_2                     5869 non-null   int64  
 3   Name_3                     5869 non-null   int64  
 4   Name_4                     5869 non-null   int64  
 5   Name_5                     5869 non-null   int64  
 6   Name_6                     5869 non-null   int64  
 7   Name_7                     5869 non-null   int64  
 8   Location_0                 5869 non-null   int64  
 9   Location_1                 5869 non-null   int64  
 10  Location_2                 5869 non-null   int64  
 11  Location_3                 5869 non-null   int64  
 12  Kilometers_Driven          5869 non-null   float64
 13  Engine                     5869 non-null   float

In [101]:
lr = LinearRegression()
lr.fit(x_train, y_train)

print('Training Score: ', lr.score(x_train, y_train))
print('Testing Score: ', lr.score(x_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, lr.predict(x_test))))

Training Score:  0.8803456886168821
Testing Score:  0.8716902726911508
RMSE:  0.2611926386809389


In [102]:
px.histogram(y_train)

In [103]:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(x_train, y_train)

print('Training Score: ', rf.score(x_train, y_train))
print('Testing Score: ', rf.score(x_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, rf.predict(x_test))))

Training Score:  0.9912751919577898
Testing Score:  0.9429243253445748
RMSE:  0.17420341670440476
