In [2]:
#importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### Now we will be loading and reading the provided dataset for the problem:

In [263]:
data = pd.read_excel("Data_Train.xlsx")
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


**Extracting basic information about our dataset:**
   * Columns
   * Total Entries
   * Data types of columns
   * Describe of the integer columns
   * Missing values

In [215]:
data.shape

(6019, 12)

In [216]:
data.columns

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'Price'],
      dtype='object')

In [259]:
print(data.isna().any())
print("\n")
print(data.info())
print("\n")
data.describe()

Name                 False
Location             False
Year                 False
Kilometers_Driven    False
Fuel_Type            False
Transmission         False
Owner_Type           False
Mileage               True
Engine                True
Power                 True
Seats                 True
Price                False
dtype: bool


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              598

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,5977.0,6019.0
mean,2013.358199,58738.38,5.278735,9.479468
std,3.269742,91268.84,0.80884,11.187917
min,1998.0,171.0,0.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.64
75%,2016.0,73000.0,5.0,9.95
max,2019.0,6500000.0,10.0,160.0


**Columns with missing values are ['Mileage', 'Engine', 'Power', 'Seats']**

Also these columns are in categorical form we'll be converting them to numerical variables for EDA.

**From this we also see that the minimum value of seats is 0 which isn't physically feasbile so its an error in the data.**

In [308]:
#Filling null and NAN values in Mileage column
# data['Mileage'] = data['Mileage'].str.split().str.get(0)
data['Mileage'] = data['Mileage'].apply(pd.to_numeric)

#Filling null and NAN values in Power column
# data['Power'] = data['Power'].str.split().str.get(0)
data['Power'] = data['Power'].replace('null', np.nan)
data['Power'] = data['Power'].replace('nan', np.nan)
data['Power'] = data['Power'].apply(pd.to_numeric)
meanPower = data['Power'].mean()
data['Power']=data['Power'].fillna(meanPower)
# data['Engine'] = data['Engine'].str.split().str.get(0)
data['Engine'] = data['Engine'].replace('nan',np.nan)
data['Engine'] = data['Engine'].apply(pd.to_numeric)
meanEngine = data['Engine'].mean()
data['Engine']=data['Engine'].fillna(meanEngine)
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5950 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               5950 non-null   object 
 1   Location           5950 non-null   object 
 2   Year               5950 non-null   int64  
 3   Kilometers_Driven  5950 non-null   int64  
 4   Fuel_Type          5950 non-null   object 
 5   Transmission       5950 non-null   object 
 6   Owner_Type         5950 non-null   object 
 7   Mileage            5950 non-null   float64
 8   Engine             5950 non-null   float64
 9   Power              5950 non-null   float64
 10  Seats              5950 non-null   float64
 11  Price              5950 non-null   float64
 12  meanEngine         5950 non-null   float64
 13  Brand              5950 non-null   object 
dtypes: float64(6), int64(2), object(6)
memory usage: 697.3+ KB


Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,meanEngine
count,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0
mean,2013.406555,58563.63,18.343571,1619.688945,113.120496,5.281081,9.456319,1619.688945
std,3.220781,91659.73,4.173886,597.264489,53.341649,0.803033,11.13839,597.264489
min,1998.0,171.0,6.4,72.0,34.2,2.0,0.44,72.0
25%,2012.0,33904.5,15.3,1198.0,77.0,5.0,3.5,1198.0
50%,2014.0,53000.0,18.2,1493.0,98.6,5.0,5.65,1493.0
75%,2016.0,72977.25,21.1,1968.0,138.03,5.0,9.915,1968.0
max,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0,5998.0


**Lets filter our dataframe for '0' values in columns that can't be feasible for zero values.**

Example - Mileage and Seats driven cannot be 0 for any car in our dataset to be part of this dataset.

In [273]:
data = data.loc[(data[['Mileage', 'Power', 'Engine', 'Seats']] != 0).all(axis=1)]

In [291]:
data.reset_index(drop=True)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,meanEngine,Brand
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.60,998.0,58.16,5.0,1.75,998.0,Maruti
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.50,1582.0,Hyundai
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.20,1199.0,88.70,5.0,4.50,1199.0,Honda
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.00,1248.0,Maruti
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.74,1968.0,Audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5945,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.40,1248.0,74.00,5.0,4.75,1248.0,Maruti
5946,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.40,1120.0,71.00,5.0,4.00,1120.0,Hyundai
5947,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.00,2498.0,112.00,8.0,2.90,2498.0,Mahindra
5948,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.90,998.0,67.10,5.0,2.65,998.0,Maruti


In [292]:
data.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,meanEngine
count,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0,5950.0
mean,2013.406555,58563.63,18.343571,1619.688945,113.120496,5.281081,9.456319,1619.688945
std,3.220781,91659.73,4.173886,597.264489,53.341649,0.803033,11.13839,597.264489
min,1998.0,171.0,6.4,72.0,34.2,2.0,0.44,72.0
25%,2012.0,33904.5,15.3,1198.0,77.0,5.0,3.5,1198.0
50%,2014.0,53000.0,18.2,1493.0,98.6,5.0,5.65,1493.0
75%,2016.0,72977.25,21.1,1968.0,138.03,5.0,9.915,1968.0
max,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0,5998.0


In [276]:
data.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               25
Power                 0
Seats                30
Price                 0
meanEngine            0
dtype: int64

**Checking if there is any duplicate data that can be dropped:**

In [46]:
data.duplicated().sum()

0

### Cleaning Data:

**Removing missing values (NANs)**

In [277]:
data.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               25
Power                 0
Seats                30
Price                 0
meanEngine            0
dtype: int64

In [280]:
meanMileage = data['Mileage'].mean()
data['Mileage']=data['Mileage'].fillna(meanMileage)

meanEngine = data['Engine'].mean()
data['Engine']=data['Engine'].fillna(meanEngine)

meanSeats = data['Seats'].mean()
data['Seats']=data['Seats'].fillna(meanSeats)

**Checking Null Values after replacing:**

In [282]:
data.isnull().sum()

Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
meanEngine           0
dtype: int64

**We will use brand names as a parameter for selection of vehicle instead of the model of car so we'll add a column with the brand name for each item in the dataset.**

In [283]:
# print(str(data.Name).split())
data['Brand'] = data.Name.str.split().str.get(0)
data.head()

# data['Brand'].value_counts()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,meanEngine,Brand
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75,998.0,Maruti
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,1582.0,Hyundai
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,1199.0,Honda
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,1248.0,Maruti
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,1968.0,Audi


#### Filter bad data:

In [305]:
# Filter bad data
dataClean = data[
    (data["Year"].between(2004, 2019, inclusive=True)) &
    (data["Power"].between(100, 600, inclusive=True)) &
    (data["Price"].between(3, 120, inclusive=True)) &
    (data["Kilometers_Driven"].between(100, 350000, inclusive=True))]

In [307]:
dataClean.reset_index(drop = True)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,meanEngine,Brand
0,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.20,5.0,12.50,1582.0,Hyundai
1,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.20,1968.0,140.80,5.0,17.74,1968.0,Audi
2,Toyota Innova Crysta 2.8 GX AT 8S,Mumbai,2016,36000,Diesel,Automatic,First,11.36,2755.0,171.50,8.0,17.50,2755.0,Toyota
3,Volkswagen Vento Diesel Comfortline,Pune,2013,64430,Diesel,Manual,First,20.54,1598.0,103.60,5.0,5.20,1598.0,Volkswagen
4,Maruti Ciaz Zeta,Kochi,2018,25692,Petrol,Manual,First,21.56,1462.0,103.25,5.0,9.95,1462.0,Maruti
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2637,Volkswagen Vento 1.6 Highline,Mumbai,2011,38000,Petrol,Manual,First,16.09,1598.0,103.50,5.0,3.25,1598.0,Volkswagen
2638,Mercedes-Benz M-Class ML 320 CDI,Mumbai,2009,102002,Diesel,Automatic,First,8.70,2987.0,224.34,5.0,10.75,2987.0,Mercedes-Benz
2639,Porsche Panamera Diesel,Hyderabad,2013,40000,Diesel,Automatic,Second,17.85,2967.0,300.00,4.0,45.00,2967.0,Porsche
2640,Toyota Camry Hybrid,Mumbai,2015,33500,Petrol,Automatic,First,19.16,2494.0,158.20,5.0,19.75,2494.0,Toyota
