In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Data Exploration

In [2]:
df = pd.read_csv('../data/train-data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


In [4]:
pd.options.display.float_format = '{:,.1f}'.format
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,6019.0,5977.0,6019.0
mean,3009.0,2013.4,58738.4,5.3,9.5
std,1737.7,3.3,91268.8,0.8,11.2
min,0.0,1998.0,171.0,0.0,0.4
25%,1504.5,2011.0,34000.0,5.0,3.5
50%,3009.0,2014.0,53000.0,5.0,5.6
75%,4513.5,2016.0,73000.0,5.0,9.9
max,6018.0,2019.0,6500000.0,10.0,160.0


In [5]:
df.describe(include='O')

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,New_Price
count,6019,6019,6019,6019,6019,6017,5983,5983,824
unique,1876,11,5,2,4,442,146,372,540
top,Mahindra XUV500 W8 2WD,Mumbai,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,95.13 Lakh
freq,49,790,3205,4299,4929,172,606,235,6


In [6]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("../reports/first-exploration.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 14/14 [00:00<00:00, 150.99it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
df.duplicated().sum()

0

In [8]:
pd.options.display.float_format = '{:,.3f}'.format
df.isnull().mean() *100

Unnamed: 0           0.000
Name                 0.000
Location             0.000
Year                 0.000
Kilometers_Driven    0.000
Fuel_Type            0.000
Transmission         0.000
Owner_Type           0.000
Mileage              0.033
Engine               0.598
Power                0.598
Seats                0.698
New_Price           86.310
Price                0.000
dtype: float64

In [9]:
# Drop Unnamed: 0 & New Price
df.drop(columns=['Unnamed: 0', 'New_Price'], inplace=True)

In [10]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    fig = px.histogram(df, x=col, marginal='box', title=f'Distribution of {col}', nbins=30)
    fig.show()

> Takeaways:
* Unnamed: 0 > Drop
* Name > Feature Engineering > Brand, Model
* Location > encoding
* Year > Age =  Current Year - Year (Max Year in dataset + 1 - Year)
* Kilometers_Driven > Handle outliers
* Fuel_Type, Transmission & Owner_Type > encoding
* Fuel_Type > Take care of 'Electric' category
* Mileage, Engine & Power > Extract numeric values & convert to float (Take care of Units)
* Seats > Handle Zeros
* New Price > Drop (Too many missing values)