# Data Exploration

In [18]:
import pandas as pd

df = pd.read_csv('train-data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


In [20]:
pd.options.display.float_format = '{:,.1f}'.format
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,6019.0,5977.0,6019.0
mean,3009.0,2013.4,58738.4,5.3,9.5
std,1737.7,3.3,91268.8,0.8,11.2
min,0.0,1998.0,171.0,0.0,0.4
25%,1504.5,2011.0,34000.0,5.0,3.5
50%,3009.0,2014.0,53000.0,5.0,5.6
75%,4513.5,2016.0,73000.0,5.0,9.9
max,6018.0,2019.0,6500000.0,10.0,160.0


In [21]:
df.describe(include='O')

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,New_Price
count,6019,6019,6019,6019,6019,6017,5983,5983,824
unique,1876,11,5,2,4,442,146,372,540
top,Mahindra XUV500 W8 2WD,Mumbai,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,95.13 Lakh
freq,49,790,3205,4299,4929,172,606,235,6


In [22]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile.to_file("your_report.html")

> Data Preprocessing Steps:
* Unnamed: 0 > Drop
* Name > Feature Engineering > Brand, Model
* Location > encoding
* Year > Age =  Current Year - Year (Max Year in dataset + 1 - Year)
* Kilometers_Driven > Handle outliers
* Fuel_Type, Transmission & Owner_Type > encoding
* Fuel_Type > Take care of 'Electric' category
* Mileage, Engine & Power > Extract numeric values & convert to float (Take care of Units)
* Seats > Handle Zeros
* New Price > Drop (Too many missing values)

In [23]:
df.duplicated().sum()

0

In [24]:
pd.options.display.float_format = '{:,.3f}'.format
df.isnull().mean() *100

Unnamed: 0           0.000
Name                 0.000
Location             0.000
Year                 0.000
Kilometers_Driven    0.000
Fuel_Type            0.000
Transmission         0.000
Owner_Type           0.000
Mileage              0.033
Engine               0.598
Power                0.598
Seats                0.698
New_Price           86.310
Price                0.000
dtype: float64

In [25]:
# Drop Unnamed: 0 & New Price
df.drop(columns=['Unnamed: 0', 'New_Price'], inplace=True)

# Name

In [49]:
df.Name.sample(10)

4638                 Mahindra Xylo E4 ABS BS III
328            Maruti Zen Estilo LXI Green (CNG)
1205                      Honda CR-V 2.0L 2WD MT
1861              Toyota Fortuner 4x2 4 Speed AT
5436              Toyota Fortuner 4x2 4 Speed AT
4931                         Mahindra TUV 300 T8
4869    Land Rover Range Rover 3.6 TDV8 Vogue SE
5962                             Maruti Ritz VDi
4038      Toyota Innova 2.5 GX (Diesel) 7 Seater
5480               Hyundai Grand i10 Asta Option
Name: Name, dtype: object

In [51]:
df.Name.str.contains('Land Rover').sum()

60

In [9]:
df.Name[df.Name.str.contains('Land Rover')].sample(5)

1505       Land Rover Range Rover Sport SE
5311        Land Rover Freelander 2 TD4 SE
1259         Land Rover Freelander 2 TD4 S
311     Land Rover Range Rover HSE Dynamic
2528       Land Rover Discovery SE 3.0 TD6
Name: Name, dtype: object

In [53]:
sample_car = df.loc[10, 'Name']
sample_car.split(' ')[0]  # Brand

'Maruti'

In [59]:
def get_brand(name):
    if name.startswith('Land Rover'):
        return 'Land Rover'
    return name.split(' ')[0]

sample_car = df.loc[1505, 'Name']
get_brand(sample_car)

'Land Rover'

In [60]:
df['Brand'] = df.Name.apply(get_brand)

In [61]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75,Maruti
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5,Hyundai
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5,Honda
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0,Maruti
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74,Audi


In [62]:
df[df.Name.str.contains('Land Rover')].sample(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand
2306,Land Rover Range Rover Sport 2005 2012 HSE,Hyderabad,2010,45000,Diesel,Automatic,First,14.7 kmpl,2993 CC,241.4 bhp,6.0,40.0,Land Rover
4814,Land Rover Freelander 2 SE,Delhi,2013,68000,Diesel,Automatic,Second,12.39 kmpl,2179 CC,147.51 bhp,5.0,15.75,Land Rover
5016,Land Rover Freelander 2 TD4 HSE,Delhi,2013,72000,Diesel,Automatic,First,0.0 kmpl,2179 CC,115 bhp,5.0,15.5,Land Rover
1513,Land Rover Range Rover 2.2L Pure,Bangalore,2014,56000,Diesel,Automatic,First,12.7 kmpl,2179 CC,187.7 bhp,5.0,34.5,Land Rover
410,Land Rover Range Rover 2.2L Pure,Coimbatore,2014,95649,Diesel,Automatic,First,12.7 kmpl,2179 CC,187.7 bhp,5.0,29.67,Land Rover


In [95]:
import numpy as np
for i in np.random.randint(0, len(df), 5):
    sample_car = df.loc[i, 'Name']
    print(sample_car.split(' ')[1])   # MODEL

Bolero
Grand
Verna
Amaze
i20


In [99]:
sample_car = df[df.Name.str.contains('Land Rover')].reset_index().loc[50, 'Name']
print(sample_car)

Land Rover Discovery Sport TD4 HSE 7S


In [100]:
def get_model(name):
    if name.startswith('Land Rover Range Rover'):
        return 'Range Rover'
    elif name.startswith('Land Rover'):
        return name.split(' ')[2]
    else:
        return name.split(' ')[1]
    
df['Model'] = df.Name.apply(get_model)

In [106]:
df[df.Name.str.contains('Land Rover')].sample(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model
521,Land Rover Range Rover HSE Dynamic,Coimbatore,2017,49275,Diesel,Automatic,First,12.7 kmpl,2179 CC,187.7 bhp,5.0,45.64,Land Rover,Range Rover
4925,Land Rover Discovery Sport TD4 S,Delhi,2017,22000,Diesel,Automatic,First,12.83 kmpl,2179 CC,147.5 bhp,5.0,35.0,Land Rover,Discovery
941,Land Rover Range Rover Evoque 2.0 TD4 HSE Dynamic,Kochi,2018,31427,Diesel,Automatic,First,15.68 kmpl,1999 CC,177 bhp,5.0,59.72,Land Rover,Range Rover
4079,Land Rover Range Rover 3.0 Diesel LWB Vogue,Hyderabad,2017,25000,Diesel,Automatic,First,13.33 kmpl,2993 CC,255 bhp,5.0,160.0,Land Rover,Range Rover
3218,Land Rover Range Rover 2.2L Dynamic,Pune,2015,52000,Diesel,Automatic,First,12.7 kmpl,2179 CC,187.7 bhp,5.0,39.75,Land Rover,Range Rover


In [107]:
df.describe(include='O')

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Brand,Model
count,6019,6019,6019,6019,6019,6017,5983,5983,6019,6019
unique,1876,11,5,2,4,442,146,372,31,214
top,Mahindra XUV500 W8 2WD,Mumbai,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,Maruti,Swift
freq,49,790,3205,4299,4929,172,606,235,1211,353


In [108]:
df.drop(columns=['Name'], inplace=True)

# Age

In [110]:
df['Age'] = df.Year.max() + 1 - df.Year

# Kilometers_Driven

In [None]:
import seaborn as sns
import plotly.express as px

px.histogram(df, x='Kilometers_Driven', marginal='box', title='Km Distribution')

In [24]:
df[df.Kilometers_Driven == df.Kilometers_Driven.max()]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
2328,Chennai,2017,6500000,Diesel,Automatic,First,15.97 kmpl,2993 CC,258 bhp,5.0,65.0,BMW,X5,3


In [114]:
km_outlier = df[df.Kilometers_Driven == df.Kilometers_Driven.max()].index
df.drop(index=km_outlier, inplace=True)

In [None]:
px.histogram(df, x='Kilometers_Driven', marginal='box', title='Km Distribution')

In [116]:
df.Kilometers_Driven.quantile([0.8, 0.9, 0.95, 0.99, 0.995, 0.999, 1])

0.800    79,000.000
0.900    96,893.600
0.950   120,000.000
0.990   171,871.500
0.995   200,000.000
0.999   299,988.474
1.000   775,000.000
Name: Kilometers_Driven, dtype: float64

In [117]:
df[df.Kilometers_Driven > df.Kilometers_Driven.quantile(0.999)].shape

(7, 14)

In [118]:
df[df.Kilometers_Driven > df.Kilometers_Driven.quantile(0.999)]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
340,Kolkata,2013,775000,Diesel,Automatic,First,19.3 kmpl,1968 CC,141 bhp,5.0,7.5,Skoda,Octavia,7
358,Chennai,2009,620000,Petrol,Manual,First,20.36 kmpl,1197 CC,78.9 bhp,5.0,2.7,Hyundai,i10,11
1860,Chennai,2013,720000,Diesel,Manual,First,20.54 kmpl,1598 CC,103.6 bhp,5.0,5.9,Volkswagen,Vento,7
2823,Chennai,2015,480000,Diesel,Automatic,First,16.96 kmpl,1968 CC,138.03 bhp,5.0,13.0,Volkswagen,Jetta,5
3092,Kolkata,2015,480000,Petrol,Manual,First,17.4 kmpl,1497 CC,117.3 bhp,5.0,5.0,Honda,City,5
3649,Jaipur,2008,300000,Diesel,Manual,First,17.0 kmpl,1405 CC,70 bhp,5.0,1.0,Tata,Indigo,12
4491,Bangalore,2013,445000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.9 bhp,5.0,4.45,Hyundai,i20,7


In [119]:
q1 = df.Kilometers_Driven.quantile(0.25)
q3 = df.Kilometers_Driven.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

Lower Bound: -24500.0, Upper Bound: 131500.0


In [120]:
df[df.Kilometers_Driven > upper_bound].shape

(201, 14)

In [121]:
df_c = df.copy()
df_c = df_c[df_c.Kilometers_Driven < df.Kilometers_Driven.quantile(0.999)]

In [122]:
px.histogram(df_c, x='Kilometers_Driven', marginal='box', title='Km Distribution')

In [123]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
0,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75,Maruti,Wagon,10
1,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5,Hyundai,Creta,5
2,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5,Honda,Jazz,9
3,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0,Maruti,Ertiga,8
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74,Audi,A4,7


# Fuel_Type

In [125]:
df_c.Fuel_Type.unique()

array(['CNG', 'Diesel', 'Petrol', 'LPG', 'Electric'], dtype=object)

In [127]:
df_c[df_c.Fuel_Type == 'Electric']

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
4446,Chennai,2016,50000,Electric,Automatic,First,,72 CC,41 bhp,5.0,13.0,Mahindra,E,4
4904,Mumbai,2011,44000,Electric,Automatic,First,,1798 CC,73 bhp,5.0,12.75,Toyota,Prius,9


In [128]:
df_c.isna().mean() * 100

Location            0.000
Year                0.000
Kilometers_Driven   0.000
Fuel_Type           0.000
Transmission        0.000
Owner_Type          0.000
Mileage             0.033
Engine              0.599
Power               0.599
Seats               0.699
Price               0.000
Brand               0.000
Model               0.000
Age                 0.000
dtype: float64

In [129]:
df_c.dropna(inplace=True)

In [130]:
df_c.isna().mean() * 100

Location            0.000
Year                0.000
Kilometers_Driven   0.000
Fuel_Type           0.000
Transmission        0.000
Owner_Type          0.000
Mileage             0.000
Engine              0.000
Power               0.000
Seats               0.000
Price               0.000
Brand               0.000
Model               0.000
Age                 0.000
dtype: float64

In [131]:
df_c.Fuel_Type.unique()

array(['CNG', 'Diesel', 'Petrol', 'LPG'], dtype=object)

# Mileage

In [134]:
df_c.Mileage.str.split(' ').str[1].unique()

array(['km/kg', 'kmpl'], dtype=object)

In [135]:
df_c.Fuel_Type.unique()

array(['CNG', 'Diesel', 'Petrol', 'LPG'], dtype=object)

In [136]:
# fuel densities (kg/L)
fuel_densities = {
    "CNG": 0.43,
    "Diesel": 0.832,
    "Petrol": 0.74,
    "LPG": 0.54
}

def convert_to_kmpl(row):
    fuel = row['Fuel_Type']
    mileage = float(row['Mileage'].split()[0])
    unit = row['Mileage'].split()[1] 
    
    if fuel in fuel_densities:
        if unit == 'km/kg':
            return mileage * fuel_densities[fuel]
        else:
            return mileage
    return np.nan

# Apply conversion to DataFrame
df_c['Mileage'] = df_c.apply(convert_to_kmpl, axis=1)

In [137]:
df_c.isna().mean() * 100

Location            0.000
Year                0.000
Kilometers_Driven   0.000
Fuel_Type           0.000
Transmission        0.000
Owner_Type          0.000
Mileage             0.000
Engine              0.000
Power               0.000
Seats               0.000
Price               0.000
Brand               0.000
Model               0.000
Age                 0.000
dtype: float64

# Engine

In [138]:
df_c.Engine.str.split(' ').str[1].unique()

array(['CC'], dtype=object)

In [139]:
def get_engine(engine):
    return float(engine.split(' ')[0]) 

df_c['Engine'] = df_c['Engine'].apply(get_engine)

# Power

In [140]:
df_c.Power.str.split(' ').str[1].unique()

array(['bhp'], dtype=object)

In [146]:
df_c.Power.sample(10)

5215       110 bhp
5848    174.33 bhp
4020        74 bhp
4349       136 bhp
1835      73.9 bhp
880      67.04 bhp
2965       162 bhp
1419      null bhp
2154    108.45 bhp
3138        60 bhp
Name: Power, dtype: object

In [147]:
def get_power(power):
    try:
        return float(power.split(' ')[0])
    except:
        return np.nan

df_c['Power'] = df_c['Power'].apply(get_power)
# df_c['Power'] = pd.to_numeric(df_c['Power'].str.split(' ').str[0], errors='coerce')

In [148]:
df_c.isna().mean() * 100

Location            0.000
Year                0.000
Kilometers_Driven   0.000
Fuel_Type           0.000
Transmission        0.000
Owner_Type          0.000
Mileage             0.000
Engine              0.000
Power               1.726
Seats               0.000
Price               0.000
Brand               0.000
Model               0.000
Age                 0.000
dtype: float64

# Seats

In [149]:
df_c = df_c[df_c.Seats > 0]

In [156]:
df_c.Seats.value_counts(normalize=True) * 100

Seats
5.000    83.875
7.000    11.297
8.000     2.246
4.000     1.659
6.000     0.520
2.000     0.268
10.000    0.084
9.000     0.050
Name: proportion, dtype: float64

# Univariate Analysis

In [150]:
df_c.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Model,Age
0,Mumbai,2010,72000,CNG,Manual,First,11.438,998.0,58.16,5.0,1.75,Maruti,Wagon,10
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5,Hyundai,Creta,5
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5,Honda,Jazz,9
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0,Maruti,Ertiga,8
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74,Audi,A4,7


In [151]:
df_c.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Age
count,5966.0,5966.0,5966.0,5966.0,5864.0,5966.0,5966.0,5966.0
mean,2013.388,57012.074,18.03,1621.2,113.263,5.28,9.495,6.612
std,3.248,33614.788,4.499,600.789,53.877,0.807,11.189,3.248
min,1998.0,171.0,0.0,624.0,34.2,2.0,0.44,1.0
25%,2012.0,33868.5,15.1,1198.0,75.0,5.0,3.5,4.0
50%,2014.0,53000.0,18.0,1493.0,97.7,5.0,5.65,6.0
75%,2016.0,72705.75,21.0,1984.0,138.1,5.0,9.95,8.0
max,2019.0,299322.0,28.4,5998.0,560.0,10.0,160.0,22.0


In [152]:
df_c.describe(include='O')

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Brand,Model
count,5966,5966,5966,5966,5966,5966
unique,11,4,2,4,31,211
top,Mumbai,Diesel,Manual,First,Maruti,Swift
freq,784,3190,4261,4894,1197,343


In [153]:
numeric_cols = df_c.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    fig = px.histogram(df_c, x=col, marginal='box', title=f'Distribution of {col}', nbins=30)
    fig.show()

In [157]:
df_c = df_c[df_c.Year >= 2006] 
df_c['Mileage'] = df_c['Mileage'].replace(0, np.nan)
df_c = df_c[df_c.Engine < 3000]
df_c = df_c[df_c.Kilometers_Driven < 180000]
df_c = df_c[df_c.Power < 270]

In [158]:
for col in numeric_cols:
    fig = px.histogram(df_c, x=col, marginal='box', title=f'Distribution of {col}', nbins=30)
    fig.show()

In [163]:
dropped_rows_pct = 1 - (df_c.shape[0] / len(df))
print(f"Dropped rows percentage: {dropped_rows_pct:.2%}")

Dropped rows percentage: 6.63%


In [164]:
df_c.isna().mean() * 100

Location            0.000
Year                0.000
Kilometers_Driven   0.000
Fuel_Type           0.000
Transmission        0.000
Owner_Type          0.000
Mileage             0.463
Engine              0.000
Power               0.000
Seats               0.000
Price               0.000
Brand               0.000
Model               0.000
Age                 0.000
dtype: float64

In [169]:
df_c.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Age
count,5619.0,5619.0,5593.0,5619.0,5619.0,5619.0,5619.0,5619.0
mean,2013.69,55170.487,18.412,1586.869,109.971,5.279,9.182,6.31
std,2.858,30438.283,4.094,536.306,46.006,0.769,10.065,2.858
min,2006.0,171.0,5.676,624.0,34.2,2.0,0.44,1.0
25%,2012.0,33000.0,15.5,1197.0,75.0,5.0,3.6,4.0
50%,2014.0,52000.0,18.5,1461.0,92.7,5.0,5.74,6.0
75%,2016.0,71436.5,21.1,1968.0,138.03,5.0,9.75,8.0
max,2019.0,179000.0,28.4,2999.0,265.0,10.0,160.0,14.0


In [93]:
df_c.corr(numeric_only=True)['Price'].sort_values(ascending=False)

Price                1.000000
Power                0.764098
Engine               0.635854
Year                 0.318702
Seats                0.123282
Kilometers_Driven   -0.166846
Mileage             -0.308463
Age                 -0.318702
Name: Price, dtype: float64

# Data Splitting

* Category Columns > Encoding
* Numeric Columns > Handle missing values and Scaling

In [170]:
# Data Splitting

X = df_c.drop(columns=['Price', 'Year'])
y = df_c['Price']

In [171]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [172]:
X_train.shape

(4495, 12)

In [174]:
# Splitting Columns

num_cols = ['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Age']  # Impute Missing Values and Scale
nom_cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Brand', 'Model']  # USE Binary ENCODING
ord_cat_cols = ['Owner_Type']   # USE ORDINAL ENCODING

In [101]:
# !pip install category-encoders

In [175]:
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')),
                                         ('scaler', StandardScaler())])

# Preprocessing for categorical data
nominal_categorical_transformer = Pipeline(steps=[('binary', BinaryEncoder())])
ordinal_categorical_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(categories=[['Fourth & Above', 'Third', 'Second', 'First']]))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num_prep', numerical_transformer, num_cols),
                                                ('nom_prep', nominal_categorical_transformer, nom_cat_cols),
                                                ('ord_prep', ordinal_categorical_transformer, ord_cat_cols)])

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [177]:
X_train_prep

array([[-1.08423424, -0.4720888 , -0.72760357, ...,  0.        ,
         1.        ,  3.        ],
       [ 0.4521476 ,  0.02363623,  2.62788967, ...,  1.        ,
         0.        ,  3.        ],
       [-1.14876227,  0.14573599, -0.72572795, ...,  1.        ,
         1.        ,  3.        ],
       ...,
       [ 2.21735226, -1.42935092,  2.61851154, ...,  0.        ,
         1.        ,  2.        ],
       [ 0.74634838,  0.61459907, -1.10272859, ...,  0.        ,
         0.        ,  2.        ],
       [-0.72465551, -0.14730344, -0.72760357, ...,  1.        ,
         1.        ,  3.        ]])

In [179]:
# Model Building
from sklearn.linear_model import LinearRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_prep, y_train)

print('Intercept:', lr.intercept_)
print('Coefficients:', lr.coef_)

Intercept: 9.750424769747077
Coefficients: [-8.90155690e-01 -1.24033059e-01  1.18462563e+00  5.28321100e+00
 -2.63386713e+00 -8.18486731e-01  1.00674375e+00  8.65178500e-01
 -4.34365940e-01  1.07694599e+00  4.07585365e-01 -1.14104436e+00
  9.10786482e-01 -9.10786482e-01  6.61867211e+00 -1.30306878e-01
 -1.39771606e+00  4.14068629e-01 -2.71369839e+00  1.58651269e+00
  1.30750935e+00 -1.32931466e+00 -5.35514811e-03 -3.10238440e-03
 -1.05808800e+00 -6.10802860e-01  7.57654516e-01  6.83253308e-01]


In [None]:
# Predictions
y_pred = lr.predict(X_test_prep)

# Model Evaluation
print('Training Accuracy:', lr.score(X_train_prep, y_train))
print('Testing Accuracy:', lr.score(X_test_prep, y_test))

Training Accuracy: 0.7399991262351753
Testing Accuracy: 0.7573599302557424
