### Car Selling Price Prediction
#### In this notebook we will be predicting the Car Selling Price using Linear Regression

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### Load the Dataset

In [2]:
df = pd.read_csv("../Dataset/cardata.csv")

#### EDA 

In [3]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


#### Checking the Null Values and Data Types

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

### Observations:
#### 1) The Dataset does not contain any missing values.
#### 2) Column car name is not required for Linear Regression


In [7]:
df.drop(['Car_Name'], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


#### 3) We need to segregate the numerical and Categorical Variables. 

In [9]:
df_num = df.select_dtypes(np.number)
df_cat = df.select_dtypes('object')

In [10]:
df_num.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
0,2014,3.35,5.59,27000,0
1,2013,4.75,9.54,43000,0
2,2017,7.25,9.85,6900,0
3,2011,2.85,4.15,5200,0
4,2014,4.6,6.87,42450,0


In [11]:
df_cat.head()

Unnamed: 0,Fuel_Type,Seller_Type,Transmission
0,Petrol,Dealer,Manual
1,Diesel,Dealer,Manual
2,Petrol,Dealer,Manual
3,Petrol,Dealer,Manual
4,Diesel,Dealer,Manual


#### Categorical Variables Fuel_Type, Seller_Type and Transmission need to be encoded

In [13]:
dummy_var = pd.get_dummies(data=df_cat, drop_first=True)

In [14]:
dummy_var.head()

Unnamed: 0,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,False,True,False,True
1,True,False,False,True
2,False,True,False,True
3,False,True,False,True
4,True,False,False,True


#### Converting the Boolean Values of Categorical Variables to 0 and 1

In [15]:
dummy_var['Fuel_Type_Diesel'] = dummy_var['Fuel_Type_Diesel'].astype(int)
dummy_var['Fuel_Type_Petrol'] = dummy_var['Fuel_Type_Petrol'].astype(int)
dummy_var['Seller_Type_Individual'] = dummy_var['Seller_Type_Individual'].astype(int)
dummy_var['Transmission_Manual'] = dummy_var['Transmission_Manual'].astype(int)

In [16]:
dummy_var.head()

Unnamed: 0,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,0,1,0,1
1,1,0,0,1
2,0,1,0,1
3,0,1,0,1
4,1,0,0,1
