# Problem Statement:

- Based on the km_driven, manufacturing year, owner, fuel, seller type and transmission we need to predict the selling price of the car

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("CAR DETAILS FROM CAR DEKHO.csv")
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
data.shape

(4340, 8)

# Observations:
- There are 4340 rows and 8 columns in the data.
- Each single rows is representing 1 car detials.

# Step 3 : Data Preprocessing & Data Wrangling

In [5]:
data.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

#### Assignment: Apply str.split() on the name column of the data. From the output the 1st col will give the car's company name and 2nd col will give me the car's model name. You can use these 2 cols to perform encoding instead of dropping

In [7]:
data = data.drop('name', axis  = 1)
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [8]:
data['owner'].unique()

array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
       'Third Owner', 'Test Drive Car'], dtype=object)

In [9]:
dic = {
    'Test Drive Car' : 0,
    "First Owner" : 1,
    "Second Owner" : 2,
    "Third Owner" : 3,
    'Fourth & Above Owner' : 4
}
dic

{'Test Drive Car': 0,
 'First Owner': 1,
 'Second Owner': 2,
 'Third Owner': 3,
 'Fourth & Above Owner': 4}

In [10]:
data["owner"] = data['owner'].replace(dic)

In [11]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,1
1,2007,135000,50000,Petrol,Individual,Manual,1
2,2012,600000,100000,Diesel,Individual,Manual,1
3,2017,250000,46000,Petrol,Individual,Manual,1
4,2014,450000,141000,Diesel,Individual,Manual,2


In [14]:
## Apply one hot encoding on the remaining text cols

data_ohe = pd.get_dummies(data[['fuel', 'seller_type', "transmission"]])
data_ohe

Unnamed: 0,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual
0,0,0,0,0,1,0,1,0,0,1
1,0,0,0,0,1,0,1,0,0,1
2,0,1,0,0,0,0,1,0,0,1
3,0,0,0,0,1,0,1,0,0,1
4,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4335,0,1,0,0,0,0,1,0,0,1
4336,0,1,0,0,0,0,1,0,0,1
4337,0,0,0,0,1,0,1,0,0,1
4338,0,1,0,0,0,0,1,0,0,1


In [15]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual
0,2007,60000,70000,Petrol,Individual,Manual,1,0,0,0,0,1,0,1,0,0,1
1,2007,135000,50000,Petrol,Individual,Manual,1,0,0,0,0,1,0,1,0,0,1
2,2012,600000,100000,Diesel,Individual,Manual,1,0,1,0,0,0,0,1,0,0,1
3,2017,250000,46000,Petrol,Individual,Manual,1,0,0,0,0,1,0,1,0,0,1
4,2014,450000,141000,Diesel,Individual,Manual,2,0,1,0,0,0,0,1,0,0,1


In [16]:
data = data.drop(['fuel', 'seller_type', 'transmission'], axis = 1)
data.head()

Unnamed: 0,year,selling_price,km_driven,owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual
0,2007,60000,70000,1,0,0,0,0,1,0,1,0,0,1
1,2007,135000,50000,1,0,0,0,0,1,0,1,0,0,1
2,2012,600000,100000,1,0,1,0,0,0,0,1,0,0,1
3,2017,250000,46000,1,0,0,0,0,1,0,1,0,0,1
4,2014,450000,141000,2,0,1,0,0,0,0,1,0,0,1


# Seperate X and y

In [18]:
X = data.drop("selling_price", axis = 1)
y = data['selling_price']

# Split the data into trian set and test set

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Apply Linear Regression on train set

In [23]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [24]:
lr.fit(X_train, y_train)

# Perform predictions

In [26]:
y_pred = lr.predict(X_test)
y_pred

array([1421696.312325  , 1443080.75115816,  195855.27230109,
       1702358.70957553,  352563.61662196,  221046.21709551,
       1437459.0289212 ,  322758.81879197, 1548803.65158464,
        461902.24832131,  125648.37690975,  458446.08456835,
       1320144.99198051,  601279.31432097,   61669.48046909,
        646272.260455  ,  519515.92369486,  469325.2797861 ,
         -8627.74148954,  626613.56901845, -299044.01215623,
        541267.28559215,  524623.89983346,  556903.32050954,
       1191624.65145223, 1226217.06820542,  549351.50074844,
        617973.15963605,  515195.71900366,  436258.08766262,
        410336.85951543,  489274.49085647,  508350.43190998,
        723418.99257402,  237412.53536569,  591223.84442084,
        333991.3509891 ,   31749.2416534 ,   96818.09144841,
        436408.4261039 ,  140020.13836041,  640032.36824545,
        527120.07282454, 1429939.01941969,  396789.2719921 ,
        252461.59683333,  -53003.73530099,  326492.05003347,
        532626.87620975,

In [27]:
X_test

Unnamed: 0,year,km_driven,owner,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual
2089,2012,70070,1,0,1,0,0,0,1,0,0,1,0
1077,2013,86000,1,0,1,0,0,0,1,0,0,1,0
1495,2014,120000,3,0,0,0,0,1,0,1,0,0,1
555,2019,30000,1,0,1,0,0,0,1,0,0,1,0
2615,2009,95000,2,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1359,2008,102000,1,0,0,0,0,1,1,0,0,0,1
4197,2010,120000,3,0,1,0,0,0,0,1,0,0,1
1818,2013,42000,3,0,0,0,0,1,0,1,0,0,1
2644,2005,60000,3,0,0,0,0,1,0,1,0,0,1


In [28]:
lr.coef_

array([ 3.51486110e+04, -8.64040938e-01, -2.42764443e+04,  4.87114099e+04,
        3.38084510e+05, -5.45416503e+05,  1.07110005e+05,  5.15105782e+04,
       -2.83476117e+04, -1.02836250e+05,  1.31183862e+05,  4.21690620e+05,
       -4.21690620e+05])

In [29]:
lr.intercept_

-69943916.70296131

# Check Accuracy and error

In [30]:
from sklearn.metrics import r2_score, mean_squared_error
r2_score(y_test, y_pred)

0.5020508720207613

In [32]:
np.sqrt(mean_squared_error(y_pred, y_test))

430016.28756922344