# Multiple Linear Regression- Car dataset

## Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset

In [3]:
dataset=pd.read_csv('CAR.csv')

In [4]:
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


In [6]:
dataset.describe()

Unnamed: 0,year,selling_price,km_driven
count,4337.0,4337.0,4337.0
mean,2013.094766,504274.3,66223.535624
std,4.20875,578712.1,46657.695779
min,1992.0,20000.0,1.0
25%,2011.0,210000.0,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [7]:
dataset.isnull().sum()

year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

### Split the dataset into independent and Dependent variables

In [8]:
dataset.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [9]:
X=dataset[['year','km_driven', 'fuel', 'seller_type',
       'transmission','owner']]
Y=dataset[['selling_price']]

### Work with the catagorical data

In [10]:
X=pd.get_dummies(dataset[['year','km_driven', 'fuel', 'seller_type',
       'transmission','owner']],drop_first=True)

In [11]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


## Splitting the dataset into the Training set and Test set
- Random State 20

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,
                                               test_size=.10,
                                              random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [13]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression() 
regressor.fit(X_train.values,Y_train)

LinearRegression()

## Intercept and Coefficient

In [14]:
print("Intercept: ", regressor.intercept_)
print("Coefficients:")
list(zip(X, regressor.coef_))

Intercept:  [-72051850.73685972]
Coefficients:


[('year',
  array([ 3.64221128e+04, -9.56227391e-01,  2.87855640e+05, -6.03437311e+05,
          5.03964944e+04, -2.48512771e+03, -5.98400794e+04,  1.59131973e+05,
         -8.80142541e+05,  6.64418352e+02, -3.81841591e+04,  1.72448130e+05,
         -2.81224681e+04]))]

## Predicting the Test set results

In [15]:
Y_pred_regressor= regressor.predict(X_test.values)
#Predicted values
print("Prediction for test set: {}".format(Y_pred_regressor))

Prediction for test set: [[ 4.72692606e+05]
 [ 5.18112639e+05]
 [ 4.03974127e+05]
 [ 1.40174289e+06]
 [ 1.43603326e+05]
 [ 2.73821195e+05]
 [ 7.21174734e+05]
 [ 7.34074199e+05]
 [ 6.55931518e+05]
 [ 4.21271692e+05]
 [ 8.13143507e+05]
 [-9.99622074e+04]
 [ 1.36366005e+06]
 [ 5.85553341e+05]
 [ 6.26251806e+05]
 [ 1.63227291e+05]
 [ 2.26735737e+05]
 [ 1.45430308e+05]
 [ 4.62630915e+05]
 [ 6.71748475e+05]
 [ 4.91252800e+05]
 [ 4.21271692e+05]
 [ 4.55395041e+05]
 [ 3.31129902e+05]
 [ 6.65628073e+05]
 [ 6.47592994e+05]
 [ 3.84849579e+05]
 [ 4.91117343e+05]
 [ 9.89650153e+05]
 [ 6.56065799e+05]
 [ 3.89721753e+05]
 [ 2.76539468e+05]
 [ 4.72420253e+05]
 [ 7.28910025e+05]
 [ 5.50151584e+05]
 [ 1.07181213e+05]
 [ 1.72225211e+05]
 [ 7.48034573e+05]
 [ 4.28141793e+05]
 [ 7.50415537e+05]
 [ 5.35410204e+05]
 [ 6.89533758e+05]
 [ 1.47148244e+06]
 [ 5.33326806e+05]
 [ 4.40095403e+05]
 [ 4.91630273e+05]
 [ 2.39894298e+04]
 [ 8.35228493e+05]
 [ 6.96319286e+04]
 [ 5.29248175e+04]
 [ 7.19123715e+04]
 [ 7.2

### Calculate RMSE, R-Square

In [16]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f"RMSE : {math.sqrt(mean_squared_error(Y_test,Y_pred_regressor))}")
print(f'R- Sqaure : {r2_score(Y_test,Y_pred_regressor)}')

RMSE : 330481.49035666464
R- Sqaure : 0.5209839170703832


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [17]:
X_test.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
406,2012,80000,1,0,0,0,1,0,1,0,0,0,1
3027,2013,100000,1,0,0,0,1,0,1,0,0,0,0
2277,2016,30000,0,0,0,1,1,0,1,0,0,0,0
799,2017,7658,0,0,0,1,0,0,0,0,0,0,0
2738,2012,110000,0,0,0,1,1,0,1,0,1,0,0


In [18]:
regressor.predict([[2014,70000,1,0,0,0,0,0,1,0,0,0,0]])

array([[643061.65324564]])