# Multiple Linear Regression- Car dataset

## Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [3]:
data = pd.read_csv('CAR.csv')

In [4]:
data.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


### Split the dataset into independent and Dependent variables

In [6]:
data.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [7]:
x = data[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]
y = data[['selling_price']]

### Work with the catagorical data

In [8]:
x = pd.get_dummies(data[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']], drop_first=True)

In [9]:
x.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


## Splitting the dataset into the Training set and Test set
- Random State 20

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_rest, y_train, y_rest = train_test_split(x, y, test_size=.20, random_state=20)

In [11]:
x_test, x_val, y_test, y_val = train_test_split(x_rest, y_rest, test_size=.50, random_state=42)

## Training the Multiple Linear Regression model on the Training set

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
regressor = LinearRegression()
regressor.fit(x_train.values, y_train)

LinearRegression()

## Intercept and Coefficient

In [14]:
print("Coefficients", regressor.coef_)


Coefficients [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  2.91038305e-11
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]


In [15]:
print("Intercept", regressor.intercept_)

Intercept [-71683645.58006924]


## Predicting the Test set results

In [16]:
y_pred = regressor.predict(x_test.values)

In [17]:
print(y_pred)

[[  30749.25003727]
 [ 821100.12192748]
 [ 218516.58653592]
 [  37857.7797745 ]
 [ 289153.1110432 ]
 [ 630196.99126908]
 [ 477192.70902458]
 [ 155638.50364208]
 [ 259274.06596197]
 [ 522407.14628579]
 [ 118866.61699857]
 [ 266018.17558792]
 [  33294.33702557]
 [ 466224.56814849]
 [ 486101.24020372]
 [ 561650.87397622]
 [ 325535.21483561]
 [ 107589.58539997]
 [ 521161.80473301]
 [ 223823.10675913]
 [ 494449.82056004]
 [1315113.49566038]
 [ 529477.64668462]
 [1633241.79851329]
 [ 103304.95575809]
 [ 467989.02646664]
 [ 146019.33954766]
 [  37446.95711522]
 [ -90164.88577037]
 [ 810616.06172216]
 [ 523238.73048094]
 [ 637791.47279738]
 [  13186.69613707]
 [ 489095.16825539]
 [  63049.18599494]
 [  17752.7936372 ]
 [ 265027.32087488]
 [ 506977.10013007]
 [  91023.64423056]
 [ -87137.95338957]
 [ 235100.13835773]
 [ 473660.21568102]
 [ 613260.04960123]
 [ 401984.70437928]
 [ 282298.13757341]
 [ 566864.3872713 ]
 [ 716061.67707035]
 [1453574.57884182]
 [ 674016.0302508 ]
 [1056802.51602727]


### Calculate RMSE, R-Square

In [18]:
from sklearn.metrics import mean_squared_error, r2_score
import math

In [19]:
print(f"R-Square: {r2_score(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

R-Square: 0.50
MSE: 137746947725.80
RMSE: 371142.76


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [20]:
x_test.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
3520,2006,60000,0,0,0,1,0,0,1,0,1,0,0
1063,2018,25000,1,0,0,0,0,0,1,0,0,0,0
1248,2006,120000,1,0,0,0,1,0,1,0,0,0,1
583,2005,56580,0,0,0,1,0,0,1,0,0,0,0
1189,2014,25000,0,0,0,1,1,0,1,0,1,0,0


In [23]:
regressor.predict([[2014, 70000, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]])

array([[810663.29247583]])

The predicted value was 810,663. The actual value was 465000.