**Importing the Libraries**

In [1]:
# Loading some basic libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns   # this is for visualisation
import csv

In [2]:
# Import some machine learning Libraries

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [3]:
# Suppress Warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

**Read the CSV File**

In [4]:
#  Load the data, datset is downloaded from UCI Machine Repository.
dataset = pd.read_csv('auto-mpg.csv')
dataset


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


**Goal is to predict the fuel efficiency. This is a Linear Regression Problem.**

**Checking the Shape of the Data**

In [5]:
dataset.shape

(398, 9)

**Checking the Data Information**

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


MPG is th dependant variable

**Checking the null values**

In [None]:
dataset.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

**Data Cleaning**

There is some abnormality in the data, as we see ? in horsepower

In [None]:
dataset[dataset.horsepower.str.isdigit()==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


**Replace this question mark with nan**

In [None]:
dataset['horsepower']=dataset['horsepower'].replace('?',np.nan)

In [None]:
dataset.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

now we have 6 null values, to avoid that we fill it with the median value of horse power.

**Now, we fill the null values with the median of horsepower**

In [None]:
dataset['horsepower']=dataset['horsepower'].fillna(dataset['horsepower'].median())

In [None]:
dataset.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

*They are now ready to fit the linear regression*

**Checking the Data Types**

In [None]:
dataset.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

***We cannot give the Object datatype to the Algorithm, we need to change it to numbers !!***

In [None]:
dataset['horsepower']=dataset['horsepower'].astype('float64')

In [None]:
dataset.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

**Correlation of Various Features**

In [None]:
dataset.corr().T

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
mpg,1.0,-0.775396,-0.804203,-0.773453,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.841284,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.895778,0.932824,-0.543684,-0.370164,-0.609409
horsepower,-0.773453,0.841284,0.895778,1.0,0.862442,-0.68659,-0.413733,-0.452096
weight,-0.831741,0.896017,0.932824,0.862442,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.68659,-0.417457,1.0,0.288137,0.205873
model year,0.579267,-0.348746,-0.370164,-0.413733,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.452096,-0.581024,0.205873,0.180662,1.0


**Data pre-processesing**

In [None]:
dataset=dataset.drop('car name', axis=1)

In [None]:
dataset.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
5,15.0,8,429.0,198.0,4341,10.0,70,1
6,14.0,8,454.0,220.0,4354,9.0,70,1
7,14.0,8,440.0,215.0,4312,8.5,70,1
8,14.0,8,455.0,225.0,4425,10.0,70,1
9,15.0,8,390.0,190.0,3850,8.5,70,1


In [None]:
dataset.shape

(398, 8)

**seperate the dependent and independent features**

In [None]:
x= dataset.drop('mpg', axis=1)
y= dataset[['mpg']]

In [None]:
from sklearn import preprocessing
#scale all the columns of the data. This will give a numpy array
x_scaled =preprocessing.scale(x)
x_scaled = pd.DataFrame(x_scaled,columns=x.columns)
y_scaled =preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled,columns=y.columns)


**Train-Test split**

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(x_scaled,y_scaled,test_size=0.30, random_state=1)

In [None]:
print('x_train',x_train.shape)
print('x_test',x_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)

x_train (278, 7)
x_test (120, 7)
y_train (278, 1)
y_test (120, 1)


**Build a Linear Regression Model**

In [None]:
regression_model = LinearRegression()
regression_model.fit(x_train,y_train)
for idx, col_name in enumerate (x_train.columns):
  print("The co-ef for {} is {}".format(col_name,regression_model.coef_[0][idx]))

The co-ef for cylinders is -0.08561436895562706
The co-ef for displacement is 0.3044182253593021
The co-ef for horsepower is -0.09718466302484209
The co-ef for weight is -0.7628632829136761
The co-ef for acceleration is 0.021591275172924747
The co-ef for model year is 0.3749408074118709
The co-ef for origin is 0.12302637024556841


small coeff, so model is not complex.

In [None]:
intercept = regression_model.intercept_[0]
print("Our model intercept is {}".format(intercept))

Our model intercept is 0.012833128293992874


**Ridge Regression**

In [None]:
ridge=Ridge(alpha=.3)
ridge.fit(x_train,y_train)
for i, col in enumerate(x_train.columns):
  print("Ridge model coeff for {} is : {}".format(col,ridge.coef_[0][i]))

Ridge model coeff for cylinders is : -0.08073001909555927
Ridge model coeff for displacement is : 0.28822207417231654
Ridge model coeff for horsepower is : -0.09985675156606108
Ridge model coeff for weight is : -0.7510550916024499
Ridge model coeff for acceleration is : 0.019127698314832926
Ridge model coeff for model year is : 0.37378312489584287
Ridge model coeff for origin is : 0.1221271728580029


Not much diff in coefficient

**Lasso Regression**

In [None]:
lasso=Lasso(alpha=.1)
lasso.fit(x_train,y_train)
for i, col in enumerate(x_train):
  print("Lasso model coeff for {} is : {}".format(col,lasso.coef_[i]))

Lasso model coeff for cylinders is : -0.0
Lasso model coeff for displacement is : -0.0
Lasso model coeff for horsepower is : -0.013280002937314536
Lasso model coeff for weight is : -0.6205207866794482
Lasso model coeff for acceleration is : 0.0
Lasso model coeff for model year is : 0.29198732924913484
Lasso model coeff for origin is : 0.021567653979880638


Reduces most of the coeff to zero.

**Compare the score**

In [None]:
print(regression_model.score(x_train,y_train))
print(regression_model.score(x_test,y_test))

0.8081802739111359
0.8472274567567306


In [None]:
print(ridge.score(x_train,y_train))
print(ridge.score(x_test,y_test))

0.8081651504849107
0.8475401122140553


In [None]:
print(lasso.score(x_train,y_train))
print(lasso.score(x_test,y_test))

0.7853770917055521
0.8277658025171161
