In [1]:
# 1 Mile is 1.6 KM
# 1 Gallon is 3.7 km

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [57]:
df = sns.load_dataset('mpg')

In [58]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [59]:
df.drop('name', axis = 1, inplace = True)


In [60]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [62]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [63]:
df.shape

(398, 8)

In [64]:
# Since we have not done the outlier treatment
# Then better answer would be to replace the Null Values with the
# Median Values

df['horsepower'].median()

93.5

In [65]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [66]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [68]:
df['origin'].value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [69]:
# Since it is a Categorical Data
# I will be converting the Categorical Feature into Numerical Feature
# Which is called as Data Encoding

In [70]:
df['origin'] = df['origin'].map({
    "usa" : 1,
    "japan" : 2,
    "europe" : 3
})

In [71]:
df['origin'].head()

0    1
1    1
2    1
3    1
4    1
Name: origin, dtype: int64

In [72]:
# Seperate the data into X and Y (Dependent & Independent Variables)

In [73]:
x = df.drop('mpg', axis = 1)
y = df['mpg']

In [74]:
x

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [75]:
# Now Splitting data into Training and Testing

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)


In [76]:
x_train.shape

(278, 7)

In [77]:
x_test.shape

(120, 7)

In [78]:
# Simple Linear Regression Model

In [79]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [80]:
model

In [81]:
x.isna().sum()

cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [82]:
model.fit(x_train, y_train)

In [83]:
model.coef_

array([-0.31761423,  0.02623748, -0.01827076, -0.00748775,  0.05040673,
        0.84709514,  1.51909584])

In [86]:
for i, col_name in enumerate(x_train.columns):
    print(f"Coeff for {col_name} : {model.coef_[i]}")

Coeff for cylinders : -0.31761423027992697
Coeff for displacement : 0.026237482599078894
Coeff for horsepower : -0.018270764913124512
Coeff for weight : -0.0074877503983619064
Coeff for acceleration : 0.05040673461971419
Coeff for model_year : 0.8470951427061366
Coeff for origin : 1.519095838797505


In [87]:
# Coefficients are Some what smaller, If one independent Variable changes
# There will not be much of a Difference in the Prediction
# This is sometime called as smoother model

# These features might not be contributing to the model Training

In [92]:
from sklearn.metrics import r2_score

y_pred = model.predict(x_test)
print(f"R2 Score : {r2_score(y_test, y_pred)}")

R2 Score : 0.8348001123742286


In [93]:
# Regularized Model

In [94]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha = 0.1)
ridge_model

In [95]:
ridge_model.fit(x_train, y_train)

In [96]:
for i, col_name in enumerate(x_train.columns):
    print(f"Coeff for {col_name} : {ridge_model.coef_[i]}")

Coeff for cylinders : -0.317003210100659
Coeff for displacement : 0.026213249757982997
Coeff for horsepower : -0.01826325248144963
Coeff for weight : -0.0074873260502130775
Coeff for acceleration : 0.05036896947442412
Coeff for model_year : 0.8470062938903169
Coeff for origin : 1.517452828565388


In [99]:
# Now for the Ridge Regression Model Evaluation
y_ridge_pred = ridge_model.predict(x_test)

In [101]:
print(f"R2 Score Linear Regression: {r2_score(y_test, y_pred)}")
print(f"R2 Score Ridge Linear Regression: {r2_score(y_ridge_pred, y_test)}")


R2 Score Linear Regression: 0.8348001123742286
R2 Score Ridge Linear Regression: 0.8161948884653054


In [102]:
# We dont see much Variation in the coeff of 
# Ridge Regression as compared to Lienar Regression

from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha = 0.5)
lasso_model


In [103]:
lasso_model.fit(x_train, y_train)

In [104]:
for i, col_name in enumerate(x_train.columns):
    print(f"Coeff for {col_name} : {lasso_model.coef_[i]}")

Coeff for cylinders : -0.0
Coeff for displacement : 0.006208198888300369
Coeff for horsepower : -0.01105838298716959
Coeff for weight : -0.006982673168023089
Coeff for acceleration : 0.0
Coeff for model_year : 0.744654952003819
Coeff for origin : 0.0


In [105]:
# Helps in the Feature Selection

In [106]:
y_lasso_pred = lasso_model.predict(x_test)

In [107]:
print(f"R2 Score Linear Regression: {r2_score(y_test, y_pred)}")
print(f"R2 Score Ridge Linear Regression: {r2_score(y_ridge_pred, y_test)}")
print(f"R2 Score Lasso Linear Regression: {r2_score(y_lasso_pred, y_test)}")


R2 Score Linear Regression: 0.8348001123742286
R2 Score Ridge Linear Regression: 0.8161948884653054
R2 Score Lasso Linear Regression: 0.8011805866559626


In [113]:
# Elastic Net Regression

from sklearn.linear_model import ElasticNet
elastic_model = ElasticNet(alpha = 1, l1_ratio = 0.5)
elastic_model

In [114]:
elastic_model.fit(x_train, y_train)

In [115]:
for i, col_name in enumerate(x_train.columns):
    print(f"Coeff for {col_name} : {elastic_model.coef_[i]}")

Coeff for cylinders : -0.0
Coeff for displacement : 0.0058888699536675465
Coeff for horsepower : -0.012403874933570107
Coeff for weight : -0.00693455051625763
Coeff for acceleration : 0.0
Coeff for model_year : 0.7133150744603873
Coeff for origin : 0.0


In [116]:
y_elastic_pred = elastic_model.predict(x_test)

In [117]:
print(f"R2 Score Linear Regression: {r2_score(y_test, y_pred)}")
print(f"R2 Score Ridge Linear Regression: {r2_score(y_ridge_pred, y_test)}")
print(f"R2 Score Lasso Linear Regression: {r2_score(y_lasso_pred, y_test)}")
print(f"R2 Score Elastic Net Linear Regression: {r2_score(y_elastic_pred, y_test)}")


R2 Score Linear Regression: 0.8348001123742286
R2 Score Ridge Linear Regression: 0.8161948884653054
R2 Score Lasso Linear Regression: 0.8011805866559626
R2 Score Elastic Net Linear Regression: 0.7998978137311015
