In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = sns.load_dataset('mpg')

In [4]:
df.head(4)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst


In [5]:
df.drop('name' , axis = 1 , inplace = True)

In [6]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [7]:
df['horsepower'].median()

93.5

In [10]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [11]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [14]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

In [16]:
df['origin'].value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [17]:
df['origin'] =  df['origin'].map({'usa' : 1 , 'japan' : 2, 'europe' : 3})

In [18]:
df['origin']

0      1
1      1
2      1
3      1
4      1
      ..
393    1
394    3
395    1
396    1
397    1
Name: origin, Length: 398, dtype: int64

In [19]:
df['origin'] = df['origin'].astype(int)

In [21]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int32
dtype: object

In [22]:
# Separate X and Y 
X = df.drop('mpg', axis = 1)
y = df['mpg']

In [23]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [24]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

# Train test Display 

In [25]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size = 0.3, random_state = 1)

In [32]:
X_train.shape, X_test.shape

((278, 7), (120, 7))

In [35]:
# Simple Linear Regression model
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()

In [36]:
regression_model

In [37]:
regression_model.fit(X_train, y_train)

In [42]:
for i, col_name in enumerate(X_train.columns):
    print (i, col_name)

0 cylinders
1 displacement
2 horsepower
3 weight
4 acceleration
5 model_year
6 origin


In [47]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {regression_model.coef_[i]}") 

The coefficient for cylinders is -0.3176142302799288
The coefficient for displacement is 0.026237482599078942
The coefficient for horsepower is -0.018270764913124453
The coefficient for weight is -0.007487750398361909
The coefficient for acceleration is 0.050406734619714344
The coefficient for model_year is 0.8470951427061375
The coefficient for origin is 1.5190958387975046


In [None]:
# Coefficient are relatively smaller, if oneindependent variable changes
# slightly there will be not much difference in prediction.
# This is sometime is called smoother model.

In [52]:
from sklearn.metrics import r2_score

y_pred_linear = regression_model.predict(X_test)
r2_linear = r2_score(y_test, y_pred_linear)


print(f"R square of linear regression: {r2_linear}")

R square of linear regression: 0.8348001123742285


In [54]:
# Ridge Regression 
from sklearn.linear_model import Ridge
ridge_regression_model = Ridge(alpha = 0.1)
ridge_regression_model.fit(X_train, y_train)

for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}") 



The coefficient for cylinders is -0.317003210100651
The coefficient for displacement is 0.026213249757982896
The coefficient for horsepower is -0.01826325248144899
The coefficient for weight is -0.007487326050213115
The coefficient for acceleration is 0.05036896947442574
The coefficient for model_year is 0.8470062938903152
The coefficient for origin is 1.5174528285653759


# Ridge Regression Evaluation Metrics

In [58]:
from sklearn.metrics import r2_score

y_pred_ridge = ridge_regression_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge) 

print(f"R-square of Ridge regression: {r2_ridge}")

R-square of Ridge regression: 0.8348084889168358


In [59]:
# We dont see much variation in coeff of ridge regression as compared to linear regression 

In [63]:
from sklearn.linear_model import Lasso
lasso_regression_model = Lasso(alpha = 0.5)
lasso_regression_model.fit(X_train, y_train)

for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.317003210100651
The coefficient for displacement is 0.026213249757982896
The coefficient for horsepower is -0.01826325248144899
The coefficient for weight is -0.007487326050213115
The coefficient for acceleration is 0.05036896947442574
The coefficient for model_year is 0.8470062938903152
The coefficient for origin is 1.5174528285653759


In [64]:
# Features coefficients are 0, Lasso helps in feature selection 

In [66]:
from sklearn.metrics import r2_score

y_pred_lasso = lasso_regression_model.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso) 

print(f"R-square of Lasso Regression: {r2_lasso}")

R-square of Lasso Regression: 0.8277934716635554


In [68]:
from sklearn.linear_model import ElasticNet 
elastic_net_model = ElasticNet(alpha = 1, l1_ratio = 0.5)
elastic_net_model.fit(X_train, y_train)

In [69]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_net_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.0058888699536675535
The coefficient for horsepower is -0.012403874933570123
The coefficient for weight is -0.00693455051625763
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.7133150744603872
The coefficient for origin is 0.0


In [73]:
y_pred_elastic_net = elastic_net_model.predict(X_test)
r2_elastic_net = r2_score(y_test, y_pred_elastic_net)
print(f"R-Squared score for Elastic Net Regression:{r2_elastic_net}")

R-Squared score for Elastic Net Regression:0.8284840073256805


# Cross Validation Implementation

In [74]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv=5)
lassocv.fit(X_train, y_train)
y_pred =  lassocv.predict(X_test)
score = r2_score(y_test, y_pred)
print("R2 Score", score)

R2 Score 0.808280598384475


In [75]:
from sklearn.linear_model import RidgeCV
ridgecv = RidgeCV(cv=5)
ridgecv.fit(X_train, y_train)
y_pred =  ridgecv.predict(X_test)
score = r2_score(y_test, y_pred)
print("R2 Score", score)

R2 Score 0.8354145247502054


In [76]:
ridgecv.get_params()

{'alpha_per_target': False,
 'alphas': (0.1, 1.0, 10.0),
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'scoring': None,
 'store_cv_values': False}

In [79]:
from sklearn.linear_model import ElasticNetCV
elasticnetcv = ElasticNetCV(cv=5)
elasticnetcv.fit(X_train, y_train)
y_pred =  elasticnetcv.predict(X_test)
score = r2_score(y_test, y_pred)
print("R2 Score", score)

R2 Score 0.792863401804916


In [80]:
elasticnetcv.get_params()

{'alphas': None,
 'copy_X': True,
 'cv': 5,
 'eps': 0.001,
 'fit_intercept': True,
 'l1_ratio': 0.5,
 'max_iter': 1000,
 'n_alphas': 100,
 'n_jobs': None,
 'positive': False,
 'precompute': 'auto',
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'verbose': 0}