In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset('mpg')
df.drop("name", axis =1, inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [3]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [4]:
df['horsepower'].median()

93.5

In [5]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [6]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [7]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [9]:
df.origin.unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [10]:
df.origin.value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [11]:
df['origin'] = df['origin'].map({"usa":1, "japan": 2, "europe": 3})

In [12]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1
394,44.0,4,97.0,52.0,2130,24.6,82,3
395,32.0,4,135.0,84.0,2295,11.6,82,1
396,28.0,4,120.0,79.0,2625,18.6,82,1


In [13]:
X = df.drop('mpg', axis =1)
y = df['mpg']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [15]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model

In [16]:
regression_model.fit(X_train, y_train)

In [18]:
regression_model.coef_

array([-0.31761423,  0.02623748, -0.01827076, -0.00748775,  0.05040673,
        0.84709514,  1.51909584])

In [17]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {regression_model.coef_[i]}")

The coefficient for cylinders is -0.3176142302799369
The coefficient for displacement is 0.026237482599078946
The coefficient for horsepower is -0.018270764913124595
The coefficient for weight is -0.007487750398361897
The coefficient for acceleration is 0.0504067346197138
The coefficient for model_year is 0.8470951427061365
The coefficient for origin is 1.5190958387975024


In [19]:
from sklearn.metrics import r2_score
y_pred_linear = regression_model.predict(X_test)
r2_score(y_test, y_pred_linear)

0.8348001123742285

In [20]:
#ridge regression

from sklearn.linear_model import Ridge
ridge_regression_model = Ridge(alpha=100)
ridge_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.10121103813475338
The coefficient for displacement is 0.01549704723498359
The coefficient for horsepower is -0.015154146537673992
The coefficient for weight is -0.0072516011271230064
The coefficient for acceleration is 0.030640385245588053
The coefficient for model_year is 0.7914981246881716
The coefficient for origin is 0.7240611102907413


In [21]:
from sklearn.metrics import r2_score
y_pred = ridge_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8350365150570258

In [22]:
#lasso regression

from sklearn.linear_model import Lasso
lasso_regression_model = Lasso(alpha=0.5)
lasso_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {lasso_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.006208198888300358
The coefficient for horsepower is -0.011058382987169565
The coefficient for weight is -0.0069826731680230885
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.744654952003819
The coefficient for origin is 0.0


In [23]:
from sklearn.metrics import r2_score
y_pred = lasso_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8277934716635555

In [24]:
#ElasticNet
from sklearn.linear_model import ElasticNet
elastic_regression_model = ElasticNet(alpha=0.8, l1_ratio=0.4)
elastic_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.007991329781571462
The coefficient for horsepower is -0.013891824664720713
The coefficient for weight is -0.006988137750203173
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.736838267738092
The coefficient for origin is 0.20426289739925593


In [25]:
from sklearn.metrics import r2_score
y_pred = elastic_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8308672338152328

In [26]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv = 5)
lassocv.fit(X_train, y_train)
y_pred = lassocv.predict(X_test)
r2_score(y_test, y_pred)

0.8082805983844751