In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = sns.load_dataset('mpg')
df.drop("name", axis =1, inplace=True)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [6]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [7]:
df['horsepower'].median()

93.5

In [8]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [9]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [10]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [12]:
df.origin.unique()

array(['usa', 'japan', 'europe'], dtype=object)

In [13]:
df.origin.value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [14]:
df['origin'] = df['origin'].map({"usa":1, "japan": 2, "europe": 3})

In [15]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1
394,44.0,4,97.0,52.0,2130,24.6,82,3
395,32.0,4,135.0,84.0,2295,11.6,82,1
396,28.0,4,120.0,79.0,2625,18.6,82,1


In [16]:
X = df.drop('mpg', axis =1)
y = df['mpg']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [18]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model

In [19]:
regression_model.fit(X_train, y_train)

In [20]:
regression_model.coef_

array([-0.31761423,  0.02623748, -0.01827076, -0.00748775,  0.05040673,
        0.84709514,  1.51909584])

In [21]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {regression_model.coef_[i]}")

The coefficient for cylinders is -0.3176142302799369
The coefficient for displacement is 0.026237482599078946
The coefficient for horsepower is -0.018270764913124595
The coefficient for weight is -0.007487750398361897
The coefficient for acceleration is 0.0504067346197138
The coefficient for model_year is 0.8470951427061365
The coefficient for origin is 1.5190958387975024


In [22]:
from sklearn.metrics import r2_score
y_pred_linear = regression_model.predict(X_test)
r2_score(y_test, y_pred_linear)

0.8348001123742285

In [23]:
#ridge regression

from sklearn.linear_model import Ridge
ridge_regression_model = Ridge(alpha=100)
ridge_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.10121103813475338
The coefficient for displacement is 0.01549704723498359
The coefficient for horsepower is -0.015154146537673992
The coefficient for weight is -0.0072516011271230064
The coefficient for acceleration is 0.030640385245588053
The coefficient for model_year is 0.7914981246881716
The coefficient for origin is 0.7240611102907413


In [24]:
from sklearn.metrics import r2_score
y_pred = ridge_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8350365150570258

In [25]:
#lasso regression

from sklearn.linear_model import Lasso
lasso_regression_model = Lasso(alpha=0.5)
lasso_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {lasso_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.006208198888300358
The coefficient for horsepower is -0.011058382987169565
The coefficient for weight is -0.0069826731680230885
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.744654952003819
The coefficient for origin is 0.0


In [26]:
from sklearn.metrics import r2_score
y_pred = lasso_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8277934716635555

In [27]:
y_pred 

array([22.69308712, 25.44502578, 20.1201592 , 25.6877054 , 24.16568459,
       15.29124546, 28.89478105, 33.56080467, 17.18003536, 10.81844918,
       31.03098259, 17.07781142, 22.23820371, 26.4095746 , 35.6028448 ,
       22.64394305,  9.97914351, 21.31421487,  7.94669506, 32.22296702,
       25.4356543 , 31.22301038, 22.05467198, 25.31778893, 26.2369358 ,
       28.52843209, 31.44019952, 31.82300721, 16.06063293, 30.65392731,
       28.09631905,  9.8002893 , 20.28800132, 27.68011881, 25.86153075,
       13.65982769, 28.27396319,  7.81651456, 32.63699742, 25.08560397,
       26.18194574, 25.67744208, 20.95695191, 32.54986101, 27.46815162,
       22.44593828, 20.88416017, 11.81077144, 28.15387812, 19.16888385,
       24.57313398, 27.5195837 , 16.53976433, 12.01758536, 30.23126578,
       24.91494799,  9.31892902, 12.96216784, 30.14319044, 34.85611133,
       34.76087733, 34.85611133, 18.22701814, 28.83818661, 19.58352668,
       32.06083487, 28.2013323 , 26.90989689, 30.5307212 , 12.78

In [28]:
#ElasticNet
from sklearn.linear_model import ElasticNet
elastic_regression_model = ElasticNet(alpha=0.8, l1_ratio=0.4)
elastic_regression_model.fit(X_train, y_train)
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.007991329781571462
The coefficient for horsepower is -0.013891824664720713
The coefficient for weight is -0.006988137750203173
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.736838267738092
The coefficient for origin is 0.20426289739925593


In [29]:
from sklearn.metrics import r2_score
y_pred = elastic_regression_model.predict(X_test)
r2_score(y_test, y_pred)

0.8308672338152328

In [30]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv = 5)
lassocv.fit(X_train, y_train)
y_pred = lassocv.predict(X_test)
r2_score(y_test, y_pred)

0.8082805983844751

In [None]:
regression_model.predict(X_train)

In [32]:
import pickle as pickle
pickle.dump(regression_model,open("regression_model.pkl","wb"))
pickle.dump(X_train,open("X_train.pkl","wb"))

In [2]:
import pickle as pickle
regression_model_from_pkl = pickle.load(open("regression_model.pkl","rb"))
X_train = pickle.load(open("X_train.pkl","rb"))
regression_model_from_pkl.predict(X_train)

array([31.54753543, 27.0578835 , 22.47424124, 15.26127428, 34.38558269,
       33.22769161, 12.66468801, 31.65184707, 28.86032715, 29.29805258,
       29.62492446, 25.48789259, 10.02197588, 33.80166701, 24.83805413,
       13.61678217, 29.35368782, 16.74448452, 31.58020152, 22.20328394,
       16.45386704, 21.41146049, 11.48833394, 25.80667767, 26.59234957,
       24.28498262, 13.55125233, 21.26424309, 19.28396933, 22.749538  ,
       15.23697238,  9.87750757, 15.15655021, 32.6495694 , 29.3015285 ,
       11.42645116, 11.29393847, 25.37995474, 23.33290739, 17.84807076,
       20.46385247, 35.42564486, 28.39035288, 29.99738962,  6.27464876,
       15.91514038, 17.28659067, 28.80343231, 33.71325339, 20.77373126,
       31.54169488, 34.4567098 , 23.02010589, 21.55198433, 28.87468303,
       31.84998838, 21.52873564, 15.68771888, 24.65484857, 31.01792721,
       33.97980796, 16.97654248, 27.99166263, 20.66462305, 28.49600655,
       29.81796064, 12.07883805, 25.1551553 ,  5.45200933, 28.45