# Ridge and Lasso method

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import sklearn

In [2]:
mpg_df = pd.read_csv("auto-mpg.csv")
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [3]:
mpg_df['origin'].value_counts()

origin
1    249
3     79
2     70
Name: count, dtype: int64

In [4]:
mpg_df = mpg_df.drop('car name',axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1 : 'america' , 2: 'europe', 3: 'asia'})
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [5]:
# one hot encoder
mpg_df = pd.get_dummies(mpg_df , columns = ['origin'], dtype=int)
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,1,0,0


In [6]:
# # one hot encoder
# mpg_df = pd.get_dummies(mpg_df , columns = ['horsepower'], dtype=int)
# mpg_df.head()

In [7]:
mpg_df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0


In [8]:
# mpg_df = mpg_df.replace('?', np.nan)
# mpg_df = mpg_df.apply(lambda x : x.fillna(x.median()),axis = 0)

# # mpg_df = mpg_df.apply(lambda x : x.fillna(x.median()),axis = 0)

mpg_df = mpg_df.replace('?', np.nan)

num_cols = mpg_df.select_dtypes(include='number').columns
mpg_df[num_cols] = mpg_df[num_cols].apply(
    lambda x: x.fillna(x.median())
)


In [9]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mpg             398 non-null    float64
 1   cylinders       398 non-null    int64  
 2   displacement    398 non-null    float64
 3   horsepower      392 non-null    object 
 4   weight          398 non-null    int64  
 5   acceleration    398 non-null    float64
 6   model year      398 non-null    int64  
 7   origin_america  398 non-null    int64  
 8   origin_asia     398 non-null    int64  
 9   origin_europe   398 non-null    int64  
dtypes: float64(3), int64(6), object(1)
memory usage: 31.2+ KB


In [10]:
# mpg_df['horsepower'].value_counts()

In [11]:
# split the data into independent and dependent variable.
x = mpg_df.drop('mpg',axis = 1)
y = mpg_df[['mpg']]

In [12]:
x

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,8,307.0,130,3504,12.0,70,1,0,0
1,8,350.0,165,3693,11.5,70,1,0,0
2,8,318.0,150,3436,11.0,70,1,0,0
3,8,304.0,150,3433,12.0,70,1,0,0
4,8,302.0,140,3449,10.5,70,1,0,0
...,...,...,...,...,...,...,...,...,...
393,4,140.0,86,2790,15.6,82,1,0,0
394,4,97.0,52,2130,24.6,82,0,0,1
395,4,135.0,84,2295,11.6,82,1,0,0
396,4,120.0,79,2625,18.6,82,1,0,0


In [13]:
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [14]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)

In [15]:
x_scaled.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin_america,origin_asia,origin_europe
0,1.498191,1.090604,0.664133,0.63087,-1.295498,-1.627426,0.773559,-0.497643,-0.461968
1,1.498191,1.503514,1.574594,0.854333,-1.477038,-1.627426,0.773559,-0.497643,-0.461968
2,1.498191,1.196232,1.184397,0.55047,-1.658577,-1.627426,0.773559,-0.497643,-0.461968
3,1.498191,1.061796,1.184397,0.546923,-1.295498,-1.627426,0.773559,-0.497643,-0.461968
4,1.498191,1.042591,0.924265,0.565841,-1.840117,-1.627426,0.773559,-0.497643,-0.461968


In [16]:
# split the data into test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled,y,test_size = 0.2, random_state = 0)

In [17]:
x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_train.median())

In [18]:
from sklearn.linear_model import LinearRegression

# Base model - Linear Regression
regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

for idx, col_name in enumerate(x_train.columns):
    print(
        "The coefficient for {} is {}".format(
            col_name, regression_model.coef_[0][idx]
        )
    )

The coefficient for cylinders is -0.6801762707431223
The coefficient for displacement is 2.625054696462074
The coefficient for horsepower is -0.8047710869668578
The coefficient for weight is -5.6397476721701825
The coefficient for acceleration is 0.5251498674495471
The coefficient for model year is 2.826267845230558
The coefficient for origin_america is -0.7951470266011219
The coefficient for origin_asia is 0.5963244516756391
The coefficient for origin_europe is 0.38602557552956207


In [19]:
intercept = regression_model.intercept_[0]
print(
        "The intercept value for our model is {}".format(intercept)
    )

The intercept value for our model is 23.39272105075148


In [20]:
y_pred_reg_train = regression_model.predict(x_train)
y_pred_reg_test = regression_model.predict(x_test)

In [21]:
from sklearn.metrics import r2_score

In [22]:
print("linear regression training accuracy", r2_score(y_train,y_pred_reg_train))
print("linear regression test accuracy", r2_score(y_test,y_pred_reg_test))

linear regression training accuracy 0.8212578704640939
linear regression test accuracy 0.8273880276812928


# Regularisation with Ridge and lasso

In [23]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [24]:
ridge = Ridge(alpha = 0.3)
ridge.fit(x_train,y_train)
print("ridge model :", (ridge.coef_))

ridge model : [-0.65274626  2.51973645 -0.81228857 -5.56807017  0.511903    2.81900877
 -0.79034949  0.59547942  0.38081236]


In [25]:
print(ridge.score(x_train,y_train))
print(ridge.score(x_test,y_test))

0.8212467625784774
0.8272298895116833


In [26]:
lasso = Lasso(alpha = 0.1)
lasso.fit(x_train,y_train)
print("lasso model :" ,(lasso.coef_))

lasso model : [-0.          0.         -0.35044392 -4.48679158  0.2969366   2.68875359
 -0.89086405  0.1930531   0.        ]


In [27]:
print(lasso.score(x_train,y_train))
print(lasso.score(x_test,y_test))

0.8153980808467995
0.8274402809570914


In [30]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only = True)

In [31]:
x_scaled = x_scaled.fillna(x_scaled.median())

In [32]:
x_poly = poly.fit_transform(x_scaled)
x_train, x_test, y_train, y_test = train_test_split(x_poly,y,test_size = 0.3, random_state = 1)

In [33]:
x_train.shape

(278, 46)

In [34]:
regression_model.fit(x_train,y_train)
print(regression_model.coef_[0])

[-3.66533396e-13  3.08674097e-02 -5.40788932e-01 -1.93073535e+00
 -5.28856456e+00 -6.14724891e-01  3.06044815e+00  2.02832367e-01
  1.40652041e-01 -4.05192731e-01 -1.82061261e+00  1.88395963e-01
  1.93751225e+00  1.50917544e+00 -1.51340090e+00 -4.39259295e-02
  1.15453574e+00 -1.15373036e+00  3.43675129e-01  1.63304828e+00
 -9.60277024e-01  2.53920406e+00  3.55367935e-01  4.33704459e-01
 -9.06113934e-01 -6.62328037e-01 -2.85315383e-01 -1.52000420e+00
 -6.70902663e-01  4.96001930e-01  3.33193211e-01 -1.45030101e-01
  2.48843897e-01  4.22773438e-01 -9.59035304e-01  4.67327993e-01
  4.96785043e-01 -6.58923095e-01  2.17025478e-01  6.10239238e-01
 -5.01632741e-01  4.13729055e-01  2.04215276e-01 -2.82762752e-01
  4.30101139e-01  1.36664695e-01]


In [35]:
ridge = Ridge(alpha = 0.3)
ridge.fit(x_train,y_train)
print("ridge model :", (ridge.coef_))

ridge model : [ 0.          0.06744609 -0.61768727 -1.98571407 -5.15374739 -0.62408399
  3.03736622  0.1703171   0.16030007 -0.38444456 -1.48866129  0.028376
  1.72736026  1.41986952 -1.38667845 -0.05716879  1.13571407 -1.11717748
  0.30229162  1.53943236 -0.84223503  2.38631123  0.21448411  0.5070261
 -0.80384191 -0.47782161 -0.30297733 -1.51356956 -0.62132424  0.43747173
  0.33149025 -0.1444153   0.25605085  0.47589402 -0.93889303  0.37869981
  0.47843425 -0.67043791  0.22923541  0.6120847  -0.498885    0.40339263
  0.21155153 -0.29198126  0.41830046  0.11726274]


In [36]:
lasso = Lasso(alpha = 0.3)
lasso.fit(x_train,y_train)
print("lasso model :", (lasso.coef_))

lasso model : [ 0.         -0.         -0.         -1.48894701 -4.87585559  0.
  2.6605066  -0.2251455   0.          0.          0.          0.
  0.          0.         -0.          0.          0.         -0.
  0.          0.93725827 -0.         -0.          0.          0.
 -0.          0.15425897 -0.         -0.64618649  0.         -0.
 -0.         -0.         -0.0283077   0.         -0.         -0.
  0.         -0.40865604  0.          0.30912684 -0.10347315  0.
  0.         -0.         -0.         -0.        ]


In [37]:
print(ridge.score(x_train,y_train))
print(ridge.score(x_test,y_test))

0.9025974597510317
0.8673781731805364


In [38]:
print(lasso.score(x_train,y_train))
print(lasso.score(x_test,y_test))

0.8748927883097986
0.8827308949878448
