## Logistic regression model using Ridge and Lasso regularization

In [1]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

In [2]:
# Read the csv file

data=pd.read_csv('2. car-mpg.csv')
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [3]:
# Drop the car name

data=data.drop('car_name', axis=1)
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,1,0
1,15.0,8,350.0,165,3693,11.5,70,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0
3,16.0,8,304.0,150,3433,12.0,70,1,0
4,17.0,8,302.0,140,3449,10.5,70,1,0


In [4]:
data['origin']=data['origin'].replace({1:'america', 2: 'europe', 3:'asia'})
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,america,0
1,15.0,8,350.0,165,3693,11.5,70,america,0
2,18.0,8,318.0,150,3436,11.0,70,america,0
3,16.0,8,304.0,150,3433,12.0,70,america,0
4,17.0,8,302.0,140,3449,10.5,70,america,0


In [5]:
data=pd.get_dummies(data, columns=['origin'])
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,0,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,0,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,0,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,0,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,0,1,0,0


In [6]:
data.describe()

Unnamed: 0,mpg,cyl,disp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.530151,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.499718,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0,1.0


In [7]:
# hp is missing from the above dataset it means that hp is having any other data type than numeric one.

hpisdigit=pd.DataFrame(data.hp.str.isdigit())
data[hpisdigit['hp']==False]

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,?,2046,19.0,71,1,1,0,0
126,21.0,6,200.0,?,2875,17.0,74,0,1,0,0
330,40.9,4,85.0,?,1835,17.3,80,1,0,0,1
336,23.6,4,140.0,?,2905,14.3,80,1,1,0,0
354,34.5,4,100.0,?,2320,15.8,81,1,0,0,1
374,23.0,4,151.0,?,3035,20.5,82,1,1,0,0


In [8]:
data=data.replace('?', np.nan)
data[hpisdigit['hp']==False]

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
32,25.0,4,98.0,,2046,19.0,71,1,1,0,0
126,21.0,6,200.0,,2875,17.0,74,0,1,0,0
330,40.9,4,85.0,,1835,17.3,80,1,0,0,1
336,23.6,4,140.0,,2905,14.3,80,1,1,0,0
354,34.5,4,100.0,,2320,15.8,81,1,0,0,1
374,23.0,4,151.0,,3035,20.5,82,1,1,0,0


In [9]:
medianfiller= lambda x: x.fillna(x.median())

data=data.apply(medianfiller, axis=0)

In [10]:
data['hp']=data['hp'].astype('float64')
data.describe()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,car_type,origin_america,origin_asia,origin_europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.30402,2970.424623,15.56809,76.01005,0.530151,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,38.222625,846.841774,2.757689,3.697627,0.499718,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,0.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,76.0,2223.75,13.825,73.0,0.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,125.0,3608.0,17.175,79.0,1.0,1.0,0.0,0.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0,1.0,1.0,1.0,1.0


In [11]:
# Separate independent & Dependent variable

x=data.drop('mpg', axis=1)

y=data[['mpg']]

In [12]:
# scaling the data points
from sklearn import preprocessing

x_scaled=preprocessing.scale(x)
x_scaled=pd.DataFrame(x_scaled, columns=x.columns)

y_scaled=preprocessing.scale(y)
y_scaled=pd.DataFrame(y_scaled, columns=y.columns)


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(x_scaled, y_scaled, test_size=0.30, random_state=1)

In [14]:
### Fit a simple linear model

regression_model=LinearRegression()
regression_model.fit(x_train, y_train)

LinearRegression()

In [15]:
for idx, col_name in enumerate(x_train.columns):
    print("The coeficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coeficient for cyl is 0.3210223856916105
The coeficient for disp is 0.3248343091848387
The coeficient for hp is -0.22916950059437668
The coeficient for wt is -0.7112101905072296
The coeficient for acc is 0.014713682764190954
The coeficient for yr is 0.37558119495107434
The coeficient for car_type is 0.3814769484233102
The coeficient for origin_america is -0.074722475475842
The coeficient for origin_asia is 0.04451525203567801
The coeficient for origin_europe is 0.048348549539454166


In [16]:
intercept=regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 0.019284116103639715


In [17]:
#Create a regularized RIDGE model and note the coefficients

ridge = Ridge(alpha=.3)
ridge.fit(x_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.31649043  0.31320707 -0.22876025 -0.70109447  0.01295851  0.37447352
   0.37725608 -0.07423624  0.04441039  0.04784031]]


In [19]:
# Create a regularized LASSO model and note the coefficients

lasso=Lasso(alpha= 0.1)
lasso.fit(x_train, y_train)
print("Lasso Model:", (lasso.coef_))

Lasso Model: [-0.         -0.         -0.01690287 -0.51890013  0.          0.28138241
  0.1278489  -0.01642647  0.          0.        ]


In [20]:
# Comparing the score

print(regression_model.score(x_train, y_train))
print(regression_model.score(x_test, y_test))

0.8343770256960538
0.8513421387780066


In [21]:
print(ridge.score(x_train,y_train))
print(ridge.score(x_test, y_test))

0.8343617931312616
0.8518882171608506


In [22]:
print(lasso.score(x_train, y_train))
print(lasso.score(x_test, y_test))

0.7938010766228453
0.8375229615977083


In [23]:
# Let us generate polynomial models reflecting the non-linear interaction between some dimensions¶

from sklearn.preprocessing import PolynomialFeatures

poly=PolynomialFeatures(degree=2, interaction_only=True)

In [24]:
x_poly=poly.fit_transform(x_scaled)

x_train, x_test, y_train, y_test= train_test_split(x_poly, y, test_size=0.30, random_state=1)

In [26]:
x_train.shape

(278, 56)

In [27]:
# Fit a simple regression model on poly feature

regression_model.fit(x_train, y_train)
print(regression_model.coef_[0])

[ 3.24082770e-13 -1.14204220e+12 -4.43738735e+00 -2.24947964e+00
 -2.98166341e+00 -1.56730367e+00  3.00442772e+00 -1.52060575e+12
 -7.80788356e+11  3.71375223e+12 -3.23609457e+12 -1.15918732e+00
 -1.43925476e+00 -3.57818604e-03  2.58444214e+00 -1.91918182e+00
 -3.65891647e+12 -6.45319147e+12 -2.39436996e+12 -2.28543203e+12
  3.90441895e-01  2.09503174e-01 -4.23446655e-01  3.58471680e+00
 -2.02703094e+00 -9.03672940e+11 -7.44778888e+11 -7.10893285e+11
  2.47772217e-01 -6.70440674e-01 -1.92620850e+00 -7.47558594e-01
 -2.15947171e+11 -1.77976884e+11 -1.69879374e+11 -1.72500610e-01
  5.30212402e-01 -3.32050323e+00  1.69388998e+12  1.39605098e+12
  1.33253411e+12  5.85876465e-01  1.53894043e+00  4.76389633e+11
  3.92625390e+11  3.74761903e+11  4.00207520e-01 -1.27131857e+10
 -1.04778089e+10 -1.00010944e+10 -1.09798815e+12  8.13175594e+11
  7.76178109e+11  2.20248210e+11 -5.15971535e+12  2.83957085e+12]


In [35]:
ridge=Ridge(alpha=0.3)
ridge.fit(x_train,y_train)
print(ridge.coef_)

[[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [31]:
print(ridge.score(x_train, y_train))
print(ridge.score(x_test, y_test))

0.9143225702003365
0.861339805369854


In [36]:
lasso=Lasso(alpha=0.01)
lasso.fit(x_train, y_train)
print(lasso.coef_)

[ 0.          0.52263805 -0.5402102  -1.99423315 -4.55360385 -0.85285179
  2.99044036  0.00711821 -0.          0.76073274 -0.         -0.
 -0.19736449  0.          2.04221833 -1.00014513  0.         -0.
  4.28412669 -0.          0.          0.31442062 -0.          2.13894094
 -1.06760107  0.         -0.          0.          0.         -0.44991392
 -1.55885506 -0.         -0.68837902  0.          0.17455864 -0.34653644
  0.3313704  -2.84931966  0.         -0.34340563  0.00815105  0.47019445
  1.25759712 -0.69634581  0.          0.55528147  0.2948979  -0.67289549
  0.06490671  0.         -1.19639935  1.06711702  0.         -0.88034391
  0.         -0.        ]


In [38]:
print(lasso.score(x_train, y_train))
print(lasso.score(x_test, y_test))

0.9098286193898272
0.8695296858772456
