# Regularization
## 1. Import Libraries

In [1]:
#Import Libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

`from sklearn.impute import SimpleImputer`
- The SimpleImputer class from the sklearn.impute module in scikit-learn is used to handle missing values in a dataset by imputing/replacing them with a specified strategy (such as mean, median, most frequent, or a constant value)

`from sklearn.preprocessing import OrdinalEncoder`
- The OrdinalEncoder from the sklearn.preprocessing module in scikit-learn is used to encode categorical features into ordinal integers. It assigns a unique integer value to each category present in the input data.

`What is Ordinal integers?`
- Ordinal integers are a set of integers used to represent ordered or ranked categories or values. In machine learning and data preprocessing, ordinal integers are often assigned to categorical variables to encode them numerically based on their inherent order or ranking.
- For instance, consider a categorical variable like T-shirt sizes: 'small', 'medium', and 'large'. Ordinal integers might be assigned in a way that reflects their order, such as:

1. 'small' might be encoded as 0
2. 'medium' might be encoded as 1
3. 'large' might be encoded as 2

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/aishwaryamate/Datasets/main/cars.csv')
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,sedan,rwd,front,68.9,55.5,ohc,141,114,23,28,16845
201,-1,95,volvo,gas,sedan,rwd,front,68.8,55.5,ohc,141,160,19,25,19045
202,-1,95,volvo,gas,sedan,rwd,front,68.9,55.5,ohcv,173,134,18,23,21485
203,-1,95,volvo,diesel,sedan,rwd,front,68.9,55.5,ohc,145,106,26,27,22470


## 2. EDA

In [3]:
df.describe()

Unnamed: 0,symboling,width,height,engine-size,city-mpg,highway-mpg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,65.907805,53.724878,126.907317,25.219512,30.75122,13227.478049
std,1.245307,2.145204,2.443522,41.642693,6.542142,6.886443,7902.651615
min,-2.0,60.3,47.8,61.0,13.0,16.0,5118.0
25%,0.0,64.1,52.0,97.0,19.0,25.0,7788.0
50%,1.0,65.5,54.1,120.0,24.0,30.0,10345.0
75%,2.0,66.9,55.5,141.0,30.0,34.0,16500.0
max,3.0,72.3,59.8,326.0,49.0,54.0,45400.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   body-style         205 non-null    object 
 5   drive-wheels       205 non-null    object 
 6   engine-location    205 non-null    object 
 7   width              205 non-null    float64
 8   height             205 non-null    float64
 9   engine-type        205 non-null    object 
 10  engine-size        205 non-null    int64  
 11  horsepower         205 non-null    object 
 12  city-mpg           205 non-null    int64  
 13  highway-mpg        205 non-null    int64  
 14  price              205 non-null    int64  
dtypes: float64(2), int64(5), object(8)
memory usage: 24.2+ KB


In [5]:
df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
body-style           0
drive-wheels         0
engine-location      0
width                0
height               0
engine-type          0
engine-size          0
horsepower           0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [6]:
df['normalized-losses'].value_counts()

?      41
161    11
91      8
150     7
134     6
128     6
104     6
85      5
94      5
65      5
102     5
74      5
168     5
103     5
95      5
106     4
93      4
118     4
148     4
122     4
83      3
125     3
154     3
115     3
137     3
101     3
119     2
87      2
89      2
192     2
197     2
158     2
81      2
188     2
194     2
153     2
129     2
108     2
110     2
164     2
145     2
113     2
256     1
107     1
90      1
231     1
142     1
121     1
78      1
98      1
186     1
77      1
Name: normalized-losses, dtype: int64

In [7]:
# ? are missing values

In [8]:
df[df['horsepower']== '?']

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
130,0,?,renault,gas,wagon,fwd,front,66.5,55.2,ohc,132,?,23,31,9295
131,2,?,renault,gas,hatchback,fwd,front,66.6,50.5,ohc,132,?,23,31,9895


In [9]:
df.replace('?', np.nan, inplace=True)

In [10]:
df.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
body-style            0
drive-wheels          0
engine-location       0
width                 0
height                0
engine-type           0
engine-size           0
horsepower            2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [11]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
body-style            object
drive-wheels          object
engine-location       object
width                float64
height               float64
engine-type           object
engine-size            int64
horsepower            object
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [12]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [13]:
# Replacing missing values
# Use the median if an outlier is present.
# use mean if outlier is not present present 
# If we want to use fillna, replace the object data type with an int data type using astype.

In [14]:
# But without using astype
# Using SimpleImputer

In [15]:
si = SimpleImputer(strategy='median')

In [16]:
df.iloc[:, [1,11]]

Unnamed: 0,normalized-losses,horsepower
0,,111
1,,111
2,,154
3,164,102
4,164,115
...,...,...
200,95,114
201,95,160
202,95,134
203,95,106


In [17]:
df.iloc[:, [1,11]] = si.fit_transform(df.iloc[:, [1,11]])

In [18]:
#Replaced
df.iloc[:, [1,11]]

Unnamed: 0,normalized-losses,horsepower
0,115.0,111.0
1,115.0,111.0
2,115.0,154.0
3,164.0,102.0
4,164.0,115.0
...,...,...
200,95.0,114.0
201,95.0,160.0
202,95.0,134.0
203,95.0,106.0


In [19]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,115.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,115.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,115.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450


In [20]:
df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
body-style           0
drive-wheels         0
engine-location      0
width                0
height               0
engine-type          0
engine-size          0
horsepower           0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [21]:
# Encoding

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    float64
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   body-style         205 non-null    object 
 5   drive-wheels       205 non-null    object 
 6   engine-location    205 non-null    object 
 7   width              205 non-null    float64
 8   height             205 non-null    float64
 9   engine-type        205 non-null    object 
 10  engine-size        205 non-null    int64  
 11  horsepower         205 non-null    float64
 12  city-mpg           205 non-null    int64  
 13  highway-mpg        205 non-null    int64  
 14  price              205 non-null    int64  
dtypes: float64(4), int64(5), object(6)
memory usage: 24.2+ KB


In [23]:
# All categorical calumns but this is not good way
df[['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location', 'engine-type']]

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc
...,...,...,...,...,...,...
200,volvo,gas,sedan,rwd,front,ohc
201,volvo,gas,sedan,rwd,front,ohc
202,volvo,gas,sedan,rwd,front,ohcv
203,volvo,diesel,sedan,rwd,front,ohc


In [24]:
#Easy way to select same data types coumns
df.select_dtypes(object)

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc
...,...,...,...,...,...,...
200,volvo,gas,sedan,rwd,front,ohc
201,volvo,gas,sedan,rwd,front,ohc
202,volvo,gas,sedan,rwd,front,ohcv
203,volvo,diesel,sedan,rwd,front,ohc


In [25]:
cat_cols = df.select_dtypes(object).columns
cat_cols

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [26]:
oe = OrdinalEncoder()
df[cat_cols] = oe.fit_transform(df[cat_cols])

In [27]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,115.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,13495
1,3,115.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,16500
2,1,115.0,0.0,1.0,2.0,2.0,0.0,65.5,52.4,5.0,152,154.0,19,26,16500
3,2,164.0,1.0,1.0,3.0,1.0,0.0,66.2,54.3,3.0,109,102.0,24,30,13950
4,2,164.0,1.0,1.0,3.0,0.0,0.0,66.4,54.3,3.0,136,115.0,18,22,17450


In [28]:
#Encoded
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    float64
 2   make               205 non-null    float64
 3   fuel-type          205 non-null    float64
 4   body-style         205 non-null    float64
 5   drive-wheels       205 non-null    float64
 6   engine-location    205 non-null    float64
 7   width              205 non-null    float64
 8   height             205 non-null    float64
 9   engine-type        205 non-null    float64
 10  engine-size        205 non-null    int64  
 11  horsepower         205 non-null    float64
 12  city-mpg           205 non-null    int64  
 13  highway-mpg        205 non-null    int64  
 14  price              205 non-null    int64  
dtypes: float64(10), int64(5)
memory usage: 24.2 KB


## Model Buliding

In [29]:
x = df.iloc[:,:-1]
y = df['price']

In [30]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.3, random_state=1)

In [31]:
lr = LinearRegression()
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)

In [32]:
lr.intercept_

-64935.11357978259

In [33]:
#Slope value
lr.coef_

array([ 5.71727164e+01,  4.76320989e-01, -2.01309566e+02, -6.22705136e+02,
       -1.63712110e+02,  1.88863899e+03,  1.63884484e+04,  7.90632094e+02,
        3.61221503e+02,  2.81207534e+02,  9.82290864e+01, -1.06474945e+01,
        3.08435166e+02, -4.17126915e+02])

In [34]:
lr.score(xtrain, ytrain), lr.score(xtest, ytest)

(0.8504229026078215, 0.7964854785429524)

In [35]:
#Overfitted model

## Lasso
- "Least Absolute Shrinkage and Selection Operator," is a regression analysis technique used for variable selection and regularization in linear models. It adds a penalty term to the standard linear regression objective function, which helps prevent overfitting and encourages simpler models by imposing a constraint on the sum of the absolute values of the model's coefficients.

In [36]:
l1 = Lasso()
l1.fit(xtrain,ytrain)
l1.score(xtrain, ytrain), l1.score(xtest, ytest)

(0.8504215478243032, 0.7966615211575686)

In [37]:
l1.coef_

array([ 5.70766693e+01,  4.67141700e-01, -2.01139953e+02, -6.14206970e+02,
       -1.64751649e+02,  1.88558824e+03,  1.63181959e+04,  7.88680118e+02,
        3.61783500e+02,  2.81248690e+02,  9.83193653e+01, -1.05310631e+01,
        3.07274772e+02, -4.15725804e+02])

## Hyperparameter Tuning

In [38]:
# We are looking for Alpha Value which make Train and test score same or approx same
for i in range(100,200):
    l1 = Lasso(alpha=i)
    l1.fit(xtrain,ytrain)
    print(f"Alpha {i} Train {l1.score(xtrain, ytrain)} Test {l1.score(xtest, ytest)}")

Alpha 100 Train 0.8372483974026499 Test 0.8092040910955614
Alpha 101 Train 0.8369899229541407 Test 0.8092979610738258
Alpha 102 Train 0.8367288785613758 Test 0.8093910017461712
Alpha 103 Train 0.8364651676681871 Test 0.8094832684251526
Alpha 104 Train 0.8361989818307348 Test 0.8095746503624812
Alpha 105 Train 0.8359302253870345 Test 0.8096652035299016
Alpha 106 Train 0.8356588978359675 Test 0.8097549282309099
Alpha 107 Train 0.8353849991129144 Test 0.8098438245155563
Alpha 108 Train 0.8351084257059385 Test 0.809931945484127
Alpha 109 Train 0.8348293842179737 Test 0.8100191840285438
Alpha 110 Train 0.8345477714936769 Test 0.8101055942385992
Alpha 111 Train 0.8342635877040369 Test 0.8101911760417996
Alpha 112 Train 0.8339767222137543 Test 0.8102759813628972
Alpha 113 Train 0.8336873954613211 Test 0.8103599056262563
Alpha 114 Train 0.8333954973355284 Test 0.810443001666504
Alpha 115 Train 0.8331010276993127 Test 0.810525269547536
Alpha 116 Train 0.8328038691204882 Test 0.8106067595950118


In [39]:
l1 = Lasso(alpha=175)
l1.fit(xtrain,ytrain)
l1.score(xtrain, ytrain), l1.score(xtest, ytest)

(0.8107234361496893, 0.8139459609474125)

In [40]:
# We got generalized model

In [41]:
l1.coef_

array([  26.15091464,   -0.        , -173.02547787,   -0.        ,
       -363.80743898, 1247.84193718, 3867.709301  ,  391.04314711,
        451.57194036,  297.35122938,  113.32977728,   11.00742675,
         70.90090498, -160.35202828])

In [42]:
# columns with value 0 are of no use drop those cols

In [43]:
xtrain.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'body-style',
       'drive-wheels', 'engine-location', 'width', 'height', 'engine-type',
       'engine-size', 'horsepower', 'city-mpg', 'highway-mpg'],
      dtype='object')

In [44]:
# Thats how Lasso are used for Regularization and Feature Selection

## Rigid

In [45]:
l2 = Ridge()
l2.fit(xtrain, ytrain)
l2.score(xtrain, ytrain), l2.score(xtest, ytest)

(0.8435840853399225, 0.8075632224690537)

In [46]:
# Tuning
for i in range(1, 100):
    l2 = Ridge(alpha=i)
    l2.fit(xtrain,ytrain)
    print(f"Alpha {i} Train {l2.score(xtrain, ytrain)} Test {l2.score(xtest, ytest)}")

Alpha 1 Train 0.8435840853399225 Test 0.8075632224690537
Alpha 2 Train 0.8356695734845092 Test 0.8112192014374255
Alpha 3 Train 0.8296379623431073 Test 0.8129299663310143
Alpha 4 Train 0.8250699092246864 Test 0.8138839096972439
Alpha 5 Train 0.8215093087765016 Test 0.8144682684596014
Alpha 6 Train 0.8186486103834849 Test 0.814843562726523
Alpha 7 Train 0.8162882573020809 Test 0.8150880725612083
Alpha 8 Train 0.8142964263180523 Test 0.815244730507158
Alpha 9 Train 0.812583522729097 Test 0.8153392574436445
Alpha 10 Train 0.8110868722186451 Test 0.8153881483263244
Alpha 11 Train 0.8097614513602518 Test 0.8154025610279507
Alpha 12 Train 0.8085741366835051 Test 0.8153903693701571
Alpha 13 Train 0.8075000372738095 Test 0.8153573212892009
Alpha 14 Train 0.8065200924084595 Test 0.8153077294072232
Alpha 15 Train 0.8056194580416446 Test 0.8152449025615436
Alpha 16 Train 0.804786398073303 Test 0.8151714264843272
Alpha 17 Train 0.8040115065187471 Test 0.8150893527689589
Alpha 18 Train 0.8032871514

In [47]:
l2 = Ridge(alpha=7)
l2.fit(xtrain, ytrain)
l2.score(xtrain, ytrain), l2.score(xtest, ytest)

(0.8162882573020809, 0.8150880725612083)

In [48]:
l2.coef_

array([ 2.31139279e+02, -3.29204695e+00, -1.90126861e+02, -1.01249352e+03,
       -6.04047656e+02,  1.71736131e+03,  3.40160377e+03,  3.91530395e+02,
        5.57861287e+02,  5.13867686e+02,  1.02968876e+02,  2.01227011e+01,
        2.13425523e+02, -2.79915768e+02])

In [49]:
# We don't any 0 because Rigid do not perform Feature Selection

## Elastic Net

In [50]:
em = ElasticNet()
em.fit(xtrain, ytrain)
em.score(xtrain, ytrain), em.score(xtest, ytest)

(0.7866253599240464, 0.8098264817256129)

In [51]:
# Generalized model we dont have to perform Hyperparameter tuning
# Still we can perform Hyperparameter tuning