# Regularisation for Linear Regression

In [8]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
df = pd.read_csv("Advertising.csv")

In [10]:
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [11]:
X = df.drop('sales', axis=1)

In [12]:
y = df['sales']

In [6]:
y

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [13]:
# Polynomial feature set.
from sklearn.preprocessing import PolynomialFeatures
polyconverter = PolynomialFeatures(degree=3, include_bias=False)
polyfset = polyconverter.fit_transform(X)

In [9]:
X.shape

(200, 3)

In [10]:
polyfset.shape

(200, 19)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(polyfset, y, test_size=0.3, random_state=42)

In [13]:
X_train

array([[2.8430000e+02, 1.0600000e+01, 6.4000000e+00, ..., 7.1910400e+02,
        4.3417600e+02, 2.6214400e+02],
       [1.8490000e+02, 2.1000000e+01, 2.2000000e+01, ..., 9.7020000e+03,
        1.0164000e+04, 1.0648000e+04],
       [1.1290000e+02, 1.7400000e+01, 3.8600000e+01, ..., 1.1686536e+04,
        2.5925304e+04, 5.7512456e+04],
       ...,
       [2.1770000e+02, 3.3500000e+01, 5.9000000e+01, ..., 6.6212750e+04,
        1.1661350e+05, 2.0537900e+05],
       [1.6560000e+02, 1.0000000e+01, 1.7600000e+01, ..., 1.7600000e+03,
        3.0976000e+03, 5.4517760e+03],
       [2.8020000e+02, 1.0100000e+01, 2.1400000e+01, ..., 2.1830140e+03,
        4.6253960e+03, 9.8003440e+03]])

In [14]:
X_train.shape

(140, 19)

In [17]:
# Feature scaling!
from sklearn.preprocessing import StandardScaler

In [18]:
# We do not to assume ANY information from the test set!
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [19]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
X_train # Much smaller values! all scaled down to the same range. Good for performance.

array([[ 1.53003874, -0.8862929 , -1.16598019, ..., -0.72447935,
        -0.64260728, -0.56741817],
       [ 0.34103865, -0.17314426, -0.40960132, ..., -0.49005815,
        -0.48542724, -0.48607391],
       [-0.5202089 , -0.42000341,  0.39526337, ..., -0.43826891,
        -0.23081191, -0.11902137],
       ...,
       [ 0.73338476,  0.68400555,  1.38437419, ...,  0.98466871,
         1.23420682,  1.03910131],
       [ 0.11017646, -0.92743609, -0.62293895, ..., -0.69731572,
        -0.59958111, -0.52677186],
       [ 1.48099548, -0.9205789 , -0.43869282, ..., -0.68627658,
        -0.57490039, -0.49271293]])

In [20]:
polyfset

array([[2.30100000e+02, 3.78000000e+01, 6.92000000e+01, ...,
        9.88757280e+04, 1.81010592e+05, 3.31373888e+05],
       [4.45000000e+01, 3.93000000e+01, 4.51000000e+01, ...,
        6.96564990e+04, 7.99365930e+04, 9.17338510e+04],
       [1.72000000e+01, 4.59000000e+01, 6.93000000e+01, ...,
        1.46001933e+05, 2.20434291e+05, 3.32812557e+05],
       ...,
       [1.77000000e+02, 9.30000000e+00, 6.40000000e+00, ...,
        5.53536000e+02, 3.80928000e+02, 2.62144000e+02],
       [2.83600000e+02, 4.20000000e+01, 6.62000000e+01, ...,
        1.16776800e+05, 1.84062480e+05, 2.90117528e+05],
       [2.32100000e+02, 8.60000000e+00, 8.70000000e+00, ...,
        6.43452000e+02, 6.50934000e+02, 6.58503000e+02]])

# RIDGE REGRESSION (L2)

- Regularisation technique to help reduce the potential for overfitting to the training data.
- Adds a penalty term in the error that is based off the square value of coefficients. - Will detect issues easily.
- Can allow us to better generalise to unseen data.
- Make us not TOO overly responsive to our training data. Generalise to new unseen data.
- Punishes very steep slopes.
- We can give a range of lamda values and the pc can go through them and give us the highest performant one.

In [21]:
# lambda = alpha in sklearn (alpha is a unified term)
from sklearn.linear_model import Ridge

In [22]:
help(Ridge)

Help on class Ridge in module sklearn.linear_model._ridge:

class Ridge(sklearn.base.MultiOutputMixin, sklearn.base.RegressorMixin, _BaseRidge)
 |  Ridge(alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)
 |  
 |  Linear least squares with l2 regularization.
 |  
 |  Minimizes the objective function::
 |  
 |  ||y - Xw||^2_2 + alpha * ||w||^2_2
 |  
 |  This model solves a regression model where the loss function is
 |  the linear least squares function and regularization is given by
 |  the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
 |  This estimator has built-in support for multi-variate regression
 |  (i.e., when y is a 2d-array of shape (n_samples, n_targets)).
 |  
 |  Read more in the :ref:`User Guide <ridge_regression>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : {float, ndarray of shape (n_targets,)}, default=1.0
 |      Regularization strength; must be a positive float. Regul

In [23]:
ridge_model = Ridge(alpha=10) # Run just like a linear reg model.

In [24]:
ridge_model.fit(X_train, y_train)

Ridge(alpha=10)

In [25]:
testpreds = ridge_model.predict(X_test)

In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [28]:
MAE = mean_absolute_error(y_test, testpreds)

In [29]:
MSE = mean_squared_error(y_test, testpreds)

In [30]:
MAE

0.62965913467586

In [31]:
MSE

0.7950089683107227

In [32]:
RMSE = np.sqrt(MSE)

In [33]:
RMSE

0.8916327541710896

In [54]:
from sklearn.linear_model import RidgeCV # Ridge with cross validation.

# All sklearn score metrics where higher is better!
ridge_cv_model = RidgeCV(
    alphas=(0.01, 0.05, 0.1, 1.0, 10),
    # What metric are we targeting?
    scoring='neg_mean_absolute_error'
)

In [55]:
ridge_cv_model.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.01,  0.05,  0.1 ,  1.  , 10.  ]),
        scoring='neg_mean_absolute_error')

In [56]:
ridge_cv_model.alpha_ # Which one performed the best!

0.05

In [42]:
from sklearn.metrics import SCORERS

In [44]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [57]:
testpreds = ridge_cv_model.predict(X_test)

In [58]:
MAE = mean_absolute_error(y_test, testpreds)

In [59]:
MSE = mean_squared_error(y_test, testpreds)

In [60]:
MAE

0.4275529937963091

In [61]:
MSE # Wow we really improved the results!

0.31258904975397156

In [62]:
RMSE = np.sqrt(MSE)

In [63]:
RMSE

0.5590966372229148

In [64]:
ridge_cv_model.coef_

array([ 6.60359019,  0.35104375,  0.66131892, -8.01826635,  4.09247469,
       -1.53498094,  0.0599518 ,  0.14458021, -0.11938083,  3.27316687,
       -0.62046917,  0.9815355 ,  0.39000934, -0.5503902 ,  0.40823573,
       -0.2747367 ,  0.30144979, -0.05989389, -0.15428326])

# LASSO Regression (L1 Regularisation)

- Very similar to ridge but we consider instead the absolute value, reducing the value of coefficients
- Can yield sparce models where some coefs can be 0 or close to
- least absolute shrinkage and selection operator (LASSO)

In [67]:
from sklearn.linear_model import Lasso, LassoCV

In [72]:
lasso_cv_model = LassoCV(
    eps=0.001, # Ratio of alpha min to alpha max, smaller the value the wider range we are checking.
    n_alphas=100, # higher the number, more alphas we check.
    cv=5, # 5 fold cross validation.
    max_iter=10000 # ConvergenceWarning
)

In [73]:
lasso_cv_model.fit(X_train, y_train)

LassoCV(cv=5, max_iter=10000)

In [74]:
lasso_cv_model.alpha_

0.0049245318064748715

In [75]:
testpreds = lasso_cv_model.predict(X_test)

In [76]:
MAE = mean_absolute_error(y_test, testpreds)
MSE = mean_squared_error(y_test, testpreds)

In [77]:
MAE

0.5123045552899828

In [78]:
MSE

0.3979140711021839

In [79]:
lasso_cv_model.coef_ # Vast majority are 0, able to get a reasonable MAR and MSE by considering less features!

array([ 5.15048089,  0.4274257 ,  0.29684446, -4.53337994,  3.38937185,
       -0.4288993 ,  0.        ,  0.        ,  0.        ,  1.17891049,
       -0.        ,  0.        ,  0.16706037, -0.        ,  0.        ,
        0.        ,  0.11083672,  0.        ,  0.06155549])

# Elastic Net Regularisation

In [1]:
from sklearn.linear_model import ElasticNetCV

In [3]:
# Can let us chose the best ration between L1 and L2
elastic_model = ElasticNetCV(
    l1_ratio=[.1, .5, .7, .9, .95, .99, 1], 
    eps=0.001,
    n_alphas=100,
    max_iter=100000
)

In [20]:
elastic_model.fit(X_train, y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=100000)

In [21]:
elastic_model.l1_ratio # the l1 ratios it tried

[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]

In [23]:
elastic_model.l1_ratio_ # decided LASSO was the way to go! (returns best ratio)

1.0

In [24]:
elastic_model.alpha_

0.0049245318064748715

In [25]:
testpreds = elastic_model.predict(X_test)

In [28]:
MAE = mean_absolute_error(y_test, testpreds)
MSE = mean_squared_error(y_test, testpreds)

In [29]:
MAE

0.5123045552899828

In [30]:
MSE

0.3979140711021839