---
title: Regularization methods - lasso, ridge, and elastic net
teaching: 45
exercises: 2
keypoints:
- ""
objectives:
- ""
questions:
- ""
---

### Split data into train/test sets and zscore
We will now split our data into two separate groupings — one for fitting or training the model ("train set") and another for testing ("test set") the model's ability to generalize to data that was excluded during training. The amount of data you exclude for the test set should be large enough that the model can be vetted against a diverse range of samples. A common rule of thumb is to use 3/4 of the data for training, and 1/3 for testing.

In [12]:
from sklearn.model_selection import train_test_split

# Perform train/test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.33, 
                                                    random_state=0)
print(X_train.shape)
print(X_test.shape)

print(type(y_train))
print(type(X_train))



(978, 215)
(482, 215)
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [None]:
# sklearn version
from sklearn.linear_model import LinearRegression

# stats model version (for hypothesis testing)
from statsmodels.formula.api import ols

def train_linear_model(X_train, y_train, model_type):
    if model_type == "unregularized":
        reg = LinearRegression().fit(X_train,y_train)
#         reg = ols("dist ~ speed", data=cars).fit()
    else:
        raise ValueError('Unexpected model_type encountered; model_type = ' + model_type)
  
    # print number of estimated model coefficients. Need to add one to account for y-intercept (not included in reg.coef_ call)
    print('# model coefs = ' + str(len(reg.coef_)+1))

    return reg



Define a function `measure_model_err` to help us measure the model's performance (train/test RMSE)

In [None]:
from sklearn import metrics

def measure_model_err(X_train, X_test, y_train, y_test, reg):
    
    # 1) get model predicitons based on transformed (z-scored) predictor vars
    y_pred_train=reg.predict(X_train)
    y_pred_test=reg.predict(X_test)
    
    # 2) reverse log transformation (exponential)
    y_pred_train=np.exp(y_pred_train)
    y_pred_test=np.exp(y_pred_test)
    y_train=np.exp(y_train)
    y_test=np.exp(y_test)
    
    # 3) calculate RMSE for train and test sets
    RMSE_train = metrics.mean_squared_error(y_train, y_pred_train,squared=False) # squared=False to get RMSE instead of MSE
    R2_train = reg.score(X_train, y_train) # returns R^2 ("coef of determination")
    RMSE_test = metrics.mean_squared_error(y_test, y_pred_test,squared=False) 
    R2_test = reg.score(X_test, y_test) # returns R^2 ("coef of determination")

    return RMSE_train, RMSE_test, R2_train, R2_test

Define a function `fit_eval_model` that will call both `train_linear_model` and `measure_model_err` and report back on model performance.

In [None]:
def fit_eval_model(X_train, y_train, X_test, y_test, predictor_vars, model_type):
    '''This function uses the predictor vars specified by predictor_vars to predict housing price. Function returns RMSE for both train and test data'''
    # Convert response vectors from pandas series to numpy arrays. 
    # This is necessary for downstream analyses (required format for linear regression fucntion we'll use).
    y_train=np.array(y_train) 
    y_test=np.array(y_test) 

    # Index specific predictor vars. Use reshape to handle case of just one predictor var (convert to shape=[numRows,numvars] rather than shape=[numRows,] )
    X_train=np.array(X_train[predictor_vars]).reshape(-1, len(predictor_vars)) # index subset of predictor vars
    X_test=np.array(X_test[predictor_vars]).reshape(-1, len(predictor_vars)) # do the same for test set

    # report predictor var if there's only one
    if len(predictor_vars)==1:
        preview_predict_var = ' (' + predictor_vars[0] + ')'
    else:
        preview_predict_var = ''

    # print number of observations in train/test sets as well as number of features used to predict housing price
    print('# of predictor vars = ' + str(len(predictor_vars)) + preview_predict_var)
    print('# of train observations = ' + str(X_train.shape[0]))
    print('# of test observations = ' + str(X_test.shape[0]))
  
    # fit model to training data
    reg = train_linear_model(X_train, y_train, model_type)

    # get train and test set RMSE
    RMSE_train, RMSE_test = measure_model_err(X_train, X_test, y_train, y_test, reg)

    # print results
    print('Train RMSE:', RMSE_train)
    print('Test RMSE:', RMSE_test)
    perc_diff = (RMSE_test-RMSE_train)/RMSE_train
    perc_diff = "{:.0%}".format(perc_diff)
    print('(Test-Train)/Train:', perc_diff)
    return RMSE_train, RMSE_test


In [63]:
import pandas as pd 

all_feats=X_train.columns
RMSE_train_list=[None] * len(all_feats)
RMSE_test_list=[None] * len(all_feats)

feat_index=0
for feat in all_feats:  
    # fit univariate model and return train/test RMSE
    RMSE_train, RMSE_test = fit_eval_model(X_train, y_train, 
                                           X_test, y_test,
                                           [feat],'unregularized')
    print('')
    # store model errors
    RMSE_train_list[feat_index] = RMSE_train
    RMSE_test_list[feat_index] = RMSE_test#metrics.mean_squared_error(y_test, predicted_test,squared=False) # squared=False to get RMSE instead of MSE
    feat_index+=1
    
# store errors in pandas dataframe for ease of access downstream
df_model_err = pd.DataFrame()
df_model_err['Predictor Variable'] = all_feats
df_model_err['Train RMSE'] = RMSE_train_list
df_model_err['Test RMSE'] = RMSE_test_list


# of predictor vars = 1 (MSSubClass_20.0)
# of train observations = 978
# of test observations = 482
# model coefs = 2


ValueError: too many values to unpack (expected 2)

## Fit multivariate model using all predictor vars

#### Predictive Models VS Interpretable Models
* **Interpretable models**: Models trained with linear regression are the most interpretable kind of regression models available - meaning it’s easier to take action from the results of a linear regression model. However, if the assumptions are not satisfied, the interpretation of the results will not always be valid. This can be very dangerous depending on the application.

#### Assumptions of multivariate regression (for statistical/hypothesis testing)
1. Independence: All observations are independent
2. Linearity: The relationship between the dependent variable and the independent variables should be linear

    a. **Note**: In practice, linear models are often used to model nonlinear relationships due to complexity (number of model parameters/coefs that need to be estimated) of nonlinear models. When using a linear model to model nonlinear relationships, it usually best to use resulting model for predictive purposes only. 
3. Normality: For each value of the dependent variable, the distribution of the dependent variable must be normal.
4. Homoscedasticity: The residuals of a good model should be normally and randomly distributed i.e. the unknown does not depend on X ("homoscedasticity")

In [None]:
print(len(labels)) 

213


In [None]:
help(fit_eval_model)

Help on function fit_eval_model in module __main__:

fit_eval_model(X_train, y_train, X_test, y_test, predictor_vars, model_type)
    This function uses the predictor vars specified by predictor_vars to predict housing price. Function returns RMSE for both train and test data



In [None]:
# fit model using all features/predictors available
RMSE_train, RMSE_test = fit_eval_model(X_train, y_train, X_test, y_test, labels, 'unregularized')

# of predictor vars = 213
# of train observations = 978
# of test observations = 482
# model coefs = 214
Train RMSE: 21981.654614715466
Test RMSE: 3562241001.482347
(Test-Train)/Train: 16205418%


### Discuss
Is this a good model? Does this model encounter overfitting?

Flesh this out. How many features, how many observations, how many model coefs

## Regularized regression: ridge, lasso, elastic net


### Ridge and RidgeCV
- Show ridge optimization equation
- Default CV is Leave-One-Out. In this form of CV, all samples in the data except for one are used as the inital training set. The left out sample is used a validation set.
- One alpha value used for entire model; larger alphas give more weight to the penalty/regularization term of the loss function

Edit function below to use multiple regression techniques (add model_type input)





In [None]:
# edit train_linear_model to train ridge models as well
def train_linear_model(X_train, y_train, model_type):
    if model_type == "unregularized":
        reg = LinearRegression().fit(X_train,y_train)
    elif model_type == 'ridge':
        reg = RidgeCV(alphas=[1e-3,1e-2,1e-1,1,10,100,1000], store_cv_values=True).fit(X_train,y_train)
        print(reg.cv_values_.shape) # num_datapoints x num_alphas
        print(np.mean(reg.cv_values_, axis=0))
        print(reg.alpha_)
    else:
        raise ValueError('Unexpected model_type encountered; model_type = ' + model_type)

    # print number of estimated model coefficients. Need to add one to account for y-intercept (not included in reg.coef_ call)
    print('# model coefs = ' + str(len(reg.coef_)+1))

    return reg



In [None]:
# import sklearn's ridge model with built-in cross-validation
from sklearn.linear_model import RidgeCV 

# fit model using multivariate_model_feats and ridge regression
RMSE_train, RMSE_test = fit_eval_model(X_train, y_train, X_test, y_test, labels, 'ridge')

# of predictor vars = 213
# of train observations = 978
# of test observations = 482
(978, 7)
[1.01586692e+09 1.01401918e+09 9.99400573e+08 9.57029390e+08
 9.43452552e+08 1.02279420e+09 1.21826389e+09]
10.0
# model coefs = 214
Train RMSE: 25463.82775189401
Test RMSE: 39003.787373887266
(Test-Train)/Train: 53%


- What is the model's train and test error? How does this compare to the unregularized model we fit using all predictor variables? How does this model compare to the best univariate model we fit?
  - The ridge model does much better (i.e., in terms of Test RMSE) than the unregularized model that uses all predictor vars.
  - Unregularized_all_predictors_testRMSE: 3562241001
  - Unregularized_best_univariate_testRMSE: 48243
  - Regularized_all_predictors_testRMSE: 39004

- What alpha value was selected using RidgeCV? Is it a lower or higher value? What does this value tell you about the model?
  - This model is highly regularized/penalized since it has a large alpha value



### LASSO
- explain why there's a random state param in LASSO but not ridge


In [None]:
# edit train_linear_model to train ridge models as well
def train_linear_model(X_train, y_train, model_type):
    if model_type == "unregularized":
        reg = LinearRegression().fit(X_train,y_train)
    elif model_type == 'ridge':
        reg = RidgeCV(alphas=[1e-3,1e-2,1e-1,1,10,100,1000], store_cv_values=True).fit(X_train,y_train)
        print(reg.cv_values_.shape) # num_datapoints x num_alphas
        print(np.mean(reg.cv_values_, axis=0))
        print(reg.alpha_)
    elif model_type == 'lasso':
        reg = LassoCV(random_state=0, alphas=[1e-3,1e-2,1e-1,1,10,100,1000], max_iter=100000, tol=1e-3).fit(X_train,y_train)
        print(reg.alpha_)
        print(reg.alphas_)

    else:
        raise ValueError('Unexpected model_type encountered; model_type = ' + model_type)

    # print number of estimated model coefficients. Need to add one to account for y-intercept (not included in reg.coef_ call)
    print('# model coefs = ' + str(len(reg.coef_)+1))

    return reg



In [None]:
# import sklearn's lasso model with built-in cross-validation
from sklearn.linear_model import LassoCV 

# fit model using multivariate_model_feats and ridge regression
RMSE_train, RMSE_test = fit_eval_model(X_train, y_train, X_test, y_test, labels, 'lasso')

# of predictor vars = 213
# of train observations = 978
# of test observations = 482


  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,


100.0
[1.e+03 1.e+02 1.e+01 1.e+00 1.e-01 1.e-02 1.e-03]
# model coefs = 214
Train RMSE: 23844.411315377245
Test RMSE: 41216.23320716389
(Test-Train)/Train: 73%


Add elastic net option to function

In [None]:
# edit train_linear_model to train ridge models as well
def train_linear_model(X_train, y_train, model_type):
    if model_type == "unregularized":
        reg = LinearRegression().fit(X_train,y_train)
    elif model_type == 'ridge':
        reg = RidgeCV(alphas=[1e-3,1e-2,1e-1,1,10,100,1000], store_cv_values=True).fit(X_train,y_train)
        print(reg.cv_values_.shape) # num_datapoints x num_alphas
        print(np.mean(reg.cv_values_, axis=0))
        print('alpha:', reg.alpha_)
    elif model_type == 'lasso':
        reg = LassoCV(random_state=0, alphas=[1e-3,1e-2,1e-1,1,10,100,1000], max_iter=100000, tol=1e-3).fit(X_train,y_train)
        print('alpha:', reg.alpha_)
        print('alphas:', reg.alphas_)
    elif model_type == 'elastic':
        reg = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],alphas=[1e-5,1e-4,1e-3,1e-2,1e-1,1,10]).fit(X_train,y_train)
        print('alpha:', reg.alpha_)
        print('l1_ratio:', reg.l1_ratio_)
    else:
        raise ValueError('Unexpected model_type encountered; model_type = ' + model_type)

    # print number of estimated model coefficients. Need to add one to account for y-intercept (not included in reg.coef_ call)
    print('# model coefs = ' + str(len(reg.coef_)+1))

    return reg



In [None]:
from sklearn.linear_model import ElasticNetCV

# fit model using multivariate_model_feats and ridge regression
RMSE_train, RMSE_test = fit_eval_model(X_train, y_train, X_test, y_test, labels, 'elastic')

# of predictor vars = 213
# of train observations = 978
# of test observations = 482


  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  positive,
  po

0.1
0.95
# model coefs = 214
Train RMSE: 24561.74964572943
Test RMSE: 39333.83728843832
(Test-Train)/Train: 60%


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [None]:
# Diabetes dataset

# from sklearn import datasets
# example datasets from sklean: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes
# iris_X, iris_y = datasets.load_iris(return_X_y=True)
# more info on diabetes dataset: https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset
# diabetes = datasets.load_diabetes(return_X_y=False,as_frame=False)
# print(type(diabetes))
# feat_names=diabetes['feature_names']
# print(feat_names)
# data=diabetes['data']
# target=diabetes['target'] # the target is a quantitative measure of disease progression one year after baseline
# print(data.shape)
# print(target.shape)
# print(diabetes_X.shape) # 442 observations, 10 features
# diabetes_y

# California housing dataset

# from sklearn.datasets import fetch_california_housing
# housing = fetch_california_housing()
# # housing
# feat_names=housing['feature_names']
# print(feat_names)
# print(len(feat_names))