# Notebook 4: Ridge and Lasso Regression
Since we are dealing with a lot of multicollinearity in our world bank data, we expect Ridge and Lasso regression to perform better than the least squares regression from Notebook 2.

In [5]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import tqdm
import glob
import pandas as pd
import sklearn
from src import ana_utils as utils

from sklearn.linear_model import LinearRegression

np.random.seed(7)

Import datasets

In [6]:
wb_data = pd.read_csv("data/wb_data.csv", index_col="Country Name")
wb_data_short = pd.read_csv("data/wb_data_short.csv", index_col="Country Name")
whr_data = pd.read_csv("data/whr_data.csv", index_col="Country name")

## Ridge Regression

In [7]:
ridge = sklearn.linear_model.Ridge()

In [8]:
test_size = 1
# For the full dense indicator data
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(reg_model=ridge, n=1000, data=wb_data, gt=whr_data, test_size=test_size, scaling="normalize")
print("Mean loss (full set of indicators):", mean_loss)
print("The average size of the first ten coefficients ((full set of indicators)):", avg_coefs[:10], "\n")

loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(reg_model=ridge, n=1000, data=wb_data, gt=whr_data, test_size=test_size, scaling="normalize")
print("Mean loss (reduced set of indicators):", mean_loss)
print("The average size of the first ten coefficients (reduced set of indicators):", avg_coefs[:10])

Mean loss (full set of indicators): 0.35347341405590865
The average size of the first ten coefficients ((full set of indicators)): [ 0.3332  0.153   0.2141  0.218  -0.0819 -0.1467  0.2313 -0.2651 -0.2095
 -0.743 ] 

Mean loss (reduced set of indicators): 0.3430736714759553
The average size of the first ten coefficients (reduced set of indicators): [ 0.3331  0.1532  0.2142  0.2193 -0.0818 -0.1467  0.2314 -0.2651 -0.2094
 -0.743 ]


## Lasso regression

In [9]:
lasso = sklearn.linear_model.Lasso()

In [12]:
# For the full dense indicator data
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(reg_model=lasso, n=1000, data=wb_data, gt=whr_data, test_size=test_size, scaling="normalize")
print("Mean loss (full set of indicators):", mean_loss)
print("The average size of the first ten coefficients ((full set of indicators)):", avg_coefs, "\n")

loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(reg_model=lasso, n=1000, data=wb_data, gt=whr_data, test_size=test_size, scaling="normalize")
print("Mean loss (reduced set of indicators):", mean_loss)
print("The average size of the first ten coefficients (reduced set of indicators):", avg_coefs)

Mean loss (full set of indicators): 1.31077610493978
The average size of the first ten coefficients ((full set of indicators)): 0.0 

Mean loss (reduced set of indicators): 1.2630835249502004
The average size of the first ten coefficients (reduced set of indicators): 0.0


## Optimizing penalty parameter $\alpha$

In [16]:
alphas = [0.01, 0.1, 1, 10]
ridge_cv = sklearn.linear_model.RidgeCV(alphas=alphas)
lasso_cv = sklearn.linear_model.LassoCV(alphas=alphas, normalize=True)

In [17]:
# For the full dense indicator data
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(reg_model=ridge_cv, n=1000, data=wb_data, gt=whr_data, test_size=test_size, scaling="normalize")
#print("Mean loss (full set of indicators):", mean_loss)
#print("The average size of the first ten coefficients ((full set of indicators)):", avg_coefs[:10], "\n")

# For the full dense indicator data
loss_list, mean_loss, coef_list, avg_coefs = utils.n_fold_ceval(reg_model=lasso_cv, n=1000, data=wb_data, gt=whr_data, test_size=test_size, scaling="no_scaling")
#print("Mean loss (full set of indicators):", mean_loss)
#print("The average size of the first ten coefficients ((full set of indicators)):", avg_coefs[:10], "\n")

In [21]:
print("Best alpha for ridge: ", ridge_cv.alpha_)
#print("Best alpha for lasso: ", lasso_cv.alpha_)
mean_loss

Best alpha for ridge:  0.1


0.36599939362370426