In [33]:
import numpy as np
import pandas as pd
import random
import sklearn.linear_model

# Problem 2

## Part (b)

### Generate Data

In [22]:
n, ps, rhos = 100, [10, 25, 50], [0, 0.25, 0.5]

In [47]:
def generate_data(n, p, rho, signal='sparse'):
    if signal == 'sparse':
        beta = np.array([[2/np.sqrt(n) * (j <= np.sqrt(p)) for j in range(1,p+1,1)]]).T
    elif signal == 'dense':
        beta = np.array([[5/(j*np.sqrt(n)) for j in range(1,p+1,1)]]).T
    else:
        return
    R2 = 0.8
    sigma_rho = np.reshape([rho**(np.abs(i-j)) for i in range(1,p+1,1) for j in range(1,p+1,1)], (-1, p))
    sigma2 = ((1-R2)/R2 * (beta.T) @ sigma_rho @ beta)[0][0]
    sigma = np.sqrt(sigma2)
    epsilon = np.random.normal(0, sigma, n).reshape(n,1)
    X = np.random.multivariate_normal(np.zeros(p), sigma_rho, n)
    Y = (X @ beta + epsilon).flatten()
    return X, Y

In [57]:
# Example
# X, Y = generate_data(100, 10, 0, 'sparse')
# X.shape, Y.shape

In [58]:
# Example
# X, Y = generate_data(100, 10, 0, 'dense')
# X.shape, Y.shape

### LASSO - AIC/BIC/LOO-CV

In [331]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoLarsIC
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LassoCV

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Code source: 
# https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV
# https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html

def fit_LASSO(X, Y):
    # Tune alpha
    
    ## AIC:
    lasso_lars_ic = make_pipeline(
        StandardScaler(), LassoLarsIC(criterion="aic", normalize=False)
    ).fit(X, Y)
    
    alpha_aic = lasso_lars_ic[-1].alpha_
    
    ## BIC:
    lasso_lars_ic.set_params(lassolarsic__criterion="bic").fit(X, Y)
    #results["BIC criterion"] = lasso_lars_ic[-1].criterion_
    alpha_bic = lasso_lars_ic[-1].alpha_
    
    ## LOO-CV:
    ### LOO-CV is just k-fold CV with k = number of observations (n)
    lasso_model = make_pipeline(StandardScaler(), LassoCV(cv=n, n_alphas=20)).fit(X, Y)
    lasso = lasso_model[-1]
    alpha_cv = lasso.alpha_
    
    # Fit LASSO with tuned alpha:
    
    ## with alpha from AIC:
    clf = Lasso(alpha=alpha_aic)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    mse_aic = mean_squared_error(Y, Y_pred)
    
    ## with alpha from BIC:
    clf = Lasso(alpha=alpha_aic)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    mse_bic = mean_squared_error(Y, Y_pred)
    
    ## with alpha from LOO-CV:
    clf = Lasso(alpha=alpha_cv)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    mse_cv = mean_squared_error(Y, Y_pred)
    
    return [mse_aic, mse_bic, mse_cv]

In [330]:
def fit_model(n, p, rho, signal):
    data_list = [generate_data(n, p, rho, signal) for i in range(1000)]
    mse_ls = np.array([fit_LASSO(data_list[i][0],data_list[i][1]) for i in range(1000)])
    return np.mean(mse_ls, axis = 0)

In [236]:
import time
start_time = time.time()
result_sparse = [fit_model(n, p, rho, 'sparse') for p in ps for rho in rhos]
end_time = time.time()
run_time = end_time - start_time

In [336]:
# Run time in minutes
run_time / 60

37.61602555116018

In [240]:
start_time = time.time()
result_dense = [fit_model(n, p, rho, 'dense') for p in ps for rho in rhos]
end_time = time.time()
run_time2 = end_time - start_time

In [335]:
# Run time in minutes
run_time2 / 60

36.08029566208521

In [308]:
result_df_sparse = pd.DataFrame(result_sparse, columns = ['AIC.MSE', 'BIC.MSE', 'LOOCV.MSE'])
result_df_dense = pd.DataFrame(result_dense, columns = ['AIC.MSE', 'BIC.MSE', 'LOOCV.MSE'])

result_df = pd.concat([result_df_sparse, result_df_dense])
p_val = np.array([[p,rho] for p in ps for rho in rhos]*2)[:,0]
rho_val = np.array([[p,rho] for p in ps for rho in rhos]*2)[:,1]

result_df['p'] = p_val
result_df['rho'] = rho_val
result_df['Estimator'] = ['LASSO']*result_df.shape[0]
result_df['Signal'] = ['Sparse']*(int(result_df.shape[0]/2))+['Dense']*(int(result_df.shape[0]/2))
result_df = result_df[['p','rho','Estimator','Signal','AIC.MSE', 'BIC.MSE', 'LOOCV.MSE']]
result_df.reset_index(inplace=True, drop=True)

In [309]:
result_df

Unnamed: 0,p,rho,Estimator,Signal,AIC.MSE,BIC.MSE,LOOCV.MSE
0,10.0,0.0,LASSO,Sparse,0.02777,0.02777,0.027783
1,10.0,0.25,LASSO,Sparse,0.038466,0.038466,0.038554
2,10.0,0.5,LASSO,Sparse,0.050768,0.050768,0.050914
3,25.0,0.0,LASSO,Sparse,0.042836,0.042836,0.043202
4,25.0,0.25,LASSO,Sparse,0.064751,0.064751,0.065386
5,25.0,0.5,LASSO,Sparse,0.096796,0.096796,0.097836
6,50.0,0.0,LASSO,Sparse,0.05386,0.05386,0.054426
7,50.0,0.25,LASSO,Sparse,0.084834,0.084834,0.086605
8,50.0,0.5,LASSO,Sparse,0.137249,0.137249,0.140839
9,10.0,0.0,LASSO,Dense,0.086454,0.086454,0.086365
