In [1]:
from numba import njit, jit

import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from numpy import random
from numpy import linalg
from numpy import matlib
from scipy import sparse
from scipy import stats
from scipy.optimize import minimize
import statsmodels.api as sm
import matplotlib
import itertools as it
from matplotlib import rc


import pandas as pd
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.preamble'] = [
    r'\usepackage{amssymb}',
    r'\usepackage{amsmath}',
    r'\usepackage{xcolor}',
    r'\renewcommand*\familydefault{\sfdefault}']
matplotlib.rcParams['pgf.texsystem'] = 'pdflatex'
matplotlib.rcParams['pgf.preamble']  = [
    r'\usepackage[utf8x]{inputenc}',
    r'\usepackage{amssymb}',
    r'\usepackage[T1]{fontenc}',
    r'\usepackage{amsmath}',
    r'\usepackage{sansmath}']

from IPython.display import set_matplotlib_formats
%matplotlib inline
set_matplotlib_formats('svg')

import warnings

warnings.filterwarnings('ignore')

### Question 1

#### a)

In [2]:
def lasso_objective(b, y, X, lmbda):
    
    """
        Function that accepts the guess for the parameter vector and LASSO penalty multipler λ, 
         and computes the LASSO objective function based on the input data.
        :param b: Parameter vector.
        :param y: Outcome variable, vector of size N.
        :param X: Covariate variables (may or may not include ι), matrix of size N x P.
        :param lmbda: LASSO penalty.        
        :return: Objective function evaluated using inputs.
    """

    # Return the objective function if matrix multiplication Xβ is compatible.
    try:
        obj = np.square(y - X @ b).sum() + lmbda * norm(b, ord=1)
        return obj
    except:
        print("Error: The number of covariates is not compatible with given coefficient vector.")
        return np.inf       



#### b)

In [17]:
def dualsol(bj, lmbda):
    """
        Function that returns the solution for a single coordinate in the Cyclic
         Coordinate Descent algorithm given the OLS coordinate estimate and the
         LASSO penalty multipler.
        :param bj: OLS estimate for coordinate j.
        :param lmbda: LASSO penalty multiplier. 
        :return: LASSO coordinate estimate.
    """
   
    if bj < - lmbda:
        return (bj + lmbda)
    elif rho >  lamda:
        return (bj - lmbda)
    else: 
        return 0


def lasso_cdg(bstart, y, X, lmbda, eps=1e-6, maxiter=1000, standardized=False):
    
    """
        Function that performs the LASSO estimation through the Cyclic Coordinate Descent algorithm.
        :param b: Initial guess for the parameter vector (may or may not include b0, which will be trimmed out if so)
        :param y: Outcome variable, vector of size N.
        :param X: Covariate variables (may or may not include ι), matrix of size N x P.
        :param lmbda: LASSO penalty multiplier.  
        :param eps: Norm stopping criterion.
        :param maxiter: Iteration number stopping criterion.
        :param standardized: Indicator for whether the data has been standardized.
        :return: List containing:
            - :estimate: final coefficient vector estimate, 
            - :objectives: vector containing LASSO objective function values
            - :steps: vector containing norm of difference in estimated parameter vectors
            - :status: string regarding which stopping criterion was used.
    """
    
    p, N, b_guess = bstart.size, y.size, bstart
       
    if N != X.shape[0]:      
        print("Error: Covariate matrix is incompatible with outcome variable.")
        return None
    elif p != X.shape[1]:
        print("Error: Covariate matrix is incompatible with parameter vector.")
        return None
    

        
    # Standardize data if not done so
    if standardized is False:
        X_mean, y_mean = X.mean(axis=0), y.mean()
        X_std, y_std = X.std(axis=0), y.std()
        X, y = zscore(X, axis=0), zscore(y) 
        
    # LASSO objective
    lasso_obj = lambda b : lasso_objective(b, y, X, lmbda)
        
    keyDict = {"estimate", "objectives", "steps", "status"}
    output = dict([(key, []) for key in keyDict])
    
    niter, dist = 1, 1
    
    # While loop to perform LASSO minimization using two stopping criterion.
    while niter < maxiter and dist > eps:
        
        b_old = b_guess
        
        for j in np.arange(0, p):
            
            # Extract j^{th} covariate vector
            Xj = X[:,j].reshape(-1,1)
            
            # Compute OLS solution for β_j taking β_{-j} as given
            bj = X_j.T @ (y - X @ b_guess + b_guess[j] * Xj)
            
            # Update guess for j^{th} coordinate using LASSO closed form solution under CDG
            b_guess[j] = dualsol(bj, lmbda) 
            
        b0 = y_mean - np.dot(X_mean, b_guess)            
        
        output["estimate"].append(np.array([b0, b_guess]))
        output["objectives"].append(lasso_obj(b_guess))
        output["steps"].append(norm(b_old - b_guess, ord=np.inf))
        
        if norm(b_old - b_guess, ord=np.inf) < eps:
            output["status"] = "convergence"             
            return output  
                               
    output["status"] = "maxiter exceed"    
    
    return output
      

#### c)

In [None]:
def lasso_cdg_active(bstart,data,lmbda,epsilon=1e-06,maxiter=1000,standardized=False,cycle_len):
    p_coeff = bstart.size
    y = data[:,0]
    X = data[:,np.arange(1,data[0,:].size)]
    N,p = X.shape
    if p!=p_coeff:
        print("Error: Covariate number is not compatible with given coefficient vector")
        return
    else:
        if standardized==False:
            X = X / (np.linalg.norm(X,axis = 0))
        keyDict = {"estimate","objectives","steps","status"}
        output = dict([(key, []) for key in keyDict])
        for i in range(maxiter):
            if i%cycle_len==0:
                for j in range(p):
                    obj = lambda x : lasso_objective(np.hstack((bstart[0:j],x,bstart[j+1:p])),data,lmbda)
                    res = minimize(obj, x0=bstart[j], method='Nelder-Mead')
                    output["estimate"] = np.hstack((bstart[0:j],res.x,bstart[j+1:p]))
                    output["objectives"] = res.fun
                    output["steps"] = np.linalg.norm(bstart-output["estimate"],ord=np.inf)
            else:
                for j in np.nonzero(output["estimate"])[0].tolist():
                    obj = lambda x : lasso_objective(np.hstack((bstart[0:j],x,bstart[j+1:p])),data,lmbda)
                    res = minimize(obj, x0=bstart[j], method='Nelder-Mead')
                    output["estimate"] = np.hstack((bstart[0:j],res.x,bstart[j+1:p]))
                    output["objectives"] = res.fun
                    output["steps"] = np.linalg.norm(bstart-output["estimate"],ord=np.inf)
            if output["steps"]<epsilon:
                output["status"] = "convergence"
                return output                             
            bstart = output["estimate"]                            
        output["status"] = "maxiter exceed"        
        return output  

### Question 3

In [5]:
def sim_generateData(N, rho): 
  # generate X
    X1 = np.random.normal(size=N)
    X2 = rho * X1 + np.sqrt(1-rho**2) * np.random.normal(size=N)
  
  # generate Y
    Y = X1 + (1/np.sqrt(N)) * X2 + np.random.normal(size=N)
    data = {'Y': Y, 'X1': X1, 'X2': X2}
    df = pd.DataFrame (data, columns = ['Y','X1','X2'],index=range(N))

  # return dataset
    return(df)


#### a)

In [6]:
def pretest_estimateCoefficients(dat, alpha):
    # read dimensions
    N = dat.shape[0]
    dn= 1/(N**(1/4))
    X=dat.loc[:, dat.columns.isin(['X1','X2'])]
    mod = sm.OLS(dat.Y, X)
    res = mod.fit()  
    keyDict = {"coefficient","lb","ub","test"}
    output = dict([(key, []) for key in keyDict])
    if np.abs(res.params[1])>dn:
        output['coefficient']=res.params[0]
        output['lb']=res.conf_int(alpha=alpha, cols=None)[0][0]
        output['ub']=res.conf_int(alpha=alpha, cols=None)[1][0]
        output['test']= False
    else:
        X=dat.X1
        mod = sm.OLS(dat.Y, X)
        res = mod.fit()  
        output['coefficient']=res.params[0]
        output['lb']=res.conf_int(alpha=alpha, cols=None)[0][0]
        output['ub']=res.conf_int(alpha=alpha, cols=None)[1][0]
        output['test']= True    
    return output 

  



#### b)

In [7]:
def pretest_simulateCoefficients(N, rho, alpha, S):
    results = pd.DataFrame(columns=("coefficient","lb","ub","test"))
    # perform Monte Carlo simulation
    for k in range(S): 
        dat = sim_generateData(N, rho)
        results=pd.concat([results,pd.DataFrame(pretest_estimateCoefficients(dat, alpha),index=[k])])

    return(results)

# set seed
random.seed(100)
# specify model
# list of N values
N_array = np.array((100, 200, 400, 700, 1000))
# correlation between X's
rho = 0.9
# significance level
alpha = 0.05
# number of Monte Carlo replications
S = 1000

result = pd.DataFrame(columns=("N","coverageProb","shortModelSelectionProb"))
k=0
# perform simulation
for i in N_array:

    # simulate pretest estimators
    results = pretest_simulateCoefficients(N=i, rho=rho, alpha=alpha, S=S)
    # check if each confidence interval contains the true value
    includeTrueValue = (results['lb'] <= 1) * (1 <= results['ub'])
    # return the result
    result=pd.concat([result,pd.DataFrame({"N":i,"coverageProb":includeTrueValue.mean(),"shortModelSelectionProb":results['test'].mean()},index=[k])])
    k=k+1
print(result)



      N  coverageProb  shortModelSelectionProb
0   100         0.845                    0.798
1   200         0.835                    0.861
2   400         0.853                    0.932
3   700         0.846                    0.963
4  1000         0.845                    0.983


### Question 4

In [8]:
def sim_generateData2(N, p, rho):
    # generate X
    X1 = np.random.normal(size=N)
    Xp = (rho / (p-1)) * np.transpose([X1]*(p-1))+ np.sqrt(0.5 * (1-rho**2) / (p-1)) * np.random.normal(size=(N,p-1))
    # generate Y
    # bind Y and X
    Y = X1 + ((1/np.sqrt(N)) * Xp).sum() + np.sqrt(0.5 * (1-rho**2)) * np.random.normal(size=N)
    dat=pd.concat([pd.DataFrame(Y,columns=['Y']),pd.DataFrame(X1,columns=['X1']),pd.DataFrame(Xp,columns=[i + j for i, j in zip(['X']*(p-1),map(str,list(range(2,p+1))))])],axis=1)
    
    return(dat)


#### a)

In [9]:
def doubleLasso_estimateCoefficients_fixedLambda(dat, lmbda, alpha):
  
    # read dimensions
    N = dat.shape[0]
    p = dat.shape[1] - 1
  
    
    lasso1 = lasso_cdg(bstart=np.ones(p-1),y=dat.Y, X=dat[dat.drop(['Y','X1'], axis=1).columns],lmbda=lmbda[0])
    cov1 = dat[dat.drop(['Y','X1'], axis=1).columns].columns[np.nonzero(lasso1["estimate"])[0].tolist()]
    
    lasso2 = lasso_cdg(bstart=np.ones(p-1),y=dat.X1, X=dat[dat.drop(['Y','X1'], axis=1).columns],lmbda=lmbda[1])
    cov2 = dat[dat.drop(['Y','X1'], axis=1).columns].columns[np.nonzero(lasso2["estimate"])[0].tolist()]
    
    X = dat.loc[:, dat.columns.isin(['X1']) | dat.columns.isin(cov1) | dat.columns.isin(cov2)]
    mod = sm.OLS(dat.Y, X)
    res = mod.fit() 
    output['coefficient']=res.params[0]
    output['lb']=res.conf_int(alpha=alpha, cols=None)[0][0]
    output['ub']=res.conf_int(alpha=alpha, cols=None)[1][0]
    output['nreg']=res.params.shape[0]-1
    
    return output
    


#### b)

In [12]:
def doubleLasso_simulateCoefficients_fixedLambda(N, p, rho, lmbda, alpha, S):
  
    # prepare storage space
    results = pd.DataFrame(columns=("coefficient","lb","ub","nreg"))

  
    # perform Monte Carlo simulation
    for k in range(S):
        dat = sim_generateData2(N, p, rho)
        results=pd.concat([results,pd.DataFrame(doubleLasso_estimateCoefficients_fixedLambda(dat, lmbda, alpha),index=[k])])

  
    return(results)



# set seed
random.seed(100)

# specify model
# list of N values
N_array = np.array((100, 200, 400, 700, 1000))
# number of regressors
p = 3
# correlation between X's
rho = 0.9
# lasso penalty multipliers
lmbda = np.array((0.7, 0.7))
# significance level
alpha = 0.05
# number of Monte Carlo replications
S = 1000

result = pd.DataFrame(columns=("N","coverageProb","meanAdditionalRegressors"))
k=0
# perform simulation
for i in N_array:

    # simulate pretest estimators
    results = doubleLasso_simulateCoefficients_fixedLambda(N=i, p=p, rho=rho, lmbda=lmbda, alpha=alpha, S=S)
    # check if each confidence interval contains the true value
    includeTrueValue = (results['lb'] <= 1) * (1 <= results['ub'])
    # return the result
    result=pd.concat([result,pd.DataFrame({"N":i,"coverageProb":includeTrueValue.mean(),"shortModelSelectionProb":results['nreg'].mean()},index=[k])])
    k=k+1
print(result)


TypeError: lasso_cdg() missing 1 required positional argument: 'lmbda'

#### c)

In [None]:
def doubleLasso_estimateCoefficients_cvLambda(dat, alpha):
 
    # read dimensions
    N = dat.shape[0]
    p = dat.shape[1] - 1
  
    lasso1 = lasso_cdg_kfold(np.ones(p-1),dat[dat.drop(['X1'], axis=1).columns])
    cov1 = dat[dat.drop(['X1'], axis=1).columns].columns[np.nonzero(lasso1["estimate"])[0].tolist()]
    
    lasso2 = lasso_cdg_kfold(np.ones(p-1),dat[dat.drop(['Y'], axis=1).columns])
    cov2 = dat[dat.drop(['Y'], axis=1).columns].columns[np.nonzero(lasso2["estimate"])[0].tolist()]
    
    X = dat.loc[:, dat.columns.isin(['X1']) | dat.columns.isin(cov1) | dat.columns.isin(cov2)]
    mod = sm.OLS(dat.Y, X)
    res = mod.fit() 
    output['coefficient']=res.params[0]
    output['lb']=res.conf_int(alpha=alpha, cols=None)[0][0]
    output['ub']=res.conf_int(alpha=alpha, cols=None)[1][0]
    output['nreg']=res.params.shape[0]-1
    
    return output
  


#### d)

In [None]:
def doubleLasso_simulateCoefficients_cvLambda(N, p, rho, alpha, S):
  
    # prepare storage space
    results = pd.DataFrame(columns=("coefficient","lb","ub","nreg"))

  
    # perform Monte Carlo simulation
    for k in range(S):
        dat = sim_generateData2(N, p, rho)
        results=pd.concat([results,pd.DataFrame(doubleLasso_estimateCoefficients_cvLambda(dat, lmbda, alpha),index=[k])])

  
    return(results)



# set seed
random.seed(100)

# specify model
# list of N values
N_array = np.array((100, 200, 400, 700, 1000))
# number of regressors
 p = 3
# correlation between X's
rho = 0.9
# significance level
alpha = 0.05
# number of Monte Carlo replications
S = 1000

result = pd.DataFrame(columns=("N","coverageProb","meanAdditionalRegressors"))
k=0
# perform simulation
for i in N_array:

    # simulate pretest estimators
    results = doubleLasso_simulateCoefficients_cvLambda(N=i, p=p, rho=rho, alpha=alpha, S=S)
    # check if each confidence interval contains the true value
    includeTrueValue = (results['lb'] <= 1) * (1 <= results['ub'])
    # return the result
    result=pd.concat([result,pd.DataFrame({"N":i,"coverageProb":includeTrueValue.mean(),"shortModelSelectionProb":results['nreg'].mean()},index=[k])])
    k=k+1
print(result)
