In [336]:
from numba import njit, jit

import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from numpy import random
from numpy import linalg
from numpy import matlib
from scipy import sparse
from scipy import stats
from scipy.optimize import minimize
import statsmodels.api as sm
import matplotlib
import itertools as it
from matplotlib import rc


import pandas as pd
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.preamble'] = [
    r'\usepackage{amssymb}',
    r'\usepackage{amsmath}',
    r'\usepackage{xcolor}',
    r'\renewcommand*\familydefault{\sfdefault}']
matplotlib.rcParams['pgf.texsystem'] = 'pdflatex'
matplotlib.rcParams['pgf.preamble']  = [
    r'\usepackage[utf8x]{inputenc}',
    r'\usepackage{amssymb}',
    r'\usepackage[T1]{fontenc}',
    r'\usepackage{amsmath}',
    r'\usepackage{sansmath}']

from IPython.display import set_matplotlib_formats
%matplotlib inline
set_matplotlib_formats('svg')

import warnings

warnings.filterwarnings('ignore')

### Question 1

#### a)

In [32]:
def lasso_objective(b,data,lmbda):
    p = b.size
    y = data[:,0]
    X = data[:,np.arange(1,data[0,:].size)]
    N = y.size
    pdata = X[0,:].size
    if p!=pdata:
        print("Error: Covariate number is not compatible with given coefficient vector")
        return
    else:
        obj = np.square(y-X@b).sum() + lmbda*linalg.norm(b,ord=1)
        return obj
        


  



#### b)

In [107]:
def lasso_cdg(bstart,data,lmbda,epsilon=1e-06,maxiter=1000,standardized=False):
    p_coeff = bstart.size
    y = data[:,0]
    X = data[:,np.arange(1,data[0,:].size)]
    N,p = X.shape
    if p!=p_coeff:
        print("Error: Covariate number is not compatible with given coefficient vector")
        return
    else:
        if standardized==False:
            X = X / (np.linalg.norm(X,axis = 0))
        keyDict = {"estimate","objectives","steps","status"}
        output = dict([(key, []) for key in keyDict])
        for i in range(maxiter):
            for j in range(p):
                obj = lambda x : lasso_objective(np.hstack((bstart[0:j],x,bstart[j+1:p])),data,lmbda)
                res = minimize(obj, x0=bstart[j], method='Nelder-Mead')
                output["estimate"] = np.hstack((bstart[0:j],res.x,bstart[j+1:p]))
                output["objectives"] = res.fun
                output["steps"] = np.linalg.norm(bstart-output["estimate"],ord=np.inf)
                if output["steps"]<epsilon:
                    output["status"] = "convergence"
                    return output                             
                bstart = output["estimate"]                            
        output["status"] = "maxiter exceed"        
        return output        

                
                
            

#### c)

In [None]:
def lasso_cdg_active(bstart,data,lmbda,epsilon=1e-06,maxiter=1000,standardized=False,cycle_len):
    p_coeff = bstart.size
    y = data[:,0]
    X = data[:,np.arange(1,data[0,:].size)]
    N,p = X.shape
    if p!=p_coeff:
        print("Error: Covariate number is not compatible with given coefficient vector")
        return
    else:
        if standardized==False:
            X = X / (np.linalg.norm(X,axis = 0))
        keyDict = {"estimate","objectives","steps","status"}
        output = dict([(key, []) for key in keyDict])
        for i in range(maxiter):
            if i%cycle_len==0:
                for j in range(p):
                    obj = lambda x : lasso_objective(np.hstack((bstart[0:j],x,bstart[j+1:p])),data,lmbda)
                    res = minimize(obj, x0=bstart[j], method='Nelder-Mead')
                    output["estimate"] = np.hstack((bstart[0:j],res.x,bstart[j+1:p]))
                    output["objectives"] = res.fun
                    output["steps"] = np.linalg.norm(bstart-output["estimate"],ord=np.inf)
            else:
                for j in np.nonzero(output["estimate"])[0].tolist():
                    obj = lambda x : lasso_objective(np.hstack((bstart[0:j],x,bstart[j+1:p])),data,lmbda)
                    res = minimize(obj, x0=bstart[j], method='Nelder-Mead')
                    output["estimate"] = np.hstack((bstart[0:j],res.x,bstart[j+1:p]))
                    output["objectives"] = res.fun
                    output["steps"] = np.linalg.norm(bstart-output["estimate"],ord=np.inf)
            if output["steps"]<epsilon:
                output["status"] = "convergence"
                return output                             
            bstart = output["estimate"]                            
        output["status"] = "maxiter exceed"        
        return output  

### Question 3

In [138]:
def sim_generateData(N, rho): 
  # generate X
    X1 = np.random.normal(size=N)
    X2 = rho * X1 + np.sqrt(1-rho**2) * np.random.normal(size=N)
  
  # generate Y
    Y = X1 + (1/np.sqrt(N)) * X2 + np.random.normal(size=N)
    data = {'Y': Y, 'X1': X1, 'X2': X2}
    df = pd.DataFrame (data, columns = ['Y','X1','X2'],index=range(N))

  # return dataset
    return(df)


#### a)

In [322]:
def pretest_estimateCoefficients(dat, alpha):
    # read dimensions
    N = dat.shape[0]
    dn= 1/(N**(1/4))
    X=dat.loc[:, dat.columns.isin(['X1','X2'])]
    mod = sm.OLS(dat.Y, X)
    res = mod.fit()  
    keyDict = {"coefficient","lb","ub","test"}
    output = dict([(key, []) for key in keyDict])
    if np.abs(res.params[1])>dn:
        output['coefficient']=res.params[0]
        output['lb']=res.conf_int(alpha=alpha, cols=None)[0][0]
        output['ub']=res.conf_int(alpha=alpha, cols=None)[1][0]
        output['test']= False
    else:
        X=dat.X1
        mod = sm.OLS(dat.Y, X)
        res = mod.fit()  
        output['coefficient']=res.params[0]
        output['lb']=res.conf_int(alpha=alpha, cols=None)[0][0]
        output['ub']=res.conf_int(alpha=alpha, cols=None)[1][0]
        output['test']= True    
    return output 

  



#### b)

In [326]:
def pretest_simulateCoefficients(N, rho, alpha, S):
    results = pd.DataFrame(columns=("coefficient","lb","ub","test"))
    # perform Monte Carlo simulation
    for k in range(S): 
        dat = sim_generateData(N, rho)
        results=pd.concat([results,pd.DataFrame(pretest_estimateCoefficients(dat, alpha),index=[k])])

    return(results)

# set seed
random.seed(100)
# specify model
# list of N values
N_array = np.array((100, 200, 400, 700, 1000))
# correlation between X's
rho = 0.9
# significance level
alpha = 0.05
# number of Monte Carlo replications
S = 1000

result = pd.DataFrame(columns=("N","coverageProb","shortModelSelectionProb"))
k=0
# perform simulation
for i in N_array:

    # simulate pretest estimators
    results = pretest_simulateCoefficients(N=i, rho=rho, alpha=alpha, S=S)
    # check if each confidence interval contains the true value
    includeTrueValue = (results['lb'] <= 1) * (1 <= results['ub'])
    # return the result
    result=pd.concat([result,pd.DataFrame({"N":i,"coverageProb":includeTrueValue.mean(),"shortModelSelectionProb":results['test'].mean()},index=[k])])
    k=k+1
print(result)



      N  coverageProb  shortModelSelectionProb
0   100         0.845                    0.798
1   200         0.835                    0.861
2   400         0.853                    0.932
3   700         0.846                    0.963
4  1000         0.845                    0.983


### Question 4

In [397]:
def sim_generateData2(N, p, rho):
    # generate X
    X1 = np.random.normal(size=N)
    Xp = (rho / (p-1)) * np.transpose([X1]*(p-1))+ np.sqrt(0.5 * (1-rho**2) / (p-1)) * np.random.normal(size=(N,p-1))
    # generate Y
    # bind Y and X
    Y = X1 + ((1/np.sqrt(N)) * Xp).sum() + np.sqrt(0.5 * (1-rho**2)) * np.random.normal(size=N)
    dat=pd.concat([pd.DataFrame(Y,columns=['Y']),pd.DataFrame(X1,columns=['X1']),pd.DataFrame(Xp,columns=[i + j for i, j in zip(['X']*(p-1),map(str,list(range(2,p+1))))])],axis=1)
    
    return(dat)


#### a)

In [None]:
def doubleLasso_estimateCoefficients_fixedLambda(dat, lmbda, alpha):
  
    # read dimensions
    N = dat.shape[0]
    p = dat.shape[1] - 1
  
    ##### FILL IN THE REST #####
  


#### b)

In [None]:
def doubleLasso_simulateCoefficients_fixedLambda(N, p, rho, lmbda, alpha, S):
  
    # prepare storage space
    results = pd.DataFrame(columns=("coefficient","lb","ub","nreg"))

  
    # perform Monte Carlo simulation
    for k in range(S):
        dat = sim_generateData2(N, p, rho)
        results=pd.concat([results,pd.DataFrame(doubleLasso_estimateCoefficients_fixedLambda(dat, lmbda, alpha),index=[k])])

  
    return(results)



# set seed
set.seed(100)

# specify model
# list of N values
N_array = np.array((100, 200, 400, 700, 1000))
# number of regressors
 p = 3
# correlation between X's
rho = 0.9
# lasso penalty multipliers
lmbda = np.array((0.7, 0.7))
# significance level
alpha = 0.05
# number of Monte Carlo replications
S = 1000

result = pd.DataFrame(columns=("N","coverageProb","meanAdditionalRegressors"))
k=0
# perform simulation
for i in N_array:

    # simulate pretest estimators
    results = doubleLasso_simulateCoefficients_fixedLambda(N=i, p=p, rho=rho, lmbda=lmbda, alpha=alpha, S=S)
    # check if each confidence interval contains the true value
    includeTrueValue = (results['lb'] <= 1) * (1 <= results['ub'])
    # return the result
    result=pd.concat([result,pd.DataFrame({"N":i,"coverageProb":includeTrueValue.mean(),"shortModelSelectionProb":results['nreg'].mean()},index=[k])])
    k=k+1
print(result)


#### c)

In [None]:
def doubleLasso_estimateCoefficients_cvLambda(dat, alpha):
 
    # read dimensions
    N = dat.shape[0]
    p = dat.shape[1] - 1
  
  ##### FILL IN THE REST #####
  


#### d)

In [None]:
def doubleLasso_simulateCoefficients_cvLambda(N, p, rho, alpha, S):
  
    # prepare storage space
    results = pd.DataFrame(columns=("coefficient","lb","ub","nreg"))

  
    # perform Monte Carlo simulation
    for k in range(S):
        dat = sim_generateData2(N, p, rho)
        results=pd.concat([results,pd.DataFrame(doubleLasso_estimateCoefficients_cvLambda(dat, lmbda, alpha),index=[k])])

  
    return(results)



# set seed
set.seed(100)

# specify model
# list of N values
N_array = np.array((100, 200, 400, 700, 1000))
# number of regressors
 p = 3
# correlation between X's
rho = 0.9
# significance level
alpha = 0.05
# number of Monte Carlo replications
S = 1000

result = pd.DataFrame(columns=("N","coverageProb","meanAdditionalRegressors"))
k=0
# perform simulation
for i in N_array:

    # simulate pretest estimators
    results = doubleLasso_simulateCoefficients_cvLambda(N=i, p=p, rho=rho, alpha=alpha, S=S)
    # check if each confidence interval contains the true value
    includeTrueValue = (results['lb'] <= 1) * (1 <= results['ub'])
    # return the result
    result=pd.concat([result,pd.DataFrame({"N":i,"coverageProb":includeTrueValue.mean(),"shortModelSelectionProb":results['nreg'].mean()},index=[k])])
    k=k+1
print(result)
