In [1]:
# Import relevant packages for splitting data
using LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, Tables, TableOperations, StatsBase, FreqTables, DataFrames

In [2]:
# Define a function which turn a list or vector-like object into a proper two
# dimensional column vector

function cvec(a)
    """ Turn a list or vector-like object into a proper column vector
    Input
    a: List or vector-like object, has to be a potential input for np.array()
    Output
    vec: two dimensional NumPy array, with the first dimension weakly greater
         than the second (resulting in a column vector for a vector-like input)
    """
    
    # Conver input into a two dimensional NumPy array
    vec = cat([a], dims = 2) 

    # Check whether the second dimension is strictly greater than the first
    # (remembering Python's zero indexing)
    
    if size(vec)[1] < size(vec)[2]
        # If so, transpose the input vector
        vec = transpose(vec)
    end
   
    # Return the column vector
    return vec

end
    

cvec (generic function with 1 method)

In [3]:
import Statistics.cor
function corre(y, X)
    
    """ Return correlation coefficients between columns of matrices
    Inputs
    y: n by 1 NumPy array
    X: n by k NumPy array
    Outputs
    corr: list of length k, where the k-th element is the correlation
          coefficient between y and the k-th column of X
    """
    # Concatenate y and X into a single NumPy array
    yX = hcat(y, X)
    
    # Get the correlation coefficients between all columns of that array
    corr = cor(yX)
    
    # Get the first row, starting at the first off-diagonal element (these are
    # the correlation coefficients between y and each column of X
    corr = corr[1, :] 
    
    # Return the result
    return corr

end

corre (generic function with 1 method)

In [4]:
function init_values(X, y, number::Int64=5, intercetp::Bool=true)
    """ Return an initial parameter guess for a LASSO model
    Inputs
    y: n by 1 NumPy array, outcome variable
    X: n by k NumPy array, RHS variables
    Outputs
    residuals: n ny 1 NumPy array, residuals for initial parameter guess
    coefficients: k by 1 NumPy array, initial coefficient values
    """
    # Make sure y is a proper column vector
    #y = cvec(y)
    
    # Get the absolute value of correlations between y and X
    corr = broadcast(abs, cor(y, X)[1, :])
    
    # Get the number of columns of X
    kx = size(X)[2]
    
    # Make an index selecting the five columns of X which are most correlated
    # with y (since .argsort() always sorts in increasing order, selecting from
    # the back gets the most highly correlated columns)
    index = sortperm(corr, rev=true)[1: min(number, kx)]
    
    # Set up an array of coefficient guesses
    coefficients = zeros(kx)
    
    # Regress y on the five most correlated columns of X, including an intercept
    # if desired
   reg = lm(X[:, index], y)
    
    # Replace the guesses for the estimated coefficients (note that .coef_ does
    # not return the estimated intercept, if one was included in the model)
    
    coefficients[index] = GLM.coef(reg)
    
    # Replace any NANs as zeros
    replace!(coefficients, NaN=>0)
    
    # Get the regression residuals
    residuals = y - predict(reg, X[:, index])
    
    return residuals, reg, index, coefficients, corr
    return index
    
end


init_values (generic function with 3 methods)

In [5]:
# function LassoShooting_fit( x, y, lmbda, maxIter::Int = 1000, optTol::Float64 = 10^(-5), zeroThreshold::Float64 = 10^(-6),
#                             XX = nothing, Xy = nothing, beta_start = nothing)

In [6]:
# function LassoShooting_fit( x, y, lmbda, control::control, 
#                             XX = nothing, Xy = nothing, beta_start = nothing)

function LassoShooting_fit( x, y, lmbda, maxIter::Int = 1000, optTol::Float64 = 10^(-5), zeroThreshold::Float64 = 10^(-6),
                            XX = nothing, Xy = nothing, beta_start = nothing)
        
     """ Shooting LASSO algorithm with variable dependent penalty weights
    Inputs
    x: n by p NumPy array, RHS variables
    y: n by 1 NumPy array, outcome variable
    lmbda: p by 1 NumPy array, variable dependent penalty terms. The j-th
           element is the penalty term for the j-th RHS variable.
    maxIter: integer, maximum number of shooting LASSO updated
    optTol: scalar, algorithm terminated once the sum of absolute differences
            between the updated and current weights is below optTol
    zeroThreshold: scalar, if any final weights are below zeroThreshold, they
                   will be set to zero instead
    XX: k by k NumPy array, pre-calculated version of x'x
    Xy: k by 1 NumPy array, pre-calculated version of x'y
    beta_start: k by 1 NumPy array, initial weights
    Outputs
    w: k by 1 NumPy array, final weights
    wp: k by m + 1 NumPy array, where m is the number of iterations the
        algorithm took. History of weight updates, starting with the initial
        weights.
    m: integer, number of iterations the algorithm took
    """
    n = size(x)[1]
    p = size(x)[2]
    
    # Check whether XX and Xy were provided, calculate them if not
    if (isnothing(XX))
        XX = x'*x
    end

    if (isnothing(Xy))
        Xy = x'*y
    end

    # Check whether an initial value for the intercept was provided

    if (isnothing(beta_start))
        # If not, use init_values from help_functions, which will return
        # regression estimates for the five variables in x which are most
        # correlated with y, and initialize all other coefficients as zero
        beta = init_values(x, y)[4]

    else
        # Otherwise, use the provided initial weights
        beta = beta_start
    end

    # Set up a history of weights over time, starting with the initial ones
    wp = beta

    # Keep track of the number of iterations
    m = 1

    # Create versions of XX and Xy which are just those matrices times two
    XX2 = XX * 2
    Xy2 = Xy * 2

    #@unpack maxIter, optTol, zeroThreshold = control()

    # Go through all iteration
    while m<maxIter

        # Save the last set of weights (the .copy() is important, otherwise
        # beta_old will be updated every time beta is changed during the
        # following loop)
        beta_old = copy(beta)

        # Go through all parameters
        for j in 1:p
            
            # Calculate the shoot
            S0 = sum( XX2[j, :].*beta ) - XX2[j, j].*beta[j] - Xy2[j]

            # Update the weights
            if sum(isnothing(XX)) >= 1
                beta[j] = 0

            elseif S0 >lmbda[j]
                beta[j] = (lmbda[j] - S0) / XX2[j,j]

            elseif S0 < -lmbda[j]
                beta[j] = (-lmbda[j] - S0) / XX2[j,j]

            elseif broadcast(abs, S0) <= lmbda[j]
                beta[j] = 0

            end
        end

        # Add the updated weights to the history of weights
        wp = hcat(wp, beta)

        # Check whether the weights are within tolerance
        if sum(broadcast(abs, beta - beta_old)) < optTol
            # If so, break the while loop
            break
        end

        # Increase the iteration counter
        m = m + 1
    end

    # Set the final weights to the last updated weights
    w = beta   

    # Set weights which are within zeroThreshold to zero
    w[broadcast(abs, w) .< zeroThreshold] .= 0
    
    #return beta,  w
    return Dict("coefficients" => w, "coef_list" => wp, "num_it" => m)
    return w, wp, m
    #return XX2, Xy2
    

end
        

LassoShooting_fit (generic function with 7 methods)

# Test 1 

In [7]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 10
p = Int(n/2)

X = randn(n, p)
beta = randn(p)
lmbda = randn(p)
Y = randn(n)

10-element Vector{Float64}:
 -0.2568680898870918
  0.27995341717389755
  0.5841303002261958
  0.12104219332168611
 -0.24617948897107803
 -0.9064860212129372
 -1.0382922158735532
 -0.7820541793330622
  1.4371870265428164
 -1.330092605439802

In [8]:
Y

10-element Vector{Float64}:
 -0.2568680898870918
  0.27995341717389755
  0.5841303002261958
  0.12104219332168611
 -0.24617948897107803
 -0.9064860212129372
 -1.0382922158735532
 -0.7820541793330622
  1.4371870265428164
 -1.330092605439802

In [9]:
lm(X, Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
─────────────────────────────────────────────────────────────────
         Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
─────────────────────────────────────────────────────────────────
x1  -0.340826     0.199567  -1.71    0.1484  -0.853829   0.172177
x2   0.0278204    0.385424   0.07    0.9453  -0.962944   1.01858
x3   0.149146     0.337878   0.44    0.6773  -0.719397   1.01769
x4   0.283065     0.283483   1.00    0.3639  -0.445653   1.01178
x5   0.32713      0.264828   1.24    0.2716  -0.353631   1.00789
─────────────────────────────────────────────────────────────────


In [10]:
init_values(X, Y)
#

([-0.13399061240200094, 0.7351124116919241, 0.9079991999088363, 0.11287806607115711, 0.11788449416064653, 0.2265934992376083, -0.12817677633495028, -0.46934818556002667, 0.879496174638273, -0.1309568822832714], LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
─────────────────────────────────────────────────────────────────
         Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
─────────────────────────────────────────────────────────────────
x1   0.32713      0.264828   1.24    0.2716  -0.353631   1.00789
x2  -0.340826     0.199567  -1.71    0.1484  -0.853829   0.172177
x3   0.0278204    0.385424   0.07    0.9453  -0.962944   1.01858
x4   0.283065     0.283483   1.00    0.3639  -0.445653   1.01178
x5   0.149146     0.337878   0.44    0.6773  -0.719397   1.01769
─────────────────────────────────────────────────────────────────
, [5, 1, 2, 4, 3], [-0.3408259898803487, 0.02782043029355669, 0.1491463

In [11]:
LassoShooting_fit(X, Y, lmbda)

Dict{String, Any} with 3 entries:
  "coef_list"    => [-0.361246 -0.361246 … -0.35562 -0.35562; -0.056243 -0.0562…
  "num_it"       => 19
  "coefficients" => [-0.35562, -0.211748, 0.065043, 0.242178, 0.17185]

# Test 2

In [12]:
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, DataStructures, NamedArrays, PrettyTables

In [13]:
# Importing .Rdata file
growth_read = load("../../data/GrowthData.RData")

# Since growth_read is a dictionary, we check if there is a key called "GrowthData", the one we need for our analyze
haskey(growth_read, "GrowthData")
# Now we save that dataframe with a new name
growth = growth_read["GrowthData"]
names(growth)

Y = growth[!, "Outcome"]
X_2 = select(growth, Not(["Outcome"]))
X_2 = convert(Matrix, Matrix(X_2[:, 2:5]))
lmbda = randn(size(X_2)[2])
lm(X_2, Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
──────────────────────────────────────────────────────────────────────
          Coef.  Std. Error      t  Pr(>|t|)    Lower 95%    Upper 95%
──────────────────────────────────────────────────────────────────────
x1   0.00289408  0.00239153   1.21    0.2295  -0.00186013   0.00764829
x2  -0.0572642   0.023258    -2.46    0.0158  -0.1035      -0.0110289
x3   0.132428    0.0744655    1.78    0.0789  -0.015604     0.280461
x4   0.0845833   0.246612     0.34    0.7324  -0.405665     0.574832
──────────────────────────────────────────────────────────────────────


In [14]:
LassoShooting_fit(X_2, Y, lmbda)["coefficients"]

4-element Vector{Float64}:
 -0.004679457919545483
 -0.1293718715001274
  0.557765308642275
 -0.8442619960488519

# Test 3

In [15]:
# using Pkg
# Pkg.add("CSV")
# Pkg.add("DataFrames")
# Pkg.add("Dates")
# Pkg.add("Plots")
using CSV
using DataFrames
using Dates
#using Plots

In [16]:
#Reading the CSV file into a DataFrame
#We have to set the category type for some variable
data = CSV.File("../../data/wage2015_subsample_inference.csv"; types = Dict("occ" => String,"occ2"=> String,"ind"=>String,"ind2"=>String)) |> DataFrame
println("Number of Rows : ", size(data)[1],"\n","Number of Columns : ", size(data)[2],) #rows
[eltype(col) for col = eachcol(data)]
n = size(data)[1]
z = select(data, Not([:rownames, :lwage, :wage]))
p = size(z)[2]
# remember y has to be a vector
y = data[!, "lwage"]

X_3 = convert(Matrix, Matrix(z[:, 1:5]))
Y_3 = y
lambda_3 = [0.1, 0.2, 0.3, 0.4, 0.5]
#lmbda = randn(size(X_3)[2])

Number of Rows : 5150
Number of Columns : 21


5-element Vector{Float64}:
 0.1
 0.2
 0.3
 0.4
 0.5

In [17]:
LassoShooting_fit(X_3, Y_3, lambda_3)["coefficients"]

5-element Vector{Float64}:
 0.8225235638719542
 2.4302250366825917
 2.455801713293631
 2.473843233955627
 2.7338192867069036

# lambdaCalculation 

In [18]:
function lambdaCalculation( x, y, lmbda, maxIter::Int = 1000, optTol::Float64 = 10^(-5), zeroThreshold::Float64 = 10^(-6),
                            XX = nothing, Xy = nothing, beta_start = nothing)

end 

lambdaCalculation (generic function with 7 methods)

In [19]:
function lambdaCalculation(     homoskedastic::Bool=false, X_dependent_lambda::Bool=false,
                                lambda_start=nothing, c::Float64=1.1, gamma::Float64=0.1, 
                                numSim::Int=5000, y=nothing, x=nothing, par::Bool=true, 
                                corecap::Float64=Inf, fix_seed::Bool=true)
end

lambdaCalculation (generic function with 19 methods)

In [20]:
function lambdaCalculation(     homoskedastic::Bool=false, X_dependent_lambda::Bool=false,
                                lambda_start=nothing, c::Float64=1.1, gamma::Float64=0.1, 
                                numSim::Int=5000, y=nothing, x=nothing, par::Bool=true, 
                                corecap::Float64=Inf, fix_seed::Bool=true)
    # Get number of observations n and number of variables p
    n, p = size(X)

    # Get number of simulations to use (if simulations are necessary)
    R = numSim

    # Go through all possible combinations of homoskedasticy/heteroskedasticity
    # and X-dependent or independent error terms. The first two cases are
    # special cases: Handling the case there homoskedastic was set to None, and
    # where lambda_start was provided.
    #

    # 1) If homoskedastic was set to None (special case)
    if (isnothing(homoskedastic))

        # Initialize lambda
            lmbda0 = lambda_start

        Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))

        # Calculate the final vector of penalty terms
            lmbda = lmbda0 * Ups0

    # 2) If lambda_start was provided (special case)
    elseif (isnothing(lambda_start)) == 0
            # Check whether a homogeneous penalty term was provided (a scalar)
            if maximum(size(lambda_start)) == 1
                # If so, repeat that p times as the penalty term
                lmbda = ones(p,1).*lambda_start

            else
                # Otherwise, use the provided vector of penalty terms as is
                lmbda = lambda_start
            end

    # 3) Homoskedastic and X-independent
    elseif homoskedastic == true &  X_dependent_lambda == false

            # Initilaize lambda
            lmbda0 = 2 * c * sqrt(n) * quantile(Normal(0.0, 1.0),1 - gamma/(2*p))

            # Use ddof=1(corrected = true in Julia) to be consistent with R's var() function (in Julia by defaul the DDF is N-1)
            Ups0 = sqrt(var(y, corrected = true))

            # Calculate the final vector of penalty terms
            lmbda = zeros(p,1) .+ lmbda0 * Ups0

    # 4) Homoskedastic and X-dependent
    elseif homoskedastic == true & X_dependent_lambda == true

            psi = mean.(eachcol(x.^2))
            tXtpsi = (x' ./ sqrt(psi))'

            R = 5000
            sim = zeros(R,1)

            for l in 1:R
                    g = reshape(repeat(randn(n), inner = p),(p, n))'
                    sim[l] = n * maximum(2*abs.(mean.(eachcol(tXtpsi.* g))))
            end

            # Initialize lambda based on the simulated quantiles
            lambda0 = c*quantile(vec(sim), 1 - gamma)

            Ups0 = sqrt(var(y, corrected = true))

            # Calculate the final vector of penalty terms
            lmbda = zeros(p,1) .+ lmbda0 * Ups0

    # 5) Heteroskedastic and X-independent
    elseif homoskedastic == false &  X_dependent_lambda == false

            # The original includes the comment, "1=num endogenous variables"
            lmbda0 = 2 * c * sqrt(n) * quantile(Normal(0.0, 1.0),1 - gamma/(2*p*1))

            Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))
            lmbda = lmbda0 * Ups0

    # 6) Heteroskedastic and X-dependent
    elseif homoskedastic == false &  X_dependent_lambda == true

            eh = y
            ehat = reshape(repeat(eh, inner = p),(p, n))'

            xehat = x.*ehat
            psi = mean.(eachcol(xehat.^2))'
            tXehattpsi = (xehat./sqrt.(psi))

            R = 5000
            sim = zeros(R,1)

            for l in 1:R
                    g = reshape(repeat(randn(n), inner = p),(p, n))'
                    sim[l] = n * maximum(2*abs.(mean.(eachcol(tXtpsi.* g))))
            end

            # Initialize lambda based on the simulated quantiles
            lambda0 = c*quantile(vec(sim), 1 - gamma)

            Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))

            lmbda = lmbda0 * Ups0

    return Dict("lambda0" => lmbda0, "lambda" => lmbda, "Ups0" => Ups0) 
    end
end

lambdaCalculation (generic function with 19 methods)

In [169]:
y
eh = y
ehat = reshape(repeat(eh, inner = p),(p, n))'
xehat = x.*ehat
psi = mean.(eachcol(xehat.^2))'
tXehattpsi = (xehat./sqrt.(psi))

10×5 Matrix{Float64}:
  2.41485    0.110211   0.577757    1.03164    -0.669994
  0.272156  -2.09978   -2.37812    -2.44758    -0.251323
  0.516117  -0.839626  -0.0697467  -0.288397   -0.0163569
  0.129216  -0.638389  -1.36361    -0.1055      0.0612663
  0.660068   0.2317     1.35678    -0.893214   -0.916824
  1.49419    0.103511   0.0508912   0.093249    2.70922
 -0.636928  -1.19961   -0.369506    0.56272     0.424792
 -0.459297   0.618394   0.259876    0.0936285  -0.10345
 -0.475957  -0.485315   0.268583   -0.509746    0.0808326
  0.547602  -1.53129    0.163746    1.20787     1.05157

# How to translate Python Class to Julia Function

In [27]:
mutable struct State
    foo::Int
    bar::Float64
end

function dosomething(s::State)
    s.foo + s.bar
end

function dosomethingelse(s::State, n::Int)
    n * s.bar
end


s = State(1, 10)
dosomething(s)  # returns 11
dosomethingelse(s, 10)  # returns 100

100.0

In [28]:
mutable struct State
    
    colnames=nothing
    post::Bool=true
    intercept::Bool=true
    model::Bool=true
    homoskedastic::Bool=false
    X_dependent_lambda::Bool=false
    lambda_start=nothing
    c::Float64=1.1
    gamma=nothing
    numSim::Int=5000
    numIter::Int=15
    numIter::Int=15
    tol::Float64 = 10^(-5)
    threshold::Float64=-Inf
    par::Bool=true
    corecap::Float64=Inf
    fix_seed::Bool=true
    
end





function lambdaCalculation(     homoskedastic::Bool=false, X_dependent_lambda::Bool=false,
                                lambda_start=nothing, c::Float64=1.1, gamma::Float64=0.1, 
                                numSim::Int=5000, y=nothing, x=nothing, par::Bool=true, 
                                corecap::Float64=Inf, fix_seed::Bool=true)

LoadError: syntax: "colnames = nothing" inside type definition is reserved around In[28]:1

In [23]:
    def __init__(self, x, y, colnames=None, post=True, intercept=True,
                 model=True, homoskedastic=False, X_dependent_lambda=False,
                 lambda_start=None, c=1.1, gamma=None, numSim=5000, numIter=15,
                 tol=10**(-5), threshold=-np.inf, par=True, corecap=np.inf,
                 fix_seed=True):

LoadError: syntax: extra token "State" after end of expression

In [154]:
xehat.^2

5×5 Matrix{Float64}:
  9.39943    9.39943    9.39943    9.39943    9.39943
 17.3247    17.3247    17.3247    17.3247    17.3247
  0.211723   0.211723   0.211723   0.211723   0.211723
  1.00107    1.00107    1.00107    1.00107    1.00107
  5.10129    5.10129    5.10129    5.10129    5.10129

In [143]:
xehat

5×5 Matrix{Float64}:
  3.06585    3.06585    3.06585    3.06585    3.06585
 -4.16229   -4.16229   -4.16229   -4.16229   -4.16229
 -0.460134  -0.460134  -0.460134  -0.460134  -0.460134
 -1.00054   -1.00054   -1.00054   -1.00054   -1.00054
  2.2586     2.2586     2.2586     2.2586     2.2586

In [147]:
sqrt.(psi)

1×5 Matrix{Float64}:
 2.57053  2.57053  2.57053  2.57053  2.57053

In [136]:
xehat

5×5 Matrix{Float64}:
  3.06585    3.06585    3.06585    3.06585    3.06585
 -4.16229   -4.16229   -4.16229   -4.16229   -4.16229
 -0.460134  -0.460134  -0.460134  -0.460134  -0.460134
 -1.00054   -1.00054   -1.00054   -1.00054   -1.00054
  2.2586     2.2586     2.2586     2.2586     2.2586

In [135]:
psi

1×5 adjoint(::Vector{Float64}) with eltype Float64:
 -0.0597025  -0.0597025  -0.0597025  -0.0597025  -0.0597025

In [134]:
sqrt(psi)

LoadError: MethodError: no method matching sqrt(::Adjoint{Float64, Vector{Float64}})
[0mClosest candidates are:
[0m  sqrt([91m::Union{Float32, Float64}[39m) at C:\Users\Alexander\AppData\Local\Programs\Julia-1.7.2\share\julia\base\math.jl:566
[0m  sqrt([91m::StridedMatrix{T}[39m) where T<:Union{Real, Complex} at C:\Users\Alexander\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\dense.jl:836
[0m  sqrt([91m::Diagonal[39m) at C:\Users\Alexander\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\diagonal.jl:592
[0m  ...

In [128]:
psi = mean.(eachcol(xehat))'

1×5 adjoint(::Vector{Float64}) with eltype Float64:
 -0.0597025  -0.0597025  -0.0597025  -0.0597025  -0.0597025

In [116]:
y*ones(1, p)

10×5 Matrix{Float64}:
  0.638277   0.638277   0.638277   0.638277   0.638277
 -0.805969  -0.805969  -0.805969  -0.805969  -0.805969
 -0.399263  -0.399263  -0.399263  -0.399263  -0.399263
  0.203019   0.203019   0.203019   0.203019   0.203019
  0.46159    0.46159    0.46159    0.46159    0.46159
  1.1679     1.1679     1.1679     1.1679     1.1679
  0.561702   0.561702   0.561702   0.561702   0.561702
  0.737444   0.737444   0.737444   0.737444   0.737444
  0.224127   0.224127   0.224127   0.224127   0.224127
  0.584296   0.584296   0.584296   0.584296   0.584296

In [112]:
y.*ones(p,1)

LoadError: DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 10 and 5")

In [126]:
x.^2

10×5 Matrix{Float64}:
 6.75773    0.015731    0.0767582    1.65955     1.00093
 0.0538316  3.58128     0.815611     5.85853     0.0883306
 0.788887   2.33335     0.00285879   0.331446    0.00152463
 0.191249   5.21704     4.22627      0.171546    0.0827277
 0.965387   0.132943    0.809392     2.37874     3.58377
 0.772758   0.00414472  0.000177881  0.00404976  4.88835
 0.607025   2.40655     0.04054      0.637564    0.519543
 0.183133   0.371022    0.0116339    0.0102402   0.0178766
 2.12905    2.47393     0.134531     3.28602     0.118159
 0.414669   3.62391     0.00735741   2.71472     2.94232

In [125]:
psi = mean(x.^2)

1.3888938829818727

In [109]:
Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))

1×5 Matrix{Float64}:
 0.687098  0.726379  0.306074  0.79703  0.953105

In [None]:
Ups0 <- 1/sqrt(n) * sqrt(t(t(y^2) %*% (x^2)))

In [None]:
Ups0 <- sqrt(var(y))

In [75]:
repeat(randn(10), inner = p)

50-element Vector{Float64}:
 -0.5426606288066931
 -0.5426606288066931
 -0.5426606288066931
 -0.5426606288066931
 -0.5426606288066931
  1.2813129314021943
  1.2813129314021943
  1.2813129314021943
  1.2813129314021943
  1.2813129314021943
 -1.4541396281032473
 -1.4541396281032473
 -1.4541396281032473
  ⋮
 -0.537725859397369
 -0.537725859397369
  0.6457751734360266
  0.6457751734360266
  0.6457751734360266
  0.6457751734360266
  0.6457751734360266
 -0.40593715325349267
 -0.40593715325349267
 -0.40593715325349267
 -0.40593715325349267
 -0.40593715325349267

In [78]:
g = reshape(repeat(randn(10), inner = p),(5, 10))'

10×5 adjoint(::Matrix{Float64}) with eltype Float64:
  0.705075   0.705075   0.705075   0.705075   0.705075
  1.51667    1.51667    1.51667    1.51667    1.51667
 -0.728552  -0.728552  -0.728552  -0.728552  -0.728552
  0.125279   0.125279   0.125279   0.125279   0.125279
  1.42334    1.42334    1.42334    1.42334    1.42334
 -0.600246  -0.600246  -0.600246  -0.600246  -0.600246
 -0.893547  -0.893547  -0.893547  -0.893547  -0.893547
  0.236745   0.236745   0.236745   0.236745   0.236745
 -0.789944  -0.789944  -0.789944  -0.789944  -0.789944
 -0.147113  -0.147113  -0.147113  -0.147113  -0.147113

In [74]:
matrix(repeat(randn(10), inner = p))

LoadError: UndefVarError: matrix not defined

In [97]:
R = 100
sim = zeros(R,1)
for l in 1:R
    g = reshape(repeat(randn(10), inner = p),(5, 10))'
    sim[l] = maximum(2*abs.(mean.(eachcol(tXtpsi.* g))))
end




In [None]:
lambda0 = c*quantile(vec(sim), 1 - gamma)

In [81]:
g = reshape(repeat(randn(10), inner = p),(5, 10))'

10×5 adjoint(::Matrix{Float64}) with eltype Float64:
 -0.651544  -0.651544  -0.651544  -0.651544  -0.651544
 -0.449609  -0.449609  -0.449609  -0.449609  -0.449609
 -1.22734   -1.22734   -1.22734   -1.22734   -1.22734
 -0.939588  -0.939588  -0.939588  -0.939588  -0.939588
  0.87259    0.87259    0.87259    0.87259    0.87259
 -0.557327  -0.557327  -0.557327  -0.557327  -0.557327
 -0.232092  -0.232092  -0.232092  -0.232092  -0.232092
  0.523636   0.523636   0.523636   0.523636   0.523636
 -0.377307  -0.377307  -0.377307  -0.377307  -0.377307
  1.73438    1.73438    1.73438    1.73438    1.73438

100-element Vector{Float64}:
 0.9058930636221756
 0.9974339887492555
 0.8915361518976817
 0.4059838910163601
 0.7794933188038025
 0.744080936762073
 0.4901078406889143
 1.604911856999283
 0.5387011545821052
 1.8805858054106046
 1.1352847145790745
 0.6435084321974089
 0.6898252417611304
 ⋮
 0.8961491292420162
 0.842828918284426
 1.2523321965848266
 1.2214504204417402
 0.5665618497153938
 0.7392078708070684
 1.1226014126091135
 1.1197690588942042
 0.8127075867966795
 0.4388331744185597
 0.6802999047321783
 0.9591691065536855

In [104]:
quantile(vec(sim), 0.5)

0.9904178448293439

In [106]:
vector = [6.52276955293844, 6.210569320588091, 4.7864301479581, 9.1187992502725, 8.42000515087866, 6.17048265730705]

6-element Vector{Float64}:
 6.52276955293844
 6.210569320588091
 4.7864301479581
 9.1187992502725
 8.42000515087866
 6.17048265730705

In [108]:
quantile(vector, 1 - gamma)

8.76940220057558

In [72]:
n * max(2 * abs(colMeans(tXtpsi * g)))

10×5 adjoint(::Matrix{Float64}) with eltype Float64:
  2.2058     0.106425    0.235087    1.0931     -0.848923
 -0.196872   1.60577     0.766314    2.05381     0.252186
 -0.753655   1.29615     0.0453687   0.488508    0.033132
  0.371078  -1.93811    -1.74439    -0.351444    0.244057
  0.833712   0.309384    0.763387   -1.3087     -1.60633
  0.745911   0.0546277   0.011317    0.0539983   1.87606
 -0.661103  -1.31632    -0.170847    0.677528    0.611613
 -0.363119   0.516851    0.0915226   0.0858657  -0.113451
 -1.23811   -1.33463     0.311227   -1.53816     0.291675
  0.546407  -1.6153      0.0727827   1.39807     1.45549

In [86]:
tXtpsi

10×5 adjoint(::Matrix{Float64}) with eltype Float64:
  2.2058     0.106425    0.235087    1.0931     -0.848923
 -0.196872   1.60577     0.766314    2.05381     0.252186
 -0.753655   1.29615     0.0453687   0.488508    0.033132
  0.371078  -1.93811    -1.74439    -0.351444    0.244057
  0.833712   0.309384    0.763387   -1.3087     -1.60633
  0.745911   0.0546277   0.011317    0.0539983   1.87606
 -0.661103  -1.31632    -0.170847    0.677528    0.611613
 -0.363119   0.516851    0.0915226   0.0858657  -0.113451
 -1.23811   -1.33463     0.311227   -1.53816     0.291675
  0.546407  -1.6153      0.0727827   1.39807     1.45549

0.40868440531922995

In [93]:
mean.(eachcol(tXtpsi.* g))

5-element Vector{Float64}:
  0.09175619074607241
 -0.20434220265961497
  0.18418159031061226
 -0.01841595885434937
 -0.006452323111749836

In [58]:
Ups0 = sqrt(var(y, corrected = true))

0.5723082480468004

In [45]:
alex = 45
mute = alex

45

In [48]:
if (isnothing(mute))  == 0
    println(alex)
end 

45


In [47]:
isnothing(mute)

false

In [4]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 10
p = Int(n/2)

x = randn(n, p)
beta = randn(p)
lmbda = randn(p)
y = randn(n)
gamma = 0.1
c = 1.1


1.1

In [9]:
psi = mean.(eachcol(x.^2))



tXtpsi = (x' ./ sqrt.(psi))'

10×5 adjoint(::Matrix{Float64}) with eltype Float64:
 -0.276339    0.638309    0.44012    -1.99609    -0.696107
 -0.930588   -0.792754    1.56848     1.2595     -0.37923
  0.559535   -0.345888   -1.43638    -0.323765   -0.636522
  0.286299   -0.313657   -1.47366     0.0882716  -1.68812
  1.16986     0.0302914  -0.349372    1.23167     1.11022
 -2.53313     2.02852    -1.46518    -0.828138   -0.438008
  0.0846915  -0.327124   -0.641858    0.886998    0.800242
  0.814653   -0.0714077   0.302399    0.539086    0.211448
  0.210722    0.221536    0.0479571  -1.01492    -1.73667
 -0.402537    2.11403     0.580321   -0.0798286   0.99555

In [85]:
tXtpsi = (x' ./ sqrt(psi))'

10×5 adjoint(::Matrix{Float64}) with eltype Float64:
  2.2058     0.106425    0.235087    1.0931     -0.848923
 -0.196872   1.60577     0.766314    2.05381     0.252186
 -0.753655   1.29615     0.0453687   0.488508    0.033132
  0.371078  -1.93811    -1.74439    -0.351444    0.244057
  0.833712   0.309384    0.763387   -1.3087     -1.60633
  0.745911   0.0546277   0.011317    0.0539983   1.87606
 -0.661103  -1.31632    -0.170847    0.677528    0.611613
 -0.363119   0.516851    0.0915226   0.0858657  -0.113451
 -1.23811   -1.33463     0.311227   -1.53816     0.291675
  0.546407  -1.6153      0.0727827   1.39807     1.45549

In [54]:
quantile(Normal(0.0, 1.0),1 - gamma/(2*p))

2.326347874040846

In [55]:
# Initilaize lambda
lmbda0 = 2 * c * sqrt(n) * quantile(Normal(0.0, 1.0),1 - gamma/(2*p))

16.184427406091057

In [32]:
sqrt((y.^2.0)'*(x.^2.0))

LoadError: MethodError: no method matching sqrt(::Adjoint{Float64, Vector{Float64}})
[0mClosest candidates are:
[0m  sqrt([91m::Union{Float32, Float64}[39m) at C:\Users\Alexander\AppData\Local\Programs\Julia-1.7.2\share\julia\base\math.jl:566
[0m  sqrt([91m::StridedMatrix{T}[39m) where T<:Union{Real, Complex} at C:\Users\Alexander\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\dense.jl:836
[0m  sqrt([91m::Diagonal[39m) at C:\Users\Alexander\AppData\Local\Programs\Julia-1.7.2\share\julia\stdlib\v1.7\LinearAlgebra\src\diagonal.jl:592
[0m  ...

In [None]:
Ups0 = (1 / np.sqrt(n)) * np.sqrt((y**2).T @ (x**2)).T

In [None]:
lambda0 <- 2 * penalty$c * sqrt(n) * qnorm(1 - penalty$gamma/(2 *p))

In [None]:
intercetp::Bool=true

In [None]:
LassoShooting_fit(X, Y, lmbda)

In [None]:
init_values(X, Y)[4]

In [None]:
reg = lm(X, Y)

In [None]:
homoskedastic=False, X_dependent_lambda=False,
                      lambda_start=None, c=1.1, gamma=0.1, numSim=5000, y=None,
                      x=None, par=True, corecap=np.inf, fix_seed=True

In [None]:
init_values(X, Y)

In [None]:
X

In [1]:
import Pkg; Pkg.add("RData")
import Pkg; Pkg.add("CodecBzip2")
import Pkg; Pkg.add("DataStructures")
import Pkg; Pkg.add("NamedArrays")
import Pkg; Pkg.add("PrettyTables")
import Pkg; Pkg.add("Lasso")

[32m[1m    Updating[22m[39m registry at `C:\Users\Alexander\.julia\registries\General.toml`
│   exception = Downloads.RequestError("https://pkg.julialang.org/registries", 35, "schannel: failed to receive handshake, SSL/TLS connection failed", Downloads.Response("https", "https://sa.pkg.julialang.org/registries", 301, "HTTP/1.1 301 SA internal redirect trigger", ["connection" => "close", "content-length" => "0", "server" => "Varnish", "retry-after" => "0", "location" => "https://sa.pkg.julialang.org/registries", "x-geo-continent" => "SA", "x-geo-country" => "PE", "x-geo-region" => "CAL", "accept-ranges" => "bytes", "date" => "Sun, 13 Mar 2022 15:56:16 GMT", "via" => "1.1 varnish", "x-served-by" => "cache-lim12125-LIM", "x-cache" => "HIT", "x-cache-hits" => "0", "x-timer" => "S1647186976.287239,VS0,VE0"]))
└ @ Pkg.Registry C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.7\Pkg\src\Registry\Registry.jl:82
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  N

LoadError: Unable to automatically install 'Bzip2' from 'C:\Users\Alexander\.julia\packages\Bzip2_jll\iOonP\Artifacts.toml'

In [2]:
using Pkg

Pkg.PlatformEngines.probe_platform_engines!()

Pkg.PlatformEngines.download("https://github.com/JuliaBinaryWrappers/MKL_jll.jl/releases/download/MKL-v2020.0.166%2B0/MKL.v2020.0.166.x86_64-apple-darwin14.tar.gz", "MKL_jll.tar.gz"; verbose=true)

LoadError: HTTP/1.1 302 Found (Send failure: Connection was reset) while requesting https://github.com/JuliaBinaryWrappers/MKL_jll.jl/releases/download/MKL-v2020.0.166%2B0/MKL.v2020.0.166.x86_64-apple-darwin14.tar.gz

In [3]:
curl --version

LoadError: syntax: invalid operator "--"

In [5]:
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, DataStructures, NamedArrays, PrettyTables

In [13]:
# Importing .Rdata file
growth_read = load("../../data/GrowthData.RData")

# Since growth_read is a dictionary, we check if there is a key called "GrowthData", the one we need for our analyze
haskey(growth_read, "GrowthData")
# Now we save that dataframe with a new name
growth = growth_read["GrowthData"]
names(growth)

Y = growth[!, "Outcome"]
Y_2 = DataFrame([Y], [:Y])
X_2 = select(growth, Not(["Outcome"]))

Dict{String, Any} with 1 entry:
  "GrowthData" => [1m90×63 DataFrame[0m…

Unnamed: 0_level_0,intercept,gdpsh465,bmp1l,freeop,freetar,h65,hm65,hf65,p65
Unnamed: 0_level_1,Int32,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1,6.59167,0.2837,0.153491,0.043888,0.007,0.013,0.001,0.29
2,1,6.82979,0.6141,0.313509,0.061827,0.019,0.032,0.007,0.91
3,1,8.89508,0.0,0.204244,0.009186,0.26,0.325,0.201,1.0
4,1,7.56528,0.1997,0.248714,0.03627,0.061,0.07,0.051,1.0
5,1,7.1624,0.174,0.299252,0.037367,0.017,0.027,0.007,0.82
6,1,7.21891,0.0,0.258865,0.02088,0.023,0.038,0.006,0.5
7,1,7.8536,0.0,0.182525,0.014385,0.039,0.063,0.014,0.92
8,1,7.70391,0.2776,0.215275,0.029713,0.024,0.035,0.013,0.69
9,1,9.06346,0.0,0.109614,0.002171,0.402,0.488,0.314,1.0
10,1,8.15191,0.1484,0.110885,0.028579,0.145,0.173,0.114,1.0


In [28]:
X_2 = convert(Matrix, Matrix(X_2[:, 2:5]))
Y_2 = convert(Matrix, Matrix(Y_2))
lmbda = randn(size(X_2)[2])

4-element Vector{Float64}:
  0.2453949358186934
 -0.08642608748746254
 -0.6178751708887436
 -0.23957066264789037

In [29]:
lm(X_2, Y_2)

LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Matrix{Float64}, ::Matrix{Float64}, ::Nothing)
[0mClosest candidates are:
[0m  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, [91m::AbstractVector{<:Real}[39m, ::Union{Nothing, Bool}; wts, dropcollinear) at C:\Users\Alexander\.julia\packages\GLM\gt3bb\src\lm.jl:161
[0m  fit([91m::Type{StatsBase.Histogram}[39m, ::Any...; kwargs...) at C:\Users\Alexander\.julia\packages\StatsBase\pJqvO\src\hist.jl:383
[0m  fit(::Type{T}, [91m::FormulaTerm[39m, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel at C:\Users\Alexander\.julia\packages\StatsModels\57Kc9\src\statsmodel.jl:78
[0m  ...

In [30]:
X_2

90×4 Matrix{Float64}:
 6.59167  0.2837  0.153491  0.043888
 6.82979  0.6141  0.313509  0.061827
 8.89508  0.0     0.204244  0.009186
 7.56528  0.1997  0.248714  0.03627
 7.1624   0.174   0.299252  0.037367
 7.21891  0.0     0.258865  0.02088
 7.8536   0.0     0.182525  0.014385
 7.70391  0.2776  0.215275  0.029713
 9.06346  0.0     0.109614  0.002171
 8.15191  0.1484  0.110885  0.028579
 6.92952  0.0296  0.165784  0.020115
 7.23778  0.2151  0.078488  0.011581
 8.11582  0.4318  0.137482  0.026547
 ⋮                          
 7.89469  0.1062  0.247626  0.037392
 7.17549  0.0     0.179933  0.046376
 9.03097  0.0     0.293138  0.005517
 8.99554  0.0     0.30472   0.011658
 8.23483  0.0363  0.288405  0.011589
 8.33255  0.0     0.345485  0.006503
 8.64559  0.0     0.28844   0.005995
 8.99106  0.0     0.371898  0.014586
 8.02519  0.005   0.296437  0.013615
 9.03014  0.0     0.265778  0.008629
 8.86531  0.0     0.282939  0.005048
 8.91234  0.0     0.150366  0.024377

In [172]:
# # Get number of observations n and number of variables p
# n, p = size(X)

# # Get number of simulations to use (if simulations are necessary)
# R = numSim

# # Go through all possible combinations of homoskedasticy/heteroskedasticity
# # and X-dependent or independent error terms. The first two cases are
# # special cases: Handling the case there homoskedastic was set to None, and
# # where lambda_start was provided.
# #

# # 1) If homoskedastic was set to None (special case)
# if (isnothing(homoskedastic))
    
#     # Initialize lambda
#         lmbda0 = lambda_start
    
#     Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))
            
#     # Calculate the final vector of penalty terms
#         lmbda = lmbda0 * Ups0
    
# # 2) If lambda_start was provided (special case)
# elseif (isnothing(lambda_start)) == 0
    
#     # Check whether a homogeneous penalty term was provided (a scalar)
#     if maximum(size(lambda_start)) == 1
#         # If so, repeat that p times as the penalty term
#         lmbda = ones(p,1).*lambda_start
    
#     else:
#         # Otherwise, use the provided vector of penalty terms as is
#         lmbda = lambda_start
#     end
        
# # 3) Homoskedastic and X-independent
# elseif homoskedastic == true &  X_dependent_lambda == false
    
#     # Initilaize lambda
#     lmbda0 = 2 * c * sqrt(n) * quantile(Normal(0.0, 1.0),1 - gamma/(2*p))
    
#     # Use ddof=1(corrected = true in Julia) to be consistent with R's var() function (in Julia by defaul the DDF is N-1)
#     Ups0 = sqrt(var(y, corrected = true))
    
#     # Calculate the final vector of penalty terms
#     lmbda = zeros(p,1) .+ lmbda0 * Ups0

# # 4) Homoskedastic and X-dependent
# elseif homoskedastic == true & X_dependent_lambda == true
#     psi = mean.(eachcol(x.^2))
#     tXtpsi = (x' ./ sqrt.(psi))'
    
#     R = 5000
#     sim = zeros(R,1)
    
#     for l in 1:R
#         g = reshape(repeat(randn(10), inner = p),(p, n))'
#         sim[l] = n * maximum(2*abs.(mean.(eachcol(tXtpsi.* g))))
#     end
    
#     # Initialize lambda based on the simulated quantiles
#     lambda0 = c*quantile(vec(sim), 1 - gamma)
    
#     Ups0 = sqrt(var(y, corrected = true))
    
#     # Calculate the final vector of penalty terms
#     lmbda = zeros(p,1) .+ lmbda0 * Ups0
 
# # 5) Heteroskedastic and X-independent
# elseif homoskedastic == false &  X_dependent_lambda == false
    
#     # The original includes the comment, "1=num endogenous variables"
#     lmbda0 = 2 * c * sqrt(n) * quantile(Normal(0.0, 1.0),1 - gamma/(2*p*1))
    
#     Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))
#     lmbda = lmbda0 * Ups0

# # 6) Heteroskedastic and X-dependent
# elseif homoskedastic == false &  X_dependent_lambda == true
    
#     eh = y
#     ehat = reshape(repeat(eh, inner = p),(p, n))'
    
#     xehat = x.*ehat
#     psi = mean.(eachcol(xehat.^2))'
#     tXehattpsi = (xehat./sqrt.(psi))
    
#     R = 5000
#     sim = zeros(R,1)
    
#     for l in 1:R
#         g = reshape(repeat(randn(10), inner = p),(p, n))'
#         sim[l] = n * maximum(2*abs.(mean.(eachcol(tXtpsi.* g))))
#     end
    
#     # Initialize lambda based on the simulated quantiles
#     lambda0 = c*quantile(vec(sim), 1 - gamma)
    
#     Ups0 = (1 /sqrt(n)) * sqrt.((y.^2)'*(x.^2))
    
#     lmbda = lmbda0 * Ups0

# return Dict("lambda0" => lmbda0, "lambda" => lmbda, "Ups0" => Ups0) 
    
# end

LoadError: UndefVarError: numSim not defined