In [36]:
# Import relevant packages for splitting data
using LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, Tables, TableOperations, StatsBase, FreqTables, DataFrames

In [37]:
# Define a function which turn a list or vector-like object into a proper two
# dimensional column vector

function cvec(a)
    """ Turn a list or vector-like object into a proper column vector
    Input
    a: List or vector-like object, has to be a potential input for np.array()
    Output
    vec: two dimensional NumPy array, with the first dimension weakly greater
         than the second (resulting in a column vector for a vector-like input)
    """
    
    # Conver input into a two dimensional NumPy array
    vec = cat([a], dims = 2) 

    # Check whether the second dimension is strictly greater than the first
    # (remembering Python's zero indexing)
    
    if size(vec)[1] < size(vec)[2]
        # If so, transpose the input vector
        vec = transpose(vec)
    end
   
    # Return the column vector
    return vec

end
    

cvec (generic function with 1 method)

In [39]:
import Statistics.cor
function corre(y, X)
    
    """ Return correlation coefficients between columns of matrices
    Inputs
    y: n by 1 NumPy array
    X: n by k NumPy array
    Outputs
    corr: list of length k, where the k-th element is the correlation
          coefficient between y and the k-th column of X
    """
    # Concatenate y and X into a single NumPy array
    yX = hcat(y, X)
    
    # Get the correlation coefficients between all columns of that array
    corr = cor(yX)
    
    # Get the first row, starting at the first off-diagonal element (these are
    # the correlation coefficients between y and each column of X
    corr = corr[1, :] 
    
    # Return the result
    return corr

end

corre (generic function with 1 method)

In [41]:
function init_values(X, y, number::Int64=5, intercetp::Bool=true)
    """ Return an initial parameter guess for a LASSO model
    Inputs
    y: n by 1 NumPy array, outcome variable
    X: n by k NumPy array, RHS variables
    Outputs
    residuals: n ny 1 NumPy array, residuals for initial parameter guess
    coefficients: k by 1 NumPy array, initial coefficient values
    """
    # Make sure y is a proper column vector
    #y = cvec(y)
    
    # Get the absolute value of correlations between y and X
    corr = broadcast(abs, cor(y, X)[1, :])
    
    # Get the number of columns of X
    kx = size(X)[2]
    
    # Make an index selecting the five columns of X which are most correlated
    # with y (since .argsort() always sorts in increasing order, selecting from
    # the back gets the most highly correlated columns)
    index = sortperm(corr, rev=true)[1: min(number, kx)]
    
    # Set up an array of coefficient guesses
    coefficients = zeros(kx)
    
    # Regress y on the five most correlated columns of X, including an intercept
    # if desired
#    reg = lm(X[:, index], y)
    
#     # Replace the guesses for the estimated coefficients (note that .coef_ does
#     # not return the estimated intercept, if one was included in the model)
    
#     coefficients[index] = GLM.coef(reg)
    
#     # Replace any NANs as zeros
#     replace!(coefficients, NaN=>0)
    
#     # Get the regression residuals
#     residuals = y - predict(reg, X[:, index])
    
#     return residuals, reg, index, coefficients, corr
    return index
    
end


init_values (generic function with 3 methods)

In [42]:
# function LassoShooting_fit( x, y, lmbda, maxIter::Int = 1000, optTol::Float64 = 10^(-5), zeroThreshold::Float64 = 10^(-6),
#                             XX = nothing, Xy = nothing, beta_start = nothing)

In [43]:
# function LassoShooting_fit( x, y, lmbda, control::control, 
#                             XX = nothing, Xy = nothing, beta_start = nothing)

function LassoShooting_fit( x, y, lmbda, maxIter::Int = 1000, optTol::Float64 = 10^(-5), zeroThreshold::Float64 = 10^(-6),
                            XX = nothing, Xy = nothing, beta_start = nothing)
        
     """ Shooting LASSO algorithm with variable dependent penalty weights
    Inputs
    x: n by p NumPy array, RHS variables
    y: n by 1 NumPy array, outcome variable
    lmbda: p by 1 NumPy array, variable dependent penalty terms. The j-th
           element is the penalty term for the j-th RHS variable.
    maxIter: integer, maximum number of shooting LASSO updated
    optTol: scalar, algorithm terminated once the sum of absolute differences
            between the updated and current weights is below optTol
    zeroThreshold: scalar, if any final weights are below zeroThreshold, they
                   will be set to zero instead
    XX: k by k NumPy array, pre-calculated version of x'x
    Xy: k by 1 NumPy array, pre-calculated version of x'y
    beta_start: k by 1 NumPy array, initial weights
    Outputs
    w: k by 1 NumPy array, final weights
    wp: k by m + 1 NumPy array, where m is the number of iterations the
        algorithm took. History of weight updates, starting with the initial
        weights.
    m: integer, number of iterations the algorithm took
    """
    n = size(x)[1]
    p = size(x)[2]
    
    # Check whether XX and Xy were provided, calculate them if not
    if (isnothing(XX))
        XX = x'*x
    end

    if (isnothing(Xy))
        Xy = x'*y
    end

    # Check whether an initial value for the intercept was provided

    if (isnothing(beta_start))
        # If not, use init_values from help_functions, which will return
        # regression estimates for the five variables in x which are most
        # correlated with y, and initialize all other coefficients as zero
        beta = init_values(x, y)[4]

    else
        # Otherwise, use the provided initial weights
        beta = beta_start
    end

    # Set up a history of weights over time, starting with the initial ones
    wp = beta

    # Keep track of the number of iterations
    m = 1

    # Create versions of XX and Xy which are just those matrices times two
    XX2 = XX * 2
    Xy2 = Xy * 2

    #@unpack maxIter, optTol, zeroThreshold = control()

    # Go through all iteration
    while m<maxIter

        # Save the last set of weights (the .copy() is important, otherwise
        # beta_old will be updated every time beta is changed during the
        # following loop)
        beta_old = copy(beta)

        # Go through all parameters
        for j in 1:p
            
            # Calculate the shoot
            S0 = sum( XX2[j, :].*beta ) - XX2[j, j].*beta[j] - Xy2[j]

            # Update the weights
            if sum(isnothing(XX)) >= 1
                beta[j] = 0

            elseif S0 >lmbda[j]
                beta[j] = (lmbda[j] - S0) / XX2[j,j]

            elseif S0 < -lmbda[j]
                beta[j] = (-lmbda[j] - S0) / XX2[j,j]

            elseif broadcast(abs, S0) <= lmbda[j]
                beta[j] = 0

            end
        end

        # Add the updated weights to the history of weights
        wp = hcat(wp, beta)

        # Check whether the weights are within tolerance
        if sum(broadcast(abs, beta - beta_old)) < optTol
            # If so, break the while loop
            break
        end

        # Increase the iteration counter
        m = m + 1
    end

    # Set the final weights to the last updated weights
    w = beta   

    # Set weights which are within zeroThreshold to zero
    w[broadcast(abs, w) .< zeroThreshold] .= 0
    
    #return beta,  w
    return Dict("coefficients" => w, "coef_list" => wp, "num_it" => m)
    return w, wp, m
    #return XX2, Xy2
    

end
        

LassoShooting_fit (generic function with 7 methods)

In [56]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 10
p = Int(n/2)

X = randn(n, p)
beta = randn(p)
lmbda = randn(p)
Y = randn(n)

10-element Vector{Float64}:
  0.9390236880834457
  1.062875086192527
  0.5817862916124238
 -0.09281756897873912
  0.2546410204259804
  0.9994262690418787
  0.5914104665434333
  0.9514550733700607
 -0.3828570823089012
  0.3774992183051544

In [57]:
lm(X, Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
─────────────────────────────────────────────────────────────────
         Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
─────────────────────────────────────────────────────────────────
x1  -0.443655     0.259419  -1.71    0.1479  -1.11051    0.223204
x2  -0.0201311    0.225097  -0.09    0.9322  -0.598762   0.5585
x3   0.0879272    0.207484   0.42    0.6893  -0.445427   0.621282
x4   0.121388     0.303989   0.40    0.7061  -0.660041   0.902818
x5   0.112992     0.298344   0.38    0.7204  -0.653925   0.879908
─────────────────────────────────────────────────────────────────


In [59]:
init_values(X, Y)
#LassoShooting_fit(X, Y, lmbda)

5-element Vector{Int64}:
 3
 1
 2
 4
 5

In [60]:
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, DataStructures, NamedArrays, PrettyTables

In [61]:
# Importing .Rdata file
growth_read = load("../../data/GrowthData.RData")

# Since growth_read is a dictionary, we check if there is a key called "GrowthData", the one we need for our analyze
haskey(growth_read, "GrowthData")
# Now we save that dataframe with a new name
growth = growth_read["GrowthData"]
names(growth)

Y = growth[!, "Outcome"]
Y_2 = DataFrame([Y], [:Y])
X_2 = select(growth, Not(["Outcome"]))

Unnamed: 0_level_0,intercept,gdpsh465,bmp1l,freeop,freetar,h65,hm65,hf65,p65
Unnamed: 0_level_1,Int32,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1,6.59167,0.2837,0.153491,0.043888,0.007,0.013,0.001,0.29
2,1,6.82979,0.6141,0.313509,0.061827,0.019,0.032,0.007,0.91
3,1,8.89508,0.0,0.204244,0.009186,0.26,0.325,0.201,1.0
4,1,7.56528,0.1997,0.248714,0.03627,0.061,0.07,0.051,1.0
5,1,7.1624,0.174,0.299252,0.037367,0.017,0.027,0.007,0.82
6,1,7.21891,0.0,0.258865,0.02088,0.023,0.038,0.006,0.5
7,1,7.8536,0.0,0.182525,0.014385,0.039,0.063,0.014,0.92
8,1,7.70391,0.2776,0.215275,0.029713,0.024,0.035,0.013,0.69
9,1,9.06346,0.0,0.109614,0.002171,0.402,0.488,0.314,1.0
10,1,8.15191,0.1484,0.110885,0.028579,0.145,0.173,0.114,1.0


In [62]:
X_2 = convert(Matrix, Matrix(X_2[:, 2:5]))
Y_2 = convert(Matrix, Matrix(Y_2))
lmbda = randn(size(X_2)[2])

4-element Vector{Float64}:
 -1.3201593173151316
  0.48202075083438606
 -0.28271178764718186
 -0.7896844104717727

In [63]:
lm(X_2, Y_2)

LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Matrix{Float64}, ::Matrix{Float64}, ::Nothing)
[0mClosest candidates are:
[0m  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, [91m::AbstractVector{<:Real}[39m, ::Union{Nothing, Bool}; wts, dropcollinear) at C:\Users\Alexander\.julia\packages\GLM\gt3bb\src\lm.jl:161
[0m  fit([91m::Type{Histogram}[39m, ::Any...; kwargs...) at C:\Users\Alexander\.julia\packages\StatsBase\pJqvO\src\hist.jl:383
[0m  fit(::Type{T}, [91m::FormulaTerm[39m, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel at C:\Users\Alexander\.julia\packages\StatsModels\57Kc9\src\statsmodel.jl:78
[0m  ...

In [64]:
# using Pkg
# Pkg.add("CSV")
# Pkg.add("DataFrames")
# Pkg.add("Dates")
# Pkg.add("Plots")
using CSV
using DataFrames
using Dates
#using Plots

In [68]:
#Reading the CSV file into a DataFrame
#We have to set the category type for some variable
data = CSV.File("../../data/wage2015_subsample_inference.csv"; types = Dict("occ" => String,"occ2"=> String,"ind"=>String,"ind2"=>String)) |> DataFrame
println("Number of Rows : ", size(data)[1],"\n","Number of Columns : ", size(data)[2],) #rows
[eltype(col) for col = eachcol(data)]
n = size(data)[1]
z = select(data, Not([:rownames, :lwage, :wage]))
p = size(z)[2] 
y = data[!, names(data, "lwage")]
z

Number of Rows : 5150
Number of Columns : 21


Unnamed: 0_level_0,sex,shs,hsg,scl,clg,ad,mw,so,we
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
10,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [69]:
X = convert(Matrix, Matrix(z[:, 1:5]))
Y = convert(Matrix, Matrix(y))
lmbda = randn(size(X)[2])

5-element Vector{Float64}:
  0.08874093642803604
 -0.006235975855888859
 -0.15433130990008584
  0.165982906183319
 -1.964787191448247

In [70]:
reg = lm(X, Y)

LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Matrix{Float64}, ::Matrix{Float64}, ::Nothing)
[0mClosest candidates are:
[0m  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, [91m::AbstractVector{<:Real}[39m, ::Union{Nothing, Bool}; wts, dropcollinear) at C:\Users\Alexander\.julia\packages\GLM\gt3bb\src\lm.jl:161
[0m  fit([91m::Type{Histogram}[39m, ::Any...; kwargs...) at C:\Users\Alexander\.julia\packages\StatsBase\pJqvO\src\hist.jl:383
[0m  fit(::Type{T}, [91m::FormulaTerm[39m, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel at C:\Users\Alexander\.julia\packages\StatsModels\57Kc9\src\statsmodel.jl:78
[0m  ...

In [None]:
LassoShooting_fit(X, Y, lmbda)

In [None]:
init_values(X, Y)[4]

In [None]:
reg = lm(X, Y)

In [None]:
LassoShooting_fit(X, Y, lmbda)

In [None]:
init_values(X, Y)

In [None]:
X

In [1]:
import Pkg; Pkg.add("RData")
import Pkg; Pkg.add("CodecBzip2")
import Pkg; Pkg.add("DataStructures")
import Pkg; Pkg.add("NamedArrays")
import Pkg; Pkg.add("PrettyTables")
import Pkg; Pkg.add("Lasso")

[32m[1m    Updating[22m[39m registry at `C:\Users\Alexander\.julia\registries\General.toml`
│   exception = Downloads.RequestError("https://pkg.julialang.org/registries", 35, "schannel: failed to receive handshake, SSL/TLS connection failed", Downloads.Response("https", "https://sa.pkg.julialang.org/registries", 301, "HTTP/1.1 301 SA internal redirect trigger", ["connection" => "close", "content-length" => "0", "server" => "Varnish", "retry-after" => "0", "location" => "https://sa.pkg.julialang.org/registries", "x-geo-continent" => "SA", "x-geo-country" => "PE", "x-geo-region" => "CAL", "accept-ranges" => "bytes", "date" => "Sun, 13 Mar 2022 15:56:16 GMT", "via" => "1.1 varnish", "x-served-by" => "cache-lim12125-LIM", "x-cache" => "HIT", "x-cache-hits" => "0", "x-timer" => "S1647186976.287239,VS0,VE0"]))
└ @ Pkg.Registry C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.7\Pkg\src\Registry\Registry.jl:82
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  N

LoadError: Unable to automatically install 'Bzip2' from 'C:\Users\Alexander\.julia\packages\Bzip2_jll\iOonP\Artifacts.toml'

In [2]:
using Pkg

Pkg.PlatformEngines.probe_platform_engines!()

Pkg.PlatformEngines.download("https://github.com/JuliaBinaryWrappers/MKL_jll.jl/releases/download/MKL-v2020.0.166%2B0/MKL.v2020.0.166.x86_64-apple-darwin14.tar.gz", "MKL_jll.tar.gz"; verbose=true)

LoadError: HTTP/1.1 302 Found (Send failure: Connection was reset) while requesting https://github.com/JuliaBinaryWrappers/MKL_jll.jl/releases/download/MKL-v2020.0.166%2B0/MKL.v2020.0.166.x86_64-apple-darwin14.tar.gz

In [3]:
curl --version

LoadError: syntax: invalid operator "--"

In [5]:
using RData, LinearAlgebra, GLM, DataFrames, Statistics, Random, Distributions, DataStructures, NamedArrays, PrettyTables

In [13]:
# Importing .Rdata file
growth_read = load("../../data/GrowthData.RData")

# Since growth_read is a dictionary, we check if there is a key called "GrowthData", the one we need for our analyze
haskey(growth_read, "GrowthData")
# Now we save that dataframe with a new name
growth = growth_read["GrowthData"]
names(growth)

Y = growth[!, "Outcome"]
Y_2 = DataFrame([Y], [:Y])
X_2 = select(growth, Not(["Outcome"]))

Dict{String, Any} with 1 entry:
  "GrowthData" => [1m90×63 DataFrame[0m…

Unnamed: 0_level_0,intercept,gdpsh465,bmp1l,freeop,freetar,h65,hm65,hf65,p65
Unnamed: 0_level_1,Int32,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1,6.59167,0.2837,0.153491,0.043888,0.007,0.013,0.001,0.29
2,1,6.82979,0.6141,0.313509,0.061827,0.019,0.032,0.007,0.91
3,1,8.89508,0.0,0.204244,0.009186,0.26,0.325,0.201,1.0
4,1,7.56528,0.1997,0.248714,0.03627,0.061,0.07,0.051,1.0
5,1,7.1624,0.174,0.299252,0.037367,0.017,0.027,0.007,0.82
6,1,7.21891,0.0,0.258865,0.02088,0.023,0.038,0.006,0.5
7,1,7.8536,0.0,0.182525,0.014385,0.039,0.063,0.014,0.92
8,1,7.70391,0.2776,0.215275,0.029713,0.024,0.035,0.013,0.69
9,1,9.06346,0.0,0.109614,0.002171,0.402,0.488,0.314,1.0
10,1,8.15191,0.1484,0.110885,0.028579,0.145,0.173,0.114,1.0


In [28]:
X_2 = convert(Matrix, Matrix(X_2[:, 2:5]))
Y_2 = convert(Matrix, Matrix(Y_2))
lmbda = randn(size(X_2)[2])

4-element Vector{Float64}:
  0.2453949358186934
 -0.08642608748746254
 -0.6178751708887436
 -0.23957066264789037

In [29]:
lm(X_2, Y_2)

LoadError: MethodError: no method matching fit(::Type{LinearModel}, ::Matrix{Float64}, ::Matrix{Float64}, ::Nothing)
[0mClosest candidates are:
[0m  fit(::Type{LinearModel}, ::AbstractMatrix{<:Real}, [91m::AbstractVector{<:Real}[39m, ::Union{Nothing, Bool}; wts, dropcollinear) at C:\Users\Alexander\.julia\packages\GLM\gt3bb\src\lm.jl:161
[0m  fit([91m::Type{StatsBase.Histogram}[39m, ::Any...; kwargs...) at C:\Users\Alexander\.julia\packages\StatsBase\pJqvO\src\hist.jl:383
[0m  fit(::Type{T}, [91m::FormulaTerm[39m, ::Any, ::Any...; contrasts, kwargs...) where T<:RegressionModel at C:\Users\Alexander\.julia\packages\StatsModels\57Kc9\src\statsmodel.jl:78
[0m  ...

In [30]:
X_2

90×4 Matrix{Float64}:
 6.59167  0.2837  0.153491  0.043888
 6.82979  0.6141  0.313509  0.061827
 8.89508  0.0     0.204244  0.009186
 7.56528  0.1997  0.248714  0.03627
 7.1624   0.174   0.299252  0.037367
 7.21891  0.0     0.258865  0.02088
 7.8536   0.0     0.182525  0.014385
 7.70391  0.2776  0.215275  0.029713
 9.06346  0.0     0.109614  0.002171
 8.15191  0.1484  0.110885  0.028579
 6.92952  0.0296  0.165784  0.020115
 7.23778  0.2151  0.078488  0.011581
 8.11582  0.4318  0.137482  0.026547
 ⋮                          
 7.89469  0.1062  0.247626  0.037392
 7.17549  0.0     0.179933  0.046376
 9.03097  0.0     0.293138  0.005517
 8.99554  0.0     0.30472   0.011658
 8.23483  0.0363  0.288405  0.011589
 8.33255  0.0     0.345485  0.006503
 8.64559  0.0     0.28844   0.005995
 8.99106  0.0     0.371898  0.014586
 8.02519  0.005   0.296437  0.013615
 9.03014  0.0     0.265778  0.008629
 8.86531  0.0     0.282939  0.005048
 8.91234  0.0     0.150366  0.024377