In [26]:
# Julia version code
"""
nonnegative linear regression 
"""
# Discussion from Dec 22
# 1. Currently, our runtime is O(n/sqrt(eps)*(m+n)). The per iteration O(m)
# is unavoidable; however, IF we are NOT required to output the optimizer xtilde_ktotal, 
# then by maintaining 1^T x in each iteration (at a cost of O(1)), we can completely
# avoid O(n) per iteration. 

using LinearAlgebra, BenchmarkTools, Plots, Convex, SCS, NonNegLeastSquares, MLDatasets


In [213]:
function alg_ours_with_restart(C::Matrix{Float64}, b::Matrix{Float64}, ϵ::Float64 )
    extra_term_nnls = 0.5*norm(b)^2
    m, n = size(C)
    #number of times we restart
    K = 10 #ceil(log2(1/ϵ)); We use numbers to test out restart based on iteration count rather than a metric. 
    col_norm = norm.(eachcol(C))
    inv_col_norm_square = 1.0 ./(col_norm.^2)
    idx_seq = 1:n
    
    x0 = zeros(n)
    y0 = zeros(m)
    z0 = zeros(m)
    
    
    gamma_scaling = 1
    
    for i=1:K
        gamma_scaling+=1 
        gamma = 50/gamma_scaling 
        xktilde, yktilde, zktilde = alg_our_core(C, x0, y0, z0, m, n, inv_col_norm_square, idx_seq, ϵ, gamma)
        x0[:] = xktilde[:]
        y0[:] = yktilde[:]
        z0[:] = zktilde[:]
        
    end
    return -sum(x0) + 0.5* norm(z0)^2 +extra_term_nnls
end

function alg_our_core(C, x0, y0, z0, m, n, inv_col_norm_square, idx_seq, ϵ, gamma)
        # reset all the scaling factors 
        previous_A = 1.0/n
        previous_a = previous_A #a_1, A_1
        a = 1.0/(n*n) # a_2
        A = (n+1.0)/(n * n) # A_2
        
        # compute x1 using the input x0 
        # we redefined phio(x) = 1/2 * ||x-x0||_A^2, hence updating x requires x0 
        # the step p(j)+=1/||A:j||^2 implicitly assumes ybar_0 = 0 
        # To allow for ybar_0 \neq 0, we change p(j) a bit 
       
        ybar = copy(y0)
        j = rand(idx_seq)
        Aty0m = 1 - dot(ybar, C[:, j]) #                                     dot(̄ȳ, C[:, j])
    
        p = copy(x0) 
        x = copy(x0)
        p[j] += inv_col_norm_square[j]*Aty0m 
        x[j] = min(inv_col_norm_square[j], max(0, p[j])) #x and x0 differ only at j 
        
        # compute y1
        # note that y0^(R) and y1^(R) are independent of each othre
        # y1^(R) = Ax1^(R) = Ax0^(R) + A*(x1^R - x0^R) = z0^R + A*(x1^R - x0^R)
        # y0^R may be chosen to be either ytildeK or 0 (our analysis uses 0)
        # Further note that if y0^R = ytildeK, then we must ALSO choose ybar_0 = ytildeK, and 
        # this changes how x is init. 
        previous_y = copy(y0) 
        z = copy(z0) 
        z += C[:, j] * (x[j] - x0[j]) # z_1 = A x_1 = A (x_0 + (x_1 - x_0))
        y = copy(z) # y_1 = A xtilde1 = A x_1 = z_1 
    
        # compute ȳ, ỹ (because we need to return it), and some auxiliary variables 
        ybar[:] = y[:] + previous_a/a * (y[:] - previous_y[:]) #ybar_1 
        s = zeros(n) # need this so that xtildek = xk + sk/Ak; s_1 = 0 (see Chaobing's lemma for why this is needed)
        ỹ = copy(y) # ytildek = convex comb of yi's, so ytilde1 = y1

        # restart value init; -1^{\top}x+0.5\|Ax\|^{2}+.5*\|y\|^{2}+\frac{1}{2\epsilon}\|(-A^{\top}y+1)^{+}\|^{2}
        restart_coeff = 5000
        Atym = -C'*y0 .+ 1 
        truncated_Atym = ((Atym) .> 0).*Atym
        restart_val_prev = -sum(x0)+ 0.5* norm(z0)^2 +0.5*norm(y0)^2 + restart_coeff*norm(truncated_Atym)^2
        restart_val_curr = restart_val_prev
            
        # inits for restart
        iter_count = 0 
        Flag = true
        Ax0 = zeros(m)

        while (Flag)

            # updates related to x
            j = rand(idx_seq)
            p[j] += - n * inv_col_norm_square[j] * a * (sum(C[:,j] .* ybar) - 1)
            prev_xj = x[j]
            x[j] = min(inv_col_norm_square[j], max(0, p[j]))
            # update s so that we may return xtildek at only O(1) cost
            s[j] += ((n-1) * a -  previous_A) * (x[j] - prev_xj)
        
            # updates related to y 
            previous_y[:] = y[:]
            z[:] += C[:, j] * (x[j] - prev_xj)
            y[:] = previous_A/A * y[:] + a/A * z[:] + (n-1) * a/A * (x[j] - prev_xj) * C[:,j]
            # need to update ytilde each time because that's what we want to return, 
            # and we aren't saving all the yi's. 
            ỹ[:] = previous_A/A * ỹ[:] + a/A * y[:]
        
            # update scaling factors 
            previous_a, previous_A = a, A
            a = min(n * a/(n-1), sqrt(A)/(2*n))
            A += a
        
            # update ȳ (note that ȳ_k depends on a_k and a_{k+1})
            ybar[:] = y[:] + previous_a/a * (y[:] - previous_y[:])
        
            # restart stuff 
            iter_count+=1
            # Since we are computing the restart condition without any optimizations, 
            # and the restart condition likely involves (expensive) matrix-vector products, 
            # we check it only after a certain number of iters have passed. 

            if ((restart_val_curr < 0)|| (iter_count% ceil(n*gamma) ==0))
                # compute the restart condition 
                Atym = -C'*ỹ .+ 1 
                truncated_Atym = ((Atym) .> 0).*Atym  
                sumx0 = sum(x + (1.0/previous_A) * s)
                Ax0 = C*(x + (1.0/previous_A) * s)
                restart_val_curr = -sumx0+ 0.5* norm(Ax0)^2 +0.5*norm(ỹ)^2 + restart_coeff*norm(truncated_Atym)^2
                if (restart_val_curr < 0)
                    Flag = false
                end
                # For now, we want to see if restarting works or not without checking the metric 
                # and therefore we force restart by artificially fixing the Flag to false. 
                # Once we can get restart to work, we'll put the Flag = false inside the 
                # if condition of restart_metric. 
                if (restart_val_curr <= 0.5*restart_val_prev)
                   # print("curr = ", restart_val_curr, ", prev = ", restart_val_prev, "obj = ", -sumx0+ 0.5* norm(Ax0)^2,"\n")
                    Flag = false
                end
            end
            
        end
        return x + (1.0/previous_A) * s, ỹ, Ax0
end

alg_our_core (generic function with 3 methods)

In [210]:
function alg_ours_without_restart(C::Matrix{Float64}, b::Matrix{Float64}, ϵ::Float64)
    
    extra_term_nnls = 0.5*norm(b)^2
    m, n = size(C)
    K = ceil(n / √ϵ)
    previous_A = 1.0/n
    previous_a = previous_A
    a = 1.0/(n*n)
    A = (n+1.0) /(n * n)
    col_norm = norm.(eachcol(C))
    inv_col_norm_square = 1.0 ./(col_norm.^2)
    idx_seq = 1:n
    x = zeros(n)
    p = zeros(n)
    j = rand(idx_seq)
    p[j] += inv_col_norm_square[j]
    x[j] = p[j]
    # x̃ = deepcopy(x)
    previous_y = zeros(m)
    y = x[j] * C[:, j]
    # record Ax
    z = x[j] * C[:, j]
    ȳ = (n+1) * y
    s = zeros(n)
    #func_value = 0
    func_value=zeros(Int(ceil(K/n)))
    
    for k = 2:K
        j = rand(idx_seq)
        p[j] += - n * inv_col_norm_square[j] * a * (sum(C[:,j] .* ȳ) - 1)
        prev_xj = x[j]
        x[j] = min(inv_col_norm_square[j], max(0, p[j]))
        # record Ax
        z[:] += C[:, j] * (x[j] - prev_xj)
        previous_y[:] = y[:]
        y[:] = previous_A/A * y[:] + a/A * z[:] + (n-1) * a/A * (x[j] - prev_xj) * C[:,j]
        s[j] += ((n-1) * a -  previous_A) * (x[j] - prev_xj)
        previous_a, previous_A = a, A
        a = min(n * a/(n-1), sqrt(A)/(2*n))
        ȳ[:] = y[:] + previous_a/a * (y[:] - previous_y[:])
        A += a

        if k % n == 0
            x̃ = x + 1.0/previous_A * s
            C_x̃ = C * x̃
            func_value[Int(k/n)] = 0.5 * sum(C_x̃ .* C_x̃) - sum(x̃)+extra_term_nnls
            #@info "pass: $(k/n), func_value: $func_value"
        end
    end
    return(func_value)
end



alg_ours_without_restart (generic function with 1 method)

In [211]:
# https://github.com/ahwillia/NonNegLeastSquares.jl
function alg_lawsonhanson(A, b)
    xnnls = nonneg_lsq(A,b;alg=:nnls)  # NNLS

    nnls_optval = 0.5*norm(A*xnnls - b)^2
    
    print("\n nnls package value is ", nnls_optval, ", and time is ")
end

alg_lawsonhanson (generic function with 1 method)

In [216]:
# Main code
function remove_col1(A,b)#Chenghui has an idea to optimize this for speed ("filter")
    s=A'*b # n*1 
    B=A[:,vec(s.>0)] # m*b matrix where b is smaller than n
    s=s[vec(s.>0)] # s is b*1 in dimensions
    return B./s'
end

epsilon = 0.0001 

############# 
n = 50000 # variable dimension 
m = 10 # Number of data points
b=rand(m,1)-repeat([0.3],m,1)
A_init =  max.(0, randn(m, n)) #rand(m,n)#
############

############ Mnist： uncomment 
#train_x, train_y = MNIST.traindata()
#A = Array{Int64}
#b = Array{Int64}
#A_init = reshape(train_x,60000,28*28)
#b = train_y
#test_x,  test_y  = MNIST.testdata()
############


 A = remove_col1(A_init,b)
############ Mnist: uncomment this
# A = Float64.(A)
# b = Float64.(b)
# b1 = vcat(b')
############

(m,n) = size(A) # Redefine the size number n and m to prevent triviality.
@time begin
our_result = alg_ours_without_restart(A, b, epsilon)
    print("our result without restart is ", our_result[end], ", and time is ")
#our_result = alg_ours(A, b1, epsilon)
end

@time begin
our_result = alg_ours_with_restart(A, b, epsilon)
print("our result is ", our_result, ", and time is, ")
#our_result = alg_ours(A, b1, epsilon)
end

@time begin 
alg_lawsonhanson(A, b)
end


our result without restart is 0.004256918459444536, and time is  16.048316 seconds (99.30 M allocations: 13.394 GiB, 17.83% gc time)
our result is 0.0042568381496070895, and time is,  67.634191 seconds (619.44 M allocations: 83.587 GiB, 15.04% gc time)

 nnls package value is 0.004256811891241667, and time is   0.423062 seconds (56 allocations: 5.305 MiB)


In [127]:
K = 9
2^K

512