In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import numpy as np
import time
from tqdm import tqdm
import math

In [2]:
with open('./FISTA_kernel.cu') as f:
    FISTA_kernel = f.read()
    
FISTA_module = SourceModule(FISTA_kernel)

In [3]:
Sgemv = FISTA_module.get_function("Sgemv")
soft_thresh_S = FISTA_module.get_function("soft_thresh_S")

In [3]:
def FISTA_S(beta, X, y, lam, L, eta, tol = 1e-08, max_iter = 5000):
    n = np.int32(X.shape[0])
    p = np.int32(X.shape[1])
    t = np.float32(1.0)
    One = np.float32(1.0)
    MOne = np.float32(-1.0)
    IntOne = np.int32(1)
    IntZero = np.int32(0)
    
    crit = np.zeros(max_iter, dtype=np.float32)
    temp = np.zeros(p, dtype=np.float32)
    d_beta_p = gpuarray.to_gpu(temp)
    d_beta_prev = gpuarray.to_gpu(temp)
    d_X = gpuarray.to_gpu(X)
    d_y = gpuarray.to_gpu(y)
    d_ymXbp = gpuarray.empty(n, dtype = np.float32)
    d_beta = gpuarray.empty(p, dtype = np.float32)
  
    TPB = (32, 1, 1)
    bpg_p = math.ceil(np.float32(p)/TPB[0])
    bpg_n = math.ceil(np.float32(n)/TPB[0])
    BPG_p = (bpg_p, 1, 1)
    BPG_n = (bpg_n, 1, 1)
  
    L_prev = L
    for k in range(max_iter):
        d_ymXbp = d_y.copy()
        Sgemv(MOne, d_X, d_beta_p, 
              d_ymXbp, n, p, IntZero,
              grid = BPG_n, block = TPB)
        h_rbp = gpuarray.dot(d_ymXbp, d_ymXbp)
        d_XTrbp = gpuarray.zeros(p, np.float32)
        Sgemv(One, d_X, d_ymXbp, d_XTrbp, n, p,
              IntOne, grid = BPG_p, block = TPB)
        
        i_k = -1
        cond = True
        while cond:
            i_k += 1
            eta_ik = eta ** i_k
            L_cur = L_prev * eta_ik
            alpha = np.float32(1.0/L_cur)  
            d_bstar = d_beta_p + alpha*d_XTrbp
            alpha = np.float32(lam/L_cur)
            soft_thresh_S(d_bstar, alpha, d_beta, p,
                         grid = BPG_p, block = TPB)
            d_diff_beta = d_beta - d_beta_p
            h_RHS_1st = gpuarray.dot(d_diff_beta,
                                     d_diff_beta)
            h_RHS_2nd = gpuarray.dot(d_diff_beta,
                                     d_XTrbp)
              
            RHS1 = L_cur * h_RHS_1st.get()
            RHS2 = np.float32(2.0)*h_RHS_2nd.get()
            RHS =  RHS1 - RHS2 
                              
            d_ymXb = d_y.copy()
            Sgemv(MOne, d_X, d_beta, d_ymXb, n, p, 
                  IntZero, grid = BPG_n, block = TPB)
            d_ymXb2 = gpuarray.dot(d_ymXb, d_ymXb)
            LHS = d_ymXb2.get() - h_rbp.get()
            cond = (LHS > RHS)
          
        L_prev = L_cur
        tnext = np.float32((1.0+np.sqrt(1+4*t*t))/2)
        d_diff_beta = d_beta - d_beta_prev
        alpha = np.float32((t - 1.0)/tnext)
        d_beta_p = d_beta + alpha * d_diff_beta
        d_diff_b_sq = gpuarray.dot(d_diff_beta,
                                   d_diff_beta)
        crit[k] = np.sqrt(d_diff_b_sq.get())
          
        if crit[k] < tol:
            break
          
        t = tnext
        d_beta_prev = d_beta.copy()
          
    return d_beta.get(), crit, k

In [13]:
n = 100
p = 200

import numpy as np
np.random.seed(2022)
X = np.random.randn(n,p).astype(np.float32)
tr_beta = np.zeros(p).astype(np.float32)
tr_beta[:int(0.05*p)] = 1.0
y = np.dot(X, tr_beta) + np.random.randn(n).astype(np.float32)

In [14]:
beta = np.zeros(p, dtype = np.float32)
lam = np.sqrt(2*np.log(p)/n).astype(np.float32)
L = np.float32(10)
eta = np.float32(2)
tol = np.float32(1e-04)

In [15]:
out = FISTA_S(beta, X, y, lam, L, eta, tol = 1e-04, max_iter = 5000)
out[0][:10]

array([0.64001155, 1.0940939 , 0.8156298 , 1.1276115 , 0.68376046,
       0.8235541 , 0.74824077, 1.0680112 , 0.90822124, 1.1011746 ],
      dtype=float32)

In [4]:
n = 100
p = 200

import time
import numpy as np
from ctypes import *

np.random.seed(2022)
X = np.random.randn(n,p).astype(np.float32)
tr_beta = np.zeros(p).astype(np.float32)
tr_beta[:int(0.05*p)] = 1.0
y = np.dot(X, tr_beta) + np.random.randn(n).astype(np.float32)


import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import math

with open('./FISTA_kernel.cu') as f:
    FISTA_kernel = f.read()
    
FISTA_module = SourceModule(FISTA_kernel)

Sgemv = FISTA_module.get_function("Sgemv")
soft_thresh_S = FISTA_module.get_function("soft_thresh_S")

def FISTA_S(beta, X, y, lam, L, eta, tol = 1e-08, max_iter = 5000):
    n = np.int32(X.shape[0])
    p = np.int32(X.shape[1])
    t = np.float32(1.0)
    One = np.float32(1.0)
    MOne = np.float32(-1.0)
    IntOne = np.int32(1)
    IntZero = np.int32(0)
    
    crit = np.zeros(max_iter, dtype=np.float32)
    temp = np.zeros(p, dtype=np.float32)
    d_beta_p = gpuarray.to_gpu(temp)
    d_beta_prev = gpuarray.to_gpu(temp)
    d_X = gpuarray.to_gpu(X)
    d_y = gpuarray.to_gpu(y)
    d_ymXbp = gpuarray.empty(n, dtype = np.float32)
    d_beta = gpuarray.empty(p, dtype = np.float32)
  
    TPB = (32, 1, 1)
    bpg_p = math.ceil(np.float32(p)/TPB[0])
    bpg_n = math.ceil(np.float32(n)/TPB[0])
    BPG_p = (bpg_p, 1, 1)
    BPG_n = (bpg_n, 1, 1)
  
    L_prev = L
    for k in range(max_iter):
        d_ymXbp = d_y.copy()
        Sgemv(MOne, d_X, d_beta_p, 
              d_ymXbp, n, p, IntZero,
              grid = BPG_n, block = TPB)
        h_rbp = gpuarray.dot(d_ymXbp, d_ymXbp)
        d_XTrbp = gpuarray.zeros(p, np.float32)
        Sgemv(One, d_X, d_ymXbp, d_XTrbp, n, p,
              IntOne, grid = BPG_p, block = TPB)
        
        i_k = -1
        cond = True
        while cond:
            i_k += 1
            eta_ik = eta ** i_k
            L_cur = L_prev * eta_ik
            alpha = np.float32(1.0/L_cur)  
            d_bstar = d_beta_p + alpha*d_XTrbp
            alpha = np.float32(lam/L_cur)
            soft_thresh_S(d_bstar, alpha, d_beta, p,
                         grid = BPG_p, block = TPB)
            d_diff_beta = d_beta - d_beta_p
            h_RHS_1st = gpuarray.dot(d_diff_beta,
                                     d_diff_beta)
            h_RHS_2nd = gpuarray.dot(d_diff_beta,
                                     d_XTrbp)
              
            RHS1 = L_cur * h_RHS_1st.get()
            RHS2 = np.float32(2.0)*h_RHS_2nd.get()
            RHS =  RHS1 - RHS2 
                              
            d_ymXb = d_y.copy()
            Sgemv(MOne, d_X, d_beta, d_ymXb, n, p, 
                  IntZero, grid = BPG_n, block = TPB)
            d_ymXb2 = gpuarray.dot(d_ymXb, d_ymXb)
            LHS = d_ymXb2.get() - h_rbp.get()
            cond = (LHS > RHS)
          
        L_prev = L_cur
        tnext = np.float32((1.0+np.sqrt(1+4*t*t))/2)
        d_diff_beta = d_beta - d_beta_prev
        alpha = np.float32((t - 1.0)/tnext)
        d_beta_p = d_beta + alpha * d_diff_beta
        d_diff_b_sq = gpuarray.dot(d_diff_beta,
                                   d_diff_beta)
        crit[k] = np.sqrt(d_diff_b_sq.get())
          
        if crit[k] < tol:
            break
          
        t = tnext
        d_beta_prev = d_beta.copy()
          
    return d_beta.get(), crit, k

niter = 11
comp_time_pycuda = np.zeros(niter)


for i in np.arange(niter):

    beta = np.zeros(p, dtype = np.float32)
    lam = np.sqrt(2*np.log(p)/n).astype(np.float32)
    L = np.float32(10)
    eta = np.float32(2)
    
    t1 = time.time()
    out = FISTA_S(beta, X, y, lam, L, eta, tol = 1e-4, max_iter = 5000)
    t2 = time.time()
    comp_time_pycuda[i] = t2-t1
    
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_pycuda[i])
    



  1 -th iteration, Collapsed Time:  0.8366124629974365

  2 -th iteration, Collapsed Time:  0.7454531192779541

  3 -th iteration, Collapsed Time:  0.7443373203277588

  4 -th iteration, Collapsed Time:  0.739147424697876

  5 -th iteration, Collapsed Time:  0.7435777187347412

  6 -th iteration, Collapsed Time:  0.7384746074676514

  7 -th iteration, Collapsed Time:  0.7419803142547607

  8 -th iteration, Collapsed Time:  0.7370586395263672

  9 -th iteration, Collapsed Time:  0.7389974594116211

  10 -th iteration, Collapsed Time:  0.7383205890655518

  11 -th iteration, Collapsed Time:  0.7397584915161133
