In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import numpy as np
import time
from tqdm import tqdm
import math

In [2]:
with open('./FISTA_kernel.cu') as f:
    FISTA_kernel = f.read()
    
FISTA_module = SourceModule(FISTA_kernel)

In [3]:
Dgemv = FISTA_module.get_function("Dgemv")
soft_thresh_D = FISTA_module.get_function("soft_thresh_D")

In [4]:
def FISTA_D(beta, X, y, lam, L, eta, tol = 1e-08, max_iter = 5000):
    n = np.int32(X.shape[0])
    p = np.int32(X.shape[1])
    t = np.float64(1.0)
    One = np.float64(1.0)
    MOne = np.float64(-1.0)
    IntOne = np.int64(1)
    IntZero = np.int64(0)
    
    crit = np.zeros(max_iter, dtype=np.float64)
    temp = np.zeros(p, dtype=np.float64)
    d_beta_p = gpuarray.to_gpu(temp)
    d_beta_prev = gpuarray.to_gpu(temp)
    d_X = gpuarray.to_gpu(X)
    d_y = gpuarray.to_gpu(y)
    d_ymXbp = gpuarray.empty(n, dtype = np.float64)
    d_beta = gpuarray.empty(p, dtype = np.float64)
  
    TPB = (32, 1, 1)
    bpg_p = math.ceil(np.float64(p)/TPB[0])
    bpg_n = math.ceil(np.float64(n)/TPB[0])
    BPG_p = (bpg_p, 1, 1)
    BPG_n = (bpg_n, 1, 1)
  
    L_prev = L
    for k in range(max_iter):
        d_ymXbp = d_y.copy()
        Dgemv(MOne, d_X, d_beta_p, 
              d_ymXbp, n, p, IntZero,
              grid = BPG_n, block = TPB)
        h_rbp = gpuarray.dot(d_ymXbp, d_ymXbp)
        d_XTrbp = gpuarray.zeros(p, np.float64)
        Dgemv(One, d_X, d_ymXbp, d_XTrbp, n, p,
              IntOne, grid = BPG_p, block = TPB)
        
        i_k = -1
        cond = True
        while cond:
            i_k += 1
            eta_ik = eta ** i_k
            L_cur = L_prev * eta_ik
            alpha = np.float64(1.0/L_cur)  
            d_bstar = d_beta_p + alpha*d_XTrbp
            alpha = np.float64(lam/L_cur)
            soft_thresh_D(d_bstar, alpha, d_beta, p,
                         grid = BPG_p, block = TPB)
            d_diff_beta = d_beta - d_beta_p
            h_RHS_1st = gpuarray.dot(d_diff_beta, d_diff_beta)
            h_RHS_2nd = gpuarray.dot(d_diff_beta, d_XTrbp)
              
            RHS = L_cur * h_RHS_1st.get() - np.float32(2.0)*h_RHS_2nd.get()
              
            d_ymXb = d_y.copy()
            Dgemv(MOne, d_X, d_beta, d_ymXb, n, p, IntZero,
                   grid = BPG_n, block = TPB)
            LHS = gpuarray.dot(d_ymXb, d_ymXb).get() - h_rbp.get()
            cond = (LHS > RHS)
        
        L_prev = L_cur
        tnext = np.float64( (1.0 + np.sqrt(1 + 4* (t**2))) / 2.0 )
        d_diff_beta = d_beta - d_beta_prev
        d_beta_p = d_beta + np.float64((t - 1.0) / tnext) * d_diff_beta
        crit[k] = np.sqrt(gpuarray.dot(d_diff_beta, d_diff_beta).get())
          
        if crit[k] < tol:
            break
          
        t = tnext
        d_beta_prev = d_beta.copy()
          
    return d_beta.get(), crit, k

In [5]:
n = 100
p = 200

import numpy as np
np.random.seed(2022)
X = np.random.randn(n,p).astype(np.float64)
tr_beta = np.zeros(p).astype(np.float64)
tr_beta[:int(0.05*p)] = 1.0
y = np.dot(X, tr_beta) + np.random.randn(n).astype(np.float64)

In [6]:
beta = np.zeros(p, dtype = np.float64)
lam = np.sqrt(2*np.log(p)/n).astype(np.float64)
L = np.float64(10)
eta = np.float64(2)
tol = np.float64(1e-04)

In [7]:
out = FISTA_D(beta, X, y, lam, L, eta, tol = 1e-04, max_iter = 5000)
out[0][:10]

array([0.64070707, 1.11958681, 0.82775298, 1.11552658, 0.69531896,
       0.82641614, 0.73400349, 1.05100404, 0.91740151, 1.10698033])

In [8]:
n = 100
p = 200

import time
import numpy as np
from ctypes import *

np.random.seed(2022)
X = np.random.randn(n,p).astype(np.float64)
tr_beta = np.zeros(p).astype(np.float64)
tr_beta[:int(0.05*p)] = 1.0
y = np.dot(X, tr_beta) + np.random.randn(n).astype(np.float64)


import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray, tools, cumath
import math

with open('./FISTA_kernel.cu') as f:
    FISTA_kernel = f.read()
    
FISTA_module = SourceModule(FISTA_kernel)

Dgemv = FISTA_module.get_function("Dgemv")
soft_thresh_D = FISTA_module.get_function("soft_thresh_D")

def FISTA_D(beta, X, y, lam, L, eta, tol = 1e-08, max_iter = 5000):
    n = np.int32(X.shape[0])
    p = np.int32(X.shape[1])
    t = np.float64(1.0)
    One = np.float64(1.0)
    MOne = np.float64(-1.0)
    IntOne = np.int64(1)
    IntZero = np.int64(0)
    
    crit = np.zeros(max_iter, dtype=np.float64)
    temp = np.zeros(p, dtype=np.float64)
    d_beta_p = gpuarray.to_gpu(temp)
    d_beta_prev = gpuarray.to_gpu(temp)
    d_X = gpuarray.to_gpu(X)
    d_y = gpuarray.to_gpu(y)
    d_ymXbp = gpuarray.empty(n, dtype = np.float64)
    d_beta = gpuarray.empty(p, dtype = np.float64)
  
    TPB = (32, 1, 1)
    bpg_p = math.ceil(np.float64(p)/TPB[0])
    bpg_n = math.ceil(np.float64(n)/TPB[0])
    BPG_p = (bpg_p, 1, 1)
    BPG_n = (bpg_n, 1, 1)
  
    L_prev = L
    for k in range(max_iter):
        d_ymXbp = d_y.copy()
        Dgemv(MOne, d_X, d_beta_p, 
              d_ymXbp, n, p, IntZero,
              grid = BPG_n, block = TPB)
        h_rbp = gpuarray.dot(d_ymXbp, d_ymXbp)
        d_XTrbp = gpuarray.zeros(p, np.float64)
        Dgemv(One, d_X, d_ymXbp, d_XTrbp, n, p,
              IntOne, grid = BPG_p, block = TPB)
        
        i_k = -1
        cond = True
        while cond:
            i_k += 1
            eta_ik = eta ** i_k
            L_cur = L_prev * eta_ik
            alpha = np.float64(1.0/L_cur)  
            d_bstar = d_beta_p + alpha*d_XTrbp
            alpha = np.float64(lam/L_cur)
            soft_thresh_D(d_bstar, alpha, d_beta, p,
                         grid = BPG_p, block = TPB)
            d_diff_beta = d_beta - d_beta_p
            h_RHS_1st = gpuarray.dot(d_diff_beta, d_diff_beta)
            h_RHS_2nd = gpuarray.dot(d_diff_beta, d_XTrbp)
              
            RHS = L_cur * h_RHS_1st.get() - np.float32(2.0)*h_RHS_2nd.get()
              
            d_ymXb = d_y.copy()
            Dgemv(MOne, d_X, d_beta, d_ymXb, n, p, IntZero,
                   grid = BPG_n, block = TPB)
            LHS = gpuarray.dot(d_ymXb, d_ymXb).get() - h_rbp.get()
            cond = (LHS > RHS)
        
        L_prev = L_cur
        tnext = np.float64( (1.0 + np.sqrt(1 + 4* (t**2))) / 2.0 )
        d_diff_beta = d_beta - d_beta_prev
        d_beta_p = d_beta + np.float64((t - 1.0) / tnext) * d_diff_beta
        crit[k] = np.sqrt(gpuarray.dot(d_diff_beta, d_diff_beta).get())
          
        if crit[k] < tol:
            break
          
        t = tnext
        d_beta_prev = d_beta.copy()
          
    return d_beta.get(), crit, k


niter = 11
comp_time_pycuda = np.zeros(niter)


for i in np.arange(niter):

    beta = np.zeros(p, dtype = np.float64)
    lam = np.sqrt(2*np.log(p)/n).astype(np.float64)
    L = np.float64(10)
    eta = np.float64(2.0)
    
    t1 = time.time()
    out, cr, k = FISTA_D(beta, X, y, lam, L, eta, tol = 1e-4, max_iter = 5000)
    t2 = time.time()
    comp_time_pycuda[i] = t2-t1
    
    print('\n ',i+1,'-th iteration, Collapsed Time: ', comp_time_pycuda[i])
    



  1 -th iteration, Collapsed Time:  0.8120038509368896

  2 -th iteration, Collapsed Time:  0.744074821472168

  3 -th iteration, Collapsed Time:  0.7436754703521729

  4 -th iteration, Collapsed Time:  0.7423455715179443

  5 -th iteration, Collapsed Time:  0.7437632083892822

  6 -th iteration, Collapsed Time:  0.743818998336792

  7 -th iteration, Collapsed Time:  0.7405283451080322

  8 -th iteration, Collapsed Time:  0.7430922985076904

  9 -th iteration, Collapsed Time:  0.7442173957824707

  10 -th iteration, Collapsed Time:  0.7441787719726562

  11 -th iteration, Collapsed Time:  0.7445058822631836


In [9]:
k

873