In [1]:
from numba import cuda, jit, float32
import numpy as np
from timeit import default_timer as timer
import matplotlib.pyplot as plt

# lstsq in GPU with SGD

계산을 빠르게 하려는 방법

In [2]:
A = np.random.rand(10000,1000)
print(f"bytes: {A.nbytes / 1e6} MB")

bytes: 80.0 MB


$
f(x) = 
\begin{Vmatrix}
Ax - b
\end{Vmatrix}^2
\qquad
\nabla f(x) = 2A^T(Ax-b)
$

## part 1: $Ax - b$

In [3]:
## vector addition
@cuda.jit
def vector_add(x, y, out):
    
    i = cuda.grid(1)

    if i < x.size:
        out[i] = x[i] + y[i]

## vector subtraction
@cuda.jit
def vector_sub(x, y, out):
    
    i = cuda.grid(1)

    if i < x.size:
        out[i] = x[i] - y[i]

In [4]:
n = 10000

## generate data
x = np.random.rand(n)
y = np.random.rand(n)
out = np.zeros(n)

## send to GPU device
x_ = cuda.to_device(x)
y_ = cuda.to_device(y)
out_ = cuda.to_device(out)

## calculate on GPU
vector_add[100,128](x_,y_,out_)

## calculation result check
print(np.allclose(x+y, out_.copy_to_host()))

## time check
cuda.synchronize()
%timeit vector_add[100,128](x_,y_,out_)
cuda.synchronize()

True
1000 loops, best of 5: 547 µs per loop


## part 2: $A^Tx$ and $Ax$

In [5]:
## matrix vector multiplication
@cuda.jit
def matrix_vector_mul(A, x, out):

    i = cuda.grid(1)

    if i < A.shape[0]:
        tmp = 0
        for j in range(A.shape[1]):
            tmp += A[i,j] * x[j]
        out[i] = tmp

In [10]:
## generate data
A = np.random.rand(100,1000)
x = np.random.rand(1000)
out = np.zeros(100)

## send to GPU device
A_ = cuda.to_device(A)
x_ = cuda.to_device(x)
out_ = cuda.to_device(out)

## calculate on GPU
matrix_vector_mul[100,128](A_, x_, out_)

## calculation result check
print(np.allclose(A@x, out_.copy_to_host()))

## time check
cuda.synchronize()
%timeit matrix_vector_mul[100,128](A_, x_, out_)
cuda.synchronize()

True
1000 loops, best of 5: 810 µs per loop


## part 3: SGD

In [7]:
## optimize function
@cuda.jit
def optimize(grad, lr, x):
    
    i = cuda.grid(1)

    if i < x.size:
        x[i] -= grad[i] * lr * 2

In [8]:
## generate data
theta = np.random.rand(100)
grad = np.random.rand(100)
lr = 0.1

## send to GPU device
theta_ = cuda.to_device(theta)
grad_ = cuda.to_device(grad)

theta -= grad * lr * 2

## calculate on GPU
optimize[1,128](grad_,lr,theta_)

## calculation result check
print(np.allclose(theta, theta_.copy_to_host()))

## time check
cuda.synchronize()
%timeit optimize[1,128](grad_,lr,theta_)
cuda.synchronize()

True
1000 loops, best of 5: 236 µs per loop


# Test

In [11]:
A = np.random.rand(10000,1000)
b = np.random.rand(10000)
x = np.random.rand(1000)
# A_ = cuda.to_device(A)
b_ = cuda.to_device(b)
x_ = cuda.to_device(x)
Ax = cuda.to_device(np.zeros_like(b))
dum = cuda.to_device(np.zeros_like(b))
grad = cuda.to_device(np.zeros_like(x))
TPB = 128
BPG = 10
x_hat = []
error_list = []

In [13]:
lr = 1e-2/A.shape[1]
#############################################
for j in range(5):
    index = np.random.choice(A.shape[0],100)
    A_ = cuda.to_device(A[index])
    b_ = cuda.to_device(b[index])

    for i in range(10):
        cuda.synchronize()
        matrix_vector_mul[TPB,BPG](A_, x_, Ax)
        cuda.synchronize()
        vector_sub[TPB,BPG](Ax, b_, dum)
        cuda.synchronize()
        matrix_vector_mul[TPB,BPG](A_.T, dum, grad)
        cuda.synchronize()
        optimize[TPB,BPG](grad, lr, x_)
        cuda.synchronize()

    x_update = x_.copy_to_host()
    x_hat.append(x_update)
    error = np.linalg.norm(A@x_update - b)
    error_list.append(error)
    print(error)

267.27832385383084
264.39529072322307
263.5892251144037
263.8945170299067
263.1559329881729


50번 반복에 13초 소요...<br>
대실패...