In [2]:
from numba import cuda, jit, float32
import numpy as np
from timeit import default_timer as timer
from time import time
import matplotlib.pyplot as plt

# 2022_04_26.ipynb 에 있는 함수들


In [3]:
## vector addition
@cuda.jit
def vector_add(x, y, out):
    
    i = cuda.grid(1)

    if i < x.size:
        out[i] = x[i] + y[i]

## vector subtraction
@cuda.jit
def vector_sub(x, y, out):
    
    i = cuda.grid(1)

    if i < x.size:
        out[i] = x[i] - y[i]

## matrix vector multiplication
@cuda.jit
def matrix_vector_mul(A, x, out):

    i = cuda.grid(1)

    if i < A.shape[0]:
        tmp = 0
        for j in range(A.shape[1]):
            tmp += A[i,j] * x[j]
        out[i] = tmp

## optimize function
@cuda.jit
def optimize(grad, lr, x):
    
    i = cuda.grid(1)

    if i < x.size:
        x[i] -= grad[i] * lr * 2

# CPU 에서의 시간 측정

In [8]:
A = np.random.rand(10000,1000)
b = np.random.rand(10000)
x = np.random.rand(1000)
x_ = cuda.to_device(x)
Ax = cuda.to_device(np.zeros_like(b))
dum = cuda.to_device(np.zeros_like(b))
grad = cuda.to_device(np.zeros_like(x))

## Blocks per grid
BPG = 100
## Threads per block
TPB = 1024
error_list = []
lr = 1e-3/A.shape[1]

In [9]:
t1 = time()
for i in range(10):
    index = np.random.choice(A.shape[0],100)
    AA = A[index,:]
    bb = b[index]
    for j in range(10):
        q = np.dot(AA,x)
        grd = 2 * np.dot(AA.T,(q - bb))
        x -= grd * lr

    error = np.linalg.norm(A@x - b)
    error_list.append(error)

t2 = time()

theta = x
error = np.linalg.norm(A@theta - b)

print(error)
print(f"It took {t2 - t1} seconds to get theta")

312.91974599168407
It took 0.08751988410949707 seconds to get theta


In [10]:
t1 = time()
for i in range(10):
    index = np.random.choice(A.shape[0],100)
    A_ = cuda.to_device(A[index])
    b_ = cuda.to_device(b[index])

    for j in range(10):
        cuda.synchronize()
        matrix_vector_mul[BPG,TPB](A_, x_, Ax)
        cuda.synchronize()
        vector_sub[BPG,TPB](Ax, b_, dum)
        cuda.synchronize()
        matrix_vector_mul[BPG,TPB](A_.T, dum, grad)
        cuda.synchronize()
        optimize[BPG,TPB](grad, lr, x_)
        cuda.synchronize()

t2 = time()

x_hat = x_.copy_to_host()
error_gpu = np.linalg.norm(A@x_hat - b)

print(error_gpu)
print(f"It took {t2 - t1} seconds to get x_hat")

305.9553131085096
It took 23.9690580368042 seconds to get x_hat
