In [1]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 15.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2022.1.9.tar.gz (69 kB)
[K     |████████████████████████████████| 69 kB 7.8 MB/s 
[?25hCollecting mako
  Downloading Mako-1.2.0-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.0 MB/s 
Collecting platformdirs>=2.2.0
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (PEP 517) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2021.1-cp37-cp37m-linux_x86_64.whl size=626634 sha256=3b4ab6da57ef6ca3122df90be7882afdd3969d19e900d

In [2]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.pyplot as plt
from time import time

In [None]:
import pycuda.autoinit

calculate_ker = SourceModule(
    """
    #define _X (threadIdx.x + blockIdx.x * blockDim.x * blockDim.y)

    #define _WIDTH (blockDim.x)

    #define _INDEX1(x,y) (x * _WIDTH + y)

    // A: matrix, b: vector, out: vector
    __global__ void mat_vec_ker(float *out, float *A, float *b, float *theta)
    {
        int x = _X;

        for (int j = 0; j < _WIDTH; j++)
        {
            // out need to be initialized
            out[x] += A[_INDEX1(x,j)] * theta[j];
        }

        __syncthreads();
        out[x] -= b[x];
        __syncthreads();
    }
    """
)

gradient_ker = SourceModule(
    """
    #define _X (threadIdx.x)
    #define _B (blockIdx.x)
    #define _G (gridDim.x)

    #define _WIDTH (blockDim.x)

    // grad_jerk: [BD,n], n = gridDim.x
    __global__ void gradient_ker(float *grad_jerk, float *out, float *A, int width)
    {
        int x = _X;
        int index_g = x * _G + _B;
        int index_a;
        int index_o;

        for (int k = 0; k < _WIDTH; k++)
        {
            index_a = x + k * _WIDTH + _B * _WIDTH * _WIDTH;
            index_o = k + _B * _WIDTH;

            grad_jerk[index_g] += A[index_a] * out[index_o];
        }

        __syncthreads();
    }

    __global__ void finish_ker(float *grad, float *grad_jerk)
    {
        int x = _X;

        for (int k = 0; k < _G; k++)
        {
            int index = x * _G + k;
            grad[x] += grad_jerk[index];
        }
        __syncthreads();
    }
    """
)

update_ker = SourceModule(
    """
    #define _X (threadIdx.x + blockIdx.x * blockDim.x * blockDim.y)

    __global__ void update_ker(float *theta_new, float *theta, float lr, float *grad)
    {
        int x = _X;

        theta_new[x] = theta[x] - grad[x] * lr;

        __syncthreads();
    }
    """
)

multiply = calculate_ker.get_function("mat_vec_ker")
gradient = gradient_ker.get_function("gradient_ker")
finish = gradient_ker.get_function("finish_ker")
update = update_ker.get_function("update_ker")

In [None]:
# set matrix size
# BD: blockDim, GD: gridDim   
BD = 32 * 32
GD_x = 2
length = BD * GD_x
width = BD

lr = np.float32(1e-6)

A = np.float32(np.random.randn(length,width))
b = np.float32(np.random.randn(length))
theta = np.float32(np.random.randn(width))
grad_jerk = np.float32(np.zeros((BD,GD_x)))

A_gpu = gpuarray.to_gpu(A.reshape(width*length))
b_gpu = gpuarray.to_gpu(b)
theta_gpu = gpuarray.to_gpu(theta)
theta_new_gpu = gpuarray.to_gpu(theta)
out_gpu = gpuarray.empty_like(b_gpu)
grad_jerk_gpu = gpuarray.to_gpu(grad_jerk.reshape(BD*GD_x))
grad_gpu = gpuarray.empty_like(theta_gpu)
init_gpu1 = gpuarray.empty_like(theta_gpu)
init_gpu2 = gpuarray.empty_like(b_gpu)
init_gpu3 = gpuarray.empty_like(grad_jerk_gpu)

# initialize
grad_jerk_gpu[:] = init_gpu3
out_gpu[:] = init_gpu2
grad_gpu[:] = init_gpu1
theta_gpu[:] = theta_new_gpu[:]
theta_new_gpu[:] = init_gpu1

# in GPU
t1 = time()
multiply(out_gpu, A_gpu, b_gpu, theta_gpu, block=(BD,1,1), grid=(GD_x,1,1))
gradient(grad_jerk_gpu, out_gpu, A_gpu, np.int32(GD_x), block=(BD,1,1), grid=(GD_x,1,1))
finish(grad_gpu, grad_jerk_gpu, block=(BD,1,1), grid=(GD_x,1,1))
update(theta_new_gpu, theta_gpu, lr, grad_gpu, block=(BD,1,1), grid=(1,1,1))
t2 = time()

# result_gpu = out_gpu.get()
# result_gpu = grad_gpu.get()
result_gpu = theta_new_gpu.get()

# in CPU
# result_cpu = np.dot(A, theta) - b
# result_cpu = np.dot(A.T, (np.dot(A,theta) - b))
result_cpu = theta - np.dot(A.T, (np.dot(A, theta) - b)) * lr

print(f"Does it right?: {np.allclose(result_cpu, result_gpu)}")
print(f"How about 2-Norm?: {np.linalg.norm(result_cpu - result_gpu)}")
print(f"How does it take?: {t2 -t1}")

Does it right?: True
How about 2-Norm?: 2.54491453688388e-07
How does it take?: 0.0002651214599609375


# Not yet

In [82]:
ker1 = SourceModule(
    """
    #define x (threadIdx.x)

    __device__ void tall_matrix_multiply(float *matrix, float *vector, float *out, int width) {
        int index = x * width;

        for (int i = 0; i < width; i++) {
            out[x] += matrix[index + i] * vector[i];
        }
    }

    __device__ void atomic_sub(float *vector1, float *vector2, int width) {
        
        vector1[x] -= vector2[x];
    }

    __device__ void wide_matrix_multiply(float *matrix, float *vector, float *grad, int width, int length) {
        int wideness = length / width;
        int row = x / wideness;
        int col = x % wideness;
        int index1 = row * length;
        int index2 = row * wideness;

        for (int i = 0; i < width; i++) {
            int k = col * width + i;

            grad[index2 + col] += matrix[index1 + k] * vector[k];
        }
    }

    __global__ void main_function(float *matrix1, float *matrix2, float *vector1, float *vector2, float *out, float *grad, int width, int length) {

        tall_matrix_multiply(matrix1, vector1, out, width);

        __syncthreads();

        atomic_sub(out, vector2, width);
        
        __syncthreads();
        
        wide_matrix_multiply(matrix2, out, grad, width, length);
        
        __syncthreads();
    }
    """
)

calculate = ker1.get_function("main_function")

In [88]:
length = 400
width = 25
wideness = np.int32(length / width)

A = np.float32(np.random.randn(length,width))
b = np.float32(np.random.randn(length))
theta = np.float32(np.random.randn(width))
out = np.float32(np.zeros_like(b))
grad = np.float32(np.zeros((width,wideness)))

A1_gpu = gpuarray.to_gpu(A.reshape(length*width))
A2_gpu = gpuarray.to_gpu(A.T.reshape(length*width))
b_gpu = gpuarray.to_gpu(b)
theta_gpu = gpuarray.to_gpu(theta)
out_gpu = gpuarray.to_gpu(out)
grad_gpu = gpuarray.to_gpu(grad)

t1 = time()
calculate(A1_gpu, A2_gpu, theta_gpu, b_gpu, out_gpu, grad_gpu, np.int32(width), np.int32(length), block=(length,1,1), grid=(1,1,1))
t2 = time()

t3 = time()
np.linalg.lstsq(A, b, rcond=None)
t4 = time()

result_gpu = np.sum(grad_gpu.get(), axis=1)
result_cpu = np.dot(A.T,(np.dot(A,theta) - b))

print(f"Does it right?: {np.allclose(result_cpu, result_gpu)}")
print(f"How about 2-Norm?: {np.linalg.norm(result_cpu - result_gpu)}")
print(f"How does it take in GPU?: {round((t2 - t1) * 300, 5)} s")
print(f"How does it take in CPU?: {round(t4 - t3, 4)} s")
print(f"It's good?: {round((t2 - t1) * 300 / (t4 - t3), 4)}")

Does it right?: True
How about 2-Norm?: 0.00030691130086779594
How does it take in GPU?: 0.06852 s
How does it take in CPU?: 0.0052 s
It's good?: 13.215


In [87]:
result_gpu - result_cpu

array([-1.5258789e-05, -6.1035156e-05,  1.5258789e-05,  0.0000000e+00,
       -6.1035156e-05,  0.0000000e+00,  3.0517578e-05,  6.1035156e-05,
        0.0000000e+00, -2.6702881e-05, -6.1035156e-05, -3.0517578e-05,
        1.8310547e-04,  0.0000000e+00,  0.0000000e+00,  1.5258789e-05,
       -3.0517578e-05,  0.0000000e+00, -1.2207031e-04,  1.2207031e-04,
        0.0000000e+00,  6.1035156e-05,  2.4414062e-04,  6.1035156e-05,
        0.0000000e+00], dtype=float32)

# Success~~