In [None]:
!pip install pycuda

In [None]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.pyplot as plt
from time import time
import math

In [None]:
class Shared:
    def __init__(self, A, b, learning_rate):
        ## input matrix's shape
        self.length = A.shape[0]
        self.width = A.shape[1]
        
        ## in CPU
        self.A = np.float32(A)
        self.b = np.float32(b)
        self.theta = np.float32(np.zeros(self.width))
        self.out = np.float32(np.zeros(self.length))
        self.grad = np.float32(np.zeros(self.width))
        self.constrained = constrained

        ## in GPU
        self.GPU_A = gpuarray.to_gpu(self.A.reshape(self.length*self.width))
        self.GPU_b = gpuarray.to_gpu(self.b)
        self.GPU_theta = gpuarray.to_gpu(self.theta)
        self.GPU_out = gpuarray.to_gpu(self.out)
        self.GPU_grad = gpuarray.to_gpu(self.grad)

        ## for initialize out vector and grad vector
        self.init_out = gpuarray.empty_like(self.GPU_out)
        self.init_grad = gpuarray.empty_like(self.GPU_grad)

        ## TPB: thread_per_block, BPG: block_per_grid        
        self.TPB, self.BPG = self.optimal_block_size(self.length)

        ## learning_rate
        self.learning_rate = learning_rate

    def optimal_block_size(self, n):
        
        thread_per_block = int(math.sqrt(n / 2))

        block_per_grid = int(n / thread_per_block) + 1


        return thread_per_block, block_per_grid

    def momentum(self):
        self.s = self.learning_rate
        self.beta = 1/3

    def nesterov(self):
        return NotImplementedError()

In [None]:
class GetGradient:
    def __init__(self, shared):
       self.shared = shared
       
       self.kernel_function()

    def run(self):

        self.initialize()

        ## get out = np.dot(A, theta) - b
        self.first(self.shared.GPU_out,
                   self.shared.GPU_A,
                   self.shared.GPU_theta,
                   self.shared.GPU_b,
                   np.int32(self.shared.length),
                   np.int32(self.shared.width),
                   block=(self.shared.TPB,1,1),
                   grid=(self.shared.BPG,1,1))

        ## get grad = np.dot(A.T, out)
        self.second(self.shared.GPU_grad,
                   self.shared.GPU_A,
                   self.shared.GPU_out,
                   np.int32(self.shared.TPB),
                   np.int32(self.shared.BPG),
                   np.int32(self.shared.length),
                   np.int32(self.shared.width),
                   block=(self.shared.BPG,1,1),
                   grid=(self.shared.width,1,1))
                   
                   

    def kernel_function(self):
        ## block=(thread_per_block,1,1), grid=(block_per_grid,1,1)
        first_ker_function = \
        """
        #define x (threadIdx.x + blockIdx.x * blockDim.x)

        __global__ void first(float* out, float* A, float* theta, float* b, int length, int width) {
            
            if (x < length) {
                for (int j = 0; j < width; j++) {
                    int index1 = x * width + j;

                    out[x] += A[index1] * theta[j];
                    }

                out[x] -= b[x];
            }
        }
        """
        first_ker = SourceModule(first_ker_function)



        ## block=(block_per_grid,1,1), grid=(width,1,1)
        second_ker_function = \
        """
        #define tx (threadIdx.x)
        #define bx (blockIdx.x)

        __global__ void second(float* grad, float* A, float* out, int thread_per_block, int block_per_grid, int length, int width) {

            __shared__ float grad_jerk[1000];

            grad_jerk[tx] = 0;

            __syncthreads();
            
            for (int i = 0; i < thread_per_block; i++) {
                int index1 = tx * thread_per_block + i;
                int index2 = index1 * width + bx;
                
                grad_jerk[tx] += A[index2] * out[index1];
            }

            __syncthreads();

            if (tx == 0) {
                for (int i = 0; i < block_per_grid; i++) {
                    grad[bx] += grad_jerk[i];
                }
            }
            else {
                grad_jerk[1000-tx] = 0;
            }

            __syncthreads();
        }
        """
        second_ker = SourceModule(second_ker_function)

        self.first = first_ker.get_function("first")
        self.second = second_ker.get_function("second")

    def initialize(self):
        self.shared.GPU_out[:] = self.shared.init_out[:]

In [None]:
class Optimizer:
    def __init__(self, shared):
        self.shared = shared

        self.kernel_function()

    def run(self):
        return NotImplementedError()

    def kernel_function(self):
        return NotImplementedError()
        
    def initialize(self):
        return NotImplementedError()



class GradientMethod(Optimizer):
    def __init__(self, shared):
        super().__init__(shared)

    def run(self):

        self.gradient_method(self.shared.GPU_theta,
                             self.shared.GPU_grad,
                             np.float32(self.shared.learning_rate),
                             block=(self.shared.width,1,1),
                             grid=(1,1,1))

        self.initialize()
        
    def kernel_function(self):

        ## block=(width,1,1), grid=(1,1,1)
        gradient_method_ker = SourceModule(
            """
            #define x (threadIdx.x)

            __global__ void gradient_method (float* theta, float* grad, float learning_rate) {
                theta[x] -= learning_rate * grad[x];

                __syncthreads();
            }
            """
        )

        self.gradient_method = gradient_method_ker.get_function("gradient_method")

    def initialize(self):
        self.shared.GPU_grad[:] = self.shared.init_grad[:]



class MomentumMethod(Optimizer):
    def __init__(self, shared):
        super().__init__(shared)

    def run(self):

        self.momentum_method(self.shared.GPU_theta,
                             self.shared.GPU_grad,
                             np.float32(self.shared.s),
                             block=(self.shared.width,1,1),
                             grid=(1,1,1))
                             
        self.initialize()

    def kernel_function(self):

        ## block=(width,1,1), grid=(1,1,1)
        momentum_method_ker = SourceModule(
            """
            #define x (threadIdx.x)

            __global__ void momentum_method (float* theta, float* grad, float s) {
                theta[x] -= s * grad[x];

                __syncthreads();
            }
            """
        )
        
        ## block=(width,1,1), grid=(1,1,1)
        momentum_ker = SourceModule(
            """
            #define x (threadIdx.x)

            __global__ void momentum (float* grad, float beta) {

                grad[x] *= beta;
            }
            """
        )

        self.momentum_method = momentum_method_ker.get_function("momentum_method")
        self.momentum = momentum_ker.get_function("momentum")

    def initialize(self):
        self.momentum(self.shared.GPU_grad,
                      np.float32(self.shared.beta),
                      block=(self.shared.width,1,1),
                      grid=(1,1,1))

In [None]:
A = np.random.rand(2000,50)
b = np.random.rand(2000)
learning_rate = 3e-5

In [None]:
t1 = time()
lstsq = LeastSquare(A, b, learning_rate, iteration=5, optimize_method="momentum")
lstsq.solve()
opt_theta_gpu = lstsq.shared.GPU_theta.get()
t2 = time()

print(f"Duration: {round(t2 - t1,  4)} s")

plt.plot(lstsq.error)
plt.show()

print(lstsq.error[-1])

In [None]:
t1 = time()
opt_theta = np.linalg.lstsq(A,b, rcond=None)[0]
t2 =time()

print(f"Duration: {round(t2 - t1, 4)} s")

print(np.linalg.norm(np.dot(A, opt_theta) - b))

In [None]:
error = np.zeros((10,2))
duration = np.zeros((10,2))

for i in range(10):
    length = (i+1) * 1000
    A = np.random.randn(length,50)
    b = np.random.randn(length)
    learning_rate = 2e-5 * i

    t1 = time()
    lstsq = LeastSquare(A, b, learning_rate, iteration=5, optimize_method="momentum")
    lstsq.solve()
    opt_theta_gpu = lstsq.shared.GPU_theta.get()
    t2 = time()

    error[i,0] = np.linalg.norm(np.dot(A, opt_theta_gpu) - b)
    duration[i,0] = t2 - t1

    t1 = time()
    opt_theta = np.linalg.lstsq(A,b, rcond=None)[0]
    t2 =time()

    error[i,1] = np.linalg.norm(np.dot(A, opt_theta) - b)
    duration[i,1] = t2 - t1

plt.figure(figsize=(16,8))

plt.subplot(121)
plt.plot(error[:,0], label="error by GPU")
plt.plot(error[:,1], label="error by CPU")
plt.legend()

plt.subplot(122)
plt.plot(duration[:,0], label="duration in GPU")
plt.plot(duration[:,1], label="duration in CPU")
plt.legend()

plt.show()