In [7]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.pyplot as plt
from time import time

In [10]:
ker = SourceModule(
    """
    #define x (threadIdx.x)

    __device__ void tall_matrix_multiply(float *matrix, float *vector, float *out, int width) {
        int index = x * width;

        for (int i = 0; i < width; i++) {
            out[x] += matrix[index + i] * vector[i];
        }
    }

    __device__ void atomic_sub(float *vector1, float *vector2, int width) {
        
        vector1[x] -= vector2[x];
    }

    __device__ void wide_matrix_multiply(float *matrix, float *vector, float *grad, int width, int length) {
        int wideness = length / width;
        int row = x / wideness;
        int col = x % wideness;
        int index1 = row * length;
        int index2 = row * wideness;

        for (int i = 0; i < width; i++) {
            int k = col * width + i;

            grad[index2 + col] += matrix[index1 + k] * vector[k];
        }

        if (x < width) {
            int index3 = wideness * x;

            for (int i = 1; i < wideness; i++) {
                grad[index3] += grad[index3 + i];
            }
        }
    }

    __device__ void update(float *vector, float *grad, float learning_rate, int width, int length) {
        int wideness = length / width;

        if (x < width) {
            int index3 = x * wideness;

            vector[x] -= grad[index3] * learning_rate;
        }
    }

    __global__ void main_function(float *matrix1, float *matrix2, float *vector1, float *vector2, float *out, float *grad, int width, int length, float learning_rate) {

        // optimize
        tall_matrix_multiply(matrix1, vector1, out, width);

        __syncthreads();

        atomic_sub(out, vector2, width);
        
        __syncthreads();
        
        wide_matrix_multiply(matrix2, out, grad, width, length);
        
        __syncthreads();

        update(vector1, grad, learning_rate, width, length);

        __syncthreads();

    }
    """
)

calculate = ker.get_function("main_function")

In [46]:
length = 1000
width = 20
wideness = np.int32(length / width)

lr = np.float32(2e-4)

A = np.float32(np.random.rand(length,width))
b = np.float32(np.random.rand(length))
theta = np.float32(np.random.rand(width))
out = np.float32(np.zeros_like(b))
grad = np.float32(np.zeros((width,wideness)))

A1_gpu = gpuarray.to_gpu(A.reshape(length*width))
A2_gpu = gpuarray.to_gpu(A.T.reshape(length*width))
b_gpu = gpuarray.to_gpu(b)
theta_gpu = gpuarray.to_gpu(theta)
out_gpu = gpuarray.to_gpu(out)
grad_gpu = gpuarray.to_gpu(grad.reshape(width*wideness))

init_vec1 = gpuarray.empty_like(out_gpu)
init_vec2 = gpuarray.empty_like(grad_gpu)

t1 = time()
for i in range(10):
    calculate(A1_gpu, A2_gpu, theta_gpu, b_gpu, out_gpu, grad_gpu, np.int32(width), np.int32(length), lr, block=(length,1,1), grid=(1,1,1))
    out_gpu[:] = init_vec1[:]
    grad_gpu[:] = init_vec2[:]

t2 = time()
theta_gpu = theta_gpu.get()

t3 = time()
theta_cpu = np.linalg.lstsq(A, b, rcond=None)[0]
t4 = time()

error_gpu = np.linalg.norm(np.dot(A, theta_gpu) - b)
error_cpu = np.linalg.norm(np.dot(A, theta_cpu) - b)

print(f"Error in GPU: {error_gpu}")
print(f"Error in CPU: {error_cpu}")
print(f"How does it take in GPU?: {round((t2 - t1) / (t4 - t3), 5)}")

Error in GPU: 13.0409574508667
Error in CPU: 9.464192390441895
How does it take in GPU?: 1.5276


In [None]:
class lstsq_dg:

    def __init__(self, A, b, learning_rate=5e-5, initial_value="gaussian"):
        self.A = np.float32(A)
        self.b = np.float32(b)
        self.lr = np.float32(learning_rate)
        
        # get data size
        self.length = self.A.shape[0]
        self.width = self.A.shape[1]
        self.wideness = np.int32(self.length / self.width)

        # create theta with...
        if initial_value == "gaussain":
            self.theta = np.float32(np.random.randn(self.width))
        else:
            self.theta = np.float32(np.random.rand(self.width))

        self.out = np.float32(np.zeros_like(self.b))
        self.grad = np.float32(np.zeros((self.width, self.wideness)))

        # to gpu
        self.A1_gpu = gpuarray.to_gpu(self.A.reshape(self.length*self.width))
        self.A2_gpu = gpuarray.to_gpu(self.A.T.reshape(self.length*self.width))
        self.b_gpu = gpuarray.to_gpu(self.b)
        self.theta_gpu = gpuarray.to_gpu(self.theta)
        self.out_gpu = gpuarray.to_gpu(self.out)
        self.grad_gpu = gpuarray.to_gpu(self.grad.reshape(self.width*self.wideness))

        # for initialize
        self.init_vec1 = gpuarray.empty_like(self.out_gpu)
        self.init_vec2 = gpuarray.empty_like(self.grad_gpu)

    def lstsq(self):
        for iters in range(50):
            # initialize
            self.out_gpu[:] = self.init_vec1[:]
            self.grad_gpu[:] = self.init_vec2[:]

            # calculate
            calculate(self.A1_gpu, self.A2_gpu, self.theta_gpu, self.b_gpu,\
                        self.out_gpu, self.grad_gpu, np.int32(self.width), np.int32(self.length), self.lr,\
                        block=(self.length,1,1), grid=(1,1,1))

        return self.theta_gpu.get()

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/lstsq')

In [None]:
lr = np.float32(3e-4)

R = np.loadtxt("R.txt")[:1000]
b = np.float32(np.random.rand(1000))

dg = lstsq_dg(R, b, learning_rate=lr)
t1 = time()
theta_gpu = dg.lstsq()
t2 = time()

t3 = time()
theta_cpu = np.linalg.lstsq(R, b, rcond=None)[0]
t4 = time()

error_gpu = np.linalg.norm(np.dot(R, theta_gpu) - b)
error_cpu = np.linalg.norm(np.dot(R, theta_cpu) - b)

print(f"Error in GPU: {error_gpu}")
print(f"Error in CPU: {error_cpu}")
print(f"How does it take in GPU?: {round((t2 - t1) / (t4 - t3), 5)}")

Error in GPU: 17.95802390809903
Error in CPU: 8.71697202338243
How does it take in GPU?: 14.16241
