In [None]:
import os
from google.colab import drive
import matplotlib.pyplot as plt
import pandas as pd
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/lstsq/Pycuda')

Mounted at /content/drive


밑에 사용한 방식은 GPU를 두개를 쓴다는 가정하에 각 GPU가 전체 데이터 셋을 나누어 가져간 후, 서로 서로 SGD를 통해 학습하고 매 epoch 마다 결과를 합쳐(평균을 계산) 다시 그 결과를 통해 학습하는 방법입니다. 앞서 읽은 논문에서의 방법으로 표현하면 __data parallelism__에 해당하는 방법입니다..

```python
import numpy as np
import matplotlib.pyplot as plt
from time import time 

class LeastSquare():
    def __init__(self, A, b, num_gpu=2, epoches=2):
        self.A = A
        self.b = b
        self.lr = 1e-3/A.shape[1]
        self.num_gpu = num_gpu
        self.epoches = epoches
        ## record each gpu's optimized x
        self.x_hat = np.random.rand(A.shape[1])
        self.x_list = np.zeros((self.num_gpu,self.A.shape[1]))
        self.error_list = []
        self.n = int(self.A.shape[0] / num_gpu)
        self.A1 = A[:self.n,:]
        self.b1 = b[:self.n]
        self.A2 = A[self.n:,:]
        self.b2 = b[self.n:]

    def run(self):        
        for i in range(self.epoches):
            x = self.x_hat

            for j in range(self.num_gpu):
                A, b = self.initialize(j)
                x_ = self.optimize(A, b, x)
                self.x_list[j,:] = x_
                error = self.check(x_)

            self.x_hat = np.sum(self.x_list, axis=0) / self.num_gpu

        return self.x_hat

    ## initialize
    def initialize(self, num_gpu):
        index = np.random.choice(self.n,1000)
        if num_gpu == 0:
            A = self.A1[index,:]
            b = self.b1[index]
        else:
            A = self.A2[index,:]
            b = self.b2[index]

        return A, b

    def optimize(self, A, b, x, iters_per_epoch=500):
        ## optimize x
        for k in range(iters_per_epoch):
            b_ = np.dot(A, x)
            grad = 2 * np.dot(A.T, (b_ - b))
            x -= grad * self.lr

        return x

    def check(self, x):
        b_ = self.A @ x
        error = np.linalg.norm(self.b - b_)
        self.error_list.append(error)

        return error

if __name__ == "__main__":
    A = np.random.rand(10000,1000)
    b = np.random.rand(10000)
    epoch = 40

    t1 = time()
    lstsq = LeastSquare(A,b,epoches=epoch)
    t2 = time()
    dump_time1 = t2 - t1

    t1 = time()
    theta = lstsq.run()
    error = lstsq.check(theta)
    t2 = time()
    calculation_time = t2 - t1
    
    t1 = time()
    x = np.linalg.lstsq(A, b ,rcond=None)[0]
    lstsq_error = np.linalg.norm(lstsq.A @ x - lstsq.b)
    t2 = time()
    lstsq_time = t2 - t1

    t1 = time()
    result = open("data_parallel_result.txt", "w")
    result.write(f"error: {error}")
    result.write("\n")
    result.write(f"lstsq error: {lstsq_error}")
    result.write("\n")
    result.write(f"GPU1 error: {lstsq.error_list[-2]}")
    result.write("\n")
    result.write(f"GPU2 error: {lstsq.error_list[-1]}")
    result.write("\n")
    result.write(f"optimal x: {theta}")
    result.close()
    t2 = time()
    dump_time2 = t2 - t1

    t1 = time()
    fig = plt.figure(figsize=(16,8))
    plt.subplot(121)
    plt.plot(lstsq.error_list[::2])
    plt.xlabel("epoches")
    plt.ylabel("error")
    plt.subplot(122)
    plt.plot(lstsq.error_list[1::2])
    plt.xlabel("epoches")
    plt.ylabel("error")
    plt.savefig("data_parallel_error.png", dpi=fig.dpi)
    t2 = time()
    dump_time3 = t2 - t1

    dump_time = dump_time1 + dump_time2 + dump_time3

    print(f"It took {calculation_time} seconds to calculate the least square probelm.")
    print(f"It took {dump_time} seconds to something else.")
    print(f"It took {lstsq_time} seconds to calculate the np.linalg.lstsq.")
    print(f"rms between x and theta: {np.linalg.norm(x - theta)}")
```

data_parallel_result는 계산 결과를 담은 txt파일입니다.

In [None]:
lstsq_result = open("data_parallel_result.txt", "r", encoding="utf-8")
for i in range(10):
    line = lstsq_result.readline()
    print(line)
lstsq_result.close()

error: 28.8149046710103

lstsq error: 27.347241765250775

GPU1 error: 28.753315811131504

GPU2 error: 28.8149046710103

optimal x: [ 1.01048599e-02 -1.63962344e-02  8.18530410e-03 -3.00432717e-04

 -1.13814665e-02  4.82772075e-03  2.31549080e-02  1.78982011e-02

  1.54345318e-02  2.03025957e-02  2.04379305e-02 -2.31554456e-02

 -1.19348363e-02  1.61581271e-02  4.95154600e-04 -3.03585364e-03

 -2.88226547e-03  5.94086178e-03  2.13699856e-02 -1.58383437e-02

  2.85404116e-02  1.40183391e-02 -3.37064614e-02  4.07298851e-02



data_parallel_profile은 CPU에서 각 계산 과정의 소요시간은 세부적으로 기록한 txt 파일입니다.

In [None]:
lstsq_result = open("data_parallel_profile.txt", "r", encoding="utf-16")
for i in range(10):
    line = lstsq_result.readline()
    print(line)
lstsq_result.close()

## cpu 25% 정도 사용하였을때....

It took 20.454949855804443 seconds to calculate the least square probelm.

It took 0.6010358333587646 seconds to something else.

It took 1.4236323833465576 seconds to calculate the np.linalg.lstsq.

rms between x and theta: 0.38661562483931255

         1045900 function calls (1033597 primitive calls) in 23.393 seconds



   Ordered by: cumulative time



   ncalls  tottime  percall  cumtime  percall filename:lineno(function)

    612/1    0.002    0.000   23.394   23.394 {built-in method builtins.exec}



이는 데이터 셋의 크기를 100배((십만,만)으로) 증가시켜 실행시킨 결과입니다. 이때는 epoch을 40이 아닌 20으로 주어 앞선 결과보다는 덜 수렴한 모습입니다.

In [None]:
lstsq_result = open("data_parallel_result_2.txt", "r", encoding="utf-8")
for i in range(10):
    line = lstsq_result.readline()
    print(line)
lstsq_result.close()

## It took 10.855637550354004 seconds to calculate the least square probelm. 
## It took 0.6464803218841553 seconds to something else.                                                                   
## It took 9.001726627349854 seconds to calculate the np.linalg.lstsq. 
## rms between x and theta: 0.9537647484008444

error: 124.77371425213892

lstsq error: 90.8337995556563

GPU1 error: 122.82445530120378

GPU2 error: 124.77371425213892

optimal x: [-4.42302010e-02  3.79321436e-02 -1.13119200e-02  1.14975630e-02

  3.34142908e-02  1.68574959e-02  6.22364496e-02  1.63532743e-02

  1.13642179e-02  1.76564975e-02 -2.60937915e-02  1.06707097e-02

  3.32064801e-02 -3.52888164e-05 -4.70384507e-02 -2.18729501e-02

  2.03667184e-02  1.70091327e-02 -1.94876913e-02 -3.78571168e-03

  5.15585398e-02 -7.76529844e-04  1.13756708e-02  1.54584058e-02



이후 내용은 PYCUDA 책을 읽으면서 내용을 정리한 노트입니다.

# Querying your GPU

In [None]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2021.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 31.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 17.6 MB/s eta 0:00:01[K     |▋                               | 30 kB 9.8 MB/s eta 0:00:01[K     |▉                               | 40 kB 8.2 MB/s eta 0:00:01[K     |█                               | 51 kB 4.4 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.2 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.3 MB/s eta 0:00:01[K     |█▋                              | 81 kB 5.4 MB/s eta 0:00:01[K     |█▊                              | 92 kB 6.0 MB/s eta 0:00:01[K     |██                              | 102 kB 5.1 MB/s eta 0:00:01[K     |██▏                             | 112 kB 5.1 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.1 MB/s eta 0:00:01[K     |██▌                             | 133 kB 5.1 MB/s eta 0:00:01[K     |██▊   

## Querying your GPU with PyCUDA

In [None]:
import pycuda.driver as drv
## always complie this line or import the pycuda.autoinit
drv.init()

In [None]:
print(f"Detected {drv.Device.count()} CUDA Capable device (s)")

Detected 1 CUDA Capable device (s)


pycuda.driver.Device: instance
with methods like:<br>
1. __count__: count available devices
2. __compute_capability__: print device's compute capability
3. __total_memory__: print(total amount of device memory 

In [None]:
for i in range(drv.Device.count()):
    gpu_device = drv.Device(i)
    print(f"Device {i}: {gpu_device.name()}")
    compute_capability = float("%d.%d" % gpu_device.compute_capability())
    print(f"\t Compute Capability: {compute_capability}")
    print(f"\t Total Memory: {gpu_device.total_memory() // (1024 ** 2)} megabytes")

Device 0: Tesla T4
	 Compute Capability: 7.5
	 Total Memory: 15109 megabytes


with __get_attributes__ method, can look out GPU's attributes and it returns in python dictionary type.

In [None]:
device_attributes_tuples = gpu_device.get_attributes().items()
device_attributes = {}
for k, v in device_attributes_tuples:
    device_attributes[str(k)] = v

In [None]:
num_mp = device_attributes["MULTIPROCESSOR_COUNT"]

In [None]:
cuda_cores_per_mp = {5.0: 128, 5.1: 128, 5.2: 128, 6.0: 64, 6.1: 128, 6.2: 128, 7.5: 128}[compute_capability]

In [None]:
print(f"\t ({num_mp}) Multiprocessors, ({cuda_cores_per_mp}) CUDA Cores / Multiprocessors: {num_mp * cuda_cores_per_mp} CUDA Cores")

	 (40) Multiprocessors, (128) CUDA Cores / Multiprocessors: 5120 CUDA Cores


# Using PyCUDA's gpuarray class

Like NumPy's array class, PyCUDA's gpuarray class plays an analogously prominent role within GPU programming in Python.<br>
This has all of the features you know and love from NumPy:<br>
1. __multidimensional vector/matrix/tensor shape structuring__
2. __array-slicing, array unraveling__
3. __overloaded operators for point-wise computations__

## Transferring data to and from the GPU with gpuarray

GPU has its own memory apart from the host computer's memory, which is known as __device memory__(Sometimes this is known more specifically as __global device memory__, to differentiate this from the additional cache memory, shared memory, and register memory that is also on the GPU.)<br>
Unlike malloc and free functions in __C__ or new and delete operators in __C++__, in CUDA, this is comlicated further with the additional task of transferring data back and forth between the CPU to the GPU (commands such as __cudaMemcpyHostToDevice__ and __cudaMemcpyDevicetoHost__). 
> cudaMalloc: memory allocations<br>
> cudaFree: deallocations

Fortunately, PyCUDA covers all of the overhead of memory allocation, deallocation, and data tranfers with the __gpuarray__ class.

In [None]:
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from time import time

In [None]:
host_data = np.array([1,2,3,4,5], dtype=np.float32)
device_data = gpuarray.to_gpu(host_data)
device_data_x2 = 2 * device_data
host_data_x2 = device_data_x2.get()
print(host_data_x2)

[ 2.  4.  6.  8. 10.]


One thing to note:<br>
> set array element's type specifically
we set type as np.float32, this corresponds directly with the float type in __C/C++__.

It has two denefits.<br>
1. We can reduce unnecessary overhead of using an unnecessary type that will possibly take up more computational time or memory.
2. We will soon be writing portions of code in inline CUDA C, we will have to be very specific with types or our code won't work correctly, keeping in mind that C is a staticallly-typed language.

## Basic pointwise arithmetic operations with gpuarray

We saw that we can use the Python multiplication operator(*) to multiply each element in a gpuarray object by a scalar value; note...<br>
> a pointwise operation is intrinsically parallelizable, and so when we use this operation on a gpuarray object PyCUDA is able to offload each multiplication onto a single thread. The point is that the computation of one element is nor dependent on the computation of any other element.

In [None]:
x_host = np.array([1,2,3], dtype=np.float32)
y_host = np.array([1,1,1], dtype=np.float32)
z_host = np.array([2,2,2], dtype=np.float32)

x_device = gpuarray.to_gpu(x_host)
y_device = gpuarray.to_gpu(y_host)
z_device = gpuarray.to_gpu(z_host)

In [None]:
x_host + y_host

array([2., 3., 4.], dtype=float32)

In [None]:
(x_device + y_device).get()

array([2., 3., 4.], dtype=float32)

In [None]:
x_host ** z_host

array([1., 4., 9.], dtype=float32)

In [None]:
(x_device ** z_device).get()

array([1., 4., 9.], dtype=float32)

In [None]:
x_host / x_host

array([1., 1., 1.], dtype=float32)

In [None]:
(x_device / x_device).get()

array([1., 1., 1.], dtype=float32)

In [None]:
z_host - x_host

array([ 1.,  0., -1.], dtype=float32)

In [None]:
(z_device - x_device).get()

array([ 1.,  0., -1.], dtype=float32)

In [None]:
z_host / 2

array([1., 1., 1.], dtype=float32)

In [None]:
(z_device / 2).get()

array([1., 1., 1.], dtype=float32)

In [None]:
x_host - 1

array([0., 1., 2.], dtype=float32)

In [None]:
(x_device - 1).get()

array([0., 1., 2.], dtype=float32)

In [None]:
host_data = np.float32(np.random.random(50000000))

t1 = time()
host_data_2x = host_data * np.float32(2)
t2 = time()

print(f"total time to compute on CPU: {t2 - t1}")
device_data = gpuarray.to_gpu(host_data)

t1 = time()
device_data_2x = device_data * np.float32(2)
t2 = time()

from_device = device_data_2x.get()
print(f"total time to compute on GPU: {t2 - t1}")

print(f"Is the host computation the same as the GPU computation?: {np.allclose(from_device, host_data_2x)}")

total time to compute on CPU: 0.03358745574951172
total time to compute on GPU: 0.0008957386016845703
Is the host computation the same as the GPU computation?: True


In [None]:
host_data = np.float32(np.random.random(50000000))

t1 = time()
host_data_2x = host_data * np.float32(2)
t2 = time()

print(f"total time to compute on CPU: {t2 - t1}")
device_data = gpuarray.to_gpu(host_data)

t1 = time()
device_data_2x = device_data * np.float32(2)
t2 = time()

from_device = device_data_2x.get()
print(f"total time to compute on GPU: {t2 - t1}")

print(f"Is the host computation the same as the GPU computation?: {np.allclose(from_device, host_data_2x)}")

total time to compute on CPU: 0.03511476516723633
total time to compute on GPU: 0.0007052421569824219
Is the host computation the same as the GPU computation?: True


In [None]:
with open("time_calc0.py", "r") as f:
    time_calc_code = f.read()

In [None]:
%prun -s cumulative exec(time_calc_code)

total time to compute on CPU: 0.03452444076538086
total time to compute on GPU: 0.0006413459777832031
Is the host computation the same as the GPU computation?: True
 