In [1]:
!nvidia-smi

Sun Mar 26 05:25:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    16W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
#!pip install pycuda scikit-cuda

In [3]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np
import skcuda.linalg as linalg

import random
import time
import numpy as np
import matplotlib.pyplot as plt




In [4]:
linalg.init()

### Matrix multiplication demo

In [5]:
a = np.asarray(np.random.rand(5, 4), np.float32)
b = np.asarray(np.random.rand(4, 3), np.float32)
c = np.asarray(np.random.rand(3, 2), np.float32)
print(f'mtx shapes: {a.shape}, {b.shape} ,{c.shape}')

a_gpu = gpuarray.to_gpu(a)
b_gpu = gpuarray.to_gpu(b)
c_gpu = gpuarray.to_gpu(c)

d_gpu = linalg.mdot(a_gpu, b_gpu, c_gpu)
gpu_result=d_gpu.get()
cpu_result=np.matmul(a,np.matmul(b,c))


mtx shapes: (5, 4), (4, 3) ,(3, 2)


In [6]:
print(f'CPU: {cpu_result}')
print(f'GPU: {gpu_result}')

CPU: [[2.610798  2.4132574]
 [1.7176193 1.6206465]
 [2.2204137 2.0662577]
 [2.4080937 2.2284636]
 [1.3815416 1.2763608]]
GPU: [[2.6107981 2.4132574]
 [1.7176192 1.6206464]
 [2.2204137 2.0662577]
 [2.4080937 2.2284636]
 [1.3815416 1.2763606]]


In [7]:
np.allclose(cpu_result, gpu_result)

True

### Linear regression demo

In [8]:
n=100000
dx=1500
dy=500
C=np.asarray(np.random.randn(dx, dy), np.float32)
ones =np.asarray(np.random.randn(n, dy), np.float32) 
b=np.asarray(np.random.randn(dy, dy), np.float32)
nu=0.1*np.asarray(np.random.randn(n, dy), np.float32)
X=np.asarray(np.random.randn(n, dx), np.float32)
print(f'X shape: {X.shape}')
print(f'C shape: {C.shape}')
print(f'ones shape: {ones.shape}')
print(f'b shape: {b.shape}')

Y=np.matmul(X,C)+np.matmul(ones,b)+nu
print(f'Y shape: {Y.shape}')

X shape: (100000, 1500)
C shape: (1500, 500)
ones shape: (100000, 500)
b shape: (500, 500)
Y shape: (100000, 500)


$$Y=XC+1b+\nu$$
$$ Y\in\mathbb{R}^{n\times d_y}, X\in\mathbb{R}^{n\times d_x}, C\in\mathbb{R}^{d_x\times d_y}$$
$$1\in\mathbb{R}^{n\times d_y}, b\in{R}^{d_y\times d_y} ,\nu \in {R}^{n\times d_y}$$

$$_{n}\{\underbrace{\begin{bmatrix}X,1\end{bmatrix}}_{d_x+d_y}\underbrace{\begin{bmatrix} C \\ b \end{bmatrix}}_{d_y} \approx Y\in\mathbb{R}^{n\times d_y}$$
$$A=\begin{bmatrix}X,1\end{bmatrix} \in\mathbb{R}^{n\times(d_x+d_y)}, D=\begin{bmatrix}C\\b\end{bmatrix} \in\mathbb{R}^{(d_x+d_y)\times d_y}$$
$$AD \approx Y\in\mathbb{R}^{n\times d_y}$$

In [9]:
A=np.hstack([X,ones])
print(A.shape)

(100000, 2000)


In [10]:
D=np.vstack([C,b])
print(D.shape)

(2000, 500)


$$A^TAD \approx A^TY\in\mathbb{R}^{(d_x+d_y)\times d_y}$$
$$D \approx (A^TA)^{-1}A^TY\in\mathbb{R}^{n\times d_y}$$

**Calculate regression on the CPU**

In [11]:
t=time.time()
term1= np.linalg.inv(np.matmul(A.T, A))
term2 = np.matmul(A.T,Y)
res_cpu=np.matmul(term1,term2)
t=time.time()-t
print(f'Ellapsed time: {t}')

Ellapsed time: 10.136053562164307


**Calculate regression on the GPU**

In [12]:
A_gpu = gpuarray.to_gpu(A)
print(A_gpu.shape)
Y_gpu = gpuarray.to_gpu(Y)
print(Y_gpu.shape)

(100000, 2000)
(100000, 500)


In [13]:
t=time.time()
term1_gpu= linalg.inv(linalg.mdot(linalg.transpose(A_gpu), A_gpu))
term2_gpu = linalg.mdot(linalg.transpose(A_gpu),Y_gpu)
res_gpu=linalg.mdot(term1_gpu,term2_gpu)
t=time.time()-t
print(f'Ellapsed time: {t}')

Ellapsed time: 0.6024169921875


**Check if the results are close to each other on the CPU and the GPU**

In [14]:
np.max(np.abs(res_cpu-res_gpu.get()))

0.00012373924

In [15]:
np.allclose(res_cpu, res_gpu.get(),atol=1e-4)

True

In [16]:
res_cpu[:4,:4]

array([[ 0.70357364, -2.0245886 , -0.42264533,  0.9154077 ],
       [-0.9699507 ,  0.2234117 ,  0.64285135, -0.3408896 ],
       [ 1.7895846 ,  1.265549  , -0.5181347 ,  0.27114543],
       [-0.89868844, -1.166698  ,  0.9022783 , -0.52433676]],
      dtype=float32)

In [17]:
res_gpu.get()[:4,:4]

array([[ 0.70358986, -2.0246353 , -0.42265856,  0.91542226],
       [-0.96996266,  0.22341564,  0.6428619 , -0.34089154],
       [ 1.7896237 ,  1.2655714 , -0.5181428 ,  0.27114752],
       [-0.8987027 , -1.166715  ,  0.9022933 , -0.5243461 ]],
      dtype=float32)