In [1]:
!nvidia-smi

Tue Mar 21 23:05:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    24W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m294.9 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting appdirs>=1.4.0
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Building wheels 

In [5]:
!pip install scikit-cuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-cuda
  Downloading scikit_cuda-0.5.3-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.8/114.8 KB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-cuda
Successfully installed scikit-cuda-0.5.3


In [1]:
import pycuda.autoinit
from pycuda import gpuarray
import numpy as np
from skcuda import cublas
from time import time



In [2]:
m = 5000
n = 8000
k = 10000


In [3]:
precision='S'
if precision=='S':
		float_type = 'float32'
elif precision=='D':
		float_type = 'float64'
print(float_type)

float32


In [4]:
# NumPy creates arrays in row-major order
a = np.array([[1,2,3,4], [5,6,7,8]])
a

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [5]:
A = np.random.randn(m, k).astype(float_type)
B = np.random.randn(k, n).astype(float_type)
C = np.random.randn(m, n).astype(float_type)

print(A.shape)
print(B.shape)
print(C.shape)

(5000, 10000)
(10000, 8000)
(5000, 8000)


In [6]:

##A_gpu, B_gpu, C_gpu are expected to be in column-major format
# NumPy creates arrays in row-major order by default, so we need to transpose the matrices

A_cm = A.T.copy()
B_cm = B.T.copy()
C_cm = C.T.copy()

A_gpu = gpuarray.to_gpu(A_cm)
B_gpu = gpuarray.to_gpu(B_cm)
C_gpu = gpuarray.to_gpu(C_cm)

alpha = np.random.randn()
beta = np.random.randn()

#alpha=1
#beta =0 

transa = cublas._CUBLAS_OP['N'] #Do not transpose A, we already transposed it
transb = cublas._CUBLAS_OP['N'] #Do not transpose B, we already transposed it

lda = m
ldb = k
ldc = m


We will calculate $C:=\alpha AB+ \beta C$

**CPU**

In [7]:
t = time()
res=alpha*np.matmul(A,B)+beta*C
t = time() - t

gflops = 2*m*n*(k+1)*(10**-9) / t 
print(f'CPU performance: {gflops} GFLOPS') 

print(res.shape)

CPU performance: 64.8450175196471 GFLOPS
(5000, 8000)


In [8]:
res[0:5,0:5]

array([[-0.768347  ,  0.29589224,  1.9595716 ,  0.7178556 , -0.06956333],
       [ 0.94150496,  0.9923849 , -2.6909096 ,  1.2682674 ,  0.6880198 ],
       [ 0.10186237,  0.34578502,  0.8439426 , -0.5151506 , -0.8963978 ],
       [-1.5527649 ,  1.5836842 ,  2.09026   ,  0.46388733,  0.36500576],
       [-0.7837318 ,  0.01005572,  1.0571122 ,  0.3560819 ,  0.20570031]],
      dtype=float32)

**GPU**

cublasSgemm, cublasDgemm: General Matrix-Matrix multiplication with Single or Double precision 

In [10]:
	t = time()
	handle = cublas.cublasCreate()

  #A_gpu, B_gpu are expected to be in column-major format
  # NumPy creates arrays in row-major order by default
  
	exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \
						B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision)
	
	cublas.cublasDestroy(handle)
	t = time() - t

In [11]:
gflops = 2*m*n*(k+1)*(10**-9) / t 
print(f'GPU performance: {gflops} GFLOPS') 


GPU performance: 3229.2222934186307 GFLOPS


In [12]:
res_gpu=C_gpu.get()

In [13]:
np.allclose(res, res_gpu.T, atol=1e-2)

True

In [14]:
np.max(np.abs(res-res_gpu.T))

1.1444092e-05

In [15]:
res_gpu[0:4,0:4].T

array([[-0.768347  ,  0.29589278,  1.9595715 ,  0.71785575],
       [ 0.9415051 ,  0.9923851 , -2.6909108 ,  1.2682693 ],
       [ 0.10186085,  0.34578395,  0.84394276, -0.5151509 ],
       [-1.5527645 ,  1.5836843 ,  2.0902603 ,  0.46388772]],
      dtype=float32)

In [16]:
res[0:4,0:4]

array([[-0.768347  ,  0.29589224,  1.9595716 ,  0.7178556 ],
       [ 0.94150496,  0.9923849 , -2.6909096 ,  1.2682674 ],
       [ 0.10186237,  0.34578502,  0.8439426 , -0.5151506 ],
       [-1.5527649 ,  1.5836842 ,  2.09026   ,  0.46388733]],
      dtype=float32)