<a href="https://colab.research.google.com/github/YuqiShi/test/blob/master/Cuda_tutorials.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install pycuda

Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m1.0/1.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m1.0/1.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2023.1-py2.py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting mako (from pycuda)
  Do

In [4]:
import pycuda.driver as drv

drv.init()

print("%d device(s) found!" %drv.Device.count())

for ordinal in range(drv.Device.count()):
  dev = drv.Device(ordinal)
  print("Device #%d: %s" % (ordinal, dev.name()))
  print("   Compute Capability: %d.%d" % dev.compute_capability())
  print("   Totel Memory: %s GB" % (dev.total_memory()//(1024*1024*1024)))
  atts = [(str(att),value)
          for att, value in list(dev.get_attributes().items())]
  atts.sort()

  for att, value in atts:
    print(f"        {att}:{value}")


1 device(s) found!
Device #0: Tesla T4
   Compute Capability: 7.5
   Totel Memory: 14 GB
        ASYNC_ENGINE_COUNT:3
        CAN_MAP_HOST_MEMORY:1
        CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM:1
        CLOCK_RATE:1590000
        COMPUTE_CAPABILITY_MAJOR:7
        COMPUTE_CAPABILITY_MINOR:5
        COMPUTE_MODE:DEFAULT
        COMPUTE_PREEMPTION_SUPPORTED:1
        CONCURRENT_KERNELS:1
        CONCURRENT_MANAGED_ACCESS:1
        DIRECT_MANAGED_MEM_ACCESS_FROM_HOST:0
        ECC_ENABLED:1
        GENERIC_COMPRESSION_SUPPORTED:0
        GLOBAL_L1_CACHE_SUPPORTED:1
        GLOBAL_MEMORY_BUS_WIDTH:256
        GPU_OVERLAP:1
        HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED:1
        HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:0
        HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED:0
        HOST_NATIVE_ATOMIC_SUPPORTED:0
        INTEGRATED:0
        KERNEL_EXEC_TIMEOUT:0
        L2_CACHE_SIZE:4194304
        LOCAL_L1_CACHE_SUPPORTED:1
        MANAGED_MEMORY:1
        MAXIMUM_SURFACE1D_LAYERED_LAYERS:2

In [12]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

mod = SourceModule("""
    #include <stdio.h>

    __global__ void say_hi()
    {
        printf("I am %dth thread in threadIdx.x:%d.threadIdx.y:%d blockIdx.:%d blockIdx.y:%d blockDim.x:%d blockDim.y:%d gridDim.x:%d gridDim.y:%d\\n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y,gridDim.x,gridDim.y);
    }
    """)

func = mod.get_function("say_hi")
func(block=(4, 4, 1), grid=(2, 2, 1))


  globals().clear()


In [14]:
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
    const int i = threadIdx.x;
    dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400,1,1), grid=(1,1))

print(dest-a*b)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [18]:
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import numpy

free_bytes, total_bytes = cuda.mem_get_info()
exp = 10
while True:
    fill_floats = free_bytes / 4 -(1<<exp)
    if fill_floats < 0:
        raise RuntimeError("couldn't find allocatable size")
    try:
        print("alloc", fill_floats)
        ary = gpuarray.zeros((fill_floats,), dtype=numpy.float32)
        break
    except:
        pass
    exp += 1

ary.fill(float("nan"))

print("fill %d out of %d bytes with NaNs" % (fill_floats*4, free_bytes))

alloc 3928701952.0
alloc 3928700928.0
alloc 3928698880.0
alloc 3928694784.0
alloc 3928686592.0
alloc 3928670208.0
alloc 3928637440.0
alloc 3928571904.0
alloc 3928440832.0
alloc 3928178688.0
alloc 3927654400.0
alloc 3926605824.0
alloc 3924508672.0
alloc 3920314368.0
alloc 3911925760.0
alloc 3895148544.0
alloc 3861594112.0
alloc 3794485248.0
alloc 3660267520.0
alloc 3391832064.0
alloc 2854961152.0
alloc 1781219328.0


RuntimeError: ignored

In [3]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy
from pycuda.curandom import rand as curand

a_gpu = curand((50,))
b_gpu = curand((50,))

from pycuda.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(
        "float a, float *x, float b, float *y, float *z",
        "z[i] = my_f(a*x[i], b*y[i])",
        "linear_combination",
        preamble="""
        __device__ float my_f(float x, float y)
        {
            return sin(x*y);
        }
        """)

c_gpu = gpuarray.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
print(c_gpu)
import numpy.linalg as la
print(la.norm(c_gpu.get() - numpy.sin((5*a_gpu*6*b_gpu).get())) < 1e-5)



[ 0.7231871   0.8806899   0.35798535 -0.99658346 -0.32457775  0.6996876
 -0.27785546  0.8788663  -0.5593305   0.06763591  0.03788458 -0.99138665
 -0.34647107  0.27459607  0.20271116 -0.12944542  0.99667215  0.7766698
 -0.00384076  0.6894252   0.00813597 -0.24195908  0.8352214  -0.9998702
  0.98412627 -0.6304864  -0.3948932   0.99810153  0.24599268 -0.860303
  0.9541138   0.41623122  0.89988786  0.33793125  0.53693366 -0.8019235
  0.18725929  0.12536752 -0.44190723 -0.9646584   0.4871217  -0.75611264
 -0.9975576   0.28726465  0.07722988 -0.95213723  0.20477505  0.90327436
 -0.91743916  0.12283815]
True


In [8]:
import pycuda.driver as cuda
import pycuda.autoinit , pycuda.compiler
import numpy as np
from pycuda.compiler import SourceModule


a = np.random.randn(4,4).astype(np.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
    __global__ void doublify(float *a)
    {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
    }
    """)
func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1), grid=(1,1), shared=0)
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print("doubled a:")
print(a_doubled)
print("original a:")
print(a)

doubled a:
[[-0.44884303  1.0370659  -0.02796388  2.768781  ]
 [-0.850241   -0.01396938 -0.6562841  -3.4277756 ]
 [-1.2917259   0.9839839  -0.17862996 -2.0675328 ]
 [-2.152982   -0.4433159   0.1991783  -0.66262203]]
original a:
[[-0.22442152  0.51853293 -0.01398194  1.3843905 ]
 [-0.4251205  -0.00698469 -0.32814205 -1.7138878 ]
 [-0.64586294  0.49199194 -0.08931498 -1.0337664 ]
 [-1.076491   -0.22165795  0.09958915 -0.33131102]]


  globals().clear()
  globals().clear()


In [9]:
#!python
# Conway's Game of Life Accelerated with PyCUDA
# Luis Villasenor
# lvillasen@gmail.com
# 3/26/2016
# Licence: GPLv3
# Usage: python GameOfLife.py n n_iter
# where n is the board size and n_iter the number of iterations
import pycuda.driver as cuda
import pycuda.tools
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import sys
import numpy as np
from pylab import cm as cm
import matplotlib.pyplot as plt
n=int(sys.argv[1])
n_iter=int(sys.argv[2])
n_block=16
n_grid=int(n/n_block);
n=n_block*n_grid;
def random_init(n):
    #np.random.seed(100)
    M=np.zeros((n,n)).astype(np.int32)
    for i in range(n):
        for j in range(n):
            M[j,i]=np.int32(np.random.randint(2))
    return M
mod = SourceModule("""
__global__ void step(int *C, int *M)
{
  int count;
  int n_x = blockDim.x*gridDim.x;
  int i = threadIdx.x + blockDim.x*blockIdx.x;
  int j = threadIdx.y + blockDim.y*blockIdx.y;
  int threadId = j*n_x+i;
  int i_left; int i_right; int j_down; int j_up;
  if(i==0) {i_left=n_x-1;} else {i_left=i-1;}
  if(i==n_x-1) {i_right=0;} else {i_right=i+1;}
  if(j==0) {j_down=n_x-1;} else {j_down=j-1;}
  if(j==n_x-1) {j_up=0;} else {j_up=j+1;}
  count = C[j*n_x+i_left] + C[j_down*n_x+i]
    + C[j*n_x+i_right] + C[j_up*n_x+i] + C[j_up*n_x+i_left]
    + C[j_down*n_x+i_right] + C[j_down*n_x+i_left]
    + C[j_up*n_x+i_right];

// Modify matrix M according to the rules B3/S23:
//A cell is "Born" if it has exactly 3 neighbours,
//A cell "Survives" if it has 2 or 3 living neighbours; it dies otherwise.
  if(count < 2 || count > 3) M[threadId] = 0; // cell dies
  if(count == 2) M[threadId] = C[threadId];// cell stays the same
  if(count == 3) M[threadId] = 1; // cell either stays alive, or is born
}
""")
func = mod.get_function("step")
C=random_init(n)
M = np.empty_like(C)
C_gpu = gpuarray.to_gpu( C )
M_gpu = gpuarray.to_gpu( M )
for k in range(n_iter):
  func(C_gpu,M_gpu,block=(n_block,n_block,1),grid=(n_grid,n_grid,1))
  C_gpu, M_gpu = M_gpu, C_gpu
print("%d live cells after %d iterations" %(np.sum(C_gpu.get()),n_iter))
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111)
fig.suptitle("Conway's Game of Life Accelerated with PyCUDA")
ax.set_title('Number of Iterations = %d'%(n_iter))
myobj =plt.imshow(C_gpu.get(),origin='lower',cmap='Greys',  interpolation='nearest',vmin=0, vmax=1)
plt.pause(.01)
plt.draw()
m=n_iter
while True:
    m+=1
    func(C_gpu,M_gpu,block=(n_block,n_block,1),grid=(n_grid,n_grid,1))
    C_gpu, M_gpu = M_gpu, C_gpu
    myobj.set_data(C_gpu.get())
    ax.set_title('Number of Iterations = %d'%(m))
    plt.pause(.01)
    plt.draw()

ValueError: ignored