<a href="https://colab.research.google.com/github/anshulsawant/llm-systems/blob/main/cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting started with CUDA

## Setup

In [70]:
import torch, os, math
import torchvision as tv
import torchvision.transforms.functional as tvf
from torchvision import io
import matplotlib.pyplot as plt
from torch.utils.cpp_extension import load_inline

In [71]:
!pip install wurlitzer ninja



### Python Block Kernel

1. **Streaming Multiprocessors (SMs):** In NVIDIA GPUs, SMs are the fundamental units of execution. Each SM can execute multiple threads concurrently.
2. **Thread Blocks:** A thread block is a group of threads that can cooperate among themselves through shared memory and synchronization. All threads in a block are executed on the same SM. This means they can share resources such as shared memory and can synchronize their execution with each other.
3. **Shared Memory:** Shared memory is a small memory space on the GPU that is shared among the threads in a block. It is much faster than global memory (the main GPU memory), but it is also limited in size. Threads in the same block can use shared memory to share data with each other efficiently.

- The RTX 3090, based on the Ampere architecture, has 82 SMs.
- Each SM in GA10x GPUs contain 128 CUDA Cores, four third-generation Tensor Cores, a 256 KB Register File, and 128 KB of L1/Shared Memory
- In CUDA, all threads in a block have the potential to run concurrently. However, the actual concurrency depends on the number of CUDA cores per SM and the resources required by the threads.

### CUDA Setup

In [72]:
## This is slow but good for dev.
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [73]:
%load_ext wurlitzer

The wurlitzer extension is already loaded. To reload it, use:
  %reload_ext wurlitzer


In [74]:
def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")

In [75]:
cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
'''

<img src="attachment:4590626e-3f24-4381-a14b-50162f737579.png" width="500">

## Matmul

In [76]:
from torch import tensor

In [81]:
torch.manual_seed(1)
weights = torch.randn(784,10)
weights

tensor([[-1.53, -0.75, -0.65,  ..., -1.61, -0.71,  0.30],
        [-0.78, -0.25, -0.22,  ..., -1.16,  0.70,  0.20],
        [ 0.87,  0.24, -0.66,  ..., -1.45,  0.06, -0.62],
        ...,
        [ 0.51,  0.47, -0.26,  ...,  0.65,  0.43, -1.29],
        [ 0.52,  1.03,  0.81,  ..., -0.10,  2.26, -0.28],
        [-1.49,  0.39, -0.55,  ..., -0.19, -0.51,  0.54]])

### Python matmul

In [113]:
m1 = torch.randn(5, 784)
m2 = weights
m1.shape,m2.shape

(torch.Size([5, 784]), torch.Size([784, 10]))

In [114]:
ar,ac = m1.shape # n_rows * n_cols
br,bc = m2.shape
(ar,ac),(br,bc)

((5, 784), (784, 10))

In [116]:
t1 = torch.zeros(ar, bc)
t1.shape

torch.Size([5, 10])

In [117]:
for i in range(ar):         # 5
    for j in range(bc):     # 10
        for k in range(ac): # 784
            t1[i,j] += m1[i,k] * m2[k,j]

In [118]:
t1.shape

torch.Size([5, 10])

In [119]:
import numpy as np
np.set_printoptions(precision=2, linewidth=140)
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)

In [120]:
t1

tensor([[ -2.96,  -7.03,  26.43,  -4.98,  -1.75,  30.36, -30.21,   9.64,   0.34, -30.28],
        [ -1.93, -19.66, -14.86, -87.22,  26.14,  48.94,  42.18, -28.02, -34.93,  16.13],
        [ -6.34,  -2.34,   8.88,  -3.85, -56.96, -35.07,   1.56,  -0.48, -38.84, -36.10],
        [-18.06,  -7.81,  19.90,  -7.21,  -6.16, -21.42, -21.30,  60.35, -26.90, -52.29],
        [-16.04,  38.37,  49.53,  19.45,  11.70, -18.82,   2.46,  -6.98, -32.79,   0.50]])

In [121]:
def matmul(a,b):
    (ar,ac),(br,bc) = a.shape,b.shape
    c = torch.zeros(ar, bc)
    for i in range(ar):
        for j in range(bc):
            for k in range(ac): c[i,j] += a[i,k] * b[k,j]
    return c

In [122]:
%time _=matmul(m1, m2)

CPU times: user 768 ms, sys: 2.36 ms, total: 770 ms
Wall time: 771 ms


In [123]:
ar*bc*ac

39200

### 2d Python kernel

In [124]:
from types import SimpleNamespace as ns

In [125]:
def blk_kernel2d(f, blocks, threads, *args):
    for i0 in range(blocks.y):
        for i1 in range(blocks.x):
            for j0 in range(threads.y):
                for j1 in range(threads.x): f(ns(x=i1,y=i0), ns(x=j1,y=j0), threads, *args)

In [126]:
def matmul_bk(blockidx, threadidx, blockdim, m, n, out, h, w, k):
    r = blockidx.y*blockdim.y + threadidx.y
    c = blockidx.x*blockdim.x + threadidx.x

    if (r>=h or c>=w): return
    o = 0.
    for i in range(k): o += m[r*k+i] * n[i*w+c]
    out[r*w+c] = o

In [127]:
def matmul_2d(m, n):
    h,k  = m.shape
    k2,w = n.shape
    assert k==k2, "Size mismatch!"
    output = torch.zeros(h, w, dtype=m.dtype)
    tpb = ns(x=16,y=16)
    blocks = ns(x=math.ceil(w/tpb.x), y=math.ceil(h/tpb.y))
    blk_kernel2d(matmul_bk, blocks, tpb,
                 m.flatten(), n.flatten(), output.flatten(), h, w, k)
    return output

In [128]:
res = matmul_2d(m1, m2)
torch.isclose(t1, res).all()

tensor(True)

### Broadcasting

In [129]:
def matmul(a,b):
    (ar,ac),(br,bc) = a.shape,b.shape
    c = torch.zeros(ar, bc)
    for i in range(ar): c[i] = (a[i,:,None] * b).sum(dim=0)
    return c

In [130]:
torch.isclose(t1,matmul(m1, m2)).all()

tensor(False)

In [131]:
%time _=matmul(m1, m2)

CPU times: user 518 µs, sys: 0 ns, total: 518 µs
Wall time: 527 µs


In [132]:
m1 = x_train
tr = matmul(m1, m2)
tr.shape

torch.Size([50000, 10])

In [133]:
%time _=matmul(m1, m2)

CPU times: user 1.27 s, sys: 6.79 ms, total: 1.27 s
Wall time: 1.27 s


In [134]:
ar,ac = m1.shape
br,bc = m2.shape
ar*bc*ac

392000000

### CUDA matmul

In [135]:
cuda_src = cuda_begin + r'''
__global__ void matmul_k(float* m, float* n, float* out, int h, int w, int k) {
    int r = blockIdx.y*blockDim.y + threadIdx.y;
    int c = blockIdx.x*blockDim.x + threadIdx.x;

    if (r>=h || c>=w) return;
    float o = 0;
    for (int i = 0; i<k; ++i) o += m[r*k+i] * n[i*w+c];
    out[r*w+c] = o;
}

torch::Tensor matmul(torch::Tensor m, torch::Tensor n) {
    CHECK_INPUT(m); CHECK_INPUT(n);
    int h = m.size(0);
    int w = n.size(1);
    int k = m.size(1);
    TORCH_CHECK(k==n.size(0), "Size mismatch!");
    auto output = torch::zeros({h, w}, m.options());

    dim3 tpb(16,16);
    dim3 blocks(cdiv(w, tpb.x), cdiv(h, tpb.y));
    matmul_k<<<blocks, tpb>>>(
        m.data_ptr<float>(), n.data_ptr<float>(), output.data_ptr<float>(), h, w, k);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}
'''

In [136]:
cpp_src = "torch::Tensor matmul(torch::Tensor m, torch::Tensor n);"

In [137]:
module = load_cuda(cuda_src, cpp_src, ['matmul'])

In [138]:
m1c,m2c = m1.contiguous().cuda(), m2.contiguous().cuda()

In [139]:
torch.isclose(tr,module.matmul(m1c, m2c).cpu(), atol=1e-5).all()

tensor(True)

In [140]:
%%time
res=module.matmul(m1c, m2c).cpu()
res.shape

CPU times: user 6.56 ms, sys: 0 ns, total: 6.56 ms
Wall time: 5.83 ms


torch.Size([50000, 10])

### Pytorch

In [141]:
torch.isclose(tr,(m1c@m2c).cpu(), atol=1e-5).all()

tensor(True)

In [142]:
%timeit -n 10 _=(m1c@m2c).cpu()

2.06 ms ± 54.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
