<a href="https://colab.research.google.com/github/ardywibowo/cuda-mode/blob/main/chapter3/chapter3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 3

## Setup

In [6]:
!pip install ninja
!sudo apt update
!sudo apt install g++-11 -y
!sudo apt install ccache -y

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 119 kB in 2s (73.5 kB/s)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
32 packages can be upgraded. Run 'apt list --upgradable' to see them.
Reading package lists... Done
Building dependency tree..

In [7]:
import torch
import torch.utils.cpp_extension
import os
os.environ['CXX'] = '/usr/lib/ccache/g++-11'
os.environ['CC'] = '/usr/lib/ccache/gcc-11'

In [8]:
cuda_begin = """
//cuda
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
//!cuda
"""

## Problem 1

In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.

a. Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.

In [15]:
cuda_src = cuda_begin + \
"""
//cuda
__global__ void matmul_row(float* m, float* n, float* out, int h, int w, int k) {
    int r = blockIdx.x*blockDim.x + threadIdx.x;

    if (r >= h) return;

    for (int c = 0; c < w; ++c) {
        float o = 0;
        for (int i = 0; i<k; ++i) {
            o += m[r*k + i] * n[i*w + c];
        }
        out[r*w+c] = o;
    }
}

torch::Tensor matmul(torch::Tensor m, torch::Tensor n) {
    CHECK_INPUT(m); CHECK_INPUT(n);
    int h = m.size(0);
    int w = n.size(1);
    int k = m.size(1);
    TORCH_CHECK(k == n.size(0), "Size mismatch!");
    auto output = torch::zeros({h, w}, m.options());

    dim3 tpb(256);
    dim3 blocks(cdiv(h, tpb.x));
    matmul_row<<<blocks, tpb>>>(
        m.data_ptr<float>(), n.data_ptr<float>(), output.data_ptr<float>(), h, w, k);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}
//!cuda
"""

cpp_src = \
"""
//cuda
torch::Tensor matmul(torch::Tensor m, torch::Tensor n);
//!cuda
"""

module = torch.utils.cpp_extension.load_inline(
    "test_ext", cpp_src, cuda_src,
    functions=['matmul'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True)

n = 32
A = torch.randn(n, n, device='cuda')
B = torch.randn(n, n, device='cuda')

A = torch.ones((3, 3), device='cuda')
B = torch.ones((3, 3), device='cuda')

out = module.matmul(A, B); torch.cuda.synchronize()
reference = torch.matmul(A, B)
print("Out:", out)
print("Reference:", reference)
print("Correct Implementation:", torch.allclose(out, reference))

import time
num_trials = 1_000

with torch.profiler.profile() as prof:
    for i in range(num_trials):
        module.matmul(A, B)
        torch.cuda.synchronize()

print(prof.key_averages().table())

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
No modifications detected for re-loaded extension module test_ext_v2, skipping build step...
Loading extension module test_ext_v2...


Out: tensor([[ -2.4887,  -1.7471,   2.4387,  ...,   0.1727,   1.1549,   1.4592],
        [ -3.4692,   8.6338,  -2.4712,  ...,   3.5144,   0.9575,   2.7203],
        [ -6.9637,   5.5027,   1.5716,  ...,   3.4485,   0.2339,   6.3301],
        ...,
        [  1.2553,  -0.8796,  -5.3700,  ...,   1.4417,   1.7112,   3.1901],
        [ 11.2950,   1.3072,  -3.6682,  ...,  10.2170,  -3.2171,  -1.7322],
        [-10.9152,  -0.5431,  -4.4932,  ...,  -6.4946,  -2.2655,  -9.4765]],
       device='cuda:0')
Reference: tensor([[ -2.4887,  -1.7471,   2.4387,  ...,   0.1727,   1.1549,   1.4592],
        [ -3.4692,   8.6338,  -2.4712,  ...,   3.5144,   0.9575,   2.7203],
        [ -6.9637,   5.5027,   1.5716,  ...,   3.4485,   0.2339,   6.3301],
        ...,
        [  1.2553,  -0.8796,  -5.3700,  ...,   1.4417,   1.7112,   3.1901],
        [ 11.2950,   1.3072,  -3.6682,  ...,  10.2170,  -3.2171,  -1.7322],
        [-10.9152,  -0.5431,  -4.4932,  ...,  -6.4946,  -2.2655,  -9.4765]],
       device='cuda:

b. Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.

In [16]:
cuda_src = cuda_begin + \
"""
//cuda
__global__ void matmul_col(float* m, float* n, float* out, int h, int w, int k) {
    int c = blockIdx.x*blockDim.x + threadIdx.x;

    if (c >= w) return;

    for (int r = 0; r < h; ++r) {
        float o = 0;
        for (int i = 0; i<k; ++i) {
            o += m[r*k + i] * n[i*w + c];
        }
        out[r*w+c] = o;
    }
}

torch::Tensor matmul(torch::Tensor m, torch::Tensor n) {
    CHECK_INPUT(m); CHECK_INPUT(n);
    int h = m.size(0);
    int w = n.size(1);
    int k = m.size(1);
    TORCH_CHECK(k == n.size(0), "Size mismatch!");
    auto output = torch::zeros({h, w}, m.options());

    dim3 tpb(256);
    dim3 blocks(cdiv(h, tpb.x));
    matmul_col<<<blocks, tpb>>>(
        m.data_ptr<float>(), n.data_ptr<float>(), output.data_ptr<float>(), h, w, k);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}
//!cuda
"""

cpp_src = \
"""
//cuda
torch::Tensor matmul(torch::Tensor m, torch::Tensor n);
//!cuda
"""

module = torch.utils.cpp_extension.load_inline(
    "test_ext", cpp_src, cuda_src,
    functions=['matmul'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True)

n = 32
A = torch.randn(n, n, device='cuda')
B = torch.randn(n, n, device='cuda')

# A = torch.ones((3, 3), device='cuda')
# B = torch.ones((3, 3), device='cuda')

out = module.matmul(A, B); torch.cuda.synchronize()
reference = torch.matmul(A, B)
print("Out:", out)
print("Reference:", reference)
print("Correct Implementation:", torch.allclose(out, reference))

import time
num_trials = 1_000

with torch.profiler.profile() as prof:
    for i in range(num_trials):
        module.matmul(A, B)
        torch.cuda.synchronize()

print(prof.key_averages().table())

Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
The input conditions for extension module test_ext have changed. Bumping to version 3 and re-building as test_ext_v3...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/test_ext/build.ninja...
Building extension module test_ext_v3...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module test_ext_v3...


Out: tensor([[-5.4679,  1.1751, -1.0962,  ...,  2.7903,  6.5714, -1.7081],
        [ 6.8990, -6.2920, -4.9214,  ...,  2.4059, -9.0105,  3.9678],
        [-4.2295,  1.0672, -5.3186,  ...,  1.6381,  3.7604, 11.1707],
        ...,
        [-0.5478,  3.4716, -0.5633,  ..., -0.1780,  2.3237, -5.7205],
        [-0.3733, -7.9170, -1.0061,  ..., -3.9494,  0.3245, -1.2547],
        [-4.4514,  1.6518, -1.3146,  ..., -3.8617, -3.5407,  4.0330]],
       device='cuda:0')
Reference: tensor([[-5.4679,  1.1751, -1.0962,  ...,  2.7903,  6.5714, -1.7081],
        [ 6.8990, -6.2920, -4.9214,  ...,  2.4059, -9.0105,  3.9678],
        [-4.2295,  1.0672, -5.3186,  ...,  1.6381,  3.7604, 11.1707],
        ...,
        [-0.5478,  3.4716, -0.5633,  ..., -0.1780,  2.3237, -5.7205],
        [-0.3733, -7.9170, -1.0061,  ..., -3.9494,  0.3245, -1.2547],
        [-4.4514,  1.6518, -1.3146,  ..., -3.8617, -3.5407,  4.0330]],
       device='cuda:0')
Correct Implementation: True
---------------------------------------

c. Analyze the pros and cons of each of the two kernel designs.

The pros & cons for the row wise and column wise matrix multiplication depends on the size of the matrices. Let A be of size (M x K), and B of size (K x N). If M > N, there are more rows than columns, so having the row-wise direction be paralelized is more beneficial so `matmul_row` is faster, and vice-versa.

## Problem 2

Write a matrix-vector multiplication kernel and the host stub function that can be called with four parameters: pointer to the output matrix, pointer to the input matrix, pointer to the input vector, and the number of elements in each dimension. Use one thread to calculate an output vector element.

In [None]:
cuda_src = cuda_begin + \
"""
//cuda
__global__ void matmul_col(float* m, float* n, float* out, int h, int w, int k) {
    int c = blockIdx.x*blockDim.x + threadIdx.x;

    if (c >= w) return;

    for (int r = 0; r < h; ++r) {
        float o = 0;
        for (int i = 0; i<k; ++i) {
            o += m[r*k + i] * n[i*w + c];
        }
        out[r*w+c] = o;
    }
}

torch::Tensor matmul(torch::Tensor m, torch::Tensor n) {
    CHECK_INPUT(m); CHECK_INPUT(n);
    int h = m.size(0);
    int w = n.size(1);
    int k = m.size(1);
    TORCH_CHECK(k == n.size(0), "Size mismatch!");
    auto output = torch::zeros({h, w}, m.options());

    dim3 tpb(256);
    dim3 blocks(cdiv(h, tpb.x));
    matmul_col<<<blocks, tpb>>>(
        m.data_ptr<float>(), n.data_ptr<float>(), output.data_ptr<float>(), h, w, k);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}
//!cuda
"""

cpp_src = \
"""
//cuda
torch::Tensor matmul(torch::Tensor m, torch::Tensor n);
//!cuda
"""

module = torch.utils.cpp_extension.load_inline(
    "test_ext", cpp_src, cuda_src,
    functions=['matmul'], extra_cuda_cflags=['--ptxas-options=-v'], verbose=True)

n = 32
A = torch.randn(n, n, device='cuda')
B = torch.randn(n, n, device='cuda')

# A = torch.ones((3, 3), device='cuda')
# B = torch.ones((3, 3), device='cuda')

out = module.matmul(A, B); torch.cuda.synchronize()
reference = torch.matmul(A, B)
print("Out:", out)
print("Reference:", reference)
print("Correct Implementation:", torch.allclose(out, reference))

import time
num_trials = 1_000

with torch.profiler.profile() as prof:
    for i in range(num_trials):
        module.matmul(A, B)
        torch.cuda.synchronize()

print(prof.key_averages().table())