# Matrix Multiplication with PyCUDA

This notebook demonstrates how to perform matrix multiplication using PyCUDA. The CUDA kernel implementation is left empty for you to complete as an exercise.

In [None]:
# Import Required Libraries
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

from cuda_helpers import profile_gpu

## Define Matrices

Let's define two matrices to multiply. You can change their size and values as needed.

In [None]:
# Define two matrices A and B
N = 1024
A = np.random.randn(N, N).astype(np.float32)
B = np.random.randn(N, N).astype(np.float32)
print("Matrix A:\n", A)
print("Matrix B:\n", B)

## Perform Matrix Multiplication on the GPU

We will multiply matrices A and B using a CUDA kernel. The kernel code is left empty for you to complete as an exercise.

In [None]:
# Allocate GPU memory and transfer matrices
d_a = cuda.mem_alloc(A.nbytes)
d_b = cuda.mem_alloc(B.nbytes)
d_c = cuda.mem_alloc(A.nbytes)
cuda.memcpy_htod(d_a, A)
cuda.memcpy_htod(d_b, B)

# CUDA kernel for matrix multiplication (to be completed)
kernel_code = """
__global__ void matmul(float *A, float *B, float *C, int N) {
    // TODO: Implement matrix multiplication kernel
    // blockIdx, blockDim, threadIdx, gridDim
    float sum = 0;
    int2 global_id = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
                               blockIdx.y * blockDim.y + threadIdx.y);

    if (global_id.x >= N || global_id.y >= N) {
        return;
    }
    
    for (int i = 0; i < N; i++) {
        int aij = global_id.y * N + i;
        int bij = i * N + global_id.x;
        sum  += A[aij] * B[bij];
    }

    int cij = global_id.y * N + global_id.x;
    C[cij] = sum;
}
"""

mod = SourceModule(kernel_code)
matmul = mod.get_function("matmul")

In [None]:
block_size = (8, 8, 1)
grid_size = (A.shape[0] // block_size[0], A.shape[1] // block_size[1], 1)

print(f'Launching with grid_size={grid_size}, block_size={block_size}')

n_warmup = 2
n_iters = 20

launch = lambda: matmul(d_a, d_b, d_c, np.int32(N), block=block_size, grid=grid_size)
_ = profile_gpu(launch, n_warmup=n_warmup, n_iters=n_iters)

## Display Results

After running the kernel, copy the result back to the host and display it.

Refer to the [solution](./CUDA/matrix_multiplication_solution_global.cu) if you get stuck.

In [None]:
# Copy result from GPU and display
C = np.empty_like(A)
cuda.memcpy_dtoh(C, d_c)
c_numpy = np.matmul(A, B)
print("Result matrix C (A x B):\n", C)

np.testing.assert_almost_equal(C, c_numpy, decimal=3)
# Note: You need to implement the kernel for correct results!