In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [10]:
%%writefile main.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "kernel.cuh"

#define MAX_PRINT_SIZE 10

void print_matrix(int *mat, int rows, int cols, const char* name) {
    printf("%s =\n", name);
    for (int i = 0; i < rows && i < MAX_PRINT_SIZE; ++i) {
        for (int j = 0; j < cols && j < MAX_PRINT_SIZE; ++j) {
            printf("%5d ", mat[i * cols + j]);
        }
        if (cols > MAX_PRINT_SIZE) printf("...");
        printf("\n");
    }
    if (rows > MAX_PRINT_SIZE) printf("...\n");
    printf("\n");
}

int main() {
    int m, n, k;
    printf("Enter m (rows of A), n (cols of A / rows of B), k (cols of B): ");
    scanf("%d %d %d", &m, &n, &k);

    int *h_a, *h_b, *h_c, *h_cpu;
    cudaMallocHost(&h_a, sizeof(int) * m * n);
    cudaMallocHost(&h_b, sizeof(int) * n * k);
    cudaMallocHost(&h_c, sizeof(int) * m * k);
    cudaMallocHost(&h_cpu, sizeof(int) * m * k);

    for (int i = 0; i < m * n; ++i)
        h_a[i] = rand() % 10;
    for (int i = 0; i < n * k; ++i)
        h_b[i] = rand() % 10;

    cudaEvent_t start, stop;
    float gpu_time, cpu_time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // GPU multiplication
    cudaEventRecord(start);
    matrixMultiply(h_a, h_b, h_c, m, n, k);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&gpu_time, start, stop);

    // CPU multiplication
    cudaEventRecord(start);
    cpu_matrix_mult(h_a, h_b, h_cpu, m, n, k);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cpu_time, start, stop);

    // Print matrices
    print_matrix(h_a, m, n, "Matrix A");
    print_matrix(h_b, n, k, "Matrix B");
    print_matrix(h_c, m, k, "Resultant Matrix C (GPU)");

    // Verify results
    int correct = 1;
    for (int i = 0; i < m * k; ++i) {
        if (h_c[i] != h_cpu[i]) {
            correct = 0;
            break;
        }
    }

    printf("GPU time: %.3f ms\n", gpu_time);
    printf("CPU time: %.3f ms\n", cpu_time);
    if (correct)
        printf("✅ Results match! Speedup = %.2fx\n", cpu_time / gpu_time);
    else
        printf("❌ Results mismatch.\n");

    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    cudaFreeHost(h_cpu);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Overwriting main.cu


In [8]:
%%writefile kernel.cuh
#ifndef KERNEL_CUH_
#define KERNEL_CUH_

void matrixMultiply(int *h_a, int *h_b, int *h_c, int m, int n, int k);
void cpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k);

#endif

Overwriting kernel.cuh


In [9]:
%%writefile kernel.cu
#include <cuda_runtime.h>
#include "kernel.cuh"

#define BLOCK_SIZE 16

__global__ void gpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int sum = 0;

    if (row < m && col < k) {
        for (int i = 0; i < n; ++i)
            sum += a[row * n + i] * b[i * k + col];
        c[row * k + col] = sum;
    }
}

void matrixMultiply(int *h_a, int *h_b, int *h_c, int m, int n, int k) {
    int *d_a, *d_b, *d_c;
    size_t sizeA = sizeof(int) * m * n;
    size_t sizeB = sizeof(int) * n * k;
    size_t sizeC = sizeof(int) * m * k;

    cudaMalloc(&d_a, sizeA);
    cudaMalloc(&d_b, sizeB);
    cudaMalloc(&d_c, sizeC);

    cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);

    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid((k + BLOCK_SIZE - 1) / BLOCK_SIZE, (m + BLOCK_SIZE - 1) / BLOCK_SIZE);

    gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);
    cudaDeviceSynchronize();

    cudaMemcpy(h_c, d_c, sizeC, cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

void cpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k) {
    for (int i = 0; i < m; ++i)
        for (int j = 0; j < k; ++j) {
            int sum = 0;
            for (int h = 0; h < n; ++h)
                sum += a[i * n + h] * b[h * k + j];
            c[i * k + j] = sum;
        }
}


Overwriting kernel.cu


In [11]:
!nvcc -arch=sm_75 main.cu kernel.cu -o matrixmul
!./matrixmul

Enter m (rows of A), n (cols of A / rows of B), k (cols of B): 10 10 10
Matrix A =
    3     6     7     5     3     5     6     2     9     1 
    2     7     0     9     3     6     0     6     2     6 
    1     8     7     9     2     0     2     3     7     5 
    9     2     2     8     9     7     3     6     1     2 
    9     3     1     9     4     7     8     4     5     0 
    3     6     1     0     6     3     2     0     6     1 
    5     5     4     7     6     5     6     9     3     7 
    4     5     2     5     4     7     4     4     3     0 
    7     8     6     8     8     4     3     1     4     9 
    2     0     6     8     9     2     6     6     4     9 

Matrix B =
    5     0     4     8     7     1     7     2     7     2 
    2     6     1     0     6     1     5     9     4     9 
    0     9     1     7     7     1     1     5     9     7 
    7     6     7     3     6     5     6     3     9     4 
    8     1     2     9     3     9     0     8    

In [None]:

"""

## 🚀 Project Overview

This CUDA program performs **matrix multiplication on both CPU and GPU**, compares their performance, and verifies if the results match.

It has 3 files:

| File         | Purpose                                                                |
| ------------ | ---------------------------------------------------------------------- |
| `main.cu`    | Takes input, initializes matrices, times CPU & GPU ops, prints results |
| `kernel.cu`  | Defines CUDA GPU kernel and CPU function for matrix multiplication     |
| `kernel.cuh` | Header file for function declarations                                  |

---

## 🔍 Key Components Explained

### 🔹 `main.cu`

1. **Takes input**: Dimensions of matrices A (m×n), B (n×k).
2. **Allocates memory**: Uses `cudaMallocHost` (pinned memory for faster GPU transfers).
3. **Fills A and B** with random integers.
4. **Calls `matrixMultiply`** to compute matrix C on GPU.
5. **Also computes on CPU** using `cpu_matrix_mult`.
6. **Times both operations** using `cudaEvent_t` timers.
7. **Prints matrices** (up to 10×10).
8. **Compares results** and prints speedup.

---

### 🔹 `kernel.cu`

* Contains the **GPU kernel function**:

```cpp
__global__ void gpu_matrix_mult(int *a, int *b, int *c, int m, int n, int k);
```

* Each GPU thread computes **one element** of the result matrix C.
* Block and grid dimensions allow the work to be parallelized.

It also includes the **CPU version** of matrix multiplication.

---

### 🔹 `kernel.cuh`

* Just declares the functions from `kernel.cu` so they can be used in `main.cu`.

---

## 🧠 Key Concepts in the Code

| Concept           | Explanation                                                |
| ----------------- | ---------------------------------------------------------- |
| `__global__`      | CUDA keyword for kernel functions run on GPU               |
| `dim3`            | Used to configure grid and block dimensions                |
| `cudaMallocHost`  | Allocates **pinned memory** on host (faster for transfers) |
| `cudaMemcpy`      | Transfers data between CPU and GPU                         |
| `cudaEventRecord` | Used to record timestamps for measuring performance        |
| `BLOCK_SIZE`      | Threads per block (16x16 block = 256 threads)              |

---

## ❓ Q\&A Section (For Learning & Interviews)

### 🔸 **Basic Conceptual Qs**

**Q1:** What is the purpose of using GPU for matrix multiplication?
**A:** GPUs can parallelize the task across hundreds of threads, making it much faster for large matrices compared to CPU.

**Q2:** What does `__global__` mean in CUDA?
**A:** It marks a function as a GPU kernel callable from the CPU (host).

**Q3:** What is `dim3`?
**A:** It’s a CUDA data type to define 1D/2D/3D grid or block dimensions.

---

### 🔸 **Code Understanding Qs**

**Q4:** Why is `cudaMallocHost()` used instead of `malloc()`?
**A:** It allocates **pinned (page-locked)** memory for faster transfer between CPU and GPU.

**Q5:** What does this loop in the kernel do?

```cpp
for (int i = 0; i < n; ++i)
    sum += a[row * n + i] * b[i * k + col];
```

**A:** It computes the dot product of one row of A and one column of B to get the value for `C[row][col]`.

---

### 🔸 **Performance & Debugging Qs**

**Q6:** Why do we use `cudaEventRecord` and `cudaEventElapsedTime`?
**A:** To accurately measure the time taken by GPU or CPU computations.

**Q7:** How do we ensure the GPU results are correct?
**A:** By comparing each value of matrix C (GPU result) with CPU result using a loop.

---

### 🔸 **Advanced/Optimization Qs**

**Q8:** What would happen if BLOCK\_SIZE is too small or too large?
**A:** Too small → underutilized GPU. Too large → exceed shared memory/registers and may cause slower execution or failure.

**Q9:** How can shared memory improve performance in matrix multiplication?
**A:** Shared memory enables fast memory access and reduces global memory loads per thread, crucial for large matrix blocks.

---

Would you like me to add **comments directly in the code** to annotate these concepts or generate **MCQs for practice**?

"""
