In [2]:
# check the Nvidia CUDA compiler driver install or not on T4 GPU of colab
!nvcc --version
# installing necessary package for running cuda kernel on the colab gpu in notebook
!pip install nvcc4jupyter --quiet

# loading the package extension
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpmcj_5_vw".


Matrix Addition

In [3]:
%%cuda

#include <stdio.h>
#include <cuda.h>

#define N 4  // Matrix dimension (NxN)

// CUDA kernel for element-wise matrix addition
__global__ void matrixAdd(float *A, float *B, float *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;  // Row index
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // Column index

    if (row < width && col < width) {
        int idx = row * width + col;
        C[idx] = A[idx] + B[idx];  // Element-wise addition
    }
}

int main() {
    int size = N * N * sizeof(float);

    // Host matrices
    float h_A[N*N], h_B[N*N], h_C[N*N];

    // Initialize input matrices with sample data
    for (int i = 0; i < N*N; i++) {
        h_A[i] = i;
        h_B[i] = i * 2;
    }

    // Device matrices
    float *d_A, *d_B, *d_C;

    // Allocate device memory
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy matrices from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions (2D)
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel
    matrixAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Wait for GPU to finish
    cudaDeviceSynchronize();

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print the result matrix
    printf("Result matrix C = A + B:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%5.1f ", h_C[i * N + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Result matrix C = A + B:
  0.0   0.0   0.0   0.0 
  0.0   0.0   0.0   0.0 
  0.0   0.0   0.0   0.0 
  0.0   0.0   0.0   0.0 



In [4]:
%%cuda
#include <stdio.h>
#include <cuda.h>

#define N 512  // size of matrix

// CUDA kernel for matrix addition
__global__ void matrixAdd(float *A, float *B, float *C, int n) {
    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < n && j < n) {
        int idx = i * n + j;
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    int size = N * N * sizeof(float);
    float *A, *B, *C;              // host memory
    float *d_A, *d_B, *d_C;        // device memory

    // Allocate host memory
    A = (float*)malloc(size);
    B = (float*)malloc(size);
    C = (float*)malloc(size);

    // Initialize matrices
    for (int i = 0; i < N * N; i++) {
        A[i] = i;
        B[i] = i;
    }

    // Allocate device memory
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy data to device
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 dimBlock(16, 16);
    dim3 dimGrid((N + 15) / 16, (N + 15) / 16);

    // Launch kernel
    matrixAdd<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    // Print a part of the result
    printf("C[0][0] = %f\n", C[4]);
    printf("C[N-1][N-1] = %f\n", C[N*N-1]);

    // Free device and host memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(A);
    free(B);
    free(C);

    return 0;
}


C[0][0] = 0.000000
C[N-1][N-1] = 0.000000



Matrix Multiplication

In [5]:
%%cuda
#include <stdio.h>
#include <cuda.h>

#define N 512

__global__ void matrixMul(float *A, float *B, float *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;

    if (row < n && col < n) {
        for (int k = 0; k < n; k++) {
            sum += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = sum;
    }
}

int main() {
    int size = N * N * sizeof(float);
    float *A, *B, *C;
    float *d_A, *d_B, *d_C;

    A = (float*)malloc(size);
    B = (float*)malloc(size);
    C = (float*)malloc(size);

    for (int i = 0; i < N * N; i++) {
        A[i] = 1.0f;
        B[i] = 2.0f;
    }

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    dim3 dimBlock(16, 16);
    dim3 dimGrid((N + 15) / 16, (N + 15) / 16);

    matrixMul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    printf("C[0][0] = %f\n", C[0]);
    printf("C[N-1][N-1] = %f\n", C[N*N-1]);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(A);
    free(B);
    free(C);

    return 0;
}


C[0][0] = 0.000000
C[N-1][N-1] = 0.000000

