In [2]:
!nvidia-smi

Tue Dec 26 20:06:46 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import torch
torch.cuda.is_available()
# Output would be True if Pytorch is using GPU otherwise it would be False.

True

In [4]:
import tensorflow as tf
tf.test.gpu_device_name()
# Standard output is '/device:GPU:0'

'/device:GPU:0'

In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [6]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-d48okd4k
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-d48okd4k
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=c82ce33b0b382863bcde2d5a6d6b17078e6b837ed51e2de42acad9809d30f3be
  Stored in directory: /tmp/pip-ephem-wheel-cache-kviiot7s/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [7]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [10]:
%%cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
#include <iostream>

#define TILE_WIDTH 16

// CUDA Kernel using Shared Memory
__global__ void MatrixMulShared(float *d_M, float *d_N, float *d_P, int Width) {
    __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
    __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x; int by = blockIdx.y;
    int tx = threadIdx.x; int ty = threadIdx.y;

    int Row = by * TILE_WIDTH + ty;
    int Col = bx * TILE_WIDTH + tx;
    float Pvalue = 0;

    for (int m = 0; m < (Width-1)/TILE_WIDTH+1; ++m) {
        if (Row < Width && m*TILE_WIDTH+tx < Width)
            Mds[ty][tx] = d_M[Row*Width + m*TILE_WIDTH + tx];
        else
            Mds[ty][tx] = 0.0;
        if (Col < Width && m*TILE_WIDTH+ty < Width)
            Nds[ty][tx] = d_N[(m*TILE_WIDTH + ty)*Width + Col];
        else
            Nds[ty][tx] = 0.0;
        __syncthreads();

        for (int k = 0; k < TILE_WIDTH; ++k)
            Pvalue += Mds[ty][k] * Nds[k][tx];
        __syncthreads();
    }
    if (Row < Width && Col < Width)
        d_P[Row*Width+Col] = Pvalue;
}

// CPU Matrix Multiplication
void MatrixMulCPU(float *M, float *N, float *P, int Width) {
    for (int i = 0; i < Width; i++)
        for (int j = 0; j < Width; j++) {
            float sum = 0;
            for (int k = 0; k < Width; k++)
                sum += M[i * Width + k] * N[k * Width + j];
            P[i * Width + j] = sum;
        }
}

int main() {
    int Width = 1024;  // Matrix width
    size_t size = Width * Width * sizeof(float);
    float *h_M, *h_N, *h_P, *h_P_cpu;

    // Allocate and initialize host matrices
    h_M = (float *)malloc(size);
    h_N = (float *)malloc(size);
    h_P = (float *)malloc(size);
    h_P_cpu = (float *)malloc(size);

    // Initialize matrices with values
    // Initialize matrix A with random values
for (int i = 0; i < Width; i++) {
    for (int j = 0; j < Width; j++) {
        h_M[i * Width + j] = (float)(rand() % 100); // Random numbers between 0 and 99
    }
}

// Initialize matrix B with random values
for (int i = 0; i < Width; i++) {
    for (int j = 0; j < Width; j++) {
        h_N[i * Width + j] = (float)(rand() % 100); // Random numbers between 0 and 99
    }
}


    // Allocate device matrices
    float *d_M, *d_N, *d_P;
    cudaMalloc(&d_M, size);
    cudaMalloc(&d_N, size);
    cudaMalloc(&d_P, size);

    // Copy matrices to device
    cudaMemcpy(d_M, h_M, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_N, h_N, size, cudaMemcpyHostToDevice);

    // Launch the GPU Kernel
    dim3 dimGrid((Width-1)/TILE_WIDTH+1, (Width-1)/TILE_WIDTH+1, 1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    // Measuring execution time for GPU
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    MatrixMulShared<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, Width);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    std::cout << "GPU execution time (shared memory): " << milliseconds << " ms\n";

    // Copy result back to host
    cudaMemcpy(h_P, d_P, size, cudaMemcpyDeviceToHost);

    // CPU Matrix Multiplication for verification
    auto start_cpu = std::chrono::high_resolution_clock::now();
    MatrixMulCPU(h_M, h_N, h_P_cpu, Width);
    auto stop_cpu = std::chrono::high_resolution_clock::now();
    std::chrono::duration<float, std::milli> cpu_duration = stop_cpu - start_cpu;
    std::cout << "CPU execution time: " << cpu_duration.count() << " ms\n";

    // Compare CPU and GPU results

    // Compare CPU and GPU results
    bool result_correct = true;
    for (int i = 0; i < Width * Width; i++) {
        if (fabs(h_P_cpu[i] - h_P[i]) > 1e-5) {
            result_correct = false;
            break;
        }
    }
    if (result_correct)
        std::cout << "Results are correct!" << std::endl;
    else
        std::cout << "Results are incorrect!" << std::endl;

    // Free device memory
    cudaFree(d_M);
    cudaFree(d_N);
    cudaFree(d_P);

    // Free host memory
    free(h_M);
    free(h_N);
    free(h_P);
    free(h_P_cpu);

    return 0;
}


GPU execution time (shared memory): 73.7815 ms
CPU execution time: 7519.55 ms
Results are correct!

