In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install virtualenv

Collecting virtualenv
  Downloading virtualenv-20.25.0-py3-none-any.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting distlib<1,>=0.3.7 (from virtualenv)
  Downloading distlib-0.3.8-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.9/468.9 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: distlib, virtualenv
Successfully installed distlib-0.3.8 virtualenv-20.25.0


In [3]:
!virtualenv /content/drive/MyDrive/cuda_env

created virtual environment CPython3.10.12.final.0-64 in 16108ms
  creator CPython3Posix(dest=/content/drive/MyDrive/cuda_env, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==23.3.1, setuptools==69.0.2, wheel==0.42.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator


In [4]:
!source /content/drive/MyDrive/cuda_env/bin/activate

In [5]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [6]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-v2gj7geu
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-v2gj7geu
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=7003ac528752de1af536a9d565f84c0c8dbee689d9c0ea7854c5a58acf329ab0
  Stored in directory: /tmp/pip-ephem-wheel-cache-0t3_li_w/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [7]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [91]:
%%cu
#include <stdio.h>

#define TILE_WIDTH 3

const int MATRIX_SIZE=3;

__global__ void matrixMulShared(float *A, float *B, float *C) {
    __shared__ float sA[TILE_WIDTH][TILE_WIDTH];
    __shared__ float sB[TILE_WIDTH][TILE_WIDTH];

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    float Cvalue = 0.0;

    for (int k = 0; k < TILE_WIDTH; ++k) {
        sA[threadIdx.y][threadIdx.x] = A[row * TILE_WIDTH + k];
        sB[threadIdx.y][threadIdx.x] = B[k * TILE_WIDTH + col];

        __syncthreads();

        for (int m = 0; m < TILE_WIDTH; ++m) {
            Cvalue += sA[threadIdx.y][m] * sB[m][threadIdx.x];
        }

        __syncthreads();
    }

    C[row * TILE_WIDTH + col] = Cvalue;
}

__global__ void matrixMulGlobal(float *A, float *B, float *C) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    float Cvalue = 0.0;

    for (int k = 0; k < MATRIX_SIZE; ++k) {
        Cvalue += A[row * MATRIX_SIZE + k] * B[k * MATRIX_SIZE + col];
    }

    C[row * MATRIX_SIZE + col] = Cvalue;
}

int main() {
    const int size = 3 * 3 * sizeof(float);
    float h_A[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
    float h_B[9] = {9, 5, 12, 52, -52, 15, -21, 52, 21};
    float h_C[9];

    float *d_A, *d_B, *d_C;

    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 dimGrid(1, 1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);



    matrixMulShared<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);



    printf("Resultant Matrix:\n");
    for (int i = 0; i < 3; ++i) {
        for (int j = 0; j < 3; ++j) {
            printf("%.2f ", h_C[i * 3 + j]);
        }
        printf("\n");
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixMulGlobal<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
    cudaEventRecord(stop);

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Time taken for matrix multiplication using Global Memory: %.2f ms\n", milliseconds);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Resultant Matrix:
150.00 171.00 315.00 
510.00 216.00 747.00 
870.00 261.00 1179.00 
Time taken for matrix multiplication using Global Memory: 0.03 ms

