to check the version of the NVIDIA CUDA Compiler (nvcc)

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


nvcc4jupyter package is typically used to enable CUDA compilation directly within Jupyter Notebooks

In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-w6q50w43
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-w6q50w43
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0d2ab99cccbbc682722e708515fe9c4cfc50185a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4716 sha256=e943138a852cdd464dac15cdcf50f92041a1be9e0b7c27dbe380c0db6b42d64c
  Stored in directory: /tmp/pip-ephem-wheel-cache-yeaugp2c/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


CUDA Code

In [20]:
%%cu
//#include <bits/stdc++.h>
# include <cuda.h>
# include <fstream>
# include <iostream>
# include <sstream>
# include <string>

using namespace std;

__global__ void matproduct(int *l, int *m, int *n, int row1, int col1, int row2, int col2){
    int x = blockIdx.x;   // id of col2
    int y = blockIdx.y;   // id of row1
    int z = blockIdx.z;   // id of num_matrices

    int i;
    n[(col2 * row1 * z) + (col2 * y) + x] = 0;

    for (i = 0; i < col1; i++){
        n[(col2 * row1 * z) + (col2 * y) + x] += l[(col1 * row1 * z) + (col1 * y) + i] * m[(col2 * row2 * z) + (col2 * i) + x];
    }
}

int main(){
    int num_matrices; //1000
    int row1; //25
    int col1; //25
    int row2; //25
    int col2; //25

    ifstream file("mat_mul.txt");
    if (!file.is_open()){
        cerr << "Unable to open file." << endl;
        return 1;
    }

    string line;
    while (getline(file, line)){
        istringstream iss(line);
        int k, m, n, p;

        if (iss >> k >> m >> n >> p){
            cout << "K: " << k << ", M: " << m << ", N: " << n << ", p: " << p << endl << endl;
            num_matrices = k;
            row1 = m;
            col1 = n;
            row2 = n;
            col2 = p;
        }
        else{
            cerr << "Failed to k, m, n, p from the line." << endl;
        }
    }

    // host variables
    int a[row1 * col1 * num_matrices];
    int b[row2 * col2 * num_matrices];
    int c[row1 * col2 * num_matrices];

    // device variables
    int *d, *e, *f;
    int i, j;

    cudaEvent_t start, end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);

    // Initialize matrices a and b
    srand(time(0));
    int element = 1;
    for (int k = 0; k < num_matrices; k++){
        for (i = 0; i < row1; i++){
            for (j = 0; j < col1; j++){
                a[(col1 * row1 * k) + (col1 * i) + j] = rand() % 10;
            }
        }

        element = 2;
        for (i = 0; i < row2; i++){
            for (j = 0; j < col2; j++){
                b[(col2 * row2 * k) + (col2 * i) + j] = rand() % 10;
            }
        }
    }

    // allocate memory on device
    cudaMalloc((void **)&d, row1 * col1 * num_matrices * sizeof(int));
    cudaMalloc((void **)&e, row2 * col2 * num_matrices * sizeof(int));
    cudaMalloc((void **)&f, row1 * col2 * num_matrices * sizeof(int));

    // copy input data to the device
    cudaMemcpy(d, a, row1 * col1 * num_matrices * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(e, b, row2 * col2 * num_matrices * sizeof(int), cudaMemcpyHostToDevice);

    // grid(x_dimension, y_dimension, z_dimension of the grid)
    dim3 grid(col2, row1, num_matrices);

    // launch kernel
    matproduct<<<grid, 1>>>(d, e, f, row1, col1, row2, col2);     // single thread per block
    // Wait for GPU to finish before exiting
    cudaDeviceSynchronize();

    // copy data from device to host
    cudaMemcpy(c, f, row1 * col2 * num_matrices * sizeof(int), cudaMemcpyDeviceToHost);

    for (int k = 0; k < 1; k++){
        printf("Matrix A[%d]:\n", k);
        for (i = 0; i < row1; i++){
            for (j = 0; j < col2; j++){
                printf("%d\t", a[(col1 * row1 * k) + (col1 * i) + j]);
            }
            printf("\n");
        }
        printf("\n");

        printf("Matrix B[%d]:\n", k);
        for (i = 0; i < row1; i++){
            for (j = 0; j < col2; j++){
                printf("%d\t", b[(col2 * row2 * k) + (col2 * i) + j]);
            }
            printf("\n");
        }
        printf("\n");

        printf("\nMatrix C[%d]:\n", k);
        for (i = 0; i < row1; i++){
            for (j = 0; j < col2; j++){
                printf("%d\t", c[(col2 * row1 * k) + (col2 * i) + j]);
            }
            printf("\n");
        }
        printf("\n");
    }

    cudaEventRecord(end);
    cudaEventSynchronize(end);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, end);
    printf("Time taken: %f\n ", milliseconds);

    cudaFree(d);
    cudaFree(e);
    cudaFree(f);

    return 0;
}

K: 100, M: 3, N: 3, p: 3

Matrix A[0]:
0	2	2	
7	9	1	
7	4	7	

Matrix B[0]:
5	0	6	
7	4	3	
8	6	8	


Matrix C[0]:
30	20	22	
106	42	77	
119	58	110	

Time taken: 0.410752
 
