In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-llnn3p19
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-llnn3p19
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=7dc0b6a3a589156ea1993706ab21d59de9e54f6837359b61f41520b794f5da4b
  Stored in directory: /tmp/pip-ephem-wheel-cache-hn8ajl6_/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [4]:
%%cu
#include <iostream>
#include <cuda_runtime.h>

const int TILE_SIZE = 2;


__global__ void matrixMul(const int *A, const int *B, int *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ int sA[TILE_SIZE][TILE_SIZE];
    __shared__ int sB[TILE_SIZE][TILE_SIZE];

    int result = 0;

    for (int i = 0; i < (N + TILE_SIZE - 1) / TILE_SIZE; ++i) {
        if ((row < N) && (i * TILE_SIZE + threadIdx.x < N)) {
            sA[threadIdx.y][threadIdx.x] = A[row * N + i * TILE_SIZE + threadIdx.x];
        } else {
            sA[threadIdx.y][threadIdx.x] = 0;
        }

        if ((col < N) && (i * TILE_SIZE + threadIdx.y < N)) {
            sB[threadIdx.y][threadIdx.x] = B[(i * TILE_SIZE + threadIdx.y) * N + col];
        } else {
            sB[threadIdx.y][threadIdx.x] = 0;
        }

        __syncthreads();

        for (int j = 0; j < TILE_SIZE; ++j) {
            result += sA[threadIdx.y][j] * sB[j][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < N && col < N) {
        C[row * N + col] = result;
    }

    // Debugging output
    if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
        printf("ThreadIdx: (%d, %d), BlockIdx: (%d, %d), Result: %d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, result);
    }
}



void printMatrix(const int *matrix, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << matrix[i * cols + j] << " ";  // Print without setw
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;
}


int main() {
    const int N = 3;

    const int A[N][N] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
    const int B[N][N] = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};


    int C_cpu[N][N]; // Result matrix from CPU
    int C_gpu[N][N]; // Result matrix from GPU


    int *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, N * N * sizeof(int));
    cudaMalloc((void **)&d_B, N * N * sizeof(int));
    cudaMalloc((void **)&d_C, N * N * sizeof(int));


    cudaMemcpy(d_A, &A[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, &B[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);


    dim3 blockDim(2, 2);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);


    matrixMul<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);


    cudaMemcpy(&C_gpu[0][0], d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);


    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            C_cpu[i][j] = 0;
            for (int k = 0; k < N; ++k) {
                C_cpu[i][j] += A[i][k] * B[k][j];
            }
        }
    }


    std::cout << "Matrix A:" << std::endl;
    printMatrix(&A[0][0], N, N);
    std::cout << "Matrix B:" << std::endl;
    printMatrix(&B[0][0], N, N);
    std::cout << "Result from CPU (C_cpu):" << std::endl;
    printMatrix(&C_cpu[0][0], N, N);


    std::cout << "Result from GPU (C_gpu):" << std::endl;
    printMatrix(&C_gpu[0][0], N, N);


    bool resultMatch = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (C_cpu[i][j] != C_gpu[i][j]) {
                resultMatch = false;
                break;
            }
        }
    }

    if (resultMatch) {
        std::cout << "Results match between CPU and GPU implementations." << std::endl;
    } else {
        std::cout << "Results do not match between CPU and GPU implementations." << std::endl;
    }


    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

ThreadIdx: (0, 0), BlockIdx: (0, 0), Result: 30
Matrix A:
1 2 3 
4 5 6 
7 8 9 

Matrix B:
9 8 7 
6 5 4 
3 2 1 

Result from CPU (C_cpu):
30 24 18 
84 69 54 
138 114 90 

Result from GPU (C_gpu):
30 24 18 
84 69 54 
138 114 90 

Results match between CPU and GPU implementations.

