In [1]:
!python --version
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Python 3.10.12
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp529a5t29".


In [2]:
%%cuda

#include <cuda_runtime.h>
#include <iostream>

__global__ void matmul(int* A, int* B, int* C, int N) {
    int Row = blockIdx.y*blockDim.y+threadIdx.y;
    int Col = blockIdx.x*blockDim.x+threadIdx.x;
    if (Row < N && Col < N) {
        int Pvalue = 0;
        for (int k = 0; k < N; k++) {
            Pvalue += A[Row*N+k] * B[k*N+Col];
        }
        C[Row*N+Col] = Pvalue;
    }
}


int main() {
    std::cout<<"This is Atharva Pingale's code.\n";
    std::cout<<"Practical 4\n";
    int N = 16;
    int size = N * N * sizeof(int);
    int* A, * B, * C;
    int* dev_A, * dev_B, * dev_C;
    cudaMallocHost(&A, size);
    cudaMallocHost(&B, size);
    cudaMallocHost(&C, size);
    cudaMalloc(&dev_A, size);
    cudaMalloc(&dev_B, size);
    cudaMalloc(&dev_C, size);
    // C.NotebookApp.iopub_data_rate_limit = 10000000


    // Initialize matrices A and B
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            A[i*N+j] = i*N*j+1;
            B[i*N+j] = j*N*i+1;
        }
    }


    std::cout<<"\nPrinting A: ";
    std::cout<<"\n";
    for(int i=0;i<N;i++){
        for(int j=0;j<N;j++){
          std::cout<<A[i*N+j]<<" ";
        }
        std::cout<<"\n";
    }
    std::cout<<"\n";

    std::cout<<"\nPrinting B : ";
    std::cout<<"\n";
    for(int i=0;i<N;i++){
        for(int j=0;j<N;j++){
            std::cout<<B[i*N+j]<<" ";
        }
        std::cout<<"\n";

    }
    std::cout<<"\n";


    cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

    dim3 dimBlock(16, 16);
    dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);

    matmul<<<dimGrid, dimBlock>>>(dev_A, dev_B, dev_C, N);

    cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);

    // Print the result
    std::cout<<"\nPrinting C ( Result ) : ";
    std::cout<<"\n";
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            std::cout << C[i*N+j] << " ";
        }
        std::cout << std::endl;
    }

    // Free memory
    cudaFree(dev_A);
    cudaFree(dev_B);
    cudaFree(dev_C);
    cudaFreeHost(A);
    cudaFreeHost(B);
    cudaFreeHost(C);

    return 0;
}



This is Atharva Pingale's code.
Practical 4

Printing A: 
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
1 17 33 49 65 81 97 113 129 145 161 177 193 209 225 241 
1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 
1 49 97 145 193 241 289 337 385 433 481 529 577 625 673 721 
1 65 129 193 257 321 385 449 513 577 641 705 769 833 897 961 
1 81 161 241 321 401 481 561 641 721 801 881 961 1041 1121 1201 
1 97 193 289 385 481 577 673 769 865 961 1057 1153 1249 1345 1441 
1 113 225 337 449 561 673 785 897 1009 1121 1233 1345 1457 1569 1681 
1 129 257 385 513 641 769 897 1025 1153 1281 1409 1537 1665 1793 1921 
1 145 289 433 577 721 865 1009 1153 1297 1441 1585 1729 1873 2017 2161 
1 161 321 481 641 801 961 1121 1281 1441 1601 1761 1921 2081 2241 2401 
1 177 353 529 705 881 1057 1233 1409 1585 1761 1937 2113 2289 2465 2641 
1 193 385 577 769 961 1153 1345 1537 1729 1921 2113 2305 2497 2689 2881 
1 209 417 625 833 1041 1249 1457 1665 1873 2081 2289 2497 2705 2913 3121 
1 225 449 673 897 1121 1345 1569