<a href="https://colab.research.google.com/github/aarushisharan/LP5/blob/main/Cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:

!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-gg0a9ic9
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-gg0a9ic9
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4290 sha256=38a6f42928f944184fc6335587030d45cbd1994b9246414093b0582c79e33c79
  Stored in directory: /tmp/pip-ephem-wheel-cache-_8gobwgk/wheels/bc/4e/e0/2d86bd15f671dbeb32144013f1159dba09757fde36dc51a963
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [9]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [10]:

!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [11]:

!nvidia-smi

Sun May  4 11:21:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
%%writefile matrix.cu
#include<iostream>
#include<chrono>
#include<bits/stdc++.h>

using namespace std;
using namespace std::chrono;

__global__ void multiply(int*A, int *B, int *C, int M, int N, int K)
{
    int row= blockIdx.y* blockDim.y + threadIdx.y;
    int col= blockIdx.x* blockDim.x +threadIdx.x;

    if(row<M && col<K)
    {
        int sum=0;
        for(int i=0;i<N;i++)
        {
            sum+=A[row*N+i]*B[i*K+col];
        }
        C[row*K+col]=sum;
    }
}

void initialize(int *matrix, int rows, int cols)
{
    for(int i=0;i<rows*cols;i++)
    {
        cout<<"Enter the element = "<<i+1<<": ";
        cin>>matrix[i];
    }
}

void print(int *matrix, int rows, int cols)
{
    for(int i=0;i<rows;i++)
    {
        for(int j=0;j<cols;j++)
        {
            cout<<matrix[i*cols+j]<<" ";
        }
        cout<<endl;
    }
}

void sequentialMultiply(int* A, int* B, int* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < K; j++) {
            int sum = 0;
           for(int k=0;k<N;k++)
           {
            sum+= A[i*N+k] * B[k*K+j];
           }
           C[i*K+j]=sum;
        }
    }
}

int main()
{
    int M, N, K;
    cout << "Enter the number of rows and columns of the first matrix: ";
    cin >> M >> N;
    cout << "Enter the number of columns of the second matrix: ";
    cin >> K;

    int* A, * B, * C;

    int matrixSize=M*K;
    size_t matrixBytes= matrixSize*sizeof(int);

    A= new int[M*N];
    B= new int[N*K];
    C= new int[M*K];

    initialize(A, M, N);
    initialize(B, N, K);

    cout << "Matrix A: \n";
    print(A, M, N);

    cout << "Matrix B: \n";
    print(B, N, K);

    int* X, * Y, * Z;
    cudaMalloc(&X, M * N * sizeof(int));
    cudaMalloc(&Y, N * K * sizeof(int));
    cudaMalloc(&Z, M * K * sizeof(int));

    cudaMemcpy(X,A,M*N*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(Y,B,N*K*sizeof(int),cudaMemcpyHostToDevice);

    int THREADS=16;
    int BLOCKS= (M+THREADS-1)/THREADS;

    dim3 threads(THREADS, THREADS); // 16 x 16 = 256 threads per block
    dim3 blocks(BLOCKS, BLOCKS);    // 4 x 4 = 16 blocks total

    auto start = high_resolution_clock::now();
    sequentialMultiply(A, B, C, M, N, K);
    auto stop = high_resolution_clock::now();
    auto seq_duration = duration_cast<microseconds>(stop - start);

    cout << "Sequential Multiplication of matrix A and B: \n";
    print(C, M, K);

    // Parallel multiplication
    start = high_resolution_clock::now();
    multiply<<<blocks, threads>>>(X, Y, Z, M, N, K);
    cudaMemcpy(C, Z, M * K * sizeof(int), cudaMemcpyDeviceToHost);
    stop = high_resolution_clock::now();
    auto par_duration = duration_cast<microseconds>(stop - start);

    cout << "Parallel Multiplication of matrix A and B: \n";
    print(C, M, K);

    cout << "Sequential Multiplication Time: " << seq_duration.count() << " microseconds" << endl;
    cout << "Parallel Multiplication Time: " << par_duration.count() << " microseconds" << endl;

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;



}

Overwriting matrix.cu


In [8]:
!nvcc -arch=sm_75 matrix.cu -o mat

      size_t matrixBytes= matrixSize*sizeof(int);
             ^




In [9]:
!./mat

Enter the number of rows and columns of the first matrix: 2 2
Enter the number of columns of the second matrix: 2
Enter the element = 1: 2
Enter the element = 2: 2
Enter the element = 3: 4
Enter the element = 4: 5
Enter the element = 1: 4
Enter the element = 2: 5
Enter the element = 3: 6
Enter the element = 4: 7
Matrix A: 
2 2 
4 5 
Matrix B: 
4 5 
6 7 
Sequential Multiplication of matrix A and B: 
20 24 
46 55 
Parallel Multiplication of matrix A and B: 
20 24 
46 55 
Sequential Multiplication Time: 0 microseconds
Parallel Multiplication Time: 162 microseconds
