Problem Statement: Write a CUDA Program for:
1. Addditon of two large vectors 
2. Matrix Multiplication using CUDA

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-b7riqssx
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-b7riqssx
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5741c522547756ac4bb7a16df32106a15efb8a57
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10739 sha256=652cfa192a926bb82952bbed8fb3fd55f14b7943e816ba632daed7a6d34f5441
  Stored in directory: /tmp/pip-ephem-wheel-cache-_0ky53_a/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [4]:
%%writefile "matrixmul.cu"
#include <cuda_runtime.h>
#include <iostream>
using namespace std;

__global__ void matmul(int* A, int* B, int* C, int N) {
int Row = blockIdx.y*blockDim.y+threadIdx.y;
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if (Row < N && Col < N) {
  int Pvalue = 0;
  for (int k = 0; k < N; k++) {
    Pvalue += A[Row*N+k] * B[k*N+Col];
  }
  C[Row*N+Col] = Pvalue;
  }
}

void take_input(int *A,int *B,int N)
{
  cout<<"Enter "<<N*N<<" elements in matrix A :";

  for (int i=0;i<N*N;i++)
  {
    cin>>A[i];
  }

  cout<<"Enter "<<N*N<<" elements in matrix B :";

  for (int i=0;i<N*N;i++)
  {
    cin>>B[i];
  }

}

int main()
{
  int N = 2;
  int size = N * N * sizeof(int);
  int* A,* B,* C;
  int* dev_A, * dev_B, * dev_C;

  cudaMallocHost(&A, size);
  cudaMallocHost(&B, size);
  cudaMallocHost(&C, size);
  cudaMalloc(&dev_A, size);
  cudaMalloc(&dev_B, size);
  cudaMalloc(&dev_C, size);

  // Initialize matrices A and B

  take_input(A,B,N);

  cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);

  dim3 dimBlock(2,2);
  dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);

  matmul<<<dimGrid,dimBlock>>>(dev_A, dev_B, dev_C, N);
  cudaMemcpy(C, dev_C,size,cudaMemcpyDeviceToHost);

  // Print the result

  cout<<"Result of matrix multiplication :\n";

  for (int i = 0; i < N; i++)
  {
    for (int j = 0; j < N; j++)
    {
      cout << C[i*N+j] << " ";
    }
    cout << "\n";
  }
  // Free memory
  cudaFree(dev_A);
  cudaFree(dev_B);
  cudaFree(dev_C);
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFreeHost(C);
  return 0;
}

Writing matrixmul.cu


In [5]:
!nvcc matrixmul.cu

In [6]:
!./a.out

Enter 4 elements in matrix A :2 3
1 4
Enter 4 elements in matrix B :1 2
1 3
Result of matrix multiplication :
5 13 
5 14 


In [7]:
%%writefile vectoradd.cu
#include <iostream>
#include <cuda_runtime.h>
using namespace std;
__global__ void addVectors(int* A, int* B, int* C, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
  C[i] = A[i] + B[i];
}
}


void takeinput(int *A,int *B,int n)
{
  cout<<"Enter "<<n<<" elements for array A :";
  for (int i=0;i<n;i++)
  {
    cin>>A[i];
  }
  cout<<"\nEnter "<<n<<" elements for array B :";
  for (int j=0;j<n;j++)
  {
    cin>>B[j];
  }
}


int main() {
  int n = 10;
  int* A, * B, * C;
  int size = n * sizeof(int);
  // Allocate memory on the host
  cudaMallocHost(&A, size);
  cudaMallocHost(&B, size);
  cudaMallocHost(&C, size);
  // Initialize the vectors

  takeinput(A,B,n);

  // Allocate memory on the device
  int* dev_A, * dev_B, * dev_C;
  cudaMalloc(&dev_A, size);
  cudaMalloc(&dev_B, size);
  cudaMalloc(&dev_C, size);
  // Copy data from host to device
  cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
  // Launch the kernel
  //int blockSize = 256;
  //int numBlocks = (n + blockSize - 1) / blockSize;

  addVectors <<<1,n>>>(dev_A,dev_B,dev_C,n);

  // Copy data from device to host
  cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
  // Print the results
  cout<<"\nVector Addition Result  :";
  for (int i = 0; i < n; i++) {
    cout << C[i] << " ";
  }
  cout << endl;
  // Free memory
  cudaFree(dev_A);
  cudaFree(dev_B);
  cudaFree(dev_C);
  cudaFreeHost(A);
  cudaFreeHost(B);
  cudaFreeHost(C);
  return 0;
}


Writing vectoradd.cu


In [8]:
!nvcc vectoradd.cu

In [9]:
!./a.out

Enter 10 elements for array A :1 2 3 4 5 6 7 8 9 10

Enter 10 elements for array B :10 9 8 7 6 5 4 3 2 1

Vector Addition Result  :11 11 11 11 11 11 11 11 11 11 
