In [1]:
! ls /usr/local/

bin    cuda	cuda-11.8  games	       include	lib64	   man	 share
colab  cuda-11	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [2]:
! nvcc --version # nvcc compiler version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
%%writefile MatrixAddition.cu
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

// Kernel Function C_{1} matrix
__global__ void C1Matrix(int *A, int *B, int *C, int N, int M){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if(i < N && j < M){
    C[i * M + j] = A[i * M + j] + B[(N -  i - 1) * M + (M - j - 1)]; // C := C[i][j] = A[i][j] + B[N - i - 1][M - j - 1]
  }
}

// Kernel Function C_{2} matrix
__global__ void C2Matrix(float alpha, int *A, int *B, float *C, int N, int M){
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if(i < N && j < M){
    C[i * M + j] = alpha*A[i * M + j] + (1 - alpha)*B[i * M + j]; // C := C[i][j] = (a)A[i][j] + (1 - a)B[N - i - 1][M - j - 1]; with a in [0, 1]
  }
}

// Main code executed by the host
int main(void){
  int N = 2; // no. rows
  int M = 3; // no. columns
  float alpha = 0.1; // alpha value

  int* Ah = (int*)malloc(N*M*sizeof(int)); // allocate host memory
  int* Bh = (int*)malloc(N*M*sizeof(int));
  int* C1h = (int*)malloc(N*M*sizeof(int));
  float* C2h = (float*)malloc(N*M*sizeof(float));

  for(int i = 0; i < N; i++){
    for(int j = 0; j < M; j++){
      Ah[i*M+j] = j + 1 + i;
      Bh[i*M+j] = (j + 1 + i) * 2;
    }

  }

  int *Ad, *Bd, *C1d;
  float *C2d;

  cudaMalloc((void**)&Ad, N * M * sizeof(int)); // allocate GPU memory
  cudaMalloc((void**)&Bd, N * M * sizeof(int));
  cudaMalloc((void**)&C1d, N * M * sizeof(int));
  cudaMalloc((void**)&C2d, N * M * sizeof(float));

  cudaMemcpy(Ad, Ah, N*M*sizeof(int), cudaMemcpyHostToDevice); // copy data from host to device (matrix A)
  cudaMemcpy(Bd, Bh, N*M*sizeof(int), cudaMemcpyHostToDevice); // copy data from host to device (matrix B)

  dim3 BlockSize(16, 16); // threads per block
  dim3 GridSize((N + BlockSize.x - 1) / BlockSize.x, (M + BlockSize.y - 1) / BlockSize.y); // no. of blocks

  C1Matrix<<<GridSize, BlockSize>>>(Ad, Bd, C1d, N, M); // kernel function calling
  C2Matrix<<<GridSize, BlockSize>>>(alpha, Ad, Bd, C2d, N, M);

  cudaMemcpy(C1h, C1d, N*M*sizeof(int), cudaMemcpyDeviceToHost); // copy data from device back to host (matrix C_{1})
  cudaMemcpy(C2h, C2d, N*M*sizeof(float), cudaMemcpyDeviceToHost); // copy data from device back to host (matrix C_{2})

  printf("\n Matrix A.\n");
  for(int i = 0; i < N; i++){
    for(int j = 0; j < M; j++){
      printf("%d ", Ah[i*M+j]);
    }
    printf("\n");
  }

  printf("\n Matrix B.\n");
  for(int i = 0; i < N; i++){
    for(int j = 0; j < M; j++){
      printf("%d ", Bh[i*M+j]);
    }
    printf("\n");
  }

  printf("\n Matrix C_{1}.\n");
  for(int i = 0; i < N; i++){
    for(int j = 0; j < M; j++){
      printf("%d ", C1h[i*M+j]);
    }
    printf("\n");
  }

  printf("\n Matrix C_{2}.\n");
  for(int i = 0; i < N; i++){
    for(int j = 0; j < M; j++){
      printf("%.2lf ", C2h[i*M+j]);
    }
    printf("\n");
  }

  cudaFree(Ad); // free GPU memory
  cudaFree(Bd);
  cudaFree(C1d);
  cudaFree(C2d);

  free(Ah); // free host memory
  free(Bh);
  free(C1h);
  free(C2h);

  return(0);
}

Writing MatrixAddition.cu


In [4]:
! nvcc MatrixAddition.cu -o test

In [5]:
! ./test


 Matrix A.
1 2 3 
2 3 4 

 Matrix B.
2 4 6 
4 6 8 

 Matrix C_{1}.
9 8 7 
8 7 6 

 Matrix C_{2}.
1.90 3.80 5.70 
3.80 5.70 7.60 
