In [1]:
! ls /usr/local/

bin    cuda	cuda-11.8  games	       include	lib64	   man	 share
colab  cuda-11	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [2]:
! nvcc --version # nvcc compiler version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
%%writefile MatrixMultiplication.cu
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>

/* By default, memory on the device is global memory
global memory plays the same role as the Random Access Memory on a CPU */

// Kernel Function
__global__ void MatrixMultiplication(float *A, float *B, float *C, int N, int K, int M){
  int idx = blockIdx.x * blockDim.x + threadIdx.x; // (block index x-axis * block dimension x-axis) + thread index x-axis (within the block) == column index
  int idy = blockIdx.y * blockDim.y + threadIdx.y; // (block index y-axis * block dimension y-axis) + thread index y-axis (within the block) == row index

  if(idy < N && idx < M){
    float sum = 0.0;
    for(int i = 0; i < K; i++){
      sum += A[idy * K + i]*B[i * M + idx];
    }
    C[idy * M + idx] = sum;
  }
}

// Main code executed by the host
int main(void){
  int N = 12; // rows matrix A
  int K = 14; // columns matrix A; rows matrix B
  int M = 16; // columns matrix B

  float *Ah, *Bh, *Ch; // host matrix pointers
  Ah = (float *)malloc(N*K*sizeof(float)); // allocate host memory
  Bh = (float *)malloc(K*M*sizeof(float));
  Ch = (float *)malloc(N*M*sizeof(float));

  for(int i = 0; i < N; i++){ // initialize elements matrix A
    for(int j = 0; j < K; j++){
      Ah[i*K+j] = i + 1.0;
    }
  }

  for(int i = 0; i < K; i++){ // initialize elements matrix B
    for(int j = 0; j < M; j++){
      Bh[i*M+j] = (i + 1.0)*2;
    }
  }

  printf("\n Matrix A. \n");
  for(int i = 0; i < N; i++){
    for(int j = 0; j < K; j++){
      printf("%.2lf ", Ah[i*K+j]);
    }
    printf("\n");
  }

  printf("\n Matrix B. \n");
  for(int i = 0; i < K; i++){
    for(int j = 0; j < M; j++){
      printf("%.2lf ", Bh[i*M+j]);
    }
    printf("\n");
  }

  float *Ad, *Bd, *Cd;
  cudaMalloc((void **)&Ad, sizeof(float) * N * K); // allocate GPU memory
  cudaMalloc((void **)&Bd, sizeof(float) * K * M);
  cudaMalloc((void **)&Cd, sizeof(float) * N * M);

  cudaMemcpy(Ad, Ah, sizeof(float) * N * K, cudaMemcpyHostToDevice); // copy data from host to device (matrix A)
  cudaMemcpy(Bd, Bh, sizeof(float) * K * M, cudaMemcpyHostToDevice); // copy data from host to device (matrix B)

  dim3 BlockSize(16, 16); // threads per block
  dim3  GridSize((M + BlockSize.x - 1) / BlockSize.x, (N + BlockSize.y - 1) / BlockSize.y); // no. of blocks

  MatrixMultiplication<<<GridSize, BlockSize>>>(Ad, Bd, Cd, N, K, M); // kernel function calling

  cudaMemcpy(Ch, Cd, sizeof(float) * N * M, cudaMemcpyDeviceToHost); // copy data from device back to host (matrix C)

  printf("\n Matrix C. \n");
  for(int i = 0; i < N; i++){
    for(int j = 0; j < M; j++){
      printf("%.2lf ", Ch[i*M+j]);
    }
    printf("\n");
  }

  free(Ah); // free host memory
  free(Bh);
  free(Ch);

  cudaFree(Ad); // free GPU memory
  cudaFree(Bd);
  cudaFree(Cd);

  return(0);
}

Writing MatrixMultiplication.cu


In [4]:
! nvcc MatrixMultiplication.cu -o test

In [5]:
! ./test


 Matrix A. 
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 
2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 
3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 
4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 
5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 
6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 
7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 
9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 
10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 
11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 
12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 12.00 

 Matrix B. 
2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 