In [1]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-y117vvh7
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-y117vvh7
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 5cd225851b7638f3f6d55a19328295f16c014079
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.0.3-py3-none-any.whl size=7432 sha25

In [2]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Source files will be saved in "/tmp/tmp04xvqnh4".


In [3]:
!nvidia-smi

Fri Feb  9 17:52:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Task 1

## Sequential Program

In [5]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define WIDTH 16


// Matrix multiplication on the (CPU) host
void MatrixMulOnHost(int* M, int* N, int* P, int Width)
{
  for (int i = 0; i < Width; ++i)
  {
    for (int j = 0; j < Width; ++j)
    {
      double sum = 0;
      for (int k = 0; k < Width; ++k)
      {
      double a = M[i * Width + k];
      double b = N[k * Width + j];
      sum += a * b;
      }
      P[i * Width + j] = sum;
    }
  }
}

float time_diff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec);
}

int main() {
    struct timeval start, end;


    int* M, *N, *P;
    int* a;
    int size = WIDTH * WIDTH * sizeof(int);


    M = (int*)malloc(size);
    N = (int*)malloc(size);
    P = (int*)malloc(size);


    for (int i = 0; i < WIDTH * WIDTH; ++i) {
        M[i] = rand() % 10;
        N[i] = rand() % 10;
    }


    gettimeofday(&start, NULL);

    MatrixMulOnHost(M, N, P, WIDTH);

    gettimeofday(&end, NULL);


    /* printf("Result Matrix:");
    for (int j =0 ; j<WIDTH*WIDTH ; j=j+1){
        if (j%WIDTH == 0){printf("\n %d \t",*(P+j));}
        else{printf("%d\t",*(P+j));}
    }
    */

    printf("\n");
    printf("time spent: %0.8f sec\n", time_diff(&start, &end));

    free(M);
    free(N);
    free(P);
    return 0;
}


time spent: 0.00001600 sec



## Parallel - I

In [6]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>

#define WIDTH 16
#define TILE_WIDTH 2
#define N_THREADS 256

// Matrix multiplication kernel – per thread code
__global__ void MatrixMulKernel(int* d_M, int* d_N, int* d_P, int Width)
{
  // Calculate the row index of the d_P element and M
  int Row = blockIdx.y*TILE_WIDTH + threadIdx.y;
  // Calculate the column index of d_P and N
  int Col = blockIdx.x*TILE_WIDTH + threadIdx.x;
  float Pvalue = 0;
  // each thread computes one element of the block sub-matrix
  for (int k = 0; k < Width; ++k) Pvalue += d_M[Row*Width+k] * d_N[k*Width+Col];
  d_P[Row*Width+Col] = Pvalue;
}

float time_diff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec);
}

int main() {

    struct timeval start;
    struct timeval end;

    int* M, *N, *P;

    int size = WIDTH * WIDTH * sizeof(int);

    M = (int*)malloc(size);
    N = (int*)malloc(size);
    P = (int*)malloc(size);

    for (int i = 0; i < WIDTH * WIDTH; ++i) {
        M[i] = rand() % 10;
        N[i] = rand() % 10;
    }

    int *d_M, *d_N, *d_P;
    cudaMalloc((void**)&d_M, size);
    cudaMalloc((void**)&d_N, size);
    cudaMalloc((void**)&d_P, size);

    cudaMemcpy(d_M, M, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_N, N, size, cudaMemcpyHostToDevice);

    gettimeofday(&start, NULL);
    MatrixMulKernel<<<(WIDTH * WIDTH + N_THREADS - 1) / N_THREADS, N_THREADS>>>(d_M, d_N, d_P, WIDTH);
    cudaDeviceSynchronize();
    gettimeofday(&end, NULL);

    cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);

    /*printf("Result Matrix:");
    for (int j =0 ; j<WIDTH*WIDTH ; j=j+1){
        if (j%WIDTH == 0){printf("\n %d \t",*(P+j));}
        else{printf("%d\t",*(P+j));}
    }
    printf("\n");
    */
    printf("time spent: %0.8f sec\n", time_diff(&start, &end));

    free(M);
    free(N);
    free(P);
    cudaFree(d_M);
    cudaFree(d_N);
    cudaFree(d_P);
    return 0;
}



time spent: 0.11361100 sec



## Parallel - II

In [7]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define WIDTH 16
#define TILE_WIDTH 2
#define N_THREADS 256

__global__ void matrixMulTiled(const int* M, const int* N, int* P, int width) {
    __shared__ int Mds[TILE_WIDTH][TILE_WIDTH];
    __shared__ int Nds[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;

    int row = by * TILE_WIDTH + ty;
    int col = bx * TILE_WIDTH + tx;

    int Pvalue = 0;

    for (int ph = 0; ph < width / TILE_WIDTH; ++ph) {
        Mds[ty][tx] = M[row * width + ph * TILE_WIDTH + tx];
        Nds[ty][tx] = N[(ph * TILE_WIDTH + ty) * width + col];
        __syncthreads();

        for (int k = 0; k < TILE_WIDTH; ++k) {
            Pvalue += Mds[ty][k] * Nds[k][tx];
        }
        __syncthreads();
    }

    P[row * width + col] = Pvalue;
}

float time_diff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec);
}

int main() {
    int* M, *N, *P;
    int size = WIDTH * WIDTH * sizeof(int);

    M = (int*)malloc(size);
    N = (int*)malloc(size);
    P = (int*)malloc(size);

    for (int i = 0; i < WIDTH * WIDTH; ++i) {
        M[i] = rand() % 10;
        N[i] = rand() % 10;
    }

    int *d_M, *d_N, *d_P;
    cudaMalloc((void**)&d_M, size);
    cudaMalloc((void**)&d_N, size);
    cudaMalloc((void**)&d_P, size);

    cudaMemcpy(d_M, M, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_N, N, size, cudaMemcpyHostToDevice);

    dim3 dimGrid(WIDTH / TILE_WIDTH, WIDTH / TILE_WIDTH);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    matrixMulTiled<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, WIDTH);
    cudaDeviceSynchronize();
    gettimeofday(&end, NULL);

    cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);

    /*printf("Result Matrix:\n");
    for (int j = 0; j < WIDTH * WIDTH; ++j) {
        if (j % WIDTH == 0) {
            printf("\n %d \t", P[j]);
        } else {
            printf("%d\t", P[j]);
        }
    }
    */
    printf("\n");
    printf("time spent: %0.8f sec\n", time_diff(&start, &end));


    free(M);
    free(N);
    free(P);
    cudaFree(d_M);
    cudaFree(d_N);
    cudaFree(d_P);

    return 0;
}



time spent: 0.04674900 sec



# Task 2

## Parallel - I




In [14]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>

const int N = 4194304;
const int threadsPerBlock = 256;
const int arraySize = N;
const int blocksPerGrid = (arraySize + threadsPerBlock - 1) / threadsPerBlock;

__global__ void reduce0(int *g_idata, int *g_odata) {
  extern __shared__ int sdata[];
  // each thread loads one element from global to shared mem
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
  sdata[tid] = g_idata[i];
  __syncthreads();
  // do reduction in shared mem
  for(unsigned int s=1; s < blockDim.x; s *= 2)
  {
    if (tid % (2*s) == 0)
    {
    sdata[tid] += sdata[tid + s];
    }
  __syncthreads();
  }
  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

float time_diff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec);
}

int main() {
    struct timeval start, end;

    int* h_input = (int*)malloc(arraySize * sizeof(int));
    int* h_output = (int*)malloc(blocksPerGrid * sizeof(int));


    for (int i = 0; i < arraySize; ++i) {
        h_input[i] = i;
    }

    int* d_input, *d_output;
    cudaMalloc((void**)&d_input, arraySize * sizeof(int));
    cudaMalloc((void**)&d_output, blocksPerGrid * sizeof(int));

    cudaMemcpy(d_input, h_input, arraySize * sizeof(int), cudaMemcpyHostToDevice);


    gettimeofday(&start, NULL);

    reduce0<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output);
    cudaDeviceSynchronize();

    gettimeofday(&end, NULL);

    cudaMemcpy(h_output, d_output, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);

    int result = 0;
    for (int i = 0; i < blocksPerGrid; ++i) {
        result += h_output[i];
    }

    printf("time spent: %0.8f sec\n", time_diff(&start, &end));

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

time spent: 1.74649894 sec



## Parallel - II

In [20]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>

const int N = 67108864;
const int threadsPerBlock = 256;
const int arraySize = N;
const int blocksPerGrid = (arraySize + threadsPerBlock - 1) / threadsPerBlock;

__global__ void reduce0(int *g_idata, int *g_odata) {
  extern __shared__ int sdata[];
  // each thread loads one element from global to shared mem
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
  sdata[tid] = g_idata[i];
  __syncthreads();
  for (unsigned int s=1; s < blockDim.x; s *= 2)
  {
    int index = 2 * s * tid;
    if (index < blockDim.x) {
    sdata[index] += sdata[index + s];
    }
    __syncthreads();
  }
  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

float time_diff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec);
}

int main() {
    struct timeval start, end;

    int* h_input = (int*)malloc(arraySize * sizeof(int));
    int* h_output = (int*)malloc(blocksPerGrid * sizeof(int));


    for (int i = 0; i < arraySize; ++i) {
        h_input[i] = i;
    }

    int* d_input, *d_output;
    cudaMalloc((void**)&d_input, arraySize * sizeof(int));
    cudaMalloc((void**)&d_output, blocksPerGrid * sizeof(int));

    cudaMemcpy(d_input, h_input, arraySize * sizeof(int), cudaMemcpyHostToDevice);


    gettimeofday(&start, NULL);

    reduce0<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output);
    cudaDeviceSynchronize();

    gettimeofday(&end, NULL);

    cudaMemcpy(h_output, d_output, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);

    int result = 0;
    for (int i = 0; i < blocksPerGrid; ++i) {
        result += h_output[i];
    }

    printf("time spent: %0.8f sec\n", time_diff(&start, &end));

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

time spent: 1.75620699 sec



## Parallel - III

In [17]:
%%cuda

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda.h>

const int N = 67108864;
const int threadsPerBlock = 256;
const int arraySize = N;
const int blocksPerGrid = (arraySize + threadsPerBlock - 1) / threadsPerBlock;

__global__ void reduce0(int *g_idata, int *g_odata) {
  extern __shared__ int sdata[];
  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
  sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
  __syncthreads();
  for (unsigned int s=blockDim.x/2; s>0; s>>=1)
  {
    if (tid < s) {
    sdata[tid] += sdata[tid + s];
    }
    __syncthreads();
  }
  // write result for this block to global mem
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

float time_diff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec - start->tv_sec) + 1e-6 * (end->tv_usec - start->tv_usec);
}

int main() {
    struct timeval start, end;

    int* h_input = (int*)malloc(arraySize * sizeof(int));
    int* h_output = (int*)malloc(blocksPerGrid * sizeof(int));


    for (int i = 0; i < arraySize; ++i) {
        h_input[i] = i;
    }

    int* d_input, *d_output;
    cudaMalloc((void**)&d_input, arraySize * sizeof(int));
    cudaMalloc((void**)&d_output, blocksPerGrid * sizeof(int));

    cudaMemcpy(d_input, h_input, arraySize * sizeof(int), cudaMemcpyHostToDevice);


    gettimeofday(&start, NULL);

    reduce0<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output);
    cudaDeviceSynchronize();

    gettimeofday(&end, NULL);

    cudaMemcpy(h_output, d_output, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);

    int result = 0;
    for (int i = 0; i < blocksPerGrid; ++i) {
        result += h_output[i];
    }

    printf("time spent: %0.8f sec\n", time_diff(&start, &end));

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}

time spent: 1.70336902 sec

