<a href="https://colab.research.google.com/github/aditya-malte/Simple-LP1-Codes/blob/master/CUDA_Parrallel_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Select Runtime as GPU

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


In [0]:
code = """
#include <iostream>
using namespace std;

__global__ void sum(int* input) //this code runs on GPU
{
    const int tid = threadIdx.x;    //get thread id
    int step_size = 1; //set initial step_size
    int number_of_threads = blockDim.x; //get number of threads in block
    while (number_of_threads > 0)
    {
        if (tid < number_of_threads)
        {
            const int fst = tid * step_size * 2;
            const int snd = fst + step_size;
            printf("tid=%d fst=%d snd=%d|sum=%d\\n", threadIdx.x, input[fst], input[snd], input[fst]+input[snd]);
            input[fst] += input[snd];
        }
        step_size <<= 1;  //shift left operation doubles step size
        number_of_threads >>= 1;  //half the number of threads
    }
}

int main()
{
    const auto count = 8;
    const int size = count * sizeof(int);
    int h[] = {13, 27, 15, 14, 33, 2, 24, 6};   //input stored at host
    for(int i = 0; i<count; i++)
    {
      printf("%d,", h[i]);
    }
    printf("\\n");
    int* d;       //pointer stored at device

    cudaMalloc(&d, size); //allocate d with size of array
    cudaMemcpy(d, h, size, cudaMemcpyHostToDevice); //copy data from host to device

    sum <<<1, count / 2 >>>(d); //call function with one block and count/2 threads

    int result;
    cudaMemcpy(&result, d, sizeof(int), cudaMemcpyDeviceToHost);  //copy result to host

    //cout << "Sum is " << result << endl;

    //free values
    cudaFree(d);
    return 0;
}
"""


In [0]:
text_file = open("code.cu", "w")
text_file.write(code)
text_file.close()

In [0]:
!nvcc code.cu

In [5]:
!./a.out

13,27,15,14,33,2,24,6,
tid=0 fst=13 snd=27|sum=40
tid=1 fst=15 snd=14|sum=29
tid=2 fst=33 snd=2|sum=35
tid=3 fst=24 snd=6|sum=30
tid=0 fst=40 snd=29|sum=69
tid=1 fst=35 snd=30|sum=65
tid=0 fst=69 snd=65|sum=134


In [6]:
!nvprof ./a.out

13,27,15,14,33,2,24,6,
==3714== NVPROF is profiling process 3714, command: ./a.out
tid=0 fst=13 snd=27|sum=40
tid=1 fst=15 snd=14|sum=29
tid=2 fst=33 snd=2|sum=35
tid=3 fst=24 snd=6|sum=30
tid=0 fst=40 snd=29|sum=69
tid=1 fst=35 snd=30|sum=65
tid=0 fst=69 snd=65|sum=134
==3714== Profiling application: ./a.out
==3714== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.83%  455.10us         1  455.10us  455.10us  455.10us  sum(int*)
                    0.65%  2.9760us         1  2.9760us  2.9760us  2.9760us  [CUDA memcpy DtoH]
                    0.52%  2.4000us         1  2.4000us  2.4000us  2.4000us  [CUDA memcpy HtoD]
      API calls:   98.37%  141.96ms         1  141.96ms  141.96ms  141.96ms  cudaMalloc
                    0.52%  752.52us         1  752.52us  752.52us  752.52us  cudaLaunchKernel
                    0.43%  625.77us         1  625.77us  625.77us  625.77us  cuDeviceTotalMem
                    0.37%