#HPC H1 : Parallel Operations over Vector

##Installing Prerequisites

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

%load_ext nvcc_plugin

##Main Assignment

In [None]:
%%cu
#include<iostream>
#include<math.h>

#define n 8

using namespace std;

//Kernel Functions

__global__ void minimum(int *input) {    

    int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;

    while(threadCount>0) {
        if(threadId < threadCount) {
            int first = threadId*stepSize*2;
            int second = first + stepSize;
            //Modify Array In Place
            if(input[second] < input[first])
              input[first] = input[second];
        }
        stepSize *=2;
        threadCount /= 2;
    }
}

__global__ void maximum(int *input) {
    int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;

    //Array is updated Inplace
    while(threadCount>0) {
        if(threadId < threadCount) {
            int first = threadId*stepSize*2;
            int second = first + stepSize;
            //Modify Array In Place
            if(input[second] > input[first])
              input[first] = input[second];
        }
        stepSize <<= 1;
        threadCount >>= 1;
    }
}

__global__ void sum(int *input) {
    const int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;
    
    while(threadCount > 0) {
        if(threadId < threadCount) {
            int first = threadId * stepSize * 2;
            int second = first + stepSize;
            //Modify Array In Place
            input[first] += input[second];
        }
        stepSize <<= 1;
        threadCount >>= 1;
       
    }
}

__global__ void meanSquared(float *input, float mean) {
    input[threadIdx.x] -= mean;
    input[threadIdx.x] *= input[threadIdx.x];
}

__global__ void sum(float *input) {
    int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;
    
    while(threadCount > 0) {
        if(threadId < threadCount) {
            int first = threadId * stepSize * 2;
            int second = first + stepSize;
            //Modify array in place
            input[first] += input[second];
        }
        stepSize <<= 1;
        threadCount >>= 1;
       
    }
}

//Host Functions

void initialize_vector(int *input, int size) {
    for(int i=0; i<size; i++)  {
        input[i] = rand()%500;
    }
}

void display_vector(int *input, int size) {
    if(size==0){
      cout<<"[]";
      return;
    }
    cout<<"["<<input[0];
    for(int i=1; i<size; i++)  {
        cout<<", "<<input[i];   
    }
    cout<<"]";
}

int getMinimum(int* arr_d, int* arr, int size){
    int result;
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    minimum<<<1,n/2>>>(arr_d);
    //Copying  Just the first Element of the array
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

int getMaximum(int* arr_d, int* arr, int size){
    int result;
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    maximum<<<1,n/2>>>(arr_d);
    //Copying  Just the first Element of the array
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

int getSum(int* arr_d, int* arr, int size){
    int result;
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    sum<<<1,n/2>>>(arr_d);
    //Copying  Just the first Element of the array
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

float getAverage(int* arr_d, int* arr, int size){
    float result = float(getSum(arr_d,arr,size))/n;
    return result;
}

float getVariance(int* arr_d, int* arr, int size){

    //int n = size/sizeof(int);

    float mean = getAverage(arr_d, arr, size);

    float *arr_float;
    float *arr_std, result;
    
    arr_float = (float *)malloc(n*sizeof(float));

    cudaMalloc((void **)&arr_std, n*sizeof(float));
    
    for(int i=0; i<n; i++)
        arr_float[i] = float(arr[i]);
    
    //Mean Squared
    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyHostToDevice);
    meanSquared <<<1,n>>>(arr_std, mean);
    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyDeviceToHost);

    //Add Mean Squared of all the elements
    sum<<<1,n/2>>>(arr_std);
    cudaMemcpy(&result, arr_std, sizeof(float), cudaMemcpyDeviceToHost);

    result /= n;

    cudaFree(arr_std);

    return result;

}

float getStdDeviation(int* arr_d, int* arr, int size){
    float result = sqrt(getVariance(arr_d,arr,size));
    return result;
}

int main() {

    //Host Variable
    int *arr;

    //Device Variable
    int *arr_d;

    int size = n*sizeof(int);
   
    //Allocate Memory to Host Variable
    arr = (int *)malloc(size);
    
    initialize_vector(arr, n);
    cout<<endl<<"Vector - ";
    display_vector(arr, n);
    
    //Allocate Memory to Device Variable
    cudaMalloc((void **)&arr_d, size);

    //Output Variables
    int min, max, sum;
    float avg, var, stddev; 
    
    //Host Function Calls - They Launch the Kernel
    min = getMinimum(arr_d, arr, size);
    max = getMaximum(arr_d, arr, size);
    sum = getSum(arr_d, arr, size);
    avg = getAverage(arr_d, arr, size);
    var = getVariance(arr_d, arr, size);
    stddev = getStdDeviation(arr_d, arr, size);

    //Output to the console
    cout<<endl<<"\nMinimum - "<<min;
    cout<<endl<<"Maximum - "<<max;
    cout<<endl<<"Sum - "<<sum;
    cout<<endl<<"Average - "<<avg;
    cout<<endl<<"Variance - "<<var;
    cout<<endl<<"Standard Deviation - "<<stddev;
    
    //Freeing space
    free(arr);
    cudaFree(arr_d);
           
    return 0;
}


Vector - [383, 386, 277, 415, 293, 335, 386, 492]

Minimum - 277
Maximum - 492
Sum - 2967
Average - 370.875
Variance - 4173.36
Standard Deviation - 64.6015


##Modification - Displaying Thread IDs

In [None]:
%%cu
#include<iostream>
#include<math.h>

#define n 8

using namespace std;

//Kernel Functions

__global__ void minimum(int *input) {    

    int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;

    printf("\nMin : %d ",threadId);

    while(threadCount>0) {
        if(threadId < threadCount) {
            int first = threadId*stepSize*2;
            int second = first + stepSize;
            //Modify Array In Place
            if(input[second] < input[first])
              input[first] = input[second];
        }
        stepSize *=2;
        threadCount /= 2;
    }
}

__global__ void maximum(int *input) {
    int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;

    printf("\nMax : %d ",threadId);

    //Array is updated Inplace
    while(threadCount>0) {
        if(threadId < threadCount) {
            int first = threadId*stepSize*2;
            int second = first + stepSize;
            //Modify Array In Place
            if(input[second] > input[first])
              input[first] = input[second];
        }
        stepSize <<= 1;
        threadCount >>= 1;
    }
}

__global__ void sum(int *input) {
    const int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;

    printf("\nSum : %d ",threadId);
    
    while(threadCount > 0) {
        if(threadId < threadCount) {
            int first = threadId * stepSize * 2;
            int second = first + stepSize;
            //Modify Array In Place
            input[first] += input[second];
        }
        stepSize <<= 1;
        threadCount >>= 1;
       
    }
}

__global__ void meanSquared(float *input, float mean) {
    input[threadIdx.x] -= mean;
    input[threadIdx.x] *= input[threadIdx.x];

    printf("\nmean squared : %d ",threadIdx.x);
}

__global__ void sum(float *input) {
    int threadId = threadIdx.x;
    int stepSize = 1;
    int threadCount = blockDim.x;

    printf("\nSum : %d ",threadId);
    
    while(threadCount > 0) {
        if(threadId < threadCount) {
            int first = threadId * stepSize * 2;
            int second = first + stepSize;
            //Modify array in place
            input[first] += input[second];
        }
        stepSize <<= 1;
        threadCount >>= 1;
       
    }
}

//Host Functions

void initialize_vector(int *input, int size) {
    for(int i=0; i<size; i++)  {
        input[i] = rand()%500;
    }
}

void display_vector(int *input, int size) {
    if(size==0){
      cout<<"[]";
      return;
    }
    cout<<"["<<input[0];
    for(int i=1; i<size; i++)  {
        cout<<", "<<input[i];   
    }
    cout<<"]";
}

int getMinimum(int* arr_d, int* arr, int size){
    cout<<"\nMinimum Function";
    int result;
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    minimum<<<1,n/2>>>(arr_d);
    //Copying  Just the first Element of the array
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

int getMaximum(int* arr_d, int* arr, int size){
    cout<<"\nMaximum Function";
    int result;
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    maximum<<<1,n/2>>>(arr_d);
    //Copying  Just the first Element of the array
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

int getSum(int* arr_d, int* arr, int size){
    cout<<"\nSum Function";
    int result;
    cudaMemcpy(arr_d, arr, size, cudaMemcpyHostToDevice);
    sum<<<1,n/2>>>(arr_d);
    //Copying  Just the first Element of the array
    cudaMemcpy(&result, arr_d, sizeof(int), cudaMemcpyDeviceToHost);
    return result;
}

float getAverage(int* arr_d, int* arr, int size){
    cout<<"\nAverage Function";
    float result = float(getSum(arr_d,arr,size))/n;
    return result;
}

float getVariance(int* arr_d, int* arr, int size){
    cout<<"\nVariance Function";

    //int n = size/sizeof(int);

    float mean = getAverage(arr_d, arr, size);

    float *arr_float;
    float *arr_std, result;
    
    arr_float = (float *)malloc(n*sizeof(float));

    cudaMalloc((void **)&arr_std, n*sizeof(float));
    
    for(int i=0; i<n; i++)
        arr_float[i] = float(arr[i]);
    
    //Mean Squared
    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyHostToDevice);
    meanSquared <<<1,n>>>(arr_std, mean);
    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyDeviceToHost);

    //Add Mean Squared of all the elements
    sum<<<1,n/2>>>(arr_std);
    cudaMemcpy(&result, arr_std, sizeof(float), cudaMemcpyDeviceToHost);

    result /= n;

    cudaFree(arr_std);

    return result;

}

float getStdDeviation(int* arr_d, int* arr, int size){
    cout<<"\nStd Deviation Function";
    float result = sqrt(getVariance(arr_d,arr,size));
    return result;
}

int main() {

    //Host Variable
    int *arr;

    //Device Variable
    int *arr_d;

    int size = n*sizeof(int);
   
    //Allocate Memory to Host Variable
    arr = (int *)malloc(size);
    
    initialize_vector(arr, n);
    
    //Allocate Memory to Device Variable
    cudaMalloc((void **)&arr_d, size);

    //Output Variables
    int min, max, sum;
    float avg, var, stddev; 
    
    //Host Function Calls - They Launch the Kernel
    cout<<"\n===========";
    min = getMinimum(arr_d, arr, size);
    cout<<"\n===========";
    max = getMaximum(arr_d, arr, size);
    cout<<"\n===========";
    sum = getSum(arr_d, arr, size);
    cout<<"\n===========";
    avg = getAverage(arr_d, arr, size);
    cout<<"\n===========";
    var = getVariance(arr_d, arr, size);
    cout<<"\n===========";
    stddev = getStdDeviation(arr_d, arr, size);
    cout<<"\n===========";
    
    //Freeing space
    free(arr);
    cudaFree(arr_d);
           
    return 0;
}


Minimum Function
Min : 0 
Min : 1 
Min : 2 
Min : 3 
Maximum Function
Max : 0 
Max : 1 
Max : 2 
Max : 3 
Sum Function
Sum : 0 
Sum : 1 
Sum : 2 
Sum : 3 
Average Function
Sum Function
Sum : 0 
Sum : 1 
Sum : 2 
Sum : 3 
Variance Function
Average Function
Sum Function
Sum : 0 
Sum : 1 
Sum : 2 
Sum : 3 
mean squared : 0 
mean squared : 1 
mean squared : 2 
mean squared : 3 
mean squared : 4 
mean squared : 5 
mean squared : 6 
mean squared : 7 
Sum : 0 
Sum : 1 
Sum : 2 
Sum : 3 
Std Deviation Function
Variance Function
Average Function
Sum Function
Sum : 0 
Sum : 1 
Sum : 2 
Sum : 3 
mean squared : 0 
mean squared : 1 
mean squared : 2 
mean squared : 3 
mean squared : 4 
mean squared : 5 
mean squared : 6 
mean squared : 7 
Sum : 0 
Sum : 1 
Sum : 2 
Sum : 3 
