In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [4]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-qsy2u0t1
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-qsy2u0t1
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=b35e59df6dd39e1284169bbdb959e99d23802e34cd30fb4f0bb5178528465ca0
  Stored in directory: /tmp/pip-ephem-wheel-cache-8u46o_2y/wheels/c5/2b/c0/87008e795a14bbcdfc7c846a00d06981916331eb980b6c8bdf
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cu

#include <iostream>
#include <math.h>
using namespace std;

__global__ void max(float *input)
{
    int tid = threadIdx.x;
    auto step = 1;
    int number_of_threads = blockDim.x;

    while (number_of_threads > 0)
    {
        if (tid < number_of_threads)
        {
            int first = tid * step * 2;
            int second = first + step;
            if (input[second] > input[first])
                input[first] = input[second];
        }
        step *= 2;
        number_of_threads /= 2;
    }
}

__global__ void min(float *input)
{
    int thread_id = threadIdx.x;
    auto step = 1;
    int number_of_threads = blockDim.x;
    while (number_of_threads > 0)
    {
        if (thread_id < number_of_threads)
        {
            int first = thread_id * step * 2;
            int second = first + step;
            if (input[second] < input[first])
                input[first] = input[second];
        }
        step = step * 2;
        number_of_threads /= 2;
    }
}

__global__ void sum(float *input)
{
    int tid = threadIdx.x;
    auto step = 1;
    int number_of_threads = blockDim.x;
    while (number_of_threads > 0)
    {
        if (tid < number_of_threads)
        {
            int first = tid * step * 2;
            int second = first + step;
            input[first] = input[first] + input[second];
        }
        step = step * 2;
        ;
        number_of_threads = number_of_threads / 2;
    }
}

__global__ void avg(float *input)
{
    const int tid = threadIdx.x;
    auto step = 1;
    int number_of_threads = blockDim.x;
    int totalElements = number_of_threads * 2;
    while (number_of_threads > 0)
    {
        if (tid < number_of_threads)
        {
            const int first = tid * step * 2;
            const int second = first + step;
            input[first] = input[first] + input[second];
        }
        step = step * 2;
        ;
        number_of_threads = number_of_threads / 2;
    }
    input[0] = input[0] / totalElements;
}

__global__ void mean_diff_sq(float *input, float mean)
{
    input[threadIdx.x] = input[threadIdx.x] - mean;
    input[threadIdx.x] = input[threadIdx.x] * input[threadIdx.x];
}

int main()
{

    //Creating an array arr of size 100 with random numbers
    int n = 100;
    float *arr = new float[n];
    int size = n * sizeof(float);
    cout << "Number of elements: " << n << endl;
    for (int i = 0; i < n; i++)
    {
        arr[i] = rand() % 1000;
        cout << arr[i] << " ";
    }
    cout << endl;

    //Calculating maximum number in array arr
    float *arr_max, result_max;
    cudaMalloc(&arr_max, size);
    cudaMemcpy(arr_max, arr, size, cudaMemcpyHostToDevice);
    //Launch max kernel on GPU with n/2 threads
    max<<<1, n / 2>>>(arr_max);
    cudaMemcpy(&result_max, arr_max, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "\nThe maximum element is " << result_max << endl;

    //Calculating minimum number in array arr
    float *arr_min, result_min;
    cudaMalloc(&arr_min, size);
    cudaMemcpy(arr_min, arr, size, cudaMemcpyHostToDevice);
    //Launch min kernel on GPU with n/2 threads
    min<<<1, n / 2>>>(arr_min);
    cudaMemcpy(&result_min, arr_min, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "The minimum element is " << result_min << endl;

    //Calculating sum of all numbers in array arr
    float *arr_sum, result_sum;
    cudaMalloc(&arr_sum, size);
    cudaMemcpy(arr_sum, arr, size, cudaMemcpyHostToDevice);
    //Launch sum kernel on GPU with n/2 threads
    sum<<<1, n / 2>>>(arr_sum);
    cudaMemcpy(&result_sum, arr_sum, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "The sum of elements is " << result_sum << endl;

    //Calculating average of numbers in array arr
    float *arr_avg, result_avg;
    cudaMalloc(&arr_avg, size);
    cudaMemcpy(arr_avg, arr, size, cudaMemcpyHostToDevice);
    //Launch avg kernel on GPU with n/2 threads
    avg<<<1, n / 2>>>(arr_avg);
    cudaMemcpy(&result_avg, arr_avg, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "The average of elements is " << result_avg << endl;

    //Calculating standard deviation of numbers in array arr
    float *arr_sd, result_variance, result_sd;
    cudaMalloc(&arr_sd, size);
    cudaMemcpy(arr_sd, arr, size, cudaMemcpyHostToDevice);
    mean_diff_sq<<<1, n>>>(arr_sd, result_avg);
    sum<<<1, n / 2>>>(arr_sd);
    cudaMemcpy(&result_variance, arr_sd, sizeof(float), cudaMemcpyDeviceToHost);
    result_variance = result_variance / n;
    result_sd = sqrt(result_variance);
    cout << "The standard deviation of elements is " << result_sd << endl;

    cudaFree(arr_min);
    cudaFree(arr_sum);
    cudaFree(arr_max);
    cudaFree(arr_avg);
    cudaFree(arr_sd);
    return 0;
}

Number of elements: 100
383 886 777 915 793 335 386 492 649 421 362 27 690 59 763 926 540 426 172 736 211 368 567 429 782 530 862 123 67 135 929 802 22 58 69 167 393 456 11 42 229 373 421 919 784 537 198 324 315 370 413 526 91 980 956 873 862 170 996 281 305 925 84 327 336 505 846 729 313 857 124 895 582 545 814 367 434 364 43 750 87 808 276 178 788 584 403 651 754 399 932 60 676 368 739 12 226 586 94 539 

The maximum element is 996
The minimum element is 11
The sum of elements is 30020
The average of elements is 135.267
The standard deviation of elements is 360.669

