In [None]:
%%writefile assign3.cu
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>

#define ARRAY_SIZE 10000
#define BLOCK_SIZE 256

// CUDA kernel for parallel reduction to find minimum value
__global__ void minReduction(int *arr, int *result, int size) {
    __shared__ int sharedArr[BLOCK_SIZE];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;

    if (idx < size)
        sharedArr[tid] = arr[idx];
    else
        sharedArr[tid] = INT_MAX;

    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && idx + stride < size) {
            if (sharedArr[tid] > sharedArr[tid + stride])
                sharedArr[tid] = sharedArr[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0)
        result[blockIdx.x] = sharedArr[0];
}

// CUDA kernel for parallel reduction to find maximum value
__global__ void maxReduction(int *arr, int *result, int size) {
    __shared__ int sharedArr[BLOCK_SIZE];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;

    if (idx < size)
        sharedArr[tid] = arr[idx];
    else
        sharedArr[tid] = INT_MIN;

    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && idx + stride < size) {
            if (sharedArr[tid] < sharedArr[tid + stride])
                sharedArr[tid] = sharedArr[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0)
        result[blockIdx.x] = sharedArr[0];
}

// CUDA kernel for parallel reduction to find sum
__global__ void sumReduction(int *arr, int *result, int size) {
    __shared__ int sharedArr[BLOCK_SIZE];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x;

    if (idx < size)
        sharedArr[tid] = arr[idx];
    else
        sharedArr[tid] = 0;

    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && idx + stride < size)
            sharedArr[tid] += sharedArr[tid + stride];
        __syncthreads();
    }

    if (tid == 0)
        result[blockIdx.x] = sharedArr[0];
}

int main() {
    int *arr = new int[ARRAY_SIZE];
    srand(time(NULL));
    for (int i = 0; i < ARRAY_SIZE; ++i)
        arr[i] = rand() % 1000; // Generate random values between 0 and 999

    int *d_arr, *d_result;
    cudaMalloc((void**)&d_arr, ARRAY_SIZE * sizeof(int));
    cudaMalloc((void**)&d_result, sizeof(int));

    cudaMemcpy(d_arr, arr, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);

    // Finding minimum value
    minReduction<<<(ARRAY_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(d_arr, d_result, ARRAY_SIZE);
    int minVal;
    cudaMemcpy(&minVal, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Minimum value: " << minVal << std::endl;

    // Finding maximum value
    maxReduction<<<(ARRAY_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(d_arr, d_result, ARRAY_SIZE);
    int maxVal;
    cudaMemcpy(&maxVal, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Maximum value: " << maxVal << std::endl;

    // Finding sum
    sumReduction<<<(ARRAY_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(d_arr, d_result, ARRAY_SIZE);
    int sumVal;
    cudaMemcpy(&sumVal, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Sum: " << sumVal << std::endl;

    // Finding average
    float avgVal = static_cast<float>(sumVal) / ARRAY_SIZE;
    std::cout << "Average: " << avgVal << std::endl;

    cudaFree(d_arr);
    cudaFree(d_result);
    delete[] arr;

    return 0;
}


Writing assign3.cu


In [None]:
%%script bash
nvcc assign3.cu -o assign3

In [None]:
!./assign3

Minimum value: 0
Maximum value: 987
Sum: 133897
Average: 13.3897
