In [2]:
%%writefile common.h

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <cstring>
#include <time.h>
#include <stdlib.h>
#include <utime.h>
#include <fstream>

#define HANDLE_NULL( a ){if (a == NULL) { \
                            printf( "Host memory failed in %s at line %d\n", \
                                    __FILE__, __LINE__ ); \
                            exit( EXIT_FAILURE );}}

enum INIT_PARAM{
	INIT_ZERO,INIT_RANDOM,INIT_ONE,INIT_ONE_TO_TEN,INIT_FOR_SPARSE_METRICS,INIT_0_TO_X
};

//simple initialization
void initialize(int * input, const int array_size,
	INIT_PARAM PARAM = INIT_ONE_TO_TEN, int x = 0);

void initialize(float * input, const int array_size,
	INIT_PARAM PARAM = INIT_ONE_TO_TEN);

void launch_dummmy_kernel();

//compare two arrays
void compare_arrays(int * a, int * b, int size);

//reduction in cpu
int reduction_cpu(int * input, const int size);

//compare results
void compare_results(int gpu_result, int cpu_result);

//print array
void print_array(int * input, const int array_size);

//print array
void print_array(float * input, const int array_size);

//print matrix
void print_matrix(int * matrix, int nx, int ny);

void print_matrix(float * matrix, int nx, int ny);

//get matrix
int* get_matrix(int rows, int columns);

//matrix transpose in CPU
void mat_transpose_cpu(int * mat, int * transpose, int nx, int ny);

//print_time_using_host_clock
void print_time_using_host_clock(clock_t start, clock_t end);

void printData(char *msg, int *in, const int size);

void compare_arrays(float * a, float * b, float size);

void sum_array_cpu(float* a, float* b, float *c, int size);

void print_arrays_toafile(int*, int , char* );

void print_arrays_toafile_side_by_side(float*,float*,int,char*);

void print_arrays_toafile_side_by_side(int*, int*, int, char*);

#endif // !COMMON_H

Writing common.h


In [3]:
%%writefile cuda_common.cuh

#ifndef CUDA_COMMON_H
#define CUDA_COMMON_H

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
}

__global__ void scan_efficient_1G(int * input, int* auxiliry_array, int input_size);
__global__ void scan_summation(int * input, int * auxiliry_array, int input_size);

#endif // !CUDA_COMMON_H

//void query_device();

Writing cuda_common.cuh


In [4]:
%%writefile scan.cuh

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "common.h"
#include "cuda_common.cuh"

//inclusive scan sequential implementation
void scan_inclusive_cpu(float*,float*, int);


//inclusive scan parallel inefficient implementation
__global__ void scan_inclusive_gpu(float*, float*, int );

Writing scan.cuh


In [7]:
%%writefile one.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define BLOCK_SIZE 512

// GPU error checking macro
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

// Initialize array with ones
void initialize(int *input, const int array_size) {
    for (int i = 0; i < array_size; i++) {
        input[i] = 1;
    }
}

// Compare two arrays
void compare_arrays(int *a, int *b, int size) {
    bool match = true;
    for (int i = 0; i < size; i++) {
        if (a[i] != b[i]) {
            printf("Arrays don't match at index %d: a[%d]=%d, b[%d]=%d\n", i, i, a[i], i, b[i]);
            match = false;
            if (i > 10) break; // Don't spam too many differences
        }
    }
    if (match) {
        printf("Arrays match!\n");
    }
}

// Print array (for debugging)
void print_array(int *input, const int array_size) {
    printf("Array: ");
    for (int i = 0; i < array_size && i < 20; i++) { // Limit output
        printf("%d ", input[i]);
    }
    if (array_size > 20) printf("...");
    printf("\n");
}

// CPU inclusive scan implementation
void inclusive_scan_cpu(int *input, int *output, int size) {
    output[0] = input[0];
    for (int i = 1; i < size; i++) {
        output[i] = output[i-1] + input[i];
    }
}

// Naive GPU inclusive scan (has race conditions - for demonstration only)
__global__ void naive_inclusive_scan(int *input, int size) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    if (gid < size) {
        for (int offset = 1; offset <= gid; offset *= 2) {
            if (gid >= offset) {
                input[gid] += input[gid - offset];
            }
            __syncthreads();
        }
    }
}

// Better GPU inclusive scan using shared memory
__global__ void inclusive_scan_gpu(int *input, int *output, int size) {
    extern __shared__ int temp[];

    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    // Load input into shared memory
    if (gid < size) {
        temp[tid] = input[gid];
    } else {
        temp[tid] = 0;
    }
    __syncthreads();

    // Perform scan in shared memory
    for (int offset = 1; offset < blockDim.x; offset *= 2) {
        int temp_val = 0;
        if (tid >= offset && tid < blockDim.x) {
            temp_val = temp[tid - offset];
        }
        __syncthreads();

        if (tid >= offset && tid < blockDim.x) {
            temp[tid] += temp_val;
        }
        __syncthreads();
    }

    // Write result to output
    if (gid < size) {
        output[gid] = temp[tid];
    }
}

int main(int argc, char**argv)
{
    printf("Scan algorithm execution started\n");

    int input_size = 1 << 10; // Default 1024 elements

    if (argc > 1) {
        input_size = 1 << atoi(argv[1]);
    }

    printf("Input size: %d elements\n", input_size);

    const int byte_size = sizeof(int) * input_size;

    // Host memory allocation
    int *h_input = (int*)malloc(byte_size);
    int *h_output_cpu = (int*)malloc(byte_size);
    int *h_output_gpu = (int*)malloc(byte_size);

    if (!h_input || !h_output_cpu || !h_output_gpu) {
        printf("Host memory allocation failed\n");
        return -1;
    }

    // Initialize input array
    initialize(h_input, input_size);

    printf("Sample input: ");
    print_array(h_input, 10);

    // CPU scan
    clock_t cpu_start = clock();
    inclusive_scan_cpu(h_input, h_output_cpu, input_size);
    clock_t cpu_end = clock();

    printf("CPU scan completed in %f ms\n",
           ((double)(cpu_end - cpu_start) / CLOCKS_PER_SEC) * 1000);

    // Device memory allocation
    int *d_input, *d_output;
    gpuErrchk(cudaMalloc((void**)&d_input, byte_size));
    gpuErrchk(cudaMalloc((void**)&d_output, byte_size));

    // Copy input to device
    gpuErrchk(cudaMemcpy(d_input, h_input, byte_size, cudaMemcpyHostToDevice));

    // Launch GPU kernel
    dim3 block(BLOCK_SIZE);
    dim3 grid((input_size + block.x - 1) / block.x);

    printf("Grid size: %d, Block size: %d\n", grid.x, block.x);

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));

    gpuErrchk(cudaEventRecord(start));

    // Launch kernel with shared memory
    inclusive_scan_gpu<<<grid, block, BLOCK_SIZE * sizeof(int)>>>(d_input, d_output, input_size);

    gpuErrchk(cudaEventRecord(stop));
    gpuErrchk(cudaEventSynchronize(stop));

    // Check for kernel launch errors
    gpuErrchk(cudaGetLastError());

    float gpu_time;
    gpuErrchk(cudaEventElapsedTime(&gpu_time, start, stop));
    printf("GPU scan completed in %f ms\n", gpu_time);

    // Copy result back to host
    gpuErrchk(cudaMemcpy(h_output_gpu, d_output, byte_size, cudaMemcpyDeviceToHost));

    // Compare results
    printf("Comparing CPU and GPU results...\n");
    compare_arrays(h_output_cpu, h_output_gpu, input_size);

    printf("Sample CPU output: ");
    print_array(h_output_cpu, 10);
    printf("Sample GPU output: ");
    print_array(h_output_gpu, 10);

    // Cleanup
    free(h_input);
    free(h_output_cpu);
    free(h_output_gpu);
    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output));
    gpuErrchk(cudaEventDestroy(start));
    gpuErrchk(cudaEventDestroy(stop));
    gpuErrchk(cudaDeviceReset());

    printf("Execution completed successfully!\n");
    return 0;
}

Overwriting one.cu


In [8]:
!nvcc -arch=sm_75 one.cu -o one

In [10]:
!./one

Scan algorithm execution started
Input size: 1024 elements
Sample input: Array: 1 1 1 1 1 1 1 1 1 1 
CPU scan completed in 0.005000 ms
Grid size: 2, Block size: 512
GPU scan completed in 0.141312 ms
Comparing CPU and GPU results...
Arrays don't match at index 512: a[512]=513, b[512]=1
Sample CPU output: Array: 1 2 3 4 5 6 7 8 9 10 
Sample GPU output: Array: 1 2 3 4 5 6 7 8 9 10 
Execution completed successfully!
