Skip to content

Commit

Permalink
bench: added bsort dataflow microbenchmarks (modular and handcoded)
Browse files Browse the repository at this point in the history
  • Loading branch information
Shinpei Kato committed Mar 6, 2012
1 parent c9d6b44 commit 42914f1
Show file tree
Hide file tree
Showing 18 changed files with 1,679 additions and 2 deletions.
266 changes: 266 additions & 0 deletions bench/dataflow/bsort_handcoded/D_bitonic_sort.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
/*****************************************************************************
* Name : D_bitonic_sort.c
* Author : Jason Aumiller
* Version :
* Copyright :
* Description : Implements bitonic sort using CUDA driver API
* See D_bitonic_sort.h for configuration specific information.
/*****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <cuda.h>
#include <sys/time.h>

#include "D_bitonic_sort.h"

/* Reads integers from stdin, one per line. */
int readIntegers(int *ray, uint array_size) {
int i;
char str_buf[32];
uint ints_read = 0;

// Read from stdin
while (fgets(str_buf, sizeof(str_buf), stdin) != 0) {
i=atoi(str_buf);
ray[ints_read++] = i;

// Double if necessary
if ( ints_read >= array_size ) {
array_size = array_size << 1;
ray = (int*) realloc (ray, array_size * sizeof(int) );
}
}
return ints_read;
}

/* Verifies that the elements in the array are sorted. */
int verifySort(int *ray, uint array_size) {
uint errors=0;
int i;
for (i=0; i<array_size-1; ++i) {
if (ray[i] > ray[i+1]) {
++errors;
}
}
return errors;
}


void print_usage(){
printf("Usage: \n\tbitonicSort [-vO] -i | size | -p pow \n\n");
printf("Generates an array of 'size' integers (or 2^'pow') and sorts them. If the -i option is used \n");
printf("then the array is read from stdin.\n");
printf("\nOptions \n");
printf("\t -i \t Read input from stdin until EOF.\n");
printf("\t -O \t Print sorted list to stdout.\n");
printf("\t -p \t Specify 'size' as a power of 2.\n");
printf("\t -v \t Verbose.\n\n");
}

/* Generate array of random integers */
int randomInts(int *ray, uint array_size) {
int i;
srand(time(NULL));
for (i=0; i<array_size; ++i) {
ray[i] = rand() % 65536;
}
}

int main( int argc, char** argv)
{
uint num_threads;
uint num_blocks, block_size;
uint length;
uint nBytes;
int *list;
int status, verbose, c, i, j, logBlocks;
int read_stdin;
struct timeval start_time, end_time;
unsigned long total_time;
CUdevice hDevice;
CUcontext hContext;
CUmodule hModule;
CUfunction bitonicBlockFn;
CUfunction mergeBlocksFn;
CUdeviceptr pDeviceArrayA;
CUdeviceptr pDeviceArrayB;

status = SUCCESS;
verbose = 0;
read_stdin = FALSE;
length = 0;

while ((c = getopt (argc, argv, "dip:vO")) != -1) {
switch (c) {
case 'd':
verbose |= GROSS_DEBUG;
break;
case 'i':
read_stdin = TRUE;
case 'O':
verbose |= OUTPUT;
break;
case 'p':
length = 1 << atoi(optarg);
break;
case 'v':
verbose |= DEBUG;
break;
case '?':
default:
print_usage();
return FAILURE;
}
}

if ( read_stdin == TRUE ) {
/* Read sequence of integers from stdin */
list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) );
length = readIntegers(list, INIT_INPUT_SIZE);
} else if ( length > 0 ) {
list = (int*) malloc (length * sizeof(int) );
randomInts(list, length);
} else if (optind >= argc) { /* No size was given */
print_usage();
return FAILURE;
} else {
/* Generate our own integers */
length = atoi(argv[optind]);
list = (int*) malloc (length * sizeof(int) );
randomInts(list, length);
}

/*
* Phase 1:
* There will be one thread for each element to be sorted. Each
* block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements.
*/

/* Initialize sizes */
num_threads = _min(length, MAX_THREADS_PER_BLOCK );
num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1;
nBytes = length * sizeof(int);

if (verbose & DEBUG) printf("Initializing GPU.\n");

/* Start timing */
gettimeofday(&start_time, NULL);

/* Initialize GPU */
cutilDrvSafeCall( cuInit(0) );
cutilDrvSafeCall( cuDeviceGet(&hDevice, 0) );
cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) );
cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) );
cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) );

/* Allocate memory on the device */
cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes) );
cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes) );
cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) );
cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1));
cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA) );
cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4) );

/* Execute the kernel on the GPU */
if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads);
cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1) );

/*
* Phase 2:
* At this point each block is a sorted list. Now it's time to merge them.
*/

/* TODO This should go away after development */
if ( verbose & GROSS_DEBUG ) {
cuMemcpyDtoH(list, pDeviceArrayA, nBytes);
for (i=0; i<num_blocks; ++i) {
printf("### Block %d:\n", i);
for (j=0; j<num_threads; ++j) {
printf("%d\n", list[i*num_threads + j]);
}
}
}

i=0;

/* Do we need to merge blocks? */
if ( num_blocks > 1 ) {

/* There will be Log_2(num_blocks) merge steps. */
logBlocks = 0;
for (i=1; i<num_blocks; i *= 2) ++logBlocks;

if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks);

block_size = num_threads; /* How big the blocks were in the last grid launch. */
num_threads = num_blocks >> 1; /* Start with blocks/2 threads */
num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1;

cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) );
cuParamSeti(mergeBlocksFn, 4, block_size);
cuParamSetSize(mergeBlocksFn, 16);

for (i=0; i < logBlocks; ++i) {
cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1);
cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */

/* Merging uses a source array and destination array, the gpu has 2 arrays allocated
* so we swap which is the source and which is the destination for each iteration. */
if ( i%2 == 0 ) {
cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA);
cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB);
} else {
cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB);
cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA);
}

if ( verbose & DEBUG ) {
printf("Launching block merge kernel with %d blocks and %d threads per block\n",
num_blocks, num_threads/num_blocks);
}
cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) );

num_threads = num_threads >> 1;
num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK + 1;
}
}

/* Determine if answer is in pDeviceArrayA or pDeviceArrayB and copy to host */
if (i%2 == 0) cutilDrvSafeCall( cuMemcpyDtoH(list, pDeviceArrayA, nBytes) );
else cutilDrvSafeCall( cuMemcpyDtoH(list, pDeviceArrayB, nBytes) );

cutilDrvSafeCall( cuMemFree(pDeviceArrayA) );
cutilDrvSafeCall( cuMemFree(pDeviceArrayB) );
cutilDrvSafeCall( cuModuleUnload(hModule) );
cutilDrvSafeCall( cuCtxDestroy(hContext) );

/* Stop timing */
gettimeofday(&end_time, NULL);

total_time = ( (unsigned long)end_time.tv_sec * 1000000 + (unsigned long)end_time.tv_usec )
- ( (unsigned long)start_time.tv_sec * 1000000 + (unsigned long)start_time.tv_usec ) ;

printf("GPU total running time: %lu microseconds.\n", total_time);

if ( verbose & DEBUG ) printf("Verifying sort order.\n");

j = verifySort(list,length);
if ( j != 0 ){
fprintf(stderr, "***\n*** List has %d elements out of order!\n***\n", j);
status=FAILURE;
} else if (verbose & DEBUG) printf("List is sorted.\n");

if ( verbose & OUTPUT ) {
for (i=0; i < length; ++i) {
printf("%d\n", list[i]);
}
}

free(list);

return status;
}
30 changes: 30 additions & 0 deletions bench/dataflow/bsort_handcoded/D_bitonic_sort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*****************************************************************************
* Name : D_bitonic_sort.h
* Author : Jason Aumiller
* Version :
* Copyright :
* Description :
/*****************************************************************************/


#include "myutil.h"

#define MODULE_FILE "D_bitonic_sort_K.cubin"
#define BITONIC_BLOCK_FN "_Z16D_bitonic_sort_KPi"
#define MERGE_BLOCKS_FN "_Z16D_merge_blocks_KjjPiS_"
#define uint unsigned int

#define MAX_THREADS_PER_BLOCK 1024
#define INIT_INPUT_SIZE 64

#define SUCCESS 0
#define FAILURE -1
#define TRUE 1
#define FALSE 0
#define DEBUG 0x1
#define OUTPUT 0x2
#define GROSS_DEBUG 0x4
#define _min(x1,x2) ((x1) > (x2) ? (x2):(x1))



67 changes: 67 additions & 0 deletions bench/dataflow/bsort_handcoded/D_bitonic_sort_K.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*****************************************************************************
* Name : D_bitonic_sort.h
* Author : Jason Aumiller
* Version :
* Copyright :
* Description : CUDA Kernel function for bitonic sort
/*****************************************************************************/


#include <stdio.h>


__device__ void swap(uint a, uint b, int *data){
uint temp = data[a];
data[a]=data[b];
data[b]=temp;
}

__global__ void D_bitonic_sort_K(int *data_to_sort)
{
uint desc; // bit position that determines sort order ascending/descending
uint stride; // distance to partner index
int thread = threadIdx.x;

const int idx = blockIdx.x * blockDim.x + threadIdx.x;

for (desc=2; desc<=blockDim.x; desc*=2) {
for (stride = desc>>1; stride>0; stride=stride>>1) {
int partner = idx ^ stride;
__syncthreads();
if (partner > idx) {
if ( (thread & desc) == 0 && data_to_sort[idx] > data_to_sort[partner] ) {
swap(idx,partner,data_to_sort);
} else if ( (thread & desc) != 0 && data_to_sort[idx]< data_to_sort[partner] ) {
swap(idx,partner,data_to_sort);
}
}
}
}
}

__global__ void D_merge_blocks_K(uint merge_level, uint block_size, int *data_in, int *data_out) {

//const uint block = (uint) blockIdx.x;

const uint start = block_size * ( threadIdx.x << (merge_level + 1) );
const uint range = block_size << merge_level;

int i;
int da, db, end;
da = start;
db = start + range;
end = start + 2*range;

for (i=start; i < end; ++i){
if (da >= start + range) {
data_out[i] = data_in[db++];
} else if (db >= end) {
data_out[i] = data_in[da++];
} else if ( data_in[da] <= data_in[db] ) {
data_out[i] = data_in[da++];
} else {
data_out[i] = data_in[db++];
}
}
}

25 changes: 25 additions & 0 deletions bench/dataflow/bsort_handcoded/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#######
#
# Author: Jason Aumiller
# Description: Makefile for bitonic sort micro benchmark
#
# nvcc gets the --ptxas-options=-v option so that the compiled function names
# in the .cu file are printed to stdout
#
#######

BIN = bitonicSort
CC = gcc
NVCC = nvcc -arch sm_20 -cubin --ptxas-options=-v
LIBS = -lgdev -lcuda -L /usr/local/gdev/lib64
CFLAGS = -I /usr/local/gdev/include -g

all:
$(NVCC) -o D_bitonic_sort_K.cubin D_bitonic_sort_K.cu
gcc -o $(BIN) $(CFLAGS) $(LIBS) D_bitonic_sort.c

clean:
rm -f $(BIN) *.cubin ./*~

again: clean all

Loading

0 comments on commit 42914f1

Please sign in to comment.