diff --git a/paredge/Makefile b/paredge/Makefile new file mode 100755 index 0000000..68a5e50 --- /dev/null +++ b/paredge/Makefile @@ -0,0 +1,163 @@ +################################################################################ +# +# Copyright 1993-2013 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# Makefile project only supported on Mac OS X and Linux Platforms) +# +################################################################################ + +include ./findcudalib.mk + +# Location of the CUDA Toolkit +CUDA_PATH ?= /usr/local/cuda + +CUDA_SAMPLES_PATH ?= $(CUDA_PATH)/samples +BOUNDARY_INC_PATH ?= -I/home/andmax/boundary/lib -I/home/andmax/boundary/src +BOUNDARY_LIB_PATH ?= -L/home/andmax/boundary/build/lib/util +ADDITIONAL_LIBS ?= -lutil + +# internal flags +NVCCFLAGS := -m${OS_SIZE} +CCFLAGS := +NVCCLDFLAGS := +LDFLAGS := + +# Extra user flags +EXTRA_NVCCFLAGS ?= +EXTRA_NVCCLDFLAGS ?= +EXTRA_LDFLAGS ?= +EXTRA_CCFLAGS ?= + +# OS-specific build flags +ifneq ($(DARWIN),) + LDFLAGS += -rpath $(CUDA_PATH)/lib + CCFLAGS += -arch $(OS_ARCH) $(STDLIB) +else + ifeq ($(OS_ARCH),armv7l) + ifeq ($(abi),gnueabi) + CCFLAGS += -mfloat-abi=softfp + else + # default to gnueabihf + override abi := gnueabihf + LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 + CCFLAGS += -mfloat-abi=hard + endif + endif +endif + +ifeq ($(ARMv7),1) +NVCCFLAGS += -target-cpu-arch ARM +ifneq ($(TARGET_FS),) +CCFLAGS += --sysroot=$(TARGET_FS) +LDFLAGS += --sysroot=$(TARGET_FS) +LDFLAGS += -rpath-link=$(TARGET_FS)/lib +LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib +LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi) +endif +endif + +# Debug build flags +ifeq ($(dbg),1) + NVCCFLAGS += -g -G + TARGET := debug +else + TARGET := release +endif + +ALL_CCFLAGS := +ALL_CCFLAGS += $(NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) +ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) +ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) + +ALL_LDFLAGS := +ALL_LDFLAGS += $(ALL_CCFLAGS) +ALL_LDFLAGS += $(NVCCLDFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) +ALL_LDFLAGS += $(EXTRA_NVCCLDFLAGS) +ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) + +# Common includes and paths for CUDA +INCLUDES := -I$(CUDA_SAMPLES_PATH)/common/inc $(BOUNDARY_INC_PATH) +LIBRARIES := $(BOUNDARY_LIB_PATH) + +################################################################################ + +# Makefile include to help find GL Libraries +EXEC ?= +include ./findgllib.mk + +# OpenGL specific libraries +ifneq ($(DARWIN),) + # Mac OSX specific libraries and paths to include + LIBRARIES += -L/System/Library/Frameworks/OpenGL.framework/Libraries + LIBRARIES += -lGL -lGLU $(CUDA_SAMPLES_PATH)/common/lib/darwin/libGLEW.a + ALL_LDFLAGS += -Xlinker -framework -Xlinker GLUT +else + LIBRARIES += -L$(CUDA_SAMPLES_PATH)/common/lib/$(OSLOWER)/$(OS_ARCH) $(GLLINK) + LIBRARIES += -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW $(ADDITIONAL_LIBS) +endif + +# CUDA code generation flags +ifneq ($(OS_ARCH),armv7l) +GENCODE_SM10 := -gencode arch=compute_10,code=sm_10 +endif +GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 +GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 +GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\" +#GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35) +GENCODE_FLAGS := -arch=sm_30 + +################################################################################ + +# Target rules +all: build + +build: recursiveGaussian + +recursiveGaussian_cuda.o: recursiveGaussian_cuda.cu recursiveGaussian_kernel.cuh recursiveGaussian.h + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +recAlg_cuda.o: recAlg_cuda.cu recursiveGaussian.h alg4d_gpu.cuh alg5d_gpu.cuh + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +main.o: recursiveGaussian.cpp recursiveGaussian.h + $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + +recursiveGaussian: recursiveGaussian_cuda.o recAlg_cuda.o main.o + $(EXEC) $(NVCC) $(ALL_LDFLAGS) -o $@ $+ $(LIBRARIES) + +run: build + $(EXEC) ./recursiveGaussian + +clean: + $(EXEC) rm -f recursiveGaussian_cuda.o recAlg_cuda.o main.o recursiveGaussian *.pgm + +clobber: clean diff --git a/paredge/README.md b/paredge/README.md new file mode 100755 index 0000000..df575fc --- /dev/null +++ b/paredge/README.md @@ -0,0 +1,4 @@ +paredge +======= + +Parallel Edge Detection diff --git a/paredge/alg4d_gpu.cuh b/paredge/alg4d_gpu.cuh new file mode 100755 index 0000000..347dd56 --- /dev/null +++ b/paredge/alg4d_gpu.cuh @@ -0,0 +1,688 @@ +/** + * @file alg4d_gpu.cuh + * @brief Algorithm 4 in the GPU (Deriche version) with borders + * @author Andre Maximo + * @date Jun, 2014 + */ + +#ifndef ALG4D_GPU_CUH +#define ALG4D_GPU_CUH + +#define USE_FUSION true + +//=== IMPLEMENTATION ============================================================ + +// Alg4 (Deriche) GPU ----------------------------------------------------------- + +// Collect carries -------------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWC, NBCW) +void alg4d_collect_carries( Matrix *g_py1bar, + Matrix *g_ey2bar, + Matrix *g_px, + Matrix *g_ex, + float inv_width, float inv_height, + int m_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y; + + __shared__ Matrix block; + read_block(block, m, n, inv_width, inv_height); + __syncthreads(); + + float x[32]; // 32 regs per thread + + if (ty==0) { // 1 warp forward computing y1 + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] = block[tx][i]; + + float xp = 0.f; // previous x + Vector py1; // current P(Y_1) + Vector px; // this block P(X) + + if (m==0) { // 1st block +#if CLAMPTOEDGE + xp = x[0]; // clamp to edge + py1[0] = c_coefs[4]*xp; + px[0] = xp; +#pragma unroll + for (int i=1; i(); + +#pragma unroll + for (int i=0; i right + for (int j=0; j ey2; // current E(Y_2) + Vector ex; // this block E(X) + + if (m==m_size-1) { // last block +#if CLAMPTOEDGE + xn = xa = x[WS-1]; // clamp to edge + ey2[0] = c_coefs[5]*xn; + ex[0] = xn; +#pragma unroll + for (int i=1; i(); + +#pragma unroll + for (int i=0; i left + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], ey2, c_weights); + xa = xn; + xn = xc; + } + + g_ey2bar[n*(m_size+1)+m].set_col(tx, ey2); + g_ex[n*(m_size+1)+m].set_col(tx, ex); + + } + +} + +// Adjust carries --------------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWA, NBA) +void alg4d_adjust_carries( Matrix *g_py1bar, + Matrix *g_ey2bar, + Matrix *g_px, + Matrix *g_ex, + int m_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m, n = blockIdx.y; + Matrix *gpy1bar, *gey2bar; + Vector py1, ey2, py1bar, ey2bar; + __shared__ Matrix spy1ey2bar[NWA]; + + Matrix *gpx, *gex; + Vector px, ex; + __shared__ Matrix spxex[NWA]; + + // P(y1bar) -> P(y1) processing --------------------------------------------- + m = 0; + gpy1bar = (Matrix *)&g_py1bar[n*(m_size+1)+m+ty+1][0][tx]; + gpx = (Matrix *)&g_px[n*(m_size+1)+m+ty][0][tx]; + if (ty==0) // for boundary condition + py1 = g_py1bar[n*(m_size+1)+0].col(tx); + + // more parallelism here is not important + for (; m < m_size; m += NWA) { // for all image blocks + + if (m+ty < m_size) { // using smem as cache + spy1ey2bar[ty].set_col(tx, gpy1bar->col(0)); + spxex[ty].set_col(tx, gpx->col(0)); + } + + __syncthreads(); // wait to load smem + + if (ty==0) { +#pragma unroll // adjust py1bar left -> right + for (int w = 0; w < NWA; ++w) { + + py1bar = spy1ey2bar[w].col(tx); + px = spxex[w].col(tx); + + py1 = py1bar + py1 * c_AbF_T + px * c_TAFB_AC1P_T; + + spy1ey2bar[w].set_col(tx, py1); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (m+ty < m_size) gpy1bar->set_col(0, spy1ey2bar[ty].col(tx)); + + gpy1bar += NWA; + gpx += NWA; + + } + + // E(y2bar) -> E(y2) processing ----------------------------------------------- + m = m_size-1; + gey2bar = (Matrix *)&g_ey2bar[n*(m_size+1)+m-ty][0][tx]; + gex = (Matrix *)&g_ex[n*(m_size+1)+m+1-ty][0][tx]; + if (ty==0) // for boundary condition + ey2 = g_ey2bar[n*(m_size+1)+m_size].col(tx); + + for (; m >= 0; m -= NWA) { // for all image blocks + + // using smem as cache + if (m-ty >= 0) { + spy1ey2bar[ty].set_col(tx, gey2bar->col(0)); + spxex[ty].set_col(tx, gex->col(0)); + } + + __syncthreads(); // wait to load smem + + if (ty==0) { +#pragma unroll // adjust ey2bar right -> left + for (int w = 0; w < NWA; ++w) { + + ey2bar = spy1ey2bar[w].col(tx); + ex = spxex[w].col(tx); + + ey2 = ey2bar + ey2 * c_AbR_T + ex * c_HARB_AC2E_T; + + spy1ey2bar[w].set_col(tx, ey2); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (m-ty >= 0) gey2bar->set_col(0, spy1ey2bar[ty].col(tx)); + + gey2bar -= NWA; + gex -= NWA; + + } + +} + +// Write results transpose ------------------------------------------------------ + +template +__global__ __launch_bounds__(WS*NWW, NBCW) +void alg4d_write_results_transp( float *g_transp_out, + const Matrix *g_rows_py1, + const Matrix *g_rows_ey2, + Matrix *g_cols_py1, + Matrix *g_cols_ey2, + const Matrix *g_rows_px, + const Matrix *g_rows_ex, + Matrix *g_cols_pz, + Matrix *g_cols_ez, + float inv_width, float inv_height, + int m_size, int n_size, + int out_stride ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y; + + __shared__ Matrix block; + read_block(block, m, n, inv_width, inv_height); + __syncthreads(); + + float x[32]; // 32 regs per thread + + Matrix + *py1m1 = (Matrix*)&g_rows_py1[n*(m_size+1)+m][0][tx], + *ey2m1 = (Matrix*)&g_rows_ey2[n*(m_size+1)+m+1][0][tx]; + + Matrix + *pxm1 = (Matrix*)&g_rows_px[n*(m_size+1)+m][0][tx], + *exm1 = (Matrix*)&g_rows_ex[n*(m_size+1)+m+1][0][tx]; + + if (ty==0) { // 1 warp forward computing y1 + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] = block[tx][i]; + + float xp; // previous x + Vector py1 = py1m1->col(0); // current P(Y_1) + Vector px = pxm1->col(0); // previous block P(X) + + xp = px[0]; + +#pragma unroll // calculate block, scan left -> right + for (int j=0; j ey2 = ey2m1->col(0); // current E(Y_2) + Vector ex = exm1->col(0); // next block E(X) + + xn = ex[0]; + if (R==1) xa = xn; + else xa = ex[1]; + +#pragma unroll // calculate block, scan right -> left + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], ey2, c_weights); + xa = xn; + xn = xc; + } + + } + + // preparing for adding y1 and y2 + + __syncthreads(); // guarantee no smem usage + + if (ty==1) { + +#pragma unroll + for (int i=0; i<32; ++i) + block[tx][i] = x[i]; // save partial result in smem + + } + + __syncthreads(); // guarantee warp 0 can read smem + + if (ty==0) { // now add + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] += block[tx][i]; // x now has z + + g_transp_out += m*WS*out_stride + n*WS + tx; +#pragma unroll // write block inside valid transpose image + for (int i=0; i + &pu1bar = (Matrix&)g_cols_py1[m*(n_size+1)+n+1][0][tx]; + + float zp = 0.f; // previous z + Vector pu1; // current P(U_1) + Vector pz; // this block P(Z) + + if (n==0) { // 1st block +#if CLAMPTOEDGE + zp = x[0]; // clamp to edge + pu1[0] = c_coefs[4]*zp; + pz[0] = zp; +#pragma unroll + for (int i=1; i(); + +#pragma unroll + for (int i=0; i right + for (int j=0; j + &eu2bar = (Matrix&)g_cols_ey2[m*(n_size+1)+n][0][tx]; + + float zn = 0.f, za = 0.f; // next and after next z + Vector eu2; // current E(U_2) + Vector ez; // this block E(Z) + + if (n==n_size-1) { // last block +#if CLAMPTOEDGE + zn = za = x[WS-1]; // clamp to edge + eu2[0] = c_coefs[5]*zn; + ez[0] = zn; +#pragma unroll + for (int i=1; i(); + +#pragma unroll + for (int i=0; i left + for (int j=WS-1; j>=0; --j) { + float zc = x[j]; // current z + x[j] = c_coefs[2]*zn + c_coefs[3]*za; + x[j] = revI(x[j], eu2, c_weights); + za = zn; + zn = zc; + } + + eu2bar.set_col(0, eu2); // store eu2bar cols + g_cols_ez[m*(n_size+1)+n].set_col(tx, ez); + + } + + } // if fusion + +} + +// Alg4 Deriche in the GPU ------------------------------------------------------ + +__host__ +void alg4d_gpu( float *h_img, int width, int height, + float sigma, int order ) { + + const int B = WS; + if (R != 2) { std::cout << "R must be 2!\n"; return; } + + Vector coefs; // [a0-a3, coefp, coefn] + Vector w; // weights + w[0] = 1.f; + + computeFilterParams(coefs[0], coefs[1], coefs[2], coefs[3], + w[1], w[2], coefs[4], coefs[5], + sigma, order); + + Vector ck1; // convolution kernel 1 (plus) + ck1[0] = coefs[0]; ck1[1] = coefs[1]; ck1[2] = 0.f; + Vector ck2; // convolution kernel 2 (minus) + ck2[0] = 0.f; ck2[1] = coefs[2]; ck2[2] = coefs[3]; + + // pre-compute basic alg4 Deriche matrices + Matrix Ir = identity(); + Matrix Zrb = zeros(); + Matrix Zbr = zeros(); + Matrix Ib = identity(); + + Matrix AFP_T = fwd(Ir, Zrb, w), ARE_T = rev(Zrb, Ir, w); + Matrix AFB_T = fwd(Zbr, Ib, w), ARB_T = rev(Ib, Zbr, w); + Matrix AbF_T = tail(AFP_T), AbR_T = head(ARE_T); + Matrix TAFB_T = tail(AFB_T), HARB_T = head(ARB_T); + + Matrix AC1P_T = transp(conv1p(ck1)); + Matrix AC2E_T = transp(conv2e(ck2)); + + Matrix TAFB_AC1P_T = AC1P_T*TAFB_T; + Matrix HARB_AC2E_T = AC2E_T*HARB_T; + + int m_size = (width+WS-1)/WS, n_size = (height+WS-1)/WS; + + // upload to the GPU + copy_to_symbol(c_weights, w); + copy_to_symbol(c_coefs, coefs); + + copy_to_symbol(c_AbF_T, AbF_T); + copy_to_symbol(c_AbR_T, AbR_T); + + copy_to_symbol(c_TAFB_AC1P_T, TAFB_AC1P_T); + copy_to_symbol(c_HARB_AC2E_T, HARB_AC2E_T); + + float inv_width = 1.f/width, inv_height = 1.f/height; + + cudaArray *a_in; + size_t offset; + cudaChannelFormatDesc ccd = cudaCreateChannelDesc(); + cudaMallocArray(&a_in, &ccd, width, height); + cudaMemcpyToArray(a_in, 0, 0, h_img, width*height*sizeof(float), + cudaMemcpyHostToDevice); + + t_in.normalized = true; + t_in.filterMode = cudaFilterModePoint; + t_in.addressMode[0] = t_in.addressMode[1] = cudaAddressModeBorder; + + // naming convention: stride in elements and pitch in bytes + // to achieve at the same time coalescing and L2-cache colision avoidance + // the transposed and regular output images have stride of maximum size + // (4Ki) + 1 block (WS), considering up to 4096x4096 input images + int stride_img = MAXSIZE+WS, stride_transp_img = MAXSIZE+WS; + dvector d_img(height*stride_img), d_transp_img(width*stride_transp_img); + + // +1 padding is important even in zero-border to avoid if's in kernels + dvector< Matrix > + d_rows_py1bar((m_size+1)*n_size), d_rows_ey2bar((m_size+1)*n_size), + d_cols_py1bar((n_size+1)*m_size), d_cols_ey2bar((n_size+1)*m_size); + + dvector< Matrix > + d_rows_px((m_size+1)*n_size), d_rows_ex((m_size+1)*n_size), + d_cols_pz((n_size+1)*m_size), d_cols_ez((n_size+1)*m_size); + + d_rows_px.fillzero(); + d_rows_ex.fillzero(); + d_cols_pz.fillzero(); + d_cols_ez.fillzero(); + + d_rows_py1bar.fillzero(); + d_rows_ey2bar.fillzero(); + d_cols_py1bar.fillzero(); + d_cols_ey2bar.fillzero(); + +#if RUNTIMES>1 + base_timer &timer_total = timers.gpu_add("alg4d_gpu", width*height*RUNTIMES, "iP"); + + for(int r=0; r>> + ( &d_rows_py1bar, &d_rows_ey2bar, &d_rows_px, &d_rows_ex, inv_width, inv_height, m_size ); + + alg4d_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>> + ( &d_rows_py1bar, &d_rows_ey2bar, &d_rows_px, &d_rows_ex, m_size ); + + alg4d_write_results_transp<<< dim3(m_size, n_size), dim3(WS, NWW) >>> + ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_px, &d_rows_ex, &d_cols_pz, &d_cols_ez, + inv_width, inv_height, m_size, n_size, stride_transp_img ); + + cudaUnbindTexture(t_in); + cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float)); + + alg4d_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, &d_cols_pz, &d_cols_ez, n_size ); + + alg4d_write_results_transp<<< dim3(n_size, m_size), dim3(WS, NWW) >>> + ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_cols_ey2bar, &d_cols_pz, &d_cols_ez, &d_rows_px, &d_rows_ex, + inv_height, inv_width, n_size, m_size, stride_img ); + + cudaUnbindTexture(t_in); + + } + + timer_total.stop(); + + std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush; +#else + base_timer &timer_total = timers.gpu_add("alg4d_gpu", width*height, "iP"); + base_timer *timer; + + cudaBindTextureToArray(t_in, a_in); + + timer = &timers.gpu_add("collect-carries"); + + alg4d_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>> + ( &d_rows_py1bar, &d_rows_ey2bar, + &d_rows_px, &d_rows_ex, + inv_width, inv_height, m_size ); + + timer->stop(); timer = &timers.gpu_add("adjust-carries"); + + alg4d_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>> + ( &d_rows_py1bar, &d_rows_ey2bar, + &d_rows_px, &d_rows_ex, + m_size ); + + timer->stop(); timer = &timers.gpu_add("write-block-collect-carries"); + + alg4d_write_results_transp<<< dim3(m_size, n_size), dim3(WS, NWW) >>> + ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar, + &d_rows_px, &d_rows_ex, &d_cols_pz, &d_cols_ez, + inv_width, inv_height, m_size, n_size, stride_transp_img ); + + timer->stop(); timer = &timers.gpu_add("adjust-carries"); + + cudaUnbindTexture(t_in); + cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float)); + + if (!USE_FUSION) { // it seems faster to not use FUSION - not quite + alg4d_collect_carries<<< dim3(n_size, m_size), dim3(WS, NWC) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, + &d_cols_pz, &d_cols_ez, + inv_height, inv_width, n_size ); + } + + alg4d_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, + &d_cols_pz, &d_cols_ez, + n_size ); + + timer->stop(); timer = &timers.gpu_add("write-block"); + + alg4d_write_results_transp<<< dim3(n_size, m_size), dim3(WS, NWW) >>> + ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_rows_ey2bar, + &d_cols_pz, &d_cols_ez, &d_rows_px, &d_rows_ex, + inv_height, inv_width, n_size, m_size, stride_img ); + + timer->stop(); + + cudaUnbindTexture(t_in); + + timer_total.stop(); + timers.flush(); +#endif + cudaMemcpy2D(h_img, width*sizeof(float), d_img, stride_img*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToHost); + +} + +#endif // ALG4D_GPU_CUH + +/** + //checkings + Matrix pm1nx, em1nx; + Matrix bmnx; + + srand(time(NULL)); + for (int i=0; i bmny1a = conv1(pm1nx, bmnx, ck1); + Matrix bmny1b = pm1nx * AC1P_T + bmnx * AC1B_T; + + //checking 2 + Matrix bmny2a = conv2(bmnx, em1nx, ck2); + Matrix bmny2b = em1nx * AC2E_T + bmnx * AC2B_T; + + //checking 3 + Matrix pmny1bara = tail(fwd(Zbr, bmny1a, w)); + Matrix pmny1barb = pm1nx * TAFB_AC1P_T + bmnx * TAFB_AC1B_T; + + //checking 4 + Matrix emny2bara = head(rev(bmny2a, Zbr, w)); + Matrix emny2barb = em1nx * HARB_AC2E_T + bmnx * HARB_AC2B_T; + + float a, b, r, me1, mre1, me2, mre2; + mre2 = me2 = mre1 = me1 = 0; + + for (int i=0; i mre1 ? b : mre1; + } + me1 = a > me1 ? a : me1; + a = emny2bara[i][j] - emny2barb[i][j]; + a = a < 0 ? -a : a; + if (emny2bara[i][j] != 0) { + r = emny2bara[i][j] < 0 ? -emny2bara[i][j] : emny2bara[i][j]; + b = a / r; + mre2 = b > mre2 ? b : mre2; + } + me2 = a > me2 ? a : me2; + } + } + + //std::cout << "bmny1a x bmny1b: me = " << me1 << " mre = " << mre1 << "\n"; + //std::cout << "bmny2a x bmny2b: me = " << me2 << " mre = " << mre2 << "\n"; + std::cout << "pmny1bara x pmny1barb: me = " << me1 << " mre = " << mre1 << "\n"; + std::cout << "emny2bara x emny2barb: me = " << me2 << " mre = " << mre2 << "\n"; + +*/ diff --git a/paredge/alg4pd_gpu.cuh b/paredge/alg4pd_gpu.cuh new file mode 100755 index 0000000..f07933c --- /dev/null +++ b/paredge/alg4pd_gpu.cuh @@ -0,0 +1,557 @@ +/** + * @file alg4pd_gpu.cuh + * @brief Algorithm 4 plic in the GPU (Deriche version) with borders + * @author Andre Maximo + * @date Jun, 2014 + */ + +#ifndef ALG4PD_GPU_CUH +#define ALG4PD_GPU_CUH + +#define USE_FUSION true + +//=== IMPLEMENTATION ============================================================ + +// TODO LIST: +// * MAKE ADJUST CARRIES MORE PARALLEL + +// Alg4p (Deriche) GPU ---------------------------------------------------------- + +// Collect carries -------------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWC, NBCW) +void alg4pd_gpu_collect_carries( Matrix *g_py1bar, + Matrix *g_ey2bar, + float inv_width, float inv_height, + int m_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y; + + __shared__ Matrix block; + read_block(block, m, n, inv_width, inv_height); + __syncthreads(); + + float x[32]; // 32 regs per thread + + if (ty==0) { // 1 warp forward computing y1 + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] = block[tx][i]; + + float px = 0.f; // previous x + Vector py1; // current P(Y_1) + + if (m==0) { // 1st block +#if CLAMPTOEDGE + px = x[0]; // clamp to edge + py1[0] = c_coefs[4]*px; +#pragma unroll + for (int i=1; i(); + +#pragma unroll // calculate pybar, scan left -> right + for (int j=0; j ey2; // current E(Y_2) + + if (m==m_size-1) { // last block +#if CLAMPTOEDGE + nx = nnx = x[WS-1]; // clamp to edge + ey2[0] = c_coefs[5]*nx; +#pragma unroll + for (int i=1; i(); + +#pragma unroll // calculate ezhat, scan right -> left + for (int j=WS-1; j>=0; --j) { + float cx = x[j]; // current x + x[j] = c_coefs[2]*nx + c_coefs[3]*nnx; + x[j] = revI(x[j], ey2, c_weights); + nnx = nx; + nx = cx; + } + + g_ey2bar[n*(m_size+1)+m].set_col(tx, ey2); + + } + +} + +// Adjust carries --------------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWA, NBA) +void alg4pd_gpu_adjust_carries( Matrix *g_py1bar, + Matrix *g_ey2bar, + int m_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m, n = blockIdx.y; + Matrix *gpy1bar, *gey2bar; + Vector py1, ey2, py1bar, ey2bar; + __shared__ Matrix spy1ey2bar[NWA]; + + // P(y1bar) -> P(y1) processing ----------------------------------------------- + m = 0; + gpy1bar = (Matrix *)&g_py1bar[n*(m_size+1)+m+ty+1][0][tx]; + if (ty==0) py1 = g_py1bar[n*(m_size+1)+0].col(tx); + + for (; m < m_size; m += NWA) { // for all image blocks + + // using smem as cache + if (m+ty < m_size) spy1ey2bar[ty].set_col(tx, gpy1bar->col(0)); + + __syncthreads(); // wait to load smem + + if (ty==0) { +#pragma unroll // adjust py1bar left -> right + for (int w = 0; w < NWA; ++w) { + + py1bar = spy1ey2bar[w].col(tx); + + py1 = py1bar + py1 * c_AbF_T; + + spy1ey2bar[w].set_col(tx, py1); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (m+ty < m_size) gpy1bar->set_col(0, spy1ey2bar[ty].col(tx)); + + gpy1bar += NWA; + + } + + // E(zhat) -> E(z) processing ----------------------------------------------- + m = m_size-1; + gey2bar = (Matrix *)&g_ey2bar[n*(m_size+1)+m-ty][0][tx]; + if (ty==0) ey2 = g_ey2bar[n*(m_size+1)+m_size].col(tx); + + for (; m >= 0; m -= NWA) { // for all image blocks + + // using smem as cache + if (m-ty >= 0) spy1ey2bar[ty].set_col(tx, gey2bar->col(0)); + + __syncthreads(); // wait to load smem + + if (ty==0) { +#pragma unroll // adjust ey2bar right -> left + for (int w = 0; w < NWA; ++w) { + + ey2bar = spy1ey2bar[w].col(tx); + + ey2 = ey2bar + ey2 * c_AbR_T; + + spy1ey2bar[w].set_col(tx, ey2); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (m-ty >= 0) gey2bar->set_col(0, spy1ey2bar[ty].col(tx)); + + gey2bar -= NWA; + + } + +} + +// Write results transpose ------------------------------------------------------ + +template +__global__ __launch_bounds__(WS*NWW, NBCW) +void alg4pd_gpu_write_results_transp( float *g_transp_out, + const Matrix *g_rows_py1, + const Matrix *g_rows_ey2, + Matrix *g_cols_py1, + Matrix *g_cols_ey2, + float inv_width, float inv_height, + int m_size, int n_size, + int out_stride ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y; + + __shared__ Matrix block; + read_block(block, m, n, inv_width, inv_height); + __syncthreads(); + + float x[32]; // 32 regs per thread + + Matrix + *py1m1 = (Matrix*)&g_rows_py1[n*(m_size+1)+m][0][tx], + *ey2m1 = (Matrix*)&g_rows_ey2[n*(m_size+1)+m+1][0][tx]; + + if (ty==0) { // 1 warp forward computing y1 + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] = block[tx][i]; + + float px = 0.f; // previous x + Vector py1 = py1m1->col(0); // current P(Y_1) + + if (m==0) { // 1st block +#if CLAMPTOEDGE + px = x[0]; // clamp to edge +#endif + } else { // other blocks + float tu = ((m-1)*WS+(WS-1)+.5f)*inv_width; // last column of previous block + float tv = (n*WS+tx+.5f)*inv_height; // one element per row + px = tex2D(t_in, tu, tv); + } + +#pragma unroll // calculate block, scan left -> right + for (int j=0; j ey2 = ey2m1->col(0); // current E(Y_2) + + if (m==m_size-1) { // last block +#if CLAMPTOEDGE + nx = nnx = x[WS-1]; // clamp to edge +#endif + } else { // other blocks + float tu = ((m+1)*WS+.5f)*inv_width; // first column of next block + float tv = (n*WS+tx+.5f)*inv_height; // one element per row + nx = tex2D(t_in, tu, tv); + nnx = tex2D(t_in, tu+inv_width, tv); // second column of next block + } + +#pragma unroll // calculate block, scan right -> left + for (int j=WS-1; j>=0; --j) { + float cx = x[j]; // current x + x[j] = c_coefs[2]*nx + c_coefs[3]*nnx; + x[j] = revI(x[j], ey2, c_weights); + nnx = nx; + nx = cx; + } + + } + + __syncthreads(); // guarantee no smem usage + + if (ty==1) { + +#pragma unroll + for (int i=0; i<32; ++i) + block[tx][i] = x[i]; // save partial result in smem + + } + + __syncthreads(); // guarantee warp 0 can read smem + + if (ty==0) { + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] += block[tx][i]; // x now has z or v + + g_transp_out += m*WS*out_stride + n*WS + tx; +#pragma unroll // write block inside valid transpose image + for (int i=0; i + &pu1bar = (Matrix&)g_cols_py1[m*(n_size+1)+n+1][0][tx]; + + float pz = 0.f; // previous z + Vector pu1; // current P(U_1) + + if (n==0) { // 1st block +#if CLAMPTOEDGE + pz = x[0]; // clamp to edge + pu1[1] = pu1[0] = c_coefs[4]*pz; + g_cols_py1[m*(n_size+1)+0].set_col(tx, pu1); +#endif + } else { // other blocks + g_transp_out += (m*WS+tx)*out_stride + (n-1)*WS + (WS-1); + pz = *g_transp_out; + } + + pu1 = zeros(); + +#pragma unroll // calculate pybar cols, scan left -> right + for (int j=0; j + &eu2bar = (Matrix&)g_cols_ey2[m*(n_size+1)+n][0][tx]; + + float nz = 0.f, nnz = 0.f; // next and next-next z + Vector eu2; // current E(U_2) + + if (n==n_size-1) { // last block +#if CLAMPTOEDGE + nz = nnz = x[WS-1]; // clamp to edge + eu2[1] = eu2[0] = c_coefs[5]*nz; + g_cols_ey2[m*(n_size+1)+n_size].set_col(tx, eu2); +#endif + } else { // other blocks + g_transp_out += (m*WS+tx)*out_stride + (n+1)*WS; + nz = *g_transp_out; + g_transp_out += 1; + nnz = *g_transp_out; + } + + eu2 = zeros(); + +#pragma unroll // calculate ezhat cols, scan right -> left + for (int j=WS-1; j>=0; --j) { + float cz = x[j]; // current z + x[j] = c_coefs[2]*nz + c_coefs[3]*nnz; + x[j] = revI(x[j], eu2, c_weights); + nnz = nz; + nz = cz; + } + + eu2bar.set_col(0, eu2); // store eu2bar cols + + } + + } // if fusion + +} + +// Alg4p Deriche in the GPU ----------------------------------------------------- + +__host__ +void alg4pd_gpu( float *h_img, int width, int height, + float sigma, int order ) { + + const int B = WS; + if (R != 2) { std::cout << "R must be 2!\n"; return; } + + Vector coefs; // [a0-a3, coefp, coefn] + Vector w; // weights + w[0] = 1.f; + + computeFilterParams(coefs[0], coefs[1], coefs[2], coefs[3], + w[1], w[2], coefs[4], coefs[5], + sigma, order); + + // pre-compute basic alg4p Deriche matrices + Matrix Ir = identity(); + Matrix Zrb = zeros(); + + Matrix AFP_T = fwd(Ir, Zrb, w), + ARE_T = rev(Zrb, Ir, w); + + Matrix AbF_T = tail(AFP_T), + AbR_T = head(ARE_T); + + int m_size = (width+WS-1)/WS, n_size = (height+WS-1)/WS; + + // upload to the GPU + copy_to_symbol(c_weights, w); + copy_to_symbol(c_coefs, coefs); + + copy_to_symbol(c_AbF_T, AbF_T); + copy_to_symbol(c_AbR_T, AbR_T); + + float inv_width = 1.f/width, inv_height = 1.f/height; + + cudaArray *a_in; + size_t offset; + cudaChannelFormatDesc ccd = cudaCreateChannelDesc(); + cudaMallocArray(&a_in, &ccd, width, height); + cudaMemcpyToArray(a_in, 0, 0, h_img, width*height*sizeof(float), + cudaMemcpyHostToDevice); + + t_in.normalized = true; + t_in.filterMode = cudaFilterModePoint; + t_in.addressMode[0] = t_in.addressMode[1] = cudaAddressModeBorder; + + // naming convention: stride in elements and pitch in bytes + // to achieve at the same time coalescing and L2-cache colision avoidance + // the transposed and regular output images have stride of maximum size + // (4Ki) + 1 block (WS), considering up to 4096x4096 input images + int stride_img = MAXSIZE+WS, stride_transp_img = MAXSIZE+WS; + dvector d_img(height*stride_img), d_transp_img(width*stride_transp_img); + + // +1 padding is important even in zero-border to avoid if's in kernels + dvector< Matrix > + d_rows_py1bar((m_size+1)*n_size), d_rows_ey2bar((m_size+1)*n_size), + d_cols_py1bar((n_size+1)*m_size), d_cols_ey2bar((n_size+1)*m_size); + + d_rows_py1bar.fillzero(); + d_rows_ey2bar.fillzero(); + d_cols_py1bar.fillzero(); + d_cols_ey2bar.fillzero(); + +#if RUNTIMES>1 + base_timer &timer_total = timers.gpu_add("alg4pd_gpu", width*height*RUNTIMES, "iP"); + + for(int r=0; r>> + ( &d_rows_py1bar, &d_rows_ey2bar, inv_width, inv_height, m_size ); + + alg4pd_gpu_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>> + ( &d_rows_py1bar, &d_rows_ey2bar, m_size ); + + alg4pd_gpu_write_results_transp<<< dim3(m_size, n_size), dim3(WS, NWW) >>> + ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar, + inv_width, inv_height, m_size, n_size, stride_transp_img ); + + if (!USE_FUSION) { + alg4pd_gpu_collect_carries<<< dim3(n_size, m_size), dim3(WS, NWC) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, + inv_height, inv_width, n_size ); + } + + cudaUnbindTexture(t_in); + cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float)); + + alg4pd_gpu_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, n_size ); + + alg4pd_gpu_write_results_transp<<< dim3(n_size, m_size), dim3(WS, NWW) >>> + ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_cols_ey2bar, + inv_height, inv_width, n_size, m_size, stride_img ); + + cudaUnbindTexture(t_in); + + } + + timer_total.stop(); + + std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush; +#else + base_timer &timer_total = timers.gpu_add("alg4pd_gpu", width*height, "iP"); + base_timer *timer; + + cudaBindTextureToArray(t_in, a_in); + + timer = &timers.gpu_add("collect-carries"); + + alg4pd_gpu_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>> + ( &d_rows_py1bar, &d_rows_ey2bar, + inv_width, inv_height, m_size ); + + timer->stop(); timer = &timers.gpu_add("adjust-carries"); + + alg4pd_gpu_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>> + ( &d_rows_py1bar, &d_rows_ey2bar, + m_size ); + + timer->stop(); timer = &timers.gpu_add("write-block-collect-carries"); + + alg4pd_gpu_write_results_transp<<< dim3(m_size, n_size), dim3(WS, NWW) >>> + ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar, + inv_width, inv_height, m_size, n_size, stride_transp_img ); + + timer->stop(); timer = &timers.gpu_add("adjust-carries"); + + cudaUnbindTexture(t_in); + cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float)); + + if (!USE_FUSION) { // it seems faster to not use FUSION - not quite + alg4pd_gpu_collect_carries<<< dim3(n_size, m_size), dim3(WS, NWC) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, + inv_height, inv_width, n_size ); + } + + alg4pd_gpu_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>> + ( &d_cols_py1bar, &d_cols_ey2bar, + n_size ); + + timer->stop(); timer = &timers.gpu_add("write-block"); + + alg4pd_gpu_write_results_transp<<< dim3(n_size, m_size), dim3(WS, NWW) >>> + ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_rows_ey2bar, + inv_height, inv_width, n_size, m_size, stride_img ); + + timer->stop(); + + cudaUnbindTexture(t_in); + + timer_total.stop(); + timers.flush(); +#endif + cudaMemcpy2D(h_img, width*sizeof(float), d_img, stride_img*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToHost); + +} + +#endif // ALG4PD_GPU_CUH diff --git a/paredge/alg5d_gpu.cuh b/paredge/alg5d_gpu.cuh new file mode 100755 index 0000000..9663676 --- /dev/null +++ b/paredge/alg5d_gpu.cuh @@ -0,0 +1,855 @@ +/** + * @file alg5d_gpu.cuh + * @brief Algorithm 5 in the GPU (Deriche version) with borders + * @author Andre Maximo + * @date Jun, 2014 + */ + +#ifndef ALG5D_GPU_CUH +#define ALG5D_GPU_CUH + +//=== IMPLEMENTATION ============================================================ + +// Alg5 (Deriche) GPU ----------------------------------------------------------- + +// Collect carries -------------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWC, NBCW) +void alg5d_collect_carries( Matrix *g_py1bar, + Matrix *g_pty1bar, + Matrix *g_ety1bar, + Matrix *g_ey2bar, + Matrix *g_pty2bar, + Matrix *g_ety2bar, + Matrix *g_ptu11hat, + Matrix *g_etu12hat, + Matrix *g_etu22hat, + Matrix *g_ptu21hat, + Matrix *g_px, + Matrix *g_ex, + float inv_width, float inv_height, + int m_size, int n_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y; + + __shared__ Matrix block; + read_block(block, m, n, inv_width, inv_height); + __syncthreads(); + + float x[32]; // 32 regs per thread + + if (ty==0) { // 1 warp forward computing y1 + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] = block[tx][i]; + + float xp = 0.f; // previous x + Vector py1; // current P(Y_1) + Vector px; // this block P(X) + + // for now, no boundary condition (zero) + +#pragma unroll + for (int i=0; i right + for (int j=0; j ey2; // current E(Y_2) + Vector ex; // this block E(X) + +#pragma unroll + for (int i=0; i left + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], ey2, c_weights); + xa = xn; + xn = xc; + } + + g_ey2bar[n*(m_size+1)+m].set_col(tx, ey2); + g_ex[n*(m_size+1)+m].set_col(tx, ex); + + } + + __syncthreads(); + + if (ty==0) { // save y1 in smem + +#pragma unroll + for (int i=0; i<32; ++i) + block[tx][i] = x[i]; + + } + + __syncthreads(); + + if (ty==0) { // 1 warp forward-down computing u11 + +#pragma unroll // transpose regs + for (int i=0; i<32; ++i) + x[i] = block[i][tx]; + + float xp = 0.f; // previous x + Vector ptu11; // current Pt(U_11) + Vector pty1; // this block Pt(Y1) + +#pragma unroll + for (int i=0; i bottom + for (int j=0; j etu12; // current Et(U_12) + Vector ety1; // this block Et(Y1) + +#pragma unroll + for (int i=0; i top + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], etu12, c_weights); + xa = xn; + xn = xc; + } + + g_etu12hat[m*(n_size+1)+n].set_col(tx, etu12); + g_ety1bar[m*(n_size+1)+n].set_col(tx, ety1); + + } + + __syncthreads(); + + if (ty==1) { // save y2 in smem + +#pragma unroll + for (int i=0; i<32; ++i) + block[tx][i] = x[i]; + + } + + __syncthreads(); + + if (ty==1) { // 1 warp reverse-up computing u22 + +#pragma unroll // transpose regs + for (int i=0; i<32; ++i) + x[i] = block[i][tx]; + + float xn = 0.f, xa = 0.f; // next and after next x + Vector etu22; // current Et(U_22) + Vector ety2; // this block Et(Y2) + +#pragma unroll + for (int i=0; i top + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], etu22, c_weights); + xa = xn; + xn = xc; + } + + g_etu22hat[m*(n_size+1)+n].set_col(tx, etu22); + g_ety2bar[m*(n_size+1)+n].set_col(tx, ety2); + + } else if (ty==3) { // 1 warp reverse-down computing u21 + +#pragma unroll // transpose regs + for (int i=0; i<32; ++i) + x[i] = block[i][tx]; + + float xp = 0.f; // previous x + Vector ptu21; // current Pt(U_21) + Vector pty2; // this block Pt(Y2) + +#pragma unroll + for (int i=0; i bottom + for (int j=0; j +__global__ __launch_bounds__(WS*NWA, NBA) +void alg5d_adjust_carries_rows( Matrix *g_py1bar, + Matrix *g_ey2bar, + Matrix *g_px, + Matrix *g_ex, + int m_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m, n = blockIdx.y; + Matrix *gpy1bar, *gey2bar; + Vector py1, ey2, py1bar, ey2bar; + __shared__ Matrix spy1ey2bar[NWA]; + + Matrix *gpx, *gex; + Vector px, ex; + __shared__ Matrix spxex[NWA]; + + // P(y1bar) -> P(y1) processing --------------------------------------------- + m = 0; + gpy1bar = (Matrix *)&g_py1bar[n*(m_size+1)+m+ty+1][0][tx]; + gpx = (Matrix *)&g_px[n*(m_size+1)+m+ty][0][tx]; + py1 = zeros(); // zero boundary condition + + for (; m < m_size; m += NWA) { // for all image blocks + + if (m+ty < m_size) { // using smem as cache + spy1ey2bar[ty].set_col(tx, gpy1bar->col(0)); + spxex[ty].set_col(tx, gpx->col(0)); + } + + __syncthreads(); // wait to load smem + + if (ty==0) { +#pragma unroll // adjust py1bar left -> right + for (int w = 0; w < NWA; ++w) { + + py1bar = spy1ey2bar[w].col(tx); + px = spxex[w].col(tx); + + py1 = py1bar + py1 * c_AbF_T + px * c_TAFB_AC1P_T; + + spy1ey2bar[w].set_col(tx, py1); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (m+ty < m_size) gpy1bar->set_col(0, spy1ey2bar[ty].col(tx)); + + gpy1bar += NWA; + gpx += NWA; + + } + + // E(y2bar) -> E(y2) processing --------------------------------------------- + m = m_size-1; + gey2bar = (Matrix *)&g_ey2bar[n*(m_size+1)+m-ty][0][tx]; + gex = (Matrix *)&g_ex[n*(m_size+1)+m+1-ty][0][tx]; + ey2 = zeros(); + + for (; m >= 0; m -= NWA) { // for all image blocks + + if (m-ty >= 0) { // using smem as cache + spy1ey2bar[ty].set_col(tx, gey2bar->col(0)); + spxex[ty].set_col(tx, gex->col(0)); + } + + __syncthreads(); // wait to load smem + + if (ty==0) { +#pragma unroll // adjust ey2bar right -> left + for (int w = 0; w < NWA; ++w) { + + ey2bar = spy1ey2bar[w].col(tx); + ex = spxex[w].col(tx); + + ey2 = ey2bar + ey2 * c_AbR_T + ex * c_HARB_AC2E_T; + + spy1ey2bar[w].set_col(tx, ey2); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (m-ty >= 0) gey2bar->set_col(0, spy1ey2bar[ty].col(tx)); + + gey2bar -= NWA; + gex -= NWA; + + } + +} + +// Adjust carries columns ------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWA, NBAC) +void alg5d_adjust_carries_cols( Matrix *g_py1, + Matrix *g_pty1bar, + Matrix *g_ety1bar, + Matrix *g_ey2, + Matrix *g_pty2bar, + Matrix *g_ety2bar, + Matrix *g_ptu11hat, + Matrix *g_etu12hat, + Matrix *g_etu22hat, + Matrix *g_ptu21hat, + Matrix *g_px, + Matrix *g_ex, + int m_size, int n_size ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n; + Matrix *gptu11hat, *gptu21hat;//, *getu12hat, *getu22hat; + Vector ptu11hat, ptu21hat, ptu1hat, etu2hat; + Vector pty1bar, pty2bar; + Vector ptz, etz, ptu1, etu2; + + Matrix *gpty1bar, *gpty2bar;//, *gety1bar, *gety2bar; + Matrix *gpx, *gex, *gpy1, *gey2; + Vector px, ex, py1, ey2; + + __shared__ Matrix spx[NWA], sex[NWA], spy1[NWA], sey2[NWA]; + __shared__ Matrix spt1[NWA], spt2[NWA], set1[NWA], set2[NWA]; + + // store (pty1bar, pty2bar) or (ptu11hat, ptu21hat) in (spt1, spt2) + // store (ety1bar, ety2bar) or (etu22hat, etu12hat) in (set1, set2) + // store ptu1 in ptu11hat and etu2 in etu22hat + + // Pt(u11hat) -> Pt(u1) processing ------------------------------------------ + n = 0; + gptu11hat = (Matrix *)&g_ptu11hat[m*(n_size+1)+n+1+ty][0][tx]; + gptu21hat = (Matrix *)&g_ptu21hat[m*(n_size+1)+n+1+ty][0][tx]; + gpty1bar = (Matrix *)&g_pty1bar[m*(n_size+1)+n+1+ty][0][tx]; + gpty2bar = (Matrix *)&g_pty2bar[m*(n_size+1)+n+1+ty][0][tx]; + gpx = (Matrix *)&g_px[(n+ty)*(m_size+1)+m+0][0][tx]; + gex = (Matrix *)&g_ex[(n+ty)*(m_size+1)+m+1][0][tx]; + gpy1 = (Matrix *)&g_py1[(n+ty)*(m_size+1)+m+0][0][tx]; + gey2 = (Matrix *)&g_ey2[(n+ty)*(m_size+1)+m+1][0][tx]; + + ptu1 = ptz = zeros(); // zero boundary condition + + for (; n < n_size; n += NWA) { // for all image blocks + + if (n+ty < n_size) { // using smem as cache + spt1[ty].set_col(tx, gptu11hat->col(0)); + spt2[ty].set_col(tx, gptu21hat->col(0)); + spy1[ty].set_col(tx, gpy1->col(0)); + sey2[ty].set_col(tx, gey2->col(0)); + spx[ty].set_col(tx, gpx->col(0)); + sex[ty].set_col(tx, gex->col(0)); + } + + __syncthreads(); // wait to load smem + + if (ty == 0) { +#pragma unroll // adjust ptucheck top -> bottom + for (int w = 0; w < NWA; ++w) { + + ptu11hat = spt1[w].col(tx); + ptu21hat = spt2[w].col(tx); + py1 = spy1[w].col(tx); + ey2 = sey2[w].col(tx); + px = spx[w].col(tx); + ex = sex[w].col(tx); + + ptz = pty1bar + pty2bar + ey2 * c_ARE_T.col(tx) + py1 * c_AFP_T.col(tx); + + ptu1hat = ptu11hat + ptu21hat; + fixpet(ptu1hat, c_TAFB_AC1B, c_AFP_T, py1); + fixpet(ptu1hat, c_TAFB_AC1B, c_ARE_T, ey2); + fixpet(ptu1hat, c_TAFB_AC1B, c_AC1P_AFB_T, px); + fixpet(ptu1hat, c_TAFB_AC1B, c_AC2E_ARB_T, ex); + + ptu1 = ptu1hat + ptu1 * c_AbF_T + ptz * c_TAFB_AC1P_T; + + spt1[w].set_col(tx, ptu1); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (n+ty < n_size) gptu11hat->set_col(0, spt1[ty].col(tx)); + + gptu11hat += NWA; + //gpx, gex... + gpy1 += NWA*(m_size+1); + gey2 += NWA*(m_size+1); + + } +/* + // Et(vtilde) -> Et(v) processing ------------------------------------------- + n = n_size-1; + getvtilde = (Matrix *)&g_etvtilde[m*(n_size+1)+n-ty][0][tx]; + gptucheck = (Matrix *)&g_ptucheck[m*(n_size+1)+n-ty][0][tx]; + gpy = (Matrix *)&g_py[(n-ty)*(m_size+1)+m][0][tx]; + gez = (Matrix *)&g_ez[(n-ty)*(m_size+1)+m+1][0][tx]; + etv = zeros(); + + for (; n >= 0; n -= NWA) { // for all image blocks + + if (n-ty >= 0) { // using smem as cache + setvtilde[ty].set_col(tx, getvtilde->col(0)); + sptucheck[ty].set_col(tx, gptucheck->col(0)); + spy[ty].set_col(tx, gpy->col(0)); + sez[ty].set_col(tx, gez->col(0)); + } + + __syncthreads(); // wait to load smem + + if (ty == 0) { +#pragma unroll // adjust etvtilde bottom -> top + for (int w = 0; w < NWA; ++w) { + + etvtilde = setvtilde[w].col(tx); + ptu = sptucheck[w].col(tx); + py = spy[w].col(tx); + ez = sez[w].col(tx); + + etv = etvtilde + etv * c_AbR_T + ptu * c_HARB_AFP_T; + + fixpet(etv, c_HARB_AFB, c_ARE_T, ez); + fixpet(etv, c_HARB_AFB, c_ARB_AFP_T, py); + + setvtilde[w].set_col(tx, etv); + + } + } + + __syncthreads(); // wait to store result in gmem + + if (n-ty >= 0) getvtilde->set_col(0, setvtilde[ty].col(tx)); + + getvtilde -= NWA; + gptucheck -= NWA; + gpy -= NWA*(m_size+1); + gez -= NWA*(m_size+1); + + } +*/ +} + +// Write results ---------------------------------------------------------------- + +template +__global__ __launch_bounds__(WS*NWW, NBCW) +void alg5d_write_results( float *g_out, + const Matrix *g_py1, + const Matrix *g_ey2, + const Matrix *g_ptu1, + const Matrix *g_etu2, + const Matrix *g_px, + const Matrix *g_ex, + const Matrix *g_ptz, + const Matrix *g_etz, + float inv_width, float inv_height, + int m_size, int n_size, + int out_stride ) { + + int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y; + + __shared__ Matrix block; + read_block(block, m, n, inv_width, inv_height); + __syncthreads(); + + float x[32]; // 32 regs per thread + + Matrix + *py1m1 = (Matrix*)&g_py1[n*(m_size+1)+m][0][tx], + *ey2m1 = (Matrix*)&g_ey2[n*(m_size+1)+m+1][0][tx], + *ptu1m1 = (Matrix*)&g_ptu1[m*(n_size+1)+n][0][tx], + *etu2m1 = (Matrix*)&g_etu2[m*(n_size+1)+n+1][0][tx]; + + Matrix + *pxm1 = (Matrix*)&g_px[n*(m_size+1)+m][0][tx], + *exm1 = (Matrix*)&g_ex[n*(m_size+1)+m+1][0][tx], + *ptzm1 = (Matrix*)&g_ptz[m*(n_size+1)+n][0][tx], + *etzm1 = (Matrix*)&g_etz[m*(n_size+1)+n+1][0][tx]; + + if (ty==0) { // 1 warp forward computing y1 + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] = block[tx][i]; + + float xp; // previous x + Vector py1 = py1m1->col(0); // current P(Y_1) + Vector px = pxm1->col(0); // previous block P(X) + + xp = px[0]; + +#pragma unroll // calculate block, scan left -> right + for (int j=0; j ey2 = ey2m1->col(0); // current E(Y_2) + Vector ex = exm1->col(0); // next block E(X) + + xn = ex[0]; + if (R==1) xa = xn; + else xa = ex[1]; + +#pragma unroll // calculate block, scan right -> left + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], ey2, c_weights); + xa = xn; + xn = xc; + } + + } + + // preparing for adding y1 and y2 + + __syncthreads(); // guarantee no smem usage + + if (ty==1) { + +#pragma unroll + for (int i=0; i<32; ++i) + block[tx][i] = x[i]; // save partial result in smem + + } + + __syncthreads(); // guarantee warp 0 can read smem + + if (ty==0) { // now add + +#pragma unroll + for (int i=0; i<32; ++i) + block[tx][i] = x[i] + block[tx][i]; // smem now has z + + } + + __syncthreads(); // guarantee no smem usage + + if (ty==0) { // 1 warp down computing u1 + +#pragma unroll // transpose regs + for (int i=0; i<32; ++i) + x[i] = block[i][tx]; + + float xp; // previous x + Vector ptu1 = ptu1m1->col(0); // current Pt(U_1) + Vector ptz = pxm1->col(0); // previous block Pt(Z) + + xp = ptz[0]; + +#pragma unroll // calculate block, scan top -> bottom + for (int j=0; j etu2 = etu2m1->col(0); // current Et(U_2) + Vector etz = etzm1->col(0); // next block Et(Z) + + xn = etz[0]; + if (R==1) xa = xn; + else xa = etz[1]; + +#pragma unroll // calculate block, scan bottom -> top + for (int j=WS-1; j>=0; --j) { + float xc = x[j]; // current x + x[j] = c_coefs[2]*xn + c_coefs[3]*xa; + x[j] = revI(x[j], etu2, c_weights); + xa = xn; + xn = xc; + } + + } + + // preparing for adding u1 and u2 + + __syncthreads(); // guarantee no smem usage + + if (ty==1) { + +#pragma unroll + for (int i=0; i<32; ++i) + block[i][tx] = x[i]; // save partial result in smem + + } + + __syncthreads(); // guarantee warp 0 can read smem + + if (ty==0) { // now add + +#pragma unroll + for (int i=0; i<32; ++i) + x[i] += block[i][tx]; // regs now has v + + g_out += ((n+1)*WS-1)*out_stride + m*WS + tx; +#pragma unroll // write block + for (int i=0; i coefs; // [a0-a3, coefp, coefn] + Vector w; // weights + w[0] = 1.f; + + computeFilterParams(coefs[0], coefs[1], coefs[2], coefs[3], + w[1], w[2], coefs[4], coefs[5], + sigma, order); + + Vector ck1; // convolution kernel 1 (plus) + ck1[0] = coefs[0]; ck1[1] = coefs[1]; ck1[2] = 0.f; + Vector ck2; // convolution kernel 2 (minus) + ck2[0] = 0.f; ck2[1] = coefs[2]; ck2[2] = coefs[3]; + + // pre-compute basic alg5 Deriche matrices + Matrix Ir = identity(); + Matrix Zbr = zeros(); + Matrix Zrb = zeros(); + Matrix Ib = identity(); + + Matrix AFP_T = fwd(Ir, Zrb, w), ARE_T = rev(Zrb, Ir, w); + Matrix AFB_T = fwd(Zbr, Ib, w), ARB_T = rev(Ib, Zbr, w); + Matrix AbF_T = tail(AFP_T), AbR_T = head(ARE_T); + Matrix TAFB_T = tail(AFB_T), HARB_T = head(ARB_T); + + Matrix AC1P_T = transp(conv1p(ck1)); + Matrix AC2E_T = transp(conv2e(ck2)); + Matrix AC1B_T = transp(conv1b(ck1)); + Matrix AC2B_T = transp(conv2b(ck2)); + + Matrix TAFB_AC1P_T = AC1P_T*TAFB_T; + Matrix HARB_AC2E_T = AC2E_T*HARB_T; + + Matrix TAFB_AC1B = transp(AC1B_T*TAFB_T); + Matrix HARB_AC2B = transp(AC2B_T*HARB_T); + + Matrix AC1P_AFB_T = AC1P_T*AFB_T; + Matrix AC2E_ARB_T = AC2E_T*ARB_T; + + int m_size = (width+WS-1)/WS, n_size = (height+WS-1)/WS; + + // upload to the GPU + copy_to_symbol(c_weights, w); + copy_to_symbol(c_coefs, coefs); + + copy_to_symbol(c_AbF_T, AbF_T); + copy_to_symbol(c_AbR_T, AbR_T); + + copy_to_symbol(c_AFP_T, AFP_T); + copy_to_symbol(c_ARE_T, ARE_T); + + copy_to_symbol(c_TAFB_AC1P_T, TAFB_AC1P_T); + copy_to_symbol(c_HARB_AC2E_T, HARB_AC2E_T); + + copy_to_symbol(c_TAFB_AC1B, TAFB_AC1B); + copy_to_symbol(c_HARB_AC2B, HARB_AC2B); + + copy_to_symbol(c_AC1P_AFB_T, AC1P_AFB_T); + copy_to_symbol(c_AC2E_ARB_T, AC2E_ARB_T); + + float inv_width = 1.f/width, inv_height = 1.f/height; + + cudaArray *a_in; + cudaChannelFormatDesc ccd = cudaCreateChannelDesc(); + cudaMallocArray(&a_in, &ccd, width, height); + cudaMemcpyToArray(a_in, 0, 0, h_img, width*height*sizeof(float), + cudaMemcpyHostToDevice); + + t_in.normalized = true; + t_in.filterMode = cudaFilterModePoint; + t_in.addressMode[0] = t_in.addressMode[1] = cudaAddressModeBorder; + + // naming convention: stride in elements and pitch in bytes + // to achieve at the same time coalescing and L2-cache colision avoidance + // the transposed and regular output images have stride of maximum size + // (4Ki) + 1 block (WS), considering up to 4096x4096 input images + int stride_img = MAXSIZE+WS; + dvector d_img(height*stride_img); + + // +1 padding is important even in zero-border to avoid if's in kernels + dvector< Matrix > + d_py1bar((m_size+1)*n_size), d_ey2bar((m_size+1)*n_size); + dvector< Matrix > + d_pty1bar((n_size+1)*m_size), d_ety1bar((n_size+1)*m_size), + d_ptu11hat((n_size+1)*m_size), d_etu12hat((n_size+1)*m_size); + dvector< Matrix > + d_pty2bar((n_size+1)*m_size), d_ety2bar((n_size+1)*m_size), + d_ptu21hat((n_size+1)*m_size), d_etu22hat((n_size+1)*m_size); + dvector< Matrix > + d_px((m_size+1)*n_size), d_ex((m_size+1)*n_size), + d_ptz((n_size+1)*m_size), d_etz((n_size+1)*m_size); + + d_py1bar.fillzero(); + d_ey2bar.fillzero(); + d_pty1bar.fillzero(); + d_ety1bar.fillzero(); + d_ptu11hat.fillzero(); + d_etu12hat.fillzero(); + d_pty2bar.fillzero(); + d_ety2bar.fillzero(); + d_ptu21hat.fillzero(); + d_etu22hat.fillzero(); + + d_px.fillzero(); + d_ex.fillzero(); + d_ptz.fillzero(); + d_etz.fillzero(); + +#if RUNTIMES>1 + base_timer &timer_total = timers.gpu_add("alg5d_gpu", width*height*RUNTIMES, "iP"); + + for(int r=0; r>> + ( &d_py1bar, &d_pty1bar, &d_ety1bar, + &d_ey2bar, &d_pty2bar, &d_ety2bar, + &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat, + &d_px, &d_ex, + inv_width, inv_height, m_size, n_size ); + + alg5d_adjust_carries_rows<<< dim3(1, n_size), dim3(WS, NWA) >>> + ( &d_py1bar, &d_ey2bar, &d_px, &d_ex, m_size ); + + // alg5d_adjust_carries_cols<<< dim3(m_size, 1), dim3(WS, NWA) >>> + // ( &d_py1bar, &d_pty1bar, &d_ety1bar, + // &d_ey2bar, &d_pty2bar, &d_ety2bar, + // &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat, + // &d_px, &d_ex, + // m_size, n_size ); + + alg5d_write_results<<< dim3(m_size, n_size), dim3(WS, NWW) >>> + ( d_img, &d_py1bar, &d_ey2bar, &d_ptu11hat, &d_etu22hat, + &d_px, &d_ex, &d_ptz, &d_etz, + inv_width, inv_height, m_size, n_size, stride_img ); + + cudaUnbindTexture(t_in); + + } + + timer_total.stop(); + + std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush; +#else + base_timer &timer_total = timers.gpu_add("alg5d_gpu", width*height, "iP"); + base_timer *timer; + + timer = &timers.gpu_add("collect-carries"); + + cudaBindTextureToArray(t_in, a_in); + + alg5d_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>> + ( &d_py1bar, &d_pty1bar, &d_ety1bar, + &d_ey2bar, &d_pty2bar, &d_ety2bar, + &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat, + &d_px, &d_ex, + inv_width, inv_height, m_size, n_size ); + + timer->stop(); timer = &timers.gpu_add("adjust-carries-rows"); + + alg5d_adjust_carries_rows<<< dim3(1, n_size), dim3(WS, NWA) >>> + ( &d_py1bar, &d_ey2bar, &d_px, &d_ex, m_size ); + + timer->stop(); timer = &timers.gpu_add("adjust-carries-columns"); + + // alg5d_adjust_carries_cols<<< dim3(m_size, 1), dim3(WS, NWA) >>> + // ( &d_py1bar, &d_pty1bar, &d_ety1bar, + // &d_ey2bar, &d_pty2bar, &d_ety2bar, + // &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat, + // &d_px, &d_ex, + // m_size, n_size ); + + timer->stop(); timer = &timers.gpu_add("write-block"); + + alg5d_write_results<<< dim3(m_size, n_size), dim3(WS, NWW) >>> + ( d_img, &d_py1bar, &d_ey2bar, &d_ptu11hat, &d_etu22hat, + &d_px, &d_ex, &d_ptz, &d_etz, + inv_width, inv_height, m_size, n_size, stride_img ); + + timer->stop(); + + cudaUnbindTexture(t_in); + + timer_total.stop(); + timers.flush(); +#endif + cudaMemcpy2D(h_img, width*sizeof(float), d_img, stride_img*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToHost); + +} + +#endif // ALG5D_GPU_CUH diff --git a/paredge/conv.h b/paredge/conv.h new file mode 100755 index 0000000..b1eb922 --- /dev/null +++ b/paredge/conv.h @@ -0,0 +1,171 @@ +/** + * @file conv.h + * @brief Convolutions utilities definition + * @author Andre Maximo + * @date Jun, 2014 + */ + +#ifndef CONV_H +#define CONV_H + +#include + + +template +HOSTDEV +T conv_op(const T &x, const T &a) +{ + return x + a; +} + +// convolution ----------------------------------------------------- + +// convolution 1 matrix (left) on prologues +template +Matrix conv1p(const Vector& ck) +{ + Matrix m; + for(int i=0; iR) + m[i][j] = (T)0; + else + m[i][j] = ck[k]; + } + } + return m; +} + +// convolution 2 matrix (right) on epilogues +template +Matrix conv2e(const Vector& ck) +{ + Matrix m; + for(int i=M-1; i>=0; --i) { + for(int j=0; jR) + m[i][j] = (T)0; + else + m[i][j] = ck[k]; + } + } + return m; +} + +// convolution 1 matrix (left) on blocks +template +Matrix conv1b(const Vector& ck) +{ + Matrix m = zeros(); + for(int i=0; i=0 && k +Matrix conv2b(const Vector& ck) +{ + Matrix m = zeros(); + for(int i=0; i=0 && k +T conv1(const Vector &p, T x, const Vector &ck) +{ + x = x * ck[0]; +#pragma unroll + for(int i=0; i +void conv1_inplace(const Vector &_p, Vector &b, const Vector &ck) +{ + Vector p = _p; + for(int j=0; j=1; --k) + p[k] = p[k-1]; + p[0] = b[j]; + b[j] = x; + } +} + +template +void conv1_inplace(const Matrix &p, Matrix &b, + const Vector &ck) +{ + for(int i=0; i +Matrix conv1(const Matrix &p, const Matrix &b, + const Vector &ck) +{ + Matrix cb = b; + conv1_inplace(p, cb, ck); + return cb; +} + +// conv2 --------------------------------------------------------------- + +template +T conv2(T x, const Vector &e, const Vector &ck) +{ + x = x * ck[0]; +#pragma unroll + for(int i=0; i +void conv2_inplace(Vector &b, const Vector &_e, const Vector &ck) +{ + Vector e = _e; + for(int j=b.size()-1; j>=0; --j) { + T x = conv2(b[j], e, ck); + for (int k=R-1; k>=1; --k) + e[k] = e[k-1]; + e[0] = b[j]; + b[j] = x; + } +} + +template +void conv2_inplace(Matrix &b, const Matrix &e, + const Vector &ck) +{ + for(int i=0; i +Matrix conv2(const Matrix &b, const Matrix &e, + const Vector &ck) +{ + Matrix cb = b; + conv2_inplace(cb, e, ck); + return cb; +} + +#endif // CONV_H diff --git a/paredge/findcudalib.mk b/paredge/findcudalib.mk new file mode 100755 index 0000000..f40c2c3 --- /dev/null +++ b/paredge/findcudalib.mk @@ -0,0 +1,226 @@ +################################################################################ +# +# Copyright 1993-2013 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# findcudalib.mk is used to find the locations for CUDA libraries and other +# Unix Platforms. This is supported Mac OS X and Linux. +# +################################################################################ + +# OS Name (Linux or Darwin) +OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]") +OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") + +# Flags to detect 32-bit or 64-bit OS platform +OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/" -e "s/armv7l/32/") +OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") + +# Determine OS platform and unix distribution +ifeq ("$(OSLOWER)","linux") + # first search lsb_release + DISTRO = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]") + DISTVER = $(shell lsb_release -r -s 2>/dev/null) + ifeq ("$(DISTRO)",'') + # second search and parse /etc/issue + DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]") + DISTVER= $(shell more /etc/issue | awk '{print $$2}' | sed '1!d' 2>/dev/null + endif + ifeq ("$(DISTRO)",'') + # third, we can search in /etc/os-release or /etc/{distro}-release + DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB") + DISTVER= $(shell awk '/DISTRIB_RELEASE/' /etc/*-release | sed 's/DISTRIB_RELEASE=//' | grep -v "DISTRIB_RELEASE") + endif +endif + +# search at Darwin (unix based info) +DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) +ifneq ($(DARWIN),) + SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "10\.6" /System/Library/CoreServices/SystemVersion.plist))) + LION = $(strip $(findstring 10.7, $(shell egrep "10\.7" /System/Library/CoreServices/SystemVersion.plist))) + MOUNTAIN = $(strip $(findstring 10.8, $(shell egrep "10\.8" /System/Library/CoreServices/SystemVersion.plist))) + MAVERICKS = $(strip $(findstring 10.9, $(shell egrep "10\.9" /System/Library/CoreServices/SystemVersion.plist))) +endif + +# Common binaries +GCC ?= g++ +CLANG ?= /usr/bin/clang + +ifeq ("$(OSUPPER)","LINUX") + NVCC ?= $(CUDA_PATH)/bin/nvcc -ccbin $(GCC) +else + # for some newer versions of XCode, CLANG is the default compiler, so we need to include this + ifneq ($(MAVERICKS),) + NVCC ?= $(CUDA_PATH)/bin/nvcc -ccbin $(CLANG) + STDLIB ?= -stdlib=libstdc++ + else + NVCC ?= $(CUDA_PATH)/bin/nvcc -ccbin $(GCC) + endif +endif + +# Take command line flags that override any of these settings +ifeq ($(i386),1) + OS_SIZE = 32 + OS_ARCH = i686 +endif +ifeq ($(x86_64),1) + OS_SIZE = 64 + OS_ARCH = x86_64 +endif +ifeq ($(ARMv7),1) + OS_SIZE = 32 + OS_ARCH = armv7l +endif + +ifeq ("$(OSUPPER)","LINUX") + # Each Linux Distribuion has a set of different paths. This applies especially when using the Linux RPM/debian packages + ifeq ("$(DISTRO)","ubuntu") + CUDAPATH ?= /usr/lib/nvidia-current + CUDALINK ?= -L/usr/lib/nvidia-current + DFLT_PATH = /usr/lib + endif + ifeq ("$(DISTRO)","kubuntu") + CUDAPATH ?= /usr/lib/nvidia-current + CUDALINK ?= -L/usr/lib/nvidia-current + DFLT_PATH = /usr/lib + endif + ifeq ("$(DISTRO)","debian") + CUDAPATH ?= /usr/lib/nvidia-current + CUDALINK ?= -L/usr/lib/nvidia-current + DFLT_PATH = /usr/lib + endif + ifeq ("$(DISTRO)","suse") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + ifeq ("$(DISTRO)","suse linux") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + ifeq ("$(DISTRO)","opensuse") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + ifeq ("$(DISTRO)","fedora") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= /usr/lib64/nvidia + CUDALINK ?= -L/usr/lib64/nvidia + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + ifeq ("$(DISTRO)","redhat") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= /usr/lib64/nvidia + CUDALINK ?= -L/usr/lib64/nvidia + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + ifeq ("$(DISTRO)","red") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= /usr/lib64/nvidia + CUDALINK ?= -L/usr/lib64/nvidia + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + ifeq ("$(DISTRO)","redhatenterpriseworkstation") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= /usr/lib64/nvidia + CUDALINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","centos") + ifeq ($(OS_SIZE),64) + CUDAPATH ?= /usr/lib64/nvidia + CUDALINK ?= -L/usr/lib64/nvidia + DFLT_PATH = /usr/lib64 + else + CUDAPATH ?= + CUDALINK ?= + DFLT_PATH = /usr/lib + endif + endif + + ifeq ($(ARMv7),1) + CUDAPATH := /usr/arm-linux-gnueabihf/lib + CUDALINK := -L/usr/arm-linux-gnueabihf/lib + ifneq ($(TARGET_FS),) + CUDAPATH += $(TARGET_FS)/usr/lib/nvidia-current + CUDALINK += -L$(TARGET_FS)/usr/lib/nvidia-current + endif + endif + + # Search for Linux distribution path for libcuda.so + CUDALIB ?= $(shell find $(CUDAPATH) $(DFLT_PATH) -name libcuda.so -print 2>/dev/null) + + ifeq ("$(CUDALIB)",'') + $(info >>> WARNING - CUDA Driver libcuda.so is not found. Please check and re-install the NVIDIA driver. <<<) + EXEC=@echo "[@]" + endif +else + # This would be the Mac OS X path if we had to do anything special +endif + diff --git a/paredge/findgllib.mk b/paredge/findgllib.mk new file mode 100755 index 0000000..1525ad8 --- /dev/null +++ b/paredge/findgllib.mk @@ -0,0 +1,225 @@ +################################################################################ +# +# Copyright 1993-2013 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO USER: +# +# This source code is subject to NVIDIA ownership rights under U.S. and +# international Copyright laws. +# +# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE +# CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR +# IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, +# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +# OR PERFORMANCE OF THIS SOURCE CODE. +# +# U.S. Government End Users. This source code is a "commercial item" as +# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of +# "commercial computer software" and "commercial computer software +# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) +# and is provided to the U.S. Government only as a commercial end item. +# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through +# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the +# source code with only those rights set forth herein. +# +################################################################################ +# +# findgllib.mk is used to find the necessary GL Libraries for specific distributions +# this is supported on Mac OSX and Linux Platforms +# +################################################################################ + +# OS Name (Linux or Darwin) +OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]") +OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") + +# Flags to detect 32-bit or 64-bit OS platform +OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/" -e "s/armv7l/32/") +OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") + +# Determine OS platform and unix distribution +ifeq ("$(OSLOWER)","linux") + # first search lsb_release + DISTRO = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]") + DISTVER = $(shell lsb_release -r -s 2>/dev/null) + # $(info DISTRO1 = $(DISTRO) $(DISTVER)) + ifeq ($(DISTRO),) + # second search and parse /etc/issue + DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]") + DISTVER= $(shell more /etc/issue | awk '{print $$2}' | sed '1!d' 2>/dev/null) + # $(info DISTRO2 = $(DISTRO) $(DISTVER)) + endif + ifeq ($(DISTRO),) + # third, we can search in /etc/os-release or /etc/{distro}-release + DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB") + DISTVER= $(shell awk '/DISTRIB_RELEASE/' /etc/*-release | sed 's/DISTRIB_RELEASE=//' | grep -v "DISTRIB_RELEASE") + # $(info DISTRO3 = $(DISTRO) $(DISTVER)) + endif +endif + +# Take command line flags that override any of these settings +ifeq ($(i386),1) + OS_SIZE = 32 + OS_ARCH = i686 +endif +ifeq ($(x86_64),1) + OS_SIZE = 64 + OS_ARCH = x86_64 +endif +ifeq ($(ARMv7),1) + OS_SIZE = 32 + OS_ARCH = armv7l +endif + +ifeq ("$(OSUPPER)","LINUX") + # $(info) >> findgllib.mk -> LINUX path <<<) + # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside + ifeq ("$(DISTRO)","ubuntu") + GLPATH ?= /usr/lib/nvidia-current + GLLINK ?= -L/usr/lib/nvidia-current + DFLT_PATH ?= /usr/lib + endif + ifeq ("$(DISTRO)","kubuntu") + GLPATH ?= /usr/lib/nvidia-current + GLLINK ?= -L/usr/lib/nvidia-current + DFLT_PATH ?= /usr/lib + endif + ifeq ("$(DISTRO)","debian") + GLPATH ?= /usr/lib/nvidia-current + GLLINK ?= -L/usr/lib/nvidia-current + DFLT_PATH ?= /usr/lib + endif + ifeq ("$(DISTRO)","suse") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/X11R6/lib64 /usr/X11R6/lib + GLLINK ?= -L/usr/X11R6/lib64 -L/usr/X11R6/lib + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= /usr/X11R6/lib + GLLINK ?= -L/usr/X11R6/lib + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","suse linux") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/X11R6/lib64 /usr/X11R6/lib + GLLINK ?= -L/usr/X11R6/lib64 -L/usr/X11R6/lib + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= /usr/X11R6/lib + GLLINK ?= -L/usr/X11R6/lib + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","opensuse") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/X11R6/lib64 /usr/X11R6/lib + GLLINK ?= -L/usr/X11R6/lib64 -L/usr/X11R6/lib + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= /usr/X11R6/lib + GLLINK ?= -L/usr/X11R6/lib + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","fedora") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= + GLLINK ?= + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","redhat") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= + GLLINK ?= + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","red") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= + GLLINK ?= + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","redhatenterpriseworkstation") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= + GLLINK ?= + DFLT_PATH ?= /usr/lib + endif + endif + ifeq ("$(DISTRO)","centos") + ifeq ($(OS_SIZE),64) + GLPATH ?= /usr/lib64/nvidia + GLLINK ?= -L/usr/lib64/nvidia + DFLT_PATH ?= /usr/lib64 + else + GLPATH ?= + GLLINK ?= + DFLT_PATH ?= /usr/lib + endif + endif + + ifeq ($(ARMv7),1) + GLPATH := /usr/arm-linux-gnueabihf/lib + GLLINK := -L/usr/arm-linux-gnueabihf/lib + ifneq ($(TARGET_FS),) + GLPATH += $(TARGET_FS)/usr/lib/nvidia-current $(TARGET_FS)/usr/lib/arm-linux-gnueabihf + GLLINK += -L$(TARGET_FS)/usr/lib/nvidia-current -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf + endif + endif + + # find libGL, libGLU, libXi, + GLLIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libGL.so -print 2>/dev/null) + GLULIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libGLU.so -print 2>/dev/null) + X11LIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libX11.so -print 2>/dev/null) + XILIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libXi.so -print 2>/dev/null) + XMULIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libXmu.so -print 2>/dev/null) + + ifeq ("$(GLLIB)",'') + $(info >>> WARNING - libGL.so not found, refer to CUDA Samples release notes for how to find and install them. <<<) + EXEC=@echo "[@]" + endif + ifeq ("$(GLULIB)",'') + $(info >>> WARNING - libGLU.so not found, refer to CUDA Samples release notes for how to find and install them. <<<) + EXEC=@echo "[@]" + endif + ifeq ("$(X11LIB)",'') + $(info >>> WARNING - libX11.so not found, refer to CUDA Samples release notes for how to find and install them. <<<) + EXEC=@echo "[@]" + endif + ifeq ("$(XILIB)",'') + $(info >>> WARNING - libXi.so not found, refer to CUDA Samples release notes for how to find and install them. <<<) + EXEC=@echo "[@]" + endif + ifeq ("$(XMULIB)",'') + $(info >>> WARNING - libXmu.so not found, refer to CUDA Samples release notes for how to find and install them. <<<) + EXEC=@echo "[@]" + endif +else + # This would be the Mac OS X path if we had to do anything special +endif + diff --git a/paredge/fix_ref.bib b/paredge/fix_ref.bib new file mode 100755 index 0000000..24bd478 --- /dev/null +++ b/paredge/fix_ref.bib @@ -0,0 +1,10 @@ +@article{ruijters2012gpu, +title={GPU prefilter for accurate cubic B-spline interpolation}, +author={Ruijters, Daniel and Th{\'e}venaz, Philippe}, +journal={The Computer Journal}, +volume={55}, +number={1}, +pages={15--20}, +year={2012}, +publisher={Oxford University Press} +} diff --git a/paredge/gpu_geforcetitan.txt b/paredge/gpu_geforcetitan.txt new file mode 100755 index 0000000..7df0903 --- /dev/null +++ b/paredge/gpu_geforcetitan.txt @@ -0,0 +1,40 @@ + CUDA Device Query (Runtime API) version (CUDART static linking) + +Detected 1 CUDA Capable device(s) + +Device 0: "GeForce GTX TITAN" + CUDA Driver Version / Runtime Version 5.5 / 5.5 + CUDA Capability Major/Minor version number: 3.5 + Total amount of global memory: 6143 MBytes (6441730048 bytes) + (14) Multiprocessors, (192) CUDA Cores/MP: 2688 CUDA Cores + GPU Clock rate: 876 MHz (0.88 GHz) + Memory Clock rate: 3004 Mhz + Memory Bus Width: 384-bit + L2 Cache Size: 1572864 bytes + Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096) + Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers + Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers + Total amount of constant memory: 65536 bytes + Total amount of shared memory per block: 49152 bytes + Total number of registers available per block: 65536 + Warp size: 32 + Maximum number of threads per multiprocessor: 2048 + Maximum number of threads per block: 1024 + Max dimension size of a thread block (x,y,z): (1024, 1024, 64) + Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) + Maximum memory pitch: 2147483647 bytes + Texture alignment: 512 bytes + Concurrent copy and kernel execution: Yes with 1 copy engine(s) + Run time limit on kernels: Yes + Integrated GPU sharing Host Memory: No + Support host page-locked memory mapping: Yes + Alignment requirement for Surfaces: Yes + Device has ECC support: Disabled + Device supports Unified Addressing (UVA): Yes + Device PCI Bus ID / PCI location ID: 3 / 0 + Compute Mode: + < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > + +deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX TITAN +Result = PASS + diff --git a/paredge/gpu_quadrok2000.txt b/paredge/gpu_quadrok2000.txt new file mode 100755 index 0000000..c0b6f05 --- /dev/null +++ b/paredge/gpu_quadrok2000.txt @@ -0,0 +1,40 @@ + CUDA Device Query (Runtime API) version (CUDART static linking) + +Detected 1 CUDA Capable device(s) + +Device 0: "Quadro K2000" + CUDA Driver Version / Runtime Version 6.0 / 5.5 + CUDA Capability Major/Minor version number: 3.0 + Total amount of global memory: 2048 MBytes (2147155968 bytes) + ( 2) Multiprocessors, (192) CUDA Cores/MP: 384 CUDA Cores + GPU Clock rate: 954 MHz (0.95 GHz) + Memory Clock rate: 2000 Mhz + Memory Bus Width: 128-bit + L2 Cache Size: 262144 bytes + Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096) + Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers + Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers + Total amount of constant memory: 65536 bytes + Total amount of shared memory per block: 49152 bytes + Total number of registers available per block: 65536 + Warp size: 32 + Maximum number of threads per multiprocessor: 2048 + Maximum number of threads per block: 1024 + Max dimension size of a thread block (x,y,z): (1024, 1024, 64) + Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) + Maximum memory pitch: 2147483647 bytes + Texture alignment: 512 bytes + Concurrent copy and kernel execution: Yes with 1 copy engine(s) + Run time limit on kernels: Yes + Integrated GPU sharing Host Memory: No + Support host page-locked memory mapping: Yes + Alignment requirement for Surfaces: Yes + Device has ECC support: Disabled + Device supports Unified Addressing (UVA): Yes + Device PCI Bus ID / PCI location ID: 15 / 0 + Compute Mode: + < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > + +deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = Quadro K2000 +Result = PASS + diff --git a/paredge/gpud_aux.cuh b/paredge/gpud_aux.cuh new file mode 100755 index 0000000..e1b8b82 --- /dev/null +++ b/paredge/gpud_aux.cuh @@ -0,0 +1,26 @@ +/** + * @file gpud_aux.cuh + * @brief Auxiliary GPU definitions (Deriche) and implementations + * @author Andre Maximo + * @date Jun, 2014 + */ + +#ifndef GPUD_AUX_CUH +#define GPUD_AUX_CUH + +//=== GLOBAL-SCOPE DEFINITIONS ================================================== + +#define NBAC 5 // # of blocks to adjust carriers for cols (smem=2x bxb) + +// basics +__constant__ Vector c_coefs; + +// alg4d +__constant__ Matrix c_TAFB_AC1P_T, c_HARB_AC2E_T; + +// alg5d +__constant__ Matrix c_AFP_T; +__constant__ Matrix c_TAFB_AC1B, c_HARB_AC2B, + c_AC1P_AFB_T, c_AC2E_ARB_T; + +#endif // GPUD_AUX_CUH diff --git a/paredge/perf01.txt b/paredge/perf01.txt new file mode 100755 index 0000000..cfdae02 --- /dev/null +++ b/paredge/perf01.txt @@ -0,0 +1,64 @@ +64 64 45.160233 64.117485 48.390018 80.791672 78.122360 +128 128 109.495667 136.387070 176.345718 314.120544 298.755859 +192 192 118.469055 136.417160 291.930145 468.001984 456.378448 +256 256 147.451080 177.776108 370.707458 518.250244 517.507812 +320 320 158.310699 182.476059 407.468628 614.357361 622.562927 +384 384 195.555023 240.068359 516.386780 700.726807 723.647888 +448 448 222.558670 268.162018 564.280396 743.403870 790.598755 +512 512 253.770355 293.569672 601.107056 785.934875 824.709595 +576 576 274.284668 315.311768 621.999939 786.772888 807.223083 +640 640 295.734497 343.989288 640.378601 807.160645 823.183716 +704 704 323.370361 361.492859 650.717712 820.898560 889.293152 +768 768 339.730896 378.031494 670.750122 845.371155 864.034302 +832 832 359.282227 396.145264 679.790649 832.685486 868.159180 +896 896 380.712860 421.281403 693.714478 854.128357 902.375732 +960 960 397.700775 433.684265 687.070068 849.685059 902.116638 +1024 1024 417.481720 462.083984 704.057678 873.345520 923.876404 +1088 1088 402.461182 459.446625 686.792358 861.952332 924.522461 +1152 1152 440.991272 456.722595 703.789551 868.538269 931.912659 +1216 1216 459.310455 487.640076 705.504578 842.084534 921.665527 +1280 1280 449.966736 499.377289 711.788818 867.322998 921.989441 +1344 1344 481.741241 486.807800 679.699585 846.891663 906.499329 +1408 1408 497.357574 495.586853 719.500671 883.092529 953.552917 +1472 1472 507.167328 516.822937 718.878479 880.720154 953.618774 +1536 1536 517.084473 519.867676 727.509460 887.431213 961.191284 +1600 1600 532.673401 519.448669 719.591919 887.360229 937.163574 +1664 1664 539.005981 524.233765 726.562134 891.095398 945.263000 +1728 1728 520.373169 501.643402 702.776062 834.652527 887.506836 +1792 1792 535.895447 523.244385 742.748413 894.789246 965.409851 +1856 1856 575.067627 531.831421 744.350342 906.591736 966.308411 +1920 1920 576.124756 520.769897 748.907837 900.844238 874.108154 +1984 1984 584.959290 523.752197 740.765381 904.273254 964.834595 +2048 2048 591.803833 524.071838 747.518860 842.683044 950.838806 +2112 2112 603.336182 417.341034 736.741028 895.728882 972.008057 +2176 2176 598.840454 420.179169 745.584595 898.460449 971.712524 +2240 2240 615.018066 429.975250 741.152893 898.690552 978.043945 +2304 2304 614.313965 433.725708 747.196228 904.179321 982.713745 +2368 2368 624.929138 444.420715 747.538086 900.160950 973.525879 +2432 2432 636.339233 453.141083 750.813049 911.986145 974.878662 +2496 2496 637.298157 456.277039 751.672119 907.004333 973.859924 +2560 2560 643.568054 459.965729 752.201172 904.241211 971.773376 +2624 2624 641.684021 467.779022 749.641296 901.607300 979.460632 +2688 2688 643.450928 469.281982 751.228638 909.232849 978.871826 +2752 2752 651.060791 478.914642 752.868530 909.955444 980.774109 +2816 2816 653.118530 483.877350 762.031189 912.872803 984.416504 +2880 2880 653.587524 487.520447 745.559814 910.216736 979.572083 +2944 2944 655.284180 490.682709 761.514893 911.182190 987.370728 +3008 3008 660.920776 498.858307 758.948547 911.210510 984.590332 +3072 3072 658.691772 497.575195 761.142700 913.306885 987.769287 +3136 3136 646.869812 502.283569 751.161499 910.928223 978.665405 +3200 3200 660.213440 509.015381 758.642639 902.398193 983.400940 +3264 3264 650.241455 505.617371 755.336914 909.077698 976.947327 +3328 3328 665.676514 517.090149 753.827515 884.583374 985.660095 +3392 3392 656.700623 519.008423 759.489990 914.666992 983.742493 +3456 3456 671.460999 512.222229 756.539673 916.689148 987.267334 +3520 3520 675.414978 516.492004 759.940918 914.387085 983.277893 +3584 3584 678.601868 517.237427 764.180664 916.786072 978.531372 +3648 3648 675.313477 520.775330 765.306946 918.075928 992.878662 +3712 3712 676.890991 523.982605 766.977905 918.668640 993.898132 +3776 3776 667.165894 522.591797 766.459656 915.686523 988.108582 +3840 3840 653.314209 509.136292 767.725037 920.161560 992.750671 +3904 3904 671.175110 526.073364 763.760864 917.518555 989.826965 +3968 3968 668.559265 516.526672 761.514465 916.361694 984.483704 +4032 4032 676.315918 529.717957 766.719421 918.734192 989.240906 +4096 4096 683.850220 536.414429 765.472290 917.671021 987.498779 diff --git a/paredge/perf02.txt b/paredge/perf02.txt new file mode 100755 index 0000000..655236d --- /dev/null +++ b/paredge/perf02.txt @@ -0,0 +1,64 @@ +64 64 51.506283 59.294174 44.573425 79.845184 77.305832 +128 128 109.759804 133.756912 174.313293 312.091583 299.324646 +192 192 122.762535 141.430588 283.593109 465.061005 458.717773 +256 256 147.209396 173.429367 393.575867 593.916260 591.784668 +320 320 174.673340 209.546921 446.827850 650.946289 661.807922 +384 384 200.363998 235.921310 508.625824 693.695251 714.228394 +448 448 228.087891 265.998657 561.434937 753.261230 788.068420 +512 512 258.715515 300.524261 586.545532 769.635986 808.467773 +576 576 278.403381 317.791931 598.871521 779.056396 822.371704 +640 640 298.956635 345.284821 648.820862 815.117310 872.488647 +704 704 321.602966 365.987305 644.256714 813.182129 876.553284 +768 768 340.564331 381.660187 665.325012 833.149719 887.295166 +832 832 361.705933 404.222443 679.782959 816.362427 903.503052 +896 896 380.311340 413.942841 685.026123 849.975281 917.003723 +960 960 362.206085 395.687347 625.351868 860.406311 913.973877 +1024 1024 416.232849 450.659698 702.201416 863.699341 930.278931 +1088 1088 426.081848 462.295898 674.021667 859.400452 923.629883 +1152 1152 442.198517 474.325653 687.865967 868.570923 938.970520 +1216 1216 454.775848 490.639557 692.894958 871.245667 940.268982 +1280 1280 471.797180 497.931732 708.519714 878.674438 874.902893 +1344 1344 484.016663 510.985046 707.645386 881.849670 947.863403 +1408 1408 497.170349 513.555359 720.201660 874.891052 956.538025 +1472 1472 509.107422 517.038757 721.355042 881.839539 949.424072 +1536 1536 520.583801 519.536072 730.219177 885.242188 965.862488 +1600 1600 532.904846 510.202423 727.696838 890.487732 949.014099 +1664 1664 523.123352 526.567444 738.054626 890.733032 970.909485 +1728 1728 546.770813 507.838623 736.654053 889.089661 969.930908 +1792 1792 557.636536 525.379761 742.538879 893.314514 971.254883 +1856 1856 571.082336 528.900757 740.567139 899.175049 967.873840 +1920 1920 574.407471 518.737610 742.488892 901.531799 978.758911 +1984 1984 585.905640 487.733490 741.941833 902.622192 971.019287 +2048 2048 584.043701 540.448120 746.738647 899.985352 978.578430 +2112 2112 602.731689 417.438416 739.744751 902.534912 968.760681 +2176 2176 598.659363 420.051270 742.580811 901.148560 980.216858 +2240 2240 614.454346 431.110077 744.342407 903.073059 971.016602 +2304 2304 613.796448 434.699310 748.545959 903.312500 977.304565 +2368 2368 628.194336 446.281433 749.469788 908.359680 978.983582 +2432 2432 634.329895 453.890259 754.433167 908.573730 982.021301 +2496 2496 636.775696 456.107330 752.925964 911.523376 981.154968 +2560 2560 646.821655 462.572205 755.554321 911.247925 986.051453 +2624 2624 648.622314 470.022522 756.165222 912.283264 981.182373 +2688 2688 646.356140 470.587860 758.414307 913.817749 983.367249 +2752 2752 654.524658 482.325287 757.287964 914.000366 987.715088 +2816 2816 653.752808 483.601898 756.861816 910.569458 984.186890 +2880 2880 653.678711 486.463409 759.201111 914.739380 981.833252 +2944 2944 656.637756 491.532867 762.054199 916.912720 989.290771 +3008 3008 664.706726 501.213104 761.967590 916.951904 990.543945 +3072 3072 660.321533 499.157654 761.407104 917.808655 990.417725 +3136 3136 666.532654 508.196411 755.210144 912.899170 986.584900 +3200 3200 666.187378 511.404205 758.321350 910.805420 987.848267 +3264 3264 654.122070 506.821411 759.466553 915.700073 989.437317 +3328 3328 669.785522 517.792725 761.961121 916.254456 993.082336 +3392 3392 674.182617 520.439514 759.942078 914.273743 990.591492 +3456 3456 673.491943 516.815918 760.758545 916.864685 993.150208 +3520 3520 643.759521 495.513672 732.179749 918.130432 993.693604 +3584 3584 679.994507 521.850830 764.833496 917.416321 995.075928 +3648 3648 676.976074 515.804932 765.742126 881.610901 945.287842 +3712 3712 641.856262 502.447968 766.438416 920.871216 950.095276 +3776 3776 670.322266 521.055725 758.005981 907.376648 982.342468 +3840 3840 647.868469 507.671509 754.048828 899.487976 950.288391 +3904 3904 664.707642 523.931396 746.185303 911.999329 985.176880 +3968 3968 671.114563 519.500793 764.939819 893.656128 982.824585 +4032 4032 665.139038 526.386353 749.845032 897.570068 969.283752 +4096 4096 685.134766 529.514099 761.644653 913.418091 965.152466 diff --git a/paredge/readme.txt b/paredge/readme.txt new file mode 100755 index 0000000..0bba713 --- /dev/null +++ b/paredge/readme.txt @@ -0,0 +1,6 @@ +Sample: Recursive Gaussian Filter +Minimum spec: SM 1.0 + +This sample implements a Gaussian blur using Deriche's recursive method. The advantage of this method is that the execution time is independent of the filter width. + +Key concepts: diff --git a/paredge/recAlg_cuda.cu b/paredge/recAlg_cuda.cu new file mode 100755 index 0000000..99938e0 --- /dev/null +++ b/paredge/recAlg_cuda.cu @@ -0,0 +1,97 @@ +/** + * @file recAlg_cuda.cuh + * @brief Recursive Algorithms in the GPU + * @author Andre Maximo + * @date Jun, 2014 + */ + +#include "recursiveGaussian.h" + +#include "alg0_cpu.h" +#include "alg5_cpu.h" +#include "gpu_aux.h" +#include "alg5_gpu.h" +#include "alg5f4_aux.h" +#include "alg5f4_gpu.h" +#include "alg5f4_gauss_gpu.h" + +#include "gpud_aux.cuh" +#include "alg4pd_gpu.cuh" +#include "alg4d_gpu.cuh" +#include "alg5d_gpu.cuh" + +/** + * @brief Run alg5f4 - Recursive Gaussian filter van Vliet (our alg gray float) + * + * @param[in,out] iod Input/output data + * @param[in] width Image width + * @param[in] height Image height + * @param[in] sigma Sigma of the Gaussian kernel + */ + +extern "C" +void runAlg5f4(float *iod, int width, int height, float sigma) { + +#if CLAMPTOEDGE + const BorderType border_type = CLAMP_TO_EDGE; + const int border = 1; // 1 block outside +#else + const BorderType border_type = CLAMP_TO_ZERO; + const int border = 0; +#endif + + Vector w1; + Vector w2; + Vector w3; + weights(sigma, w1); + weights(sigma, w2); + weights(sigma, w3); // to use r=3 change R in recursiveGaussian.h + + // For lena image in gray 512x512 + // Deriche (original): 246 MiP/s + // Deriche (row/col): 289 MiP/s + // Alg5r3: 272 MiP/s + // Alg5f5: 378 MiP/s + // Alg5f4: 418 MiP/s + + //alg0_cpu<1>(iod, width, height, w1, border, border_type); + //alg0_cpu<2>(iod, width, height, w2, border, border_type); + //alg0_cpu<3>(iod, width, height, w3, border, border_type); + //alg5_gpu(iod, width, height, w3, border, border_type); + //alg5f5_gpu(iod, width, height, w1, w2, border, border_type); + alg5f4_gpu(iod, width, height, w1, w2, border, border_type); + +} + +/** + * @brief Run alg4pd - Recursive Gaussian filter Deriche (our alg gray float) + * + * @param[in,out] iod Input/output data + * @param[in] width Image width + * @param[in] height Image height + * @param[in] sigma Sigma of the Gaussian kernel + * @param[in] order Filter order + */ + +// R in recursiveGaussian.h must be 2 + +extern "C" +void runAlg4pd(float *iod, int width, int height, float sigma, int order) { + + alg4pd_gpu(iod, width, height, sigma, order); + +} + +extern "C" +void runAlg4d(float *iod, int width, int height, float sigma, int order) { + + alg4d_gpu(iod, width, height, sigma, order); + +} + +extern "C" +void runAlg5d(float *iod, int width, int height, float sigma, int order) { + + alg5d_gpu(iod, width, height, sigma, order); + +} diff --git a/paredge/recfilter.sobj b/paredge/recfilter.sobj new file mode 100755 index 0000000..378c880 Binary files /dev/null and b/paredge/recfilter.sobj differ diff --git a/paredge/recursiveGaussian.cpp b/paredge/recursiveGaussian.cpp new file mode 100755 index 0000000..f6b5d73 --- /dev/null +++ b/paredge/recursiveGaussian.cpp @@ -0,0 +1,699 @@ +#pragma warning(disable:4819) + +/* + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + Recursive Gaussian filter + sgreen 8/1/08 + + This code sample implements a Gaussian blur using Deriche's recursive method: + http://citeseer.ist.psu.edu/deriche93recursively.html + + This is similar to the box filter sample in the SDK, but it uses the previous + outputs of the filter as well as the previous inputs. This is also known as an + IIR (infinite impulse response) filter, since its response to an input impulse + can last forever. + + The main advantage of this method is that the execution time is independent of + the filter width. + + The GPU processes columns of the image in parallel. To avoid uncoalesced reads + for the row pass we transpose the image and then transpose it back again + afterwards. + + The implementation is based on code from the CImg library: + http://cimg.sourceforge.net/ + Thanks to David Tschumperl and all the CImg contributors! +*/ + +// OpenGL Graphics includes +#include +#if defined (__APPLE__) || defined(MACOSX) +#include +#else +#include +#endif + +// CUDA includes and interop headers +#include +#include + +// CUDA utilities and system includes +#include +#include // includes cuda.h and cuda_runtime_api.h +#include // includes cuda_runtime_api.h + +// Includes +#include +#include +#include +#include + +#include "recursiveGaussian.h" + +#define MAX(a,b) ((a > b) ? a : b) + +#define USE_SIMPLE_FILTER 0 + +#define MAX_EPSILON_ERROR 5.0f +#define THRESHOLD 0.15f + +// Define the files that are to be save and the reference images for validation +const char *sOriginal[] = +{ + "lena_10.ppm", + "lena_14.ppm", + "lena_18.ppm", + "lena_22.ppm", + NULL +}; + +const char *sReference[] = +{ + "ref_10.ppm", + "ref_14.ppm", + "ref_18.ppm", + "ref_22.ppm", + NULL +}; + +char *image_filename = (char*)"lena.ppm"; +//char *image_filename = (char*)"calhouse.ppm"; +float sigma = 4.0f; +int order = 0; +int nthreads = 256; // number of threads per block + +unsigned int width, height; +unsigned int *h_img = NULL; +unsigned int *d_img = NULL; +unsigned int *d_temp = NULL; + +float *h_gimg = NULL; +float *d_gimg = NULL; +float *d_gtemp = NULL; + +GLuint pbo = 0; // OpenGL pixel buffer object +GLuint texid = 0; // texture + +StopWatchInterface *timer = 0; + +// Auto-Verification Code +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +uint frameCount = 0; + +int *pArgc = NULL; +char **pArgv = NULL; + +bool runBenchmark = true; + +const char *sSDKsample = "CUDA Recursive Gaussian"; + +extern "C" +void gaussianFilterRGBA(unsigned int *d_src, unsigned int *d_dest, unsigned int *d_temp, int width, int height, float sigma, int order, int nthreads); + +extern "C" +void rg0(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads); + +extern "C" +void rg1(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads); + +extern "C" +float rgbaIntToFloat1(uint c); + +extern "C" +void convert_rgbaIntToFloat(uint *id, float *od0, float *od1, float *od2, float *od3, int w, int h); + +extern "C" +void convert_rgbaFloatToUchar(uchar *od, float *id0, float *id1, float *id2, float *id3, int w, int h); + +extern "C" +void runAlg5f4(float *id, int width, int height, float sigma); + +extern "C" +void runAlg4pd(float *id, int width, int height, float sigma, int order); + +extern "C" +void runAlg4d(float *id, int width, int height, float sigma, int order); + +extern "C" +void runAlg5d(float *id, int width, int height, float sigma, int order); + +void cleanup(); + +void computeFPS() +{ + frameCount++; + fpsCount++; + + if (fpsCount == fpsLimit) + { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps); + + glutSetWindowTitle(fps); + fpsCount = 0; + + fpsLimit = MAX(ifps, 1.f); + sdkResetTimer(&timer); + } +} + +// display results using OpenGL +void display() +{ + sdkStartTimer(&timer); + + // execute filter, writing results to pbo + unsigned int *d_result; + checkCudaErrors(cudaGLMapBufferObject((void **)&d_result, pbo)); + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + checkCudaErrors(cudaGLUnmapBufferObject(pbo)); + + // load texture from pbo + glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_2D, texid); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + + // display results + glClear(GL_COLOR_BUFFER_BIT); + + glEnable(GL_TEXTURE_2D); + glDisable(GL_DEPTH_TEST); + + glBegin(GL_QUADS); + glTexCoord2f(0, 1); + glVertex2f(0, 0); + glTexCoord2f(1, 1); + glVertex2f(1, 0); + glTexCoord2f(1, 0); + glVertex2f(1, 1); + glTexCoord2f(0, 0); + glVertex2f(0, 1); + glEnd(); + + glDisable(GL_TEXTURE_2D); + glutSwapBuffers(); + + sdkStopTimer(&timer); + + computeFPS(); +} + +void idle() +{ + glutPostRedisplay(); +} + +void cleanup() +{ + sdkDeleteTimer(&timer); + + checkCudaErrors(cudaFree(d_img)); + checkCudaErrors(cudaFree(d_temp)); + checkCudaErrors(cudaFree(d_gimg)); + checkCudaErrors(cudaFree(d_gtemp)); + + delete [] h_img; + delete [] h_gimg; + + if (!runBenchmark) + { + if (pbo) + { + checkCudaErrors(cudaGLUnregisterBufferObject(pbo)); + glDeleteBuffersARB(1, &pbo); + } + + if (texid) + { + glDeleteTextures(1, &texid); + } + } +} + +void keyboard(uchar key, int x, int y) +{ + switch (key) + { + case 27: + exit(EXIT_SUCCESS); + break; + + case '=': + sigma+=0.1f; + break; + + case '-': + sigma-=0.1f; + + if (sigma < 0.0) + { + sigma = 0.0f; + } + + break; + + case '+': + sigma+=1.0f; + break; + + case '_': + sigma-=1.0f; + + if (sigma < 0.0) + { + sigma = 0.0f; + } + + break; + + case '0': + order = 0; + break; + + case '1': + order = 1; + sigma = 0.5f; + break; + + case '2': + order = 2; + sigma = 0.5f; + break; + + default: + break; + } + + printf("sigma = %f\n", sigma); + glutPostRedisplay(); +} + +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); + + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); +} + +void initCudaBuffers() +{ + unsigned int size = width * height * sizeof(unsigned int); + unsigned int gsize = width * height * sizeof(float); // size for gray image + + // allocate device memory + checkCudaErrors(cudaMalloc((void **) &d_img, size)); + checkCudaErrors(cudaMalloc((void **) &d_temp, size)); + checkCudaErrors(cudaMalloc((void **) &d_gimg, gsize)); + checkCudaErrors(cudaMalloc((void **) &d_gtemp, gsize)); + + checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_gimg, h_gimg, gsize, cudaMemcpyHostToDevice)); + + sdkCreateTimer(&timer); +} + + +void initGLBuffers() +{ + // create pixel buffer object to store final image + glGenBuffersARB(1, &pbo); + glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLubyte)*4, h_img, GL_STREAM_DRAW_ARB); + + glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + checkCudaErrors(cudaGLRegisterBufferObject(pbo)); + + // create texture for display + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); +} + +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); + glutInitWindowSize(width, height); + glutCreateWindow(sSDKsample); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutIdleFunc(idle); + + printf("Press '+' and '-' to change filter width\n"); + printf("0, 1, 2 - change filter order\n"); + + glewInit(); + + if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) + { + fprintf(stderr, "Required OpenGL extensions missing."); + exit(EXIT_FAILURE); + } +} + +void +benchmark(void) +{ + // allocate memory for result + float *d_gresult; + unsigned int size = width * height * sizeof(float); + checkCudaErrors(cudaMalloc((void **) &d_gresult, size)); + + // execute the kernel (RUNTIMES number of times) + rg0(d_gimg, d_gresult, d_gtemp, width, height, sigma, order, nthreads); + + float *rg0_img = new float[width*height]; + checkCudaErrors(cudaMemcpy(rg0_img, d_gresult, width*height*sizeof(float), cudaMemcpyDeviceToHost)); + + rg1(d_gimg, d_gresult, d_gtemp, width, height, sigma, order, nthreads); + + float *rg1_img = new float[width*height]; + checkCudaErrors(cudaMemcpy(rg1_img, d_gresult, width*height*sizeof(float), cudaMemcpyDeviceToHost)); + + // algs + int ne = width*height; + float me = 0.f, mre = 0.f; + + std::vector< float > van_img(ne), ap4_img(ne), ag4_img(ne), ag5_img(ne); + for (uint i = 0; i < ne; ++i) + van_img[i] = ap4_img[i] = ag4_img[i] = ag5_img[i] = h_gimg[i]; + + runAlg5f4(&van_img[0], width, height, sigma); + + runAlg4pd(&ap4_img[0], width, height, sigma, order); + + runAlg4d(&ag4_img[0], width, height, sigma, order); + + //runAlg5d(&ag5_img[0], width, height, sigma, order); + + static char *rg0fn = (char*)"rg0.pgm"; + static char *rg1fn = (char*)"rg1.pgm"; + static char *vanfn = (char*)"van.pgm"; + static char *ap4fn = (char*)"ap4.pgm"; + static char *ag4fn = (char*)"ag4.pgm"; + + sdkSavePGM(rg0fn, rg0_img, width, height); + sdkSavePGM(rg1fn, rg1_img, width, height); + sdkSavePGM(vanfn, &van_img[0], width, height); + sdkSavePGM(ap4fn, &ap4_img[0], width, height); + sdkSavePGM(ag4fn, &ag4_img[0], width, height); + +#if RUNTIMES==1 + std::cout << " --- Checking computations ---\n"; + check_cpu_reference( rg0_img, rg1_img, ne, me, mre ); + std::cout << " rg0 x rg1: me = " << me << " mre = " << mre << "\n"; + check_cpu_reference( rg0_img, &van_img[0], ne, me, mre ); + std::cout << " rg0 x alg5f4: me = " << me << " mre = " << mre << "\n"; + check_cpu_reference( rg0_img, &ap4_img[0], ne, me, mre ); + std::cout << " rg0 x alg4pd: me = " << me << " mre = " << mre << "\n"; + check_cpu_reference( rg0_img, &ag4_img[0], ne, me, mre ); + std::cout << " rg0 x alg4d: me = " << me << " mre = " << mre << "\n"; +#endif + + delete [] rg0_img; + delete [] rg1_img; + + checkCudaErrors(cudaFree(d_gresult)); +} + +void +colorAlg(void) +{ + int ne = width*height; + std::vector< float > *alg_img = new std::vector< float >[4]; + + for (uint i = 0; i < 4; ++i) + { + alg_img[i].resize(ne); + } + + convert_rgbaIntToFloat(h_img, &alg_img[0][0], &alg_img[1][0], + &alg_img[2][0], &alg_img[3][0], width, height); + + for (uint i = 0; i < ne; ++i) + { + runAlg5f4(&alg_img[i][0], width, height, sigma); + std::cout << "\n"; + } + + uchar *a_result = new uchar[ne*4]; + convert_rgbaFloatToUchar(a_result, &alg_img[0][0], &alg_img[1][0], + &alg_img[2][0], &alg_img[3][0], width, height); + + static char *algppmfn = (char*)"alg.ppm"; + sdkSavePPM4ub(algppmfn, a_result, width, height); + + delete [] alg_img; +} + +bool +runSingleTest(const char *ref_file, const char *exec_path) +{ + // allocate memory for result + int nTotalErrors = 0; + unsigned int *d_result; + unsigned int size = width * height * sizeof(unsigned int); + checkCudaErrors(cudaMalloc((void **) &d_result, size)); + + // warm-up + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStartTimer(&timer); + + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("Kernel execution failed"); + sdkStopTimer(&timer); + + uchar *h_result = (uchar *)malloc(width*height*4); + checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost)); + + char dump_file[1024]; + sprintf(dump_file, "lena_%02d.ppm", (int)sigma); + sdkSavePPM4ub(dump_file, h_result, width, height); + + if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) + { + nTotalErrors++; + } + + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + + checkCudaErrors(cudaFree(d_result)); + free(h_result); + + printf("Summary: %d errors!\n", nTotalErrors); + + printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n"); + return(nTotalErrors == 0); +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int +main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; + char *ref_file = NULL; + +#if RUNTIMES==1 + printf("%s Starting...\n\n", sSDKsample); +#endif + + // use command-line specified CUDA device, otherwise use device with highest Gflops/s + if (argc > 1) + { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) + { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + fpsLimit = frameCheckNumber; + } + } + + // Get the path of the filename + char *filename; + + if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) + { + image_filename = filename; + } + + // load image or random image + char *image_path = sdkFindFilePath(image_filename, argv[0]); + + if (image_path == NULL) + { + fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); + exit(EXIT_FAILURE); + } + + bool randimg = false; + + if (checkCmdLineFlag(argc, (const char **)argv, "width")) + { + width = getCmdLineArgumentInt(argc, (const char **) argv, "width"); + randimg = true; + } + + if (checkCmdLineFlag(argc, (const char **)argv, "height")) + { + height = getCmdLineArgumentInt(argc, (const char **) argv, "height"); + randimg = true; + } + + if (randimg) + { + image_path = (char *)"rand"; + h_img = new unsigned int[width*height*4]; + srand(1234); + for (int i = 0; i < width*height*4; ++i) + h_img[i] = (unsigned int)(rand() % 256); + } else + { + sdkLoadPPM4ub(image_path, (uchar **)&h_img, &width, &height); + } + + if (!h_img) + { + printf("Error unable to generate image file: '%s'\n", image_path); + exit(EXIT_FAILURE); + } + +#if RUNTIMES==1 + printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); +#endif + + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) + { + //nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads"); + printf("Error unable to change number of threads!'\n"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) + { + sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma"); + } + +#if RUNTIMES==1 + printf("Sigma: %f\n", sigma); +#endif + + if (checkCmdLineFlag(argc, (const char **)argv, "order")) + { + order = getCmdLineArgumentInt(argc, (const char **) argv, "order"); + } + +#if RUNTIMES==1 + printf("Filter order: %d\n", order); +#endif + + if (checkCmdLineFlag(argc, (const char **)argv, "benchmark")) + { + runBenchmark = (bool)getCmdLineArgumentInt(argc, (const char **) argv, "benchmark"); + } + +#if RUNTIMES==1 + printf("Run benchmark: %d\n", (benchmark?1:0)); +#endif + + h_gimg = new float[width*height]; // allocate gray image + + for (uint i=0; i +#include +#include + +#include +#include +#include + +#include "recursiveGaussian.h" + +#include "recursiveGaussian_kernel.cuh" +#include "rg.cuh" + +#define USE_SIMPLE_FILTER 0 + +#define WARPSIZE 32 + +//Round a / b to nearest higher integer value +int iDivUp(int a, int b) +{ + return (a % b != 0) ? (a / b + 1) : (a / b); +} + +/* + Transpose a 2D array (see SDK transpose example) +*/ +template< typename T > +void transpose(T *d_src, T *d_dest, uint width, int height) +{ + dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1); + dim3 threads(BLOCK_DIM, BLOCK_DIM, 1); + d_transpose<<< grid, threads >>>(d_dest, d_src, width, height); + getLastCudaError("Kernel execution failed"); +} + +/** + * @brief Computer recursive filter parameters + * @param[out] a0 Filter param a0 + * @param[out] a1 Filter param a1 + * @param[out] a2 Filter param a2 + * @param[out] a3 Filter param a3 + * @param[out] b1 Filter param b1 + * @param[out] b2 Filter param b2 + * @param[out] coefp Filter param coefficient previous + * @param[out] coefn Filter param coefficient next + * @param[in] sigma Sigma of Gaussian + * @param[in] order Filter order +*/ +extern "C" +void computeFilterParams(float& a0, float& a1, float& a2, float& a3, + float& b1, float& b2, float& coefp, float& coefn, + const float& sigma, const int& order) +{ + // compute filter coefficients + const float + nsigma = sigma < 0.1f ? 0.1f : sigma, + alpha = 1.695f / nsigma, + ema = (float)std::exp(-alpha), + ema2 = (float)std::exp(-2*alpha); + + b1 = -2*ema; + b2 = ema2; + + a0 = a1 = a2 = a3 = coefp = coefn = 0.f; + + switch (order) + { + case 0: + { + const float k = (1-ema)*(1-ema)/(1+2*alpha*ema-ema2); + a0 = k; + a1 = k*(alpha-1)*ema; + a2 = k*(alpha+1)*ema; + a3 = -k*ema2; + } + break; + + case 1: + { + const float k = (1-ema)*(1-ema)/ema; + a0 = k*ema; + a1 = a3 = 0; + a2 = -a0; + } + break; + + case 2: + { + const float + ea = (float)std::exp(-alpha), + k = -(ema2-1)/(2*alpha*ema), + kn = (-2*(-1+3*ea-3*ea*ea+ea*ea*ea)/(3*ea+1+3*ea*ea+ea*ea*ea)); + a0 = kn; + a1 = -kn*(1+k*alpha)*ema; + a2 = kn*(1-k*alpha)*ema; + a3 = -kn*ema2; + } + break; + + default: + fprintf(stderr, "gaussianFilter: invalid order parameter!\n"); + return; + } + + coefp = (a0+a1)/(1+b1+b2); + coefn = (a2+a3)/(1+b1+b2); +} + +/* + Perform Gaussian filter on a 2D image using CUDA + + Parameters: + d_src - pointer to input image in device memory + d_dest - pointer to destination image in device memory + d_temp - pointer to temporary storage in device memory + width - image width + height - image height + sigma - sigma of Gaussian + order - filter order (0, 1 or 2) +*/ + +// 8-bit RGBA version +extern "C" +void gaussianFilterRGBA(uint *d_src, uint *d_dest, uint *d_temp, int width, int height, float sigma, int order, int nthreads) +{ + float a0, a1, a2, a3, b1, b2, coefp, coefn; + computeFilterParams(a0, a1, a2, a3, b1, b2, coefp, coefn, sigma, order); + + // process columns +#if USE_SIMPLE_FILTER + d_simpleRecursive_rgba<<< iDivUp(width, nthreads), nthreads >>>(d_src, d_temp, width, height, ema); +#else + d_recursiveGaussian_rgba<<< iDivUp(width, nthreads), nthreads >>>(d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn); +#endif + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); + + // process rows +#if USE_SIMPLE_FILTER + d_simpleRecursive_rgba<<< iDivUp(height, nthreads), nthreads >>>(d_dest, d_temp, height, width, ema); +#else + d_recursiveGaussian_rgba<<< iDivUp(height, nthreads), nthreads >>>(d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn); +#endif + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, height, width); + getLastCudaError("transpose: Kernel execution failed"); +} + +// 32-bit gray version of gaussianFilterRGBA +extern "C" +void rg0(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads) +{ + float a0, a1, a2, a3, b1, b2, coefp, coefn; + computeFilterParams(a0, a1, a2, a3, b1, b2, coefp, coefn, sigma, order); + + // grids (width and height) and block dimension + dim3 gdW(iDivUp(width, nthreads), 1); + dim3 gdH(iDivUp(height, nthreads), 1); + dim3 bd(WARPSIZE, nthreads/WARPSIZE); // block dimension + +#if RUNTIMES>1 + base_timer &timer_total = timers.gpu_add("rg0", width*height*RUNTIMES, "iP"); + + for (int r=0; r>>(d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); + + d_rg0<<< gdH, bd >>>(d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, height, width); + getLastCudaError("transpose: Kernel execution failed"); + } + + timer_total.stop(); + + std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush; +#else + base_timer &timer_total = timers.gpu_add("rg0", width*height, "iP"); + base_timer *timer; + + timer = &timers.gpu_add("rg0-rows"); + + d_rg0<<< gdW, bd >>>(d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + timer->stop(); timer = &timers.gpu_add("rg0-transpose"); + + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); + + timer->stop(); timer = &timers.gpu_add("rg0-cols"); + + d_rg0<<< gdH, bd >>>(d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + timer->stop(); timer = &timers.gpu_add("rg0-transpose"); + + transpose(d_temp, d_dest, height, width); + getLastCudaError("transpose: Kernel execution failed"); + + timer->stop(); + + timer_total.stop(); + timers.flush(); +#endif +} + +// Parallel up/down Deriche Recursive Gaussian filter +extern "C" +void rg1(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads) +{ + float a0, a1, a2, a3, b1, b2, coefp, coefn; + computeFilterParams(a0, a1, a2, a3, b1, b2, coefp, coefn, sigma, order); + + // grids (width and height) and block dimension + dim3 gdW(iDivUp(width, nthreads)*2, 1); + dim3 gdH(iDivUp(height, nthreads)*2, 1); + dim3 bd(WARPSIZE, nthreads/WARPSIZE); + +#if RUNTIMES>1 + base_timer &timer_total = timers.gpu_add("rg1", width*height*RUNTIMES, "iP"); + + for (int r=0; r>>(d_src, d_temp, d_dest, width, height, + a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); + + d_rg1<<< gdH, bd >>>(d_dest, d_temp, d_src, height, width, + a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, height, width); + getLastCudaError("transpose: Kernel execution failed"); + } + + timer_total.stop(); + + std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush; +#else + base_timer &timer_total = timers.gpu_add("rg1", width*height, "iP"); + base_timer *timer; + + timer = &timers.gpu_add("rg1-rows"); + + d_rg1<<< gdW, bd >>>(d_src, d_temp, d_dest, width, height, + a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + timer->stop(); timer = &timers.gpu_add("rg1-transpose"); + + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); + + timer->stop(); timer = &timers.gpu_add("rg1-cols"); + + d_rg1<<< gdH, bd >>>(d_dest, d_temp, d_src, height, width, + a0, a1, a2, a3, b1, b2, coefp, coefn); + getLastCudaError("Kernel execution failed"); + + timer->stop(); timer = &timers.gpu_add("rg1-transpose"); + + transpose(d_temp, d_dest, height, width); + getLastCudaError("transpose: Kernel execution failed"); + + timer->stop(); + + timer_total.stop(); + timers.flush(); +#endif +} diff --git a/paredge/recursiveGaussian_kernel.cuh b/paredge/recursiveGaussian_kernel.cuh new file mode 100755 index 0000000..9d4420b --- /dev/null +++ b/paredge/recursiveGaussian_kernel.cuh @@ -0,0 +1,244 @@ +/* + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + Recursive Gaussian filter +*/ + +#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_ +#define _RECURSIVEGAUSSIAN_KERNEL_CU_ + +#include +#include +#include +#include +#include + +#define BLOCK_DIM 16 + +// Transpose kernel (see transpose SDK sample for details) +template< typename T > +__global__ void d_transpose(T *odata, T *idata, int width, int height) +{ + __shared__ T block[BLOCK_DIM][BLOCK_DIM+1]; + + // read the matrix tile into shared memory + unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x; + unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y; + + if ((xIndex < width) && (yIndex < height)) + { + unsigned int index_in = yIndex * width + xIndex; + block[threadIdx.y][threadIdx.x] = idata[index_in]; + } + + __syncthreads(); + + // write the transposed matrix tile to global memory + xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x; + yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y; + + if ((xIndex < height) && (yIndex < width)) + { + unsigned int index_out = yIndex * height + xIndex; + odata[index_out] = block[threadIdx.x][threadIdx.y]; + } +} + +// RGBA version +// reads from 32-bit uint array holding 8-bit RGBA + +// convert floating point rgba color to 32-bit integer +__device__ uint rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255); +} + +// convert from 32-bit int to float4 +__host__ __device__ float4 rgbaIntToFloat(uint c) +{ + float4 rgba; + rgba.x = (c & 0xff) / 255.0f; + rgba.y = ((c>>8) & 0xff) / 255.0f; + rgba.z = ((c>>16) & 0xff) / 255.0f; + rgba.w = ((c>>24) & 0xff) / 255.0f; + return rgba; +} + +extern "C" +__host__ float rgbaIntToFloat1(uint c) +{ + float4 rgba = rgbaIntToFloat(c); + return (rgba.x+rgba.y+rgba.z+rgba.w)/4.f; +} + +extern "C" +__host__ void convert_rgbaIntToFloat(uint *id, float *od0, float *od1, float *od2, float *od3, int w, int h) +{ + float4 rgba; + for (int i = 0; i < w*h; ++i) { + rgba = rgbaIntToFloat(id[i]); + od0[i] = rgba.x; + od1[i] = rgba.y; + od2[i] = rgba.z; + od3[i] = rgba.w; + } +} + +__host__ float saturatef(const float& v) { + if (v > 1.f) return 1.f; + else if (v < 0.f) return 0.f; + return v; +} + +extern "C" +__host__ void convert_rgbaFloatToUchar(uchar *od, float *id0, float *id1, float *id2, float *id3, int w, int h) +{ + for (int i = 0; i < w*h; ++i) { + od[i*4+0] = uchar(saturatef(id0[i])*255); + od[i*4+1] = uchar(saturatef(id1[i])*255); + od[i*4+2] = uchar(saturatef(id2[i])*255); + od[i*4+3] = uchar(saturatef(id3[i])*255); + } +} + +/* + simple 1st order recursive filter + - processes one image column per thread + + parameters: + id - pointer to input data (RGBA image packed into 32-bit integers) + od - pointer to output data + w - image width + h - image height + a - blur parameter +*/ + +__global__ void +d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a) +{ + unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + + if (x >= w) return; + + id += x; // advance pointers to correct column + od += x; + + // forward pass + float4 yp = rgbaIntToFloat(*id); // previous output + + for (int y = 0; y < h; y++) + { + float4 xc = rgbaIntToFloat(*id); + float4 yc = xc + a*(yp - xc); // simple lerp between current and previous value + *od = rgbaFloatToInt(yc); + id += w; + od += w; // move to next row + yp = yc; + } + + // reset pointers to point to last element in column + id -= w; + od -= w; + + // reverse pass + // ensures response is symmetrical + yp = rgbaIntToFloat(*id); + + for (int y = h-1; y >= 0; y--) + { + float4 xc = rgbaIntToFloat(*id); + float4 yc = xc + a*(yp - xc); + *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc)*0.5f); + id -= w; + od -= w; // move to previous row + yp = yc; + } +} + +/* + recursive Gaussian filter (RGBA uint) + + parameters: + id - pointer to input data (RGBA image packed into 32-bit integers) + od - pointer to output data + w - image width + h - image height + a0-a3, b1, b2, coefp, coefn - filter parameters +*/ + +__global__ void +d_recursiveGaussian_rgba(uint *id, uint *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn) +{ + unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + + if (x >= w) return; + + id += x; // advance pointers to correct column + od += x; + + // forward pass + float4 xp = make_float4(0.0f); // previous input + float4 yp = make_float4(0.0f); // previous output + float4 yb = make_float4(0.0f); // previous output by 2 +#if CLAMPTOEDGE + xp = rgbaIntToFloat(*id); + yb = coefp*xp; + yp = yb; +#endif + + for (int y = 0; y < h; y++) + { + float4 xc = rgbaIntToFloat(*id); + float4 yc = a0*xc + a1*xp - b1*yp - b2*yb; + *od = rgbaFloatToInt(yc); + id += w; + od += w; // move to next row + xp = xc; + yb = yp; + yp = yc; + } + + // reset pointers to point to last element in column + id -= w; + od -= w; + + // reverse pass + // ensures response is symmetrical + float4 xn = make_float4(0.0f); + float4 xa = make_float4(0.0f); + float4 yn = make_float4(0.0f); + float4 ya = make_float4(0.0f); +#if CLAMPTOEDGE + xn = xa = rgbaIntToFloat(*id); + yn = coefn*xn; + ya = yn; +#endif + + for (int y = h-1; y >= 0; y--) + { + float4 xc = rgbaIntToFloat(*id); + float4 yc = a2*xn + a3*xa - b1*yn - b2*ya; + xa = xn; + xn = xc; + ya = yn; + yn = yc; + *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc); + id -= w; + od -= w; // move to previous row + } +} + +#endif // #ifndef _GAUSSIAN_KERNEL_H_ diff --git a/paredge/results/calhouse2.xcf b/paredge/results/calhouse2.xcf new file mode 100755 index 0000000..7278b67 Binary files /dev/null and b/paredge/results/calhouse2.xcf differ diff --git a/paredge/results/g0s16.png b/paredge/results/g0s16.png new file mode 100755 index 0000000..3531f36 Binary files /dev/null and b/paredge/results/g0s16.png differ diff --git a/paredge/results/g1s02.png b/paredge/results/g1s02.png new file mode 100755 index 0000000..e12ed86 Binary files /dev/null and b/paredge/results/g1s02.png differ diff --git a/paredge/results/g2s02.png b/paredge/results/g2s02.png new file mode 100755 index 0000000..6124314 Binary files /dev/null and b/paredge/results/g2s02.png differ diff --git a/paredge/rg.cuh b/paredge/rg.cuh new file mode 100755 index 0000000..4f668d9 --- /dev/null +++ b/paredge/rg.cuh @@ -0,0 +1,176 @@ +/** + * @file rg.cuh + * @brief Recursive Gaussian Deriche original CUDA sample + * @author Andre Maximo + * @date May, 2014 + * @copyright GE Proprietary + */ + +/** + * @brief Recursive Gaussian Deriche filter + * @param[in] id Input data + * @param[out] od Output data + * @param[in] w Image width + * @param[in] h Image height + * @param[in] a0-a3, b1, b2, coefp, coefn - filter parameters +*/ + +__global__ void +d_rg0(float *id, float *od, int w, int h, + float a0, float a1, float a2, float a3, + float b1, float b2, float coefp, float coefn) { + + int tx = threadIdx.x, ty = threadIdx.y, bx = blockIdx.x, bdx = blockDim.x, bdy = blockDim.y; + int bdxy = bdx*bdy; + unsigned int x = bx*(bdxy) + ty*bdx + tx; + + if (x >= w) return; + + id += x; // advance pointers to correct column + od += x; + + // forward pass + float xp = 0.f; // previous input + float yp = 0.f; // previous output + float yb = 0.f; // previous output by 2 +#if CLAMPTOEDGE + xp = *id; + yb = coefp*xp; + yp = yb; +#endif + + for (int y = 0; y < h; y++) { + float xc = *id; + float yc = a0*xc + a1*xp - b1*yp - b2*yb; + *od = yc; + id += w; + od += w; // move to next row + xp = xc; + yb = yp; + yp = yc; + } + + // reset pointers to point to last element in column + id -= w; + od -= w; + + // reverse pass + // ensures response is symmetrical + float xn = 0.f; + float xa = 0.f; + float yn = 0.f; + float ya = 0.f; +#if CLAMPTOEDGE + xn = xa = *id; + yn = coefn*xn; + ya = yn; +#endif + + for (int y = h-1; y >= 0; y--) { + float xc = *id; + float yc = a2*xn + a3*xa - b1*yn - b2*ya; + xa = xn; + xn = xc; + ya = yn; + yn = yc; + *od = *od + yc; + id -= w; + od -= w; // move to previous row + } +} + +__global__ void +d_rg1(float *id, float *od, float *td, int w, int h, + float a0, float a1, float a2, float a3, + float b1, float b2, float coefp, float coefn) { + + int tx = threadIdx.x, ty = threadIdx.y, bx = blockIdx.x, bdx = blockDim.x, bdy = blockDim.y; + int bdxy2 = bdx*bdy/2, bdy2 = bdy/2, tyby2 = ty % bdy2; + unsigned int x = bx*bdxy2 + tyby2*bdx + tx; + + if (x >= w) return; + + id += x; // advance pointers to correct column + od += x; + td += x; + + if (ty < bdy2) { + + // forward pass + float xp = 0.f; // previous input + float yp = 0.f; // previous output + float yb = 0.f; // previous output by 2 +#if CLAMPTOEDGE + xp = *id; + yb = coefp*xp; + yp = yb; +#endif + + for (int y = 0; y < h; y++) { + float xc = *id; + float yc = a0*xc + a1*xp - b1*yp - b2*yb; + *od = yc; + id += w; + od += w; // move to next row + xp = xc; + yb = yp; + yp = yc; + } + + od -= w; + td += w*(h-1); + + } else { + + id += w*(h-1); + td += w*(h-1); // advance pointers to last row + + // reverse pass + // DOEST NOT ensures response is symmetrical + float xn = 0.f; + float xa = 0.f; + float yn = 0.f; + float ya = 0.f; +#if CLAMPTOEDGE + xn = xa = *id; + yn = coefn*xn; + ya = yn; +#endif + + for (int y = h-1; y >= 0; y--) { + float xc = *id; + float yc = a2*xn + a3*xa - b1*yn - b2*ya; + xa = xn; + xn = xc; + ya = yn; + yn = yc; + *td = yc; + id -= w; + td -= w; // move to previous row + } + + td += w; + + } + + __syncthreads(); + + if (ty < bdy2) { + + for (int y = 0; y < (h+1)/2; y++) { + *od = *od + *td; + od -= w; + td -= w; + } + + } else { + + for (int y = 0; y < h/2; y++) { + *od = *od + *td; + od += w; + td += w; + } + + } + +} diff --git a/paredge/run.sh b/paredge/run.sh new file mode 100755 index 0000000..cb1ef66 --- /dev/null +++ b/paredge/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for i in $(seq 1 64); +do + echo -n "$(($i * 64)) $(($i * 64)) " + $1 -width=$(($i * 64)) -height=$(($i * 64)) + echo +done diff --git a/paredge/sage_session.sobj b/paredge/sage_session.sobj new file mode 100755 index 0000000..aecb401 Binary files /dev/null and b/paredge/sage_session.sobj differ