diff --git a/paredge/Makefile b/paredge/Makefile
new file mode 100755
index 0000000..68a5e50
--- /dev/null
+++ b/paredge/Makefile
@@ -0,0 +1,163 @@
+################################################################################
+#
+# Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+include ./findcudalib.mk
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+CUDA_SAMPLES_PATH ?= $(CUDA_PATH)/samples
+BOUNDARY_INC_PATH ?= -I/home/andmax/boundary/lib -I/home/andmax/boundary/src
+BOUNDARY_LIB_PATH ?= -L/home/andmax/boundary/build/lib/util
+ADDITIONAL_LIBS ?= -lutil
+
+# internal flags
+NVCCFLAGS   := -m${OS_SIZE}
+CCFLAGS     :=
+NVCCLDFLAGS :=
+LDFLAGS     :=
+
+# Extra user flags
+EXTRA_NVCCFLAGS   ?=
+EXTRA_NVCCLDFLAGS ?=
+EXTRA_LDFLAGS     ?=
+EXTRA_CCFLAGS     ?=
+
+# OS-specific build flags
+ifneq ($(DARWIN),) 
+  LDFLAGS += -rpath $(CUDA_PATH)/lib
+  CCFLAGS += -arch $(OS_ARCH) $(STDLIB)  
+else
+  ifeq ($(OS_ARCH),armv7l)
+    ifeq ($(abi),gnueabi)
+      CCFLAGS += -mfloat-abi=softfp
+    else
+      # default to gnueabihf
+      override abi := gnueabihf
+      LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+      CCFLAGS += -mfloat-abi=hard
+    endif
+  endif
+endif
+
+ifeq ($(ARMv7),1)
+NVCCFLAGS += -target-cpu-arch ARM
+ifneq ($(TARGET_FS),) 
+CCFLAGS += --sysroot=$(TARGET_FS)
+LDFLAGS += --sysroot=$(TARGET_FS)
+LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi)
+endif
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      TARGET := debug
+else
+      TARGET := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(NVCCLDFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(EXTRA_NVCCLDFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I$(CUDA_SAMPLES_PATH)/common/inc $(BOUNDARY_INC_PATH)
+LIBRARIES := $(BOUNDARY_LIB_PATH)
+
+################################################################################
+
+# Makefile include to help find GL Libraries
+EXEC            ?=
+include ./findgllib.mk
+
+# OpenGL specific libraries 
+ifneq ($(DARWIN),)
+  # Mac OSX specific libraries and paths to include
+  LIBRARIES += -L/System/Library/Frameworks/OpenGL.framework/Libraries 
+  LIBRARIES += -lGL -lGLU $(CUDA_SAMPLES_PATH)/common/lib/darwin/libGLEW.a
+  ALL_LDFLAGS += -Xlinker -framework -Xlinker GLUT
+else
+  LIBRARIES += -L$(CUDA_SAMPLES_PATH)/common/lib/$(OSLOWER)/$(OS_ARCH) $(GLLINK)
+  LIBRARIES += -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW $(ADDITIONAL_LIBS)
+endif
+
+# CUDA code generation flags
+ifneq ($(OS_ARCH),armv7l)
+GENCODE_SM10    := -gencode arch=compute_10,code=sm_10
+endif
+GENCODE_SM20    := -gencode arch=compute_20,code=sm_20
+GENCODE_SM30    := -gencode arch=compute_30,code=sm_30
+GENCODE_SM35    := -gencode arch=compute_35,code=\"sm_35,compute_35\"
+#GENCODE_FLAGS   := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35)
+GENCODE_FLAGS   := -arch=sm_30
+
+################################################################################
+
+# Target rules
+all: build
+
+build: recursiveGaussian
+
+recursiveGaussian_cuda.o: recursiveGaussian_cuda.cu recursiveGaussian_kernel.cuh recursiveGaussian.h
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+recAlg_cuda.o: recAlg_cuda.cu recursiveGaussian.h alg4d_gpu.cuh alg5d_gpu.cuh
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+main.o: recursiveGaussian.cpp recursiveGaussian.h
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+recursiveGaussian: recursiveGaussian_cuda.o recAlg_cuda.o main.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) -o $@ $+ $(LIBRARIES)
+
+run: build
+	$(EXEC) ./recursiveGaussian
+
+clean:
+	$(EXEC) rm -f recursiveGaussian_cuda.o recAlg_cuda.o main.o recursiveGaussian *.pgm
+
+clobber: clean
diff --git a/paredge/README.md b/paredge/README.md
new file mode 100755
index 0000000..df575fc
--- /dev/null
+++ b/paredge/README.md
@@ -0,0 +1,4 @@
+paredge
+=======
+
+Parallel Edge Detection
diff --git a/paredge/alg4d_gpu.cuh b/paredge/alg4d_gpu.cuh
new file mode 100755
index 0000000..347dd56
--- /dev/null
+++ b/paredge/alg4d_gpu.cuh
@@ -0,0 +1,688 @@
+/**
+ *  @file alg4d_gpu.cuh
+ *  @brief Algorithm 4 in the GPU (Deriche version) with borders
+ *  @author Andre Maximo
+ *  @date Jun, 2014
+ */
+
+#ifndef ALG4D_GPU_CUH
+#define ALG4D_GPU_CUH
+
+#define USE_FUSION true
+
+//=== IMPLEMENTATION ============================================================
+
+// Alg4 (Deriche) GPU -----------------------------------------------------------
+
+// Collect carries --------------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWC, NBCW)
+void alg4d_collect_carries( Matrix<float,R,WS> *g_py1bar, 
+                            Matrix<float,R,WS> *g_ey2bar,
+                            Matrix<float,R,WS> *g_px,
+                            Matrix<float,R,WS> *g_ex,
+                            float inv_width, float inv_height,
+                            int m_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y;
+
+    __shared__ Matrix<float,WS,WS+1> block;
+    read_block<NWC>(block, m, n, inv_width, inv_height);
+    __syncthreads();
+
+    float x[32]; // 32 regs per thread
+
+    if (ty==0) { // 1 warp forward computing y1
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xp = 0.f; // previous x
+        Vector<float,R> py1; // current P(Y_1)
+        Vector<float,R> px; // this block P(X)
+
+        if (m==0) { // 1st block
+#if CLAMPTOEDGE
+            xp = x[0]; // clamp to edge
+            py1[0] = c_coefs[4]*xp;
+            px[0] = xp;
+#pragma unroll
+            for (int i=1; i<R; ++i) {
+                py1[i] = py1[0];
+                px[i] = px[0];
+            }
+            g_py1bar[n*(m_size+1)+0].set_col(tx, py1);
+            g_px[n*(m_size+1)+0].set_col(tx, px);
+#endif
+        }
+
+        py1 = zeros<float,R>();
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            px[i] = x[WS-1-i];
+
+#pragma unroll // calculate py1bar, scan left -> right
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(py1, x[j], c_weights);
+            xp = xc;
+        }
+
+        g_py1bar[n*(m_size+1)+m+1].set_col(tx, py1);
+        g_px[n*(m_size+1)+m+1].set_col(tx, px);
+
+    } else if (ty==1) { // 1 warp reverse computing y2
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+        
+        float xn = 0.f, xa = 0.f; // next and after next x
+        Vector<float,R> ey2; // current E(Y_2)
+        Vector<float,R> ex; // this block E(X)
+
+        if (m==m_size-1) { // last block
+#if CLAMPTOEDGE
+            xn = xa = x[WS-1]; // clamp to edge
+            ey2[0] = c_coefs[5]*xn;
+            ex[0] = xn;
+#pragma unroll
+            for (int i=1; i<R; ++i) {
+                ey2[i] = ey2[0];
+                ex[i] = ex[0];
+            }
+            g_ey2bar[n*(m_size+1)+m_size].set_col(tx, ey2);
+            g_ex[n*(m_size+1)+m_size].set_col(tx, ex);
+#endif
+        }
+
+        ey2 = zeros<float,R>();
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            ex[i] = x[i];
+
+#pragma unroll // calculate ey2bar, scan right -> left
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], ey2, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+        g_ey2bar[n*(m_size+1)+m].set_col(tx, ey2);
+        g_ex[n*(m_size+1)+m].set_col(tx, ex);
+
+    }
+
+}
+
+// Adjust carries ---------------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWA, NBA)
+void alg4d_adjust_carries( Matrix<float,R,WS> *g_py1bar,
+                           Matrix<float,R,WS> *g_ey2bar,
+                           Matrix<float,R,WS> *g_px,
+                           Matrix<float,R,WS> *g_ex,
+                           int m_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m, n = blockIdx.y;
+    Matrix<float,R,WS> *gpy1bar, *gey2bar;
+    Vector<float,R> py1, ey2, py1bar, ey2bar;
+    __shared__ Matrix<float,R,WS> spy1ey2bar[NWA];
+
+    Matrix<float,R,WS> *gpx, *gex;
+    Vector<float,R> px, ex;
+    __shared__ Matrix<float,R,WS> spxex[NWA];
+
+    // P(y1bar) -> P(y1) processing ---------------------------------------------
+    m = 0;
+    gpy1bar = (Matrix<float,R,WS> *)&g_py1bar[n*(m_size+1)+m+ty+1][0][tx];
+    gpx = (Matrix<float,R,WS> *)&g_px[n*(m_size+1)+m+ty][0][tx];
+    if (ty==0) // for boundary condition
+        py1 = g_py1bar[n*(m_size+1)+0].col(tx);
+
+    // more parallelism here is not important
+    for (; m < m_size; m += NWA) { // for all image blocks
+
+        if (m+ty < m_size) { // using smem as cache
+            spy1ey2bar[ty].set_col(tx, gpy1bar->col(0));
+            spxex[ty].set_col(tx, gpx->col(0));
+        }
+
+        __syncthreads(); // wait to load smem
+
+        if (ty==0) {
+#pragma unroll // adjust py1bar left -> right
+            for (int w = 0; w < NWA; ++w) {
+
+                py1bar = spy1ey2bar[w].col(tx);
+                px = spxex[w].col(tx);
+
+                py1 = py1bar + py1 * c_AbF_T + px * c_TAFB_AC1P_T;
+
+                spy1ey2bar[w].set_col(tx, py1);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (m+ty < m_size) gpy1bar->set_col(0, spy1ey2bar[ty].col(tx));
+
+        gpy1bar += NWA;
+        gpx += NWA;
+
+    }
+
+    // E(y2bar) -> E(y2) processing -----------------------------------------------
+    m = m_size-1;
+    gey2bar = (Matrix<float,R,WS> *)&g_ey2bar[n*(m_size+1)+m-ty][0][tx];
+    gex = (Matrix<float,R,WS> *)&g_ex[n*(m_size+1)+m+1-ty][0][tx];
+    if (ty==0) // for boundary condition
+        ey2 = g_ey2bar[n*(m_size+1)+m_size].col(tx);
+
+    for (; m >= 0; m -= NWA) { // for all image blocks
+
+        // using smem as cache
+        if (m-ty >= 0) {
+            spy1ey2bar[ty].set_col(tx, gey2bar->col(0));
+            spxex[ty].set_col(tx, gex->col(0));
+        }
+
+        __syncthreads(); // wait to load smem
+
+        if (ty==0) {
+#pragma unroll // adjust ey2bar right -> left
+            for (int w = 0; w < NWA; ++w) {
+
+                ey2bar = spy1ey2bar[w].col(tx);
+                ex = spxex[w].col(tx);
+
+                ey2 = ey2bar + ey2 * c_AbR_T + ex * c_HARB_AC2E_T;
+
+                spy1ey2bar[w].set_col(tx, ey2);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (m-ty >= 0) gey2bar->set_col(0, spy1ey2bar[ty].col(tx));
+
+        gey2bar -= NWA;
+        gex -= NWA;
+
+    }
+
+}
+
+// Write results transpose ------------------------------------------------------
+
+template <bool FUSION, int R>
+__global__ __launch_bounds__(WS*NWW, NBCW)
+void alg4d_write_results_transp( float *g_transp_out,
+                                 const Matrix<float,R,WS> *g_rows_py1,
+                                 const Matrix<float,R,WS> *g_rows_ey2,
+                                 Matrix<float,R,WS> *g_cols_py1,
+                                 Matrix<float,R,WS> *g_cols_ey2,
+                                 const Matrix<float,R,WS> *g_rows_px,
+                                 const Matrix<float,R,WS> *g_rows_ex,
+                                 Matrix<float,R,WS> *g_cols_pz,
+                                 Matrix<float,R,WS> *g_cols_ez,
+                                 float inv_width, float inv_height,
+                                 int m_size, int n_size,
+                                 int out_stride ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y;
+
+    __shared__ Matrix<float,WS,WS+1> block;
+    read_block<NWW>(block, m, n, inv_width, inv_height);
+    __syncthreads();
+
+    float x[32]; // 32 regs per thread
+
+    Matrix<float,R,WS>
+        *py1m1 = (Matrix<float,R,WS>*)&g_rows_py1[n*(m_size+1)+m][0][tx],
+        *ey2m1 = (Matrix<float,R,WS>*)&g_rows_ey2[n*(m_size+1)+m+1][0][tx];
+
+    Matrix<float,R,WS>
+        *pxm1 = (Matrix<float,R,WS>*)&g_rows_px[n*(m_size+1)+m][0][tx],
+        *exm1 = (Matrix<float,R,WS>*)&g_rows_ex[n*(m_size+1)+m+1][0][tx];
+
+    if (ty==0) { // 1 warp forward computing y1
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xp; // previous x
+        Vector<float,R> py1 = py1m1->col(0); // current P(Y_1)
+        Vector<float,R> px = pxm1->col(0); // previous block P(X)
+
+        xp = px[0];
+
+#pragma unroll // calculate block, scan left -> right
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(py1, x[j], c_weights);
+            xp = xc;
+        }
+
+    } else if (ty==1) { // 1 warp reverse computing y2
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xn, xa; // next and after next x
+        Vector<float,R> ey2 = ey2m1->col(0); // current E(Y_2)
+        Vector<float,R> ex = exm1->col(0); // next block E(X)
+
+        xn = ex[0];
+        if (R==1) xa = xn;
+        else xa = ex[1];
+
+#pragma unroll // calculate block, scan right -> left
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], ey2, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+    }
+
+    // preparing for adding y1 and y2
+
+    __syncthreads(); // guarantee no smem usage
+
+    if (ty==1) {
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[tx][i] = x[i]; // save partial result in smem
+
+    }
+
+    __syncthreads(); // guarantee warp 0 can read smem
+
+    if (ty==0) { // now add
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] += block[tx][i]; // x now has z
+
+        g_transp_out += m*WS*out_stride + n*WS + tx;
+#pragma unroll // write block inside valid transpose image
+        for (int i=0; i<WS; ++i, g_transp_out += out_stride) {
+            *g_transp_out = x[i];
+        }
+
+    }
+
+    if (FUSION) {
+
+        if (ty==0) {
+#pragma unroll // transpose regs part-1
+            for (int i=0; i<32; ++i)
+                block[tx][i] = x[i];
+
+        }
+
+        __syncthreads();
+
+        if (ty==2) { // 1 warp forward computing u1
+
+#pragma unroll // transpose regs part-2
+            for (int i=0; i<32; ++i)
+                x[i] = block[i][tx];
+
+            Matrix<float,R,WS>
+                &pu1bar = (Matrix<float,R,WS>&)g_cols_py1[m*(n_size+1)+n+1][0][tx];
+
+            float zp = 0.f; // previous z
+            Vector<float,R> pu1; // current P(U_1)
+            Vector<float,R> pz; // this block P(Z)
+
+            if (n==0) { // 1st block
+#if CLAMPTOEDGE
+                zp = x[0]; // clamp to edge
+                pu1[0] = c_coefs[4]*zp;
+                pz[0] = zp;
+#pragma unroll
+                for (int i=1; i<R; ++i) {
+                    pu1[i] = pu1[0];
+                    pz[i] = pz[0];
+                }
+                g_cols_py1[m*(n_size+1)+0].set_col(tx, pu1);
+                g_cols_pz[m*(n_size+1)+0].set_col(tx, pz);
+#endif
+            }
+
+            pu1 = zeros<float,R>();
+
+#pragma unroll
+            for (int i=0; i<R; ++i)
+                pz[i] = x[WS-1-i];
+
+#pragma unroll // calculate py1bar cols, scan left -> right
+            for (int j=0; j<WS; ++j) {
+                float zc = x[j]; // current z
+                x[j] = c_coefs[0]*zc + c_coefs[1]*zp;
+                x[j] = fwdI(pu1, x[j], c_weights);
+                zp = zc;
+            }
+
+            pu1bar.set_col(0, pu1); // store pu1bar cols
+            g_cols_pz[m*(n_size+1)+n+1].set_col(tx, pz);
+
+        } else if (ty==3) { // 1 warp reverse computing u2
+
+#pragma unroll // transpose regs part-2
+            for (int i=0; i<32; ++i)
+                x[i] = block[i][tx];
+
+            Matrix<float,R,WS>
+                &eu2bar = (Matrix<float,R,WS>&)g_cols_ey2[m*(n_size+1)+n][0][tx];
+
+            float zn = 0.f, za = 0.f; // next and after next z
+            Vector<float,R> eu2; // current E(U_2)
+            Vector<float,R> ez; // this block E(Z)
+
+            if (n==n_size-1) { // last block
+#if CLAMPTOEDGE
+                zn = za = x[WS-1]; // clamp to edge
+                eu2[0] = c_coefs[5]*zn;
+                ez[0] = zn;
+#pragma unroll
+                for (int i=1; i<R; ++i) {
+                    eu2[i] = eu2[0];
+                    ez[i] = ez[0];
+                }
+                g_cols_ey2[m*(n_size+1)+n_size].set_col(tx, eu2);
+                g_cols_ez[m*(n_size+1)+n_size].set_col(tx, ez);
+#endif
+            }
+
+            eu2 = zeros<float,R>();
+
+#pragma unroll
+            for (int i=0; i<R; ++i)
+                ez[i] = x[i];
+
+#pragma unroll // calculate ey2bar cols, scan right -> left
+            for (int j=WS-1; j>=0; --j) {
+                float zc = x[j]; // current z
+                x[j] = c_coefs[2]*zn + c_coefs[3]*za;
+                x[j] = revI(x[j], eu2, c_weights);
+                za = zn;
+                zn = zc;
+            }
+
+            eu2bar.set_col(0, eu2); // store eu2bar cols
+            g_cols_ez[m*(n_size+1)+n].set_col(tx, ez);
+
+        }
+
+    } // if fusion
+
+}
+
+// Alg4 Deriche in the GPU ------------------------------------------------------
+
+__host__
+void alg4d_gpu( float *h_img, int width, int height,
+                float sigma, int order ) {
+
+    const int B = WS;
+    if (R != 2) { std::cout << "R must be 2!\n"; return; }
+
+    Vector<float, 6> coefs; // [a0-a3, coefp, coefn]
+    Vector<float, R+1> w; // weights
+    w[0] = 1.f;
+    
+    computeFilterParams(coefs[0], coefs[1], coefs[2], coefs[3],
+                        w[1], w[2], coefs[4], coefs[5],
+                        sigma, order);
+
+    Vector<float, R+1> ck1; // convolution kernel 1 (plus)
+    ck1[0] = coefs[0]; ck1[1] = coefs[1]; ck1[2] = 0.f;
+    Vector<float, R+1> ck2; // convolution kernel 2 (minus)
+    ck2[0] = 0.f; ck2[1] = coefs[2]; ck2[2] = coefs[3];
+
+    // pre-compute basic alg4 Deriche matrices
+    Matrix<float,R,R> Ir = identity<float,R,R>();
+    Matrix<float,R,B> Zrb = zeros<float,R,B>();
+    Matrix<float,B,R> Zbr = zeros<float,B,R>();
+    Matrix<float,B,B> Ib = identity<float,B,B>();
+
+    Matrix<float,R,B> AFP_T = fwd(Ir, Zrb, w), ARE_T = rev(Zrb, Ir, w);
+    Matrix<float,B,B> AFB_T = fwd(Zbr, Ib, w), ARB_T = rev(Ib, Zbr, w);
+    Matrix<float,R,R> AbF_T = tail<R>(AFP_T), AbR_T = head<R>(ARE_T);
+    Matrix<float,B,R> TAFB_T = tail<R>(AFB_T), HARB_T = head<R>(ARB_T);
+
+    Matrix<float,R,B> AC1P_T = transp(conv1p<float,B,R>(ck1));
+    Matrix<float,R,B> AC2E_T = transp(conv2e<float,B,R>(ck2));
+
+    Matrix<float,R,R> TAFB_AC1P_T = AC1P_T*TAFB_T;
+    Matrix<float,R,R> HARB_AC2E_T = AC2E_T*HARB_T;
+
+    int m_size = (width+WS-1)/WS, n_size = (height+WS-1)/WS;
+
+    // upload to the GPU
+    copy_to_symbol(c_weights, w);
+    copy_to_symbol(c_coefs, coefs);
+
+    copy_to_symbol(c_AbF_T, AbF_T);
+    copy_to_symbol(c_AbR_T, AbR_T);
+
+    copy_to_symbol(c_TAFB_AC1P_T, TAFB_AC1P_T);
+    copy_to_symbol(c_HARB_AC2E_T, HARB_AC2E_T);
+
+    float inv_width = 1.f/width, inv_height = 1.f/height;
+
+    cudaArray *a_in;
+    size_t offset;
+    cudaChannelFormatDesc ccd = cudaCreateChannelDesc<float>();
+    cudaMallocArray(&a_in, &ccd, width, height);
+    cudaMemcpyToArray(a_in, 0, 0, h_img, width*height*sizeof(float),
+                      cudaMemcpyHostToDevice);
+
+    t_in.normalized = true;
+    t_in.filterMode = cudaFilterModePoint;
+    t_in.addressMode[0] = t_in.addressMode[1] = cudaAddressModeBorder;
+
+    // naming convention: stride in elements and pitch in bytes
+    // to achieve at the same time coalescing and L2-cache colision avoidance
+    // the transposed and regular output images have stride of maximum size
+    // (4Ki) + 1 block (WS), considering up to 4096x4096 input images
+    int stride_img = MAXSIZE+WS, stride_transp_img = MAXSIZE+WS;
+    dvector<float> d_img(height*stride_img), d_transp_img(width*stride_transp_img);
+
+    // +1 padding is important even in zero-border to avoid if's in kernels
+    dvector< Matrix<float,R,B> >
+        d_rows_py1bar((m_size+1)*n_size), d_rows_ey2bar((m_size+1)*n_size),
+        d_cols_py1bar((n_size+1)*m_size), d_cols_ey2bar((n_size+1)*m_size);
+
+    dvector< Matrix<float,R,B> >
+        d_rows_px((m_size+1)*n_size), d_rows_ex((m_size+1)*n_size),
+        d_cols_pz((n_size+1)*m_size), d_cols_ez((n_size+1)*m_size);
+
+    d_rows_px.fillzero();
+    d_rows_ex.fillzero();
+    d_cols_pz.fillzero();
+    d_cols_ez.fillzero();
+
+    d_rows_py1bar.fillzero();
+    d_rows_ey2bar.fillzero();
+    d_cols_py1bar.fillzero();
+    d_cols_ey2bar.fillzero();
+
+#if RUNTIMES>1
+    base_timer &timer_total = timers.gpu_add("alg4d_gpu", width*height*RUNTIMES, "iP");
+
+    for(int r=0; r<RUNTIMES; ++r) {
+
+        cudaBindTextureToArray(t_in, a_in);
+
+        alg4d_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>>
+            ( &d_rows_py1bar, &d_rows_ey2bar, &d_rows_px, &d_rows_ex, inv_width, inv_height, m_size );
+
+        alg4d_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>>
+            ( &d_rows_py1bar, &d_rows_ey2bar, &d_rows_px, &d_rows_ex, m_size );
+
+        alg4d_write_results_transp<true><<< dim3(m_size, n_size), dim3(WS, NWW) >>>
+            ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_px, &d_rows_ex, &d_cols_pz, &d_cols_ez,
+              inv_width, inv_height, m_size, n_size, stride_transp_img );
+    
+        cudaUnbindTexture(t_in);
+        cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float));
+
+        alg4d_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>>
+            ( &d_cols_py1bar, &d_cols_ey2bar, &d_cols_pz, &d_cols_ez, n_size );
+
+        alg4d_write_results_transp<false><<< dim3(n_size, m_size), dim3(WS, NWW) >>>
+            ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_cols_ey2bar, &d_cols_pz, &d_cols_ez, &d_rows_px, &d_rows_ex,
+              inv_height, inv_width, n_size, m_size, stride_img );
+
+        cudaUnbindTexture(t_in);
+
+    }
+
+    timer_total.stop();
+
+    std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush;
+#else
+    base_timer &timer_total = timers.gpu_add("alg4d_gpu", width*height, "iP");
+    base_timer *timer;
+
+    cudaBindTextureToArray(t_in, a_in);
+
+    timer = &timers.gpu_add("collect-carries");
+
+    alg4d_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>>
+        ( &d_rows_py1bar, &d_rows_ey2bar,
+          &d_rows_px, &d_rows_ex,
+          inv_width, inv_height, m_size );
+
+    timer->stop(); timer = &timers.gpu_add("adjust-carries");
+
+    alg4d_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>>
+        ( &d_rows_py1bar, &d_rows_ey2bar,
+          &d_rows_px, &d_rows_ex,
+          m_size );
+
+    timer->stop(); timer = &timers.gpu_add("write-block-collect-carries");
+
+    alg4d_write_results_transp<USE_FUSION><<< dim3(m_size, n_size), dim3(WS, NWW) >>>
+        ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar,
+          &d_rows_px, &d_rows_ex, &d_cols_pz, &d_cols_ez,
+          inv_width, inv_height, m_size, n_size, stride_transp_img );
+
+    timer->stop(); timer = &timers.gpu_add("adjust-carries");
+
+    cudaUnbindTexture(t_in);
+    cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float));
+
+    if (!USE_FUSION) { // it seems faster to not use FUSION - not quite
+        alg4d_collect_carries<<< dim3(n_size, m_size), dim3(WS, NWC) >>>
+            ( &d_cols_py1bar, &d_cols_ey2bar,
+              &d_cols_pz, &d_cols_ez,
+              inv_height, inv_width, n_size );
+    }
+
+    alg4d_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>>
+        ( &d_cols_py1bar, &d_cols_ey2bar,
+          &d_cols_pz, &d_cols_ez,
+          n_size );
+
+    timer->stop(); timer = &timers.gpu_add("write-block");
+
+    alg4d_write_results_transp<false><<< dim3(n_size, m_size), dim3(WS, NWW) >>>
+        ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_rows_ey2bar,
+          &d_cols_pz, &d_cols_ez, &d_rows_px, &d_rows_ex,
+          inv_height, inv_width, n_size, m_size, stride_img );
+
+    timer->stop();
+
+    cudaUnbindTexture(t_in);
+
+    timer_total.stop();
+    timers.flush();
+#endif
+    cudaMemcpy2D(h_img, width*sizeof(float), d_img, stride_img*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToHost);
+
+}
+
+#endif // ALG4D_GPU_CUH
+
+/**
+    //checkings
+    Matrix<float,B,R> pm1nx, em1nx;
+    Matrix<float,B,B> bmnx;
+
+    srand(time(NULL));
+    for (int i=0; i<B; ++i)
+        for (int j=0; j<R; ++j) {
+            pm1nx[i][j] = (rand()%255)/255.f;
+            em1nx[i][j] = (rand()%255)/255.f;
+        }
+    for (int i=0; i<B; ++i)
+        for (int j=0; j<B; ++j)
+            bmnx[i][j] = (rand()%255)/255.f;
+
+    //checking 1
+    Matrix<float,B,B> bmny1a = conv1(pm1nx, bmnx, ck1);
+    Matrix<float,B,B> bmny1b = pm1nx * AC1P_T + bmnx * AC1B_T;
+
+    //checking 2
+    Matrix<float,B,B> bmny2a = conv2(bmnx, em1nx, ck2);
+    Matrix<float,B,B> bmny2b = em1nx * AC2E_T + bmnx * AC2B_T;
+
+    //checking 3
+    Matrix<float,B,R> pmny1bara = tail<R>(fwd(Zbr, bmny1a, w));
+    Matrix<float,B,R> pmny1barb = pm1nx * TAFB_AC1P_T + bmnx * TAFB_AC1B_T;
+
+    //checking 4
+    Matrix<float,B,R> emny2bara = head<R>(rev(bmny2a, Zbr, w));
+    Matrix<float,B,R> emny2barb = em1nx * HARB_AC2E_T + bmnx * HARB_AC2B_T;
+
+    float a, b, r, me1, mre1, me2, mre2;
+    mre2 = me2 = mre1 = me1 = 0;
+
+    for (int i=0; i<B; ++i) {
+        for (int j=0; j<R; ++j) {
+            a = pmny1bara[i][j] - pmny1barb[i][j];
+            a = a < 0 ? -a : a;
+            if (pmny1bara[i][j] != 0) {
+                r = pmny1bara[i][j] < 0 ? -pmny1bara[i][j] : pmny1bara[i][j];
+                b = a / r;
+                mre1 = b > mre1 ? b : mre1;
+            }
+            me1 = a > me1 ? a : me1;
+            a = emny2bara[i][j] - emny2barb[i][j];
+            a = a < 0 ? -a : a;
+            if (emny2bara[i][j] != 0) {
+                r = emny2bara[i][j] < 0 ? -emny2bara[i][j] : emny2bara[i][j];
+                b = a / r;
+                mre2 = b > mre2 ? b : mre2;
+            }
+            me2 = a > me2 ? a : me2;
+        }
+    }
+
+    //std::cout << "bmny1a x bmny1b: me = " << me1 << " mre = " << mre1 << "\n";
+    //std::cout << "bmny2a x bmny2b: me = " << me2 << " mre = " << mre2 << "\n";
+    std::cout << "pmny1bara x pmny1barb: me = " << me1 << " mre = " << mre1 << "\n";
+    std::cout << "emny2bara x emny2barb: me = " << me2 << " mre = " << mre2 << "\n";
+
+*/
diff --git a/paredge/alg4pd_gpu.cuh b/paredge/alg4pd_gpu.cuh
new file mode 100755
index 0000000..f07933c
--- /dev/null
+++ b/paredge/alg4pd_gpu.cuh
@@ -0,0 +1,557 @@
+/**
+ *  @file alg4pd_gpu.cuh
+ *  @brief Algorithm 4 plic in the GPU (Deriche version) with borders
+ *  @author Andre Maximo
+ *  @date Jun, 2014
+ */
+
+#ifndef ALG4PD_GPU_CUH
+#define ALG4PD_GPU_CUH
+
+#define USE_FUSION true
+
+//=== IMPLEMENTATION ============================================================
+
+// TODO LIST:
+// * MAKE ADJUST CARRIES MORE PARALLEL
+
+// Alg4p (Deriche) GPU ----------------------------------------------------------
+
+// Collect carries --------------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWC, NBCW)
+void alg4pd_gpu_collect_carries( Matrix<float,R,WS> *g_py1bar, 
+                                 Matrix<float,R,WS> *g_ey2bar,
+                                 float inv_width, float inv_height,
+                                 int m_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y;
+
+    __shared__ Matrix<float,WS,WS+1> block;
+    read_block<NWC>(block, m, n, inv_width, inv_height);
+    __syncthreads();
+
+    float x[32]; // 32 regs per thread
+
+    if (ty==0) { // 1 warp forward computing y1
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float px = 0.f; // previous x
+        Vector<float,R> py1; // current P(Y_1)
+
+        if (m==0) { // 1st block
+#if CLAMPTOEDGE
+            px = x[0]; // clamp to edge
+            py1[0] = c_coefs[4]*px;
+#pragma unroll
+            for (int i=1; i<R; ++i)
+                py1[i] = py1[0];
+            g_py1bar[n*(m_size+1)+0].set_col(tx, py1);
+#endif
+        } else { // other blocks
+            float tu = ((m-1)*WS+(WS-1)+.5f)*inv_width; // last column of previous block
+            float tv = (n*WS+tx+.5f)*inv_height; // one element per row
+            px = tex2D(t_in, tu, tv);
+        }
+
+        py1 = zeros<float,R>();
+
+#pragma unroll // calculate pybar, scan left -> right
+        for (int j=0; j<WS; ++j) {
+            float cx = x[j]; // current x
+            x[j] = c_coefs[0]*cx + c_coefs[1]*px;
+            x[j] = fwdI(py1, x[j], c_weights);
+            px = cx;
+        }
+
+        g_py1bar[n*(m_size+1)+m+1].set_col(tx, py1);
+
+    } else if (ty==1) { // 1 warp reverse computing y2
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+        
+        float nx = 0.f, nnx = 0.f; // next and next-next x
+        Vector<float,R> ey2; // current E(Y_2)
+
+        if (m==m_size-1) { // last block
+#if CLAMPTOEDGE
+            nx = nnx = x[WS-1]; // clamp to edge
+            ey2[0] = c_coefs[5]*nx;
+#pragma unroll
+            for (int i=1; i<R; ++i)
+                ey2[i] = ey2[0];
+            g_ey2bar[n*(m_size+1)+m_size].set_col(tx, ey2);
+#endif
+        } else { // other blocks
+            float tu = ((m+1)*WS+.5f)*inv_width; // first column of next block
+            float tv = (n*WS+tx+.5f)*inv_height; // one element per row
+            nx = tex2D(t_in, tu, tv);
+            nnx = tex2D(t_in, tu+inv_width, tv); // second column of next block
+        }
+
+        ey2 = zeros<float,R>();
+
+#pragma unroll // calculate ezhat, scan right -> left
+        for (int j=WS-1; j>=0; --j) {
+            float cx = x[j]; // current x
+            x[j] = c_coefs[2]*nx + c_coefs[3]*nnx;
+            x[j] = revI(x[j], ey2, c_weights);
+            nnx = nx;
+            nx = cx;
+        }
+
+        g_ey2bar[n*(m_size+1)+m].set_col(tx, ey2);
+
+    }
+
+}
+
+// Adjust carries ---------------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWA, NBA)
+void alg4pd_gpu_adjust_carries( Matrix<float,R,WS> *g_py1bar,
+                                Matrix<float,R,WS> *g_ey2bar,
+                                int m_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m, n = blockIdx.y;
+    Matrix<float,R,WS> *gpy1bar, *gey2bar;
+    Vector<float,R> py1, ey2, py1bar, ey2bar;
+    __shared__ Matrix<float,R,WS> spy1ey2bar[NWA];
+
+    // P(y1bar) -> P(y1) processing -----------------------------------------------
+    m = 0;
+    gpy1bar = (Matrix<float,R,WS> *)&g_py1bar[n*(m_size+1)+m+ty+1][0][tx];
+    if (ty==0) py1 = g_py1bar[n*(m_size+1)+0].col(tx);
+
+    for (; m < m_size; m += NWA) { // for all image blocks
+
+        // using smem as cache
+        if (m+ty < m_size) spy1ey2bar[ty].set_col(tx, gpy1bar->col(0));
+
+        __syncthreads(); // wait to load smem
+
+        if (ty==0) {
+#pragma unroll // adjust py1bar left -> right
+            for (int w = 0; w < NWA; ++w) {
+
+                py1bar = spy1ey2bar[w].col(tx);
+
+                py1 = py1bar + py1 * c_AbF_T;
+
+                spy1ey2bar[w].set_col(tx, py1);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (m+ty < m_size) gpy1bar->set_col(0, spy1ey2bar[ty].col(tx));
+
+        gpy1bar += NWA;
+
+    }
+
+    // E(zhat) -> E(z) processing -----------------------------------------------
+    m = m_size-1;
+    gey2bar = (Matrix<float,R,WS> *)&g_ey2bar[n*(m_size+1)+m-ty][0][tx];
+    if (ty==0) ey2 = g_ey2bar[n*(m_size+1)+m_size].col(tx);
+
+    for (; m >= 0; m -= NWA) { // for all image blocks
+
+        // using smem as cache
+        if (m-ty >= 0) spy1ey2bar[ty].set_col(tx, gey2bar->col(0));
+
+        __syncthreads(); // wait to load smem
+
+        if (ty==0) {
+#pragma unroll // adjust ey2bar right -> left
+            for (int w = 0; w < NWA; ++w) {
+
+                ey2bar = spy1ey2bar[w].col(tx);
+
+                ey2 = ey2bar + ey2 * c_AbR_T;
+
+                spy1ey2bar[w].set_col(tx, ey2);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (m-ty >= 0) gey2bar->set_col(0, spy1ey2bar[ty].col(tx));
+
+        gey2bar -= NWA;
+
+    }
+
+}
+
+// Write results transpose ------------------------------------------------------
+
+template <bool FUSION, int R>
+__global__ __launch_bounds__(WS*NWW, NBCW)
+void alg4pd_gpu_write_results_transp( float *g_transp_out,
+                                      const Matrix<float,R,WS> *g_rows_py1,
+                                      const Matrix<float,R,WS> *g_rows_ey2,
+                                      Matrix<float,R,WS> *g_cols_py1,
+                                      Matrix<float,R,WS> *g_cols_ey2,
+                                      float inv_width, float inv_height,
+                                      int m_size, int n_size,
+                                      int out_stride ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y;
+
+    __shared__ Matrix<float,WS,WS+1> block;
+    read_block<NWW>(block, m, n, inv_width, inv_height);
+    __syncthreads();
+
+    float x[32]; // 32 regs per thread
+
+    Matrix<float,R,WS> 
+        *py1m1 = (Matrix<float,R,WS>*)&g_rows_py1[n*(m_size+1)+m][0][tx],
+        *ey2m1 = (Matrix<float,R,WS>*)&g_rows_ey2[n*(m_size+1)+m+1][0][tx];
+
+    if (ty==0) { // 1 warp forward computing y1
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float px = 0.f; // previous x
+        Vector<float,R> py1 = py1m1->col(0); // current P(Y_1)
+
+        if (m==0) { // 1st block
+#if CLAMPTOEDGE
+            px = x[0]; // clamp to edge
+#endif
+        } else { // other blocks
+            float tu = ((m-1)*WS+(WS-1)+.5f)*inv_width; // last column of previous block
+            float tv = (n*WS+tx+.5f)*inv_height; // one element per row
+            px = tex2D(t_in, tu, tv);
+        }
+
+#pragma unroll // calculate block, scan left -> right
+        for (int j=0; j<WS; ++j) {
+            float cx = x[j]; // current x
+            x[j] = c_coefs[0]*cx + c_coefs[1]*px;
+            x[j] = fwdI(py1, x[j], c_weights);
+            px = cx;
+        }
+
+    } else if (ty==1) { // 1 warp reverse computing y2
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float nx = 0.f, nnx = 0.f; // next and next-next x
+        Vector<float,R> ey2 = ey2m1->col(0); // current E(Y_2)
+
+        if (m==m_size-1) { // last block
+#if CLAMPTOEDGE
+            nx = nnx = x[WS-1]; // clamp to edge
+#endif
+        } else { // other blocks
+            float tu = ((m+1)*WS+.5f)*inv_width; // first column of next block
+            float tv = (n*WS+tx+.5f)*inv_height; // one element per row
+            nx = tex2D(t_in, tu, tv);
+            nnx = tex2D(t_in, tu+inv_width, tv); // second column of next block
+        }
+
+#pragma unroll // calculate block, scan right -> left
+        for (int j=WS-1; j>=0; --j) {
+            float cx = x[j]; // current x
+            x[j] = c_coefs[2]*nx + c_coefs[3]*nnx;
+            x[j] = revI(x[j], ey2, c_weights);
+            nnx = nx;
+            nx = cx;
+        }
+
+    }
+
+    __syncthreads(); // guarantee no smem usage
+
+    if (ty==1) {
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[tx][i] = x[i]; // save partial result in smem
+
+    }
+
+    __syncthreads(); // guarantee warp 0 can read smem
+
+    if (ty==0) {
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] += block[tx][i]; // x now has z or v
+
+        g_transp_out += m*WS*out_stride + n*WS + tx;
+#pragma unroll // write block inside valid transpose image
+        for (int i=0; i<WS; ++i, g_transp_out += out_stride) {
+            *g_transp_out = x[i];
+        }
+
+    }
+
+    if (FUSION) {
+
+        if (ty==0) {
+#pragma unroll // transpose regs part-1
+            for (int i=0; i<32; ++i)
+                block[tx][i] = x[i];
+
+        }
+
+        __syncthreads();
+
+        if (ty==2) { // 1 warp forward computing u1
+
+#pragma unroll // transpose regs part-2
+            for (int i=0; i<32; ++i)
+                x[i] = block[i][tx];
+
+            Matrix<float,R,WS>
+                &pu1bar = (Matrix<float,R,WS>&)g_cols_py1[m*(n_size+1)+n+1][0][tx];
+
+            float pz = 0.f; // previous z
+            Vector<float,R> pu1; // current P(U_1)
+
+            if (n==0) { // 1st block
+#if CLAMPTOEDGE
+                pz = x[0]; // clamp to edge
+                pu1[1] = pu1[0] = c_coefs[4]*pz;
+                g_cols_py1[m*(n_size+1)+0].set_col(tx, pu1);
+#endif
+            } else { // other blocks
+                g_transp_out += (m*WS+tx)*out_stride + (n-1)*WS + (WS-1);
+                pz = *g_transp_out;
+            }
+
+            pu1 = zeros<float,R>();
+
+#pragma unroll // calculate pybar cols, scan left -> right
+            for (int j=0; j<WS; ++j) {
+                float cz = x[j]; // current z
+                x[j] = c_coefs[0]*cz + c_coefs[1]*pz;
+                x[j] = fwdI(pu1, x[j], c_weights);
+                pz = cz;
+            }
+
+            pu1bar.set_col(0, pu1); // store pu1bar cols
+
+        } else if (ty==3) { // 1 warp reverse computing u2
+
+#pragma unroll // transpose regs part-2
+            for (int i=0; i<32; ++i)
+                x[i] = block[i][tx];
+
+            Matrix<float,R,WS>
+                &eu2bar = (Matrix<float,R,WS>&)g_cols_ey2[m*(n_size+1)+n][0][tx];
+
+            float nz = 0.f, nnz = 0.f; // next and next-next z
+            Vector<float,R> eu2; // current E(U_2)
+
+            if (n==n_size-1) { // last block
+#if CLAMPTOEDGE
+                nz = nnz = x[WS-1]; // clamp to edge
+                eu2[1] = eu2[0] = c_coefs[5]*nz;
+                g_cols_ey2[m*(n_size+1)+n_size].set_col(tx, eu2);
+#endif
+            } else { // other blocks
+                g_transp_out += (m*WS+tx)*out_stride + (n+1)*WS;
+                nz = *g_transp_out;
+                g_transp_out += 1;
+                nnz = *g_transp_out;
+            }
+
+            eu2 = zeros<float,R>();
+
+#pragma unroll // calculate ezhat cols, scan right -> left
+            for (int j=WS-1; j>=0; --j) {
+                float cz = x[j]; // current z
+                x[j] = c_coefs[2]*nz + c_coefs[3]*nnz;
+                x[j] = revI(x[j], eu2, c_weights);
+                nnz = nz;
+                nz = cz;
+            }
+
+            eu2bar.set_col(0, eu2); // store eu2bar cols
+
+        }
+
+    } // if fusion
+
+}
+
+// Alg4p Deriche in the GPU -----------------------------------------------------
+
+__host__
+void alg4pd_gpu( float *h_img, int width, int height,
+                 float sigma, int order ) {
+
+    const int B = WS;
+    if (R != 2) { std::cout << "R must be 2!\n"; return; }
+
+    Vector<float, 6> coefs; // [a0-a3, coefp, coefn]
+    Vector<float, R+1> w; // weights
+    w[0] = 1.f;
+    
+    computeFilterParams(coefs[0], coefs[1], coefs[2], coefs[3],
+                        w[1], w[2], coefs[4], coefs[5],
+                        sigma, order);
+
+    // pre-compute basic alg4p Deriche matrices
+    Matrix<float,R,R> Ir = identity<float,R,R>();
+    Matrix<float,R,B> Zrb = zeros<float,R,B>();
+
+    Matrix<float,R,B> AFP_T = fwd(Ir, Zrb, w),
+                      ARE_T = rev(Zrb, Ir, w);
+
+    Matrix<float,R,R> AbF_T = tail<R>(AFP_T),
+                      AbR_T = head<R>(ARE_T);
+
+    int m_size = (width+WS-1)/WS, n_size = (height+WS-1)/WS;
+
+    // upload to the GPU
+    copy_to_symbol(c_weights, w);
+    copy_to_symbol(c_coefs, coefs);
+
+    copy_to_symbol(c_AbF_T, AbF_T);
+    copy_to_symbol(c_AbR_T, AbR_T);
+
+    float inv_width = 1.f/width, inv_height = 1.f/height;
+
+    cudaArray *a_in;
+    size_t offset;
+    cudaChannelFormatDesc ccd = cudaCreateChannelDesc<float>();
+    cudaMallocArray(&a_in, &ccd, width, height);
+    cudaMemcpyToArray(a_in, 0, 0, h_img, width*height*sizeof(float),
+                      cudaMemcpyHostToDevice);
+
+    t_in.normalized = true;
+    t_in.filterMode = cudaFilterModePoint;
+    t_in.addressMode[0] = t_in.addressMode[1] = cudaAddressModeBorder;
+
+    // naming convention: stride in elements and pitch in bytes
+    // to achieve at the same time coalescing and L2-cache colision avoidance
+    // the transposed and regular output images have stride of maximum size
+    // (4Ki) + 1 block (WS), considering up to 4096x4096 input images
+    int stride_img = MAXSIZE+WS, stride_transp_img = MAXSIZE+WS;
+    dvector<float> d_img(height*stride_img), d_transp_img(width*stride_transp_img);
+
+    // +1 padding is important even in zero-border to avoid if's in kernels
+    dvector< Matrix<float,R,B> >
+        d_rows_py1bar((m_size+1)*n_size), d_rows_ey2bar((m_size+1)*n_size),
+        d_cols_py1bar((n_size+1)*m_size), d_cols_ey2bar((n_size+1)*m_size);
+
+    d_rows_py1bar.fillzero();
+    d_rows_ey2bar.fillzero();
+    d_cols_py1bar.fillzero();
+    d_cols_ey2bar.fillzero();
+
+#if RUNTIMES>1
+    base_timer &timer_total = timers.gpu_add("alg4pd_gpu", width*height*RUNTIMES, "iP");
+
+    for(int r=0; r<RUNTIMES; ++r) {
+
+        cudaBindTextureToArray(t_in, a_in);
+
+        alg4pd_gpu_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>>
+            ( &d_rows_py1bar, &d_rows_ey2bar, inv_width, inv_height, m_size );
+
+        alg4pd_gpu_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>>
+            ( &d_rows_py1bar, &d_rows_ey2bar, m_size );
+
+        alg4pd_gpu_write_results_transp<USE_FUSION><<< dim3(m_size, n_size), dim3(WS, NWW) >>>
+            ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar,
+              inv_width, inv_height, m_size, n_size, stride_transp_img );
+
+        if (!USE_FUSION) {
+            alg4pd_gpu_collect_carries<<< dim3(n_size, m_size), dim3(WS, NWC) >>>
+                ( &d_cols_py1bar, &d_cols_ey2bar,
+                  inv_height, inv_width, n_size );
+        }
+    
+        cudaUnbindTexture(t_in);
+        cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float));
+
+        alg4pd_gpu_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>>
+            ( &d_cols_py1bar, &d_cols_ey2bar, n_size );
+
+        alg4pd_gpu_write_results_transp<false><<< dim3(n_size, m_size), dim3(WS, NWW) >>>
+            ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_cols_ey2bar,
+              inv_height, inv_width, n_size, m_size, stride_img );
+
+        cudaUnbindTexture(t_in);
+
+    }
+
+    timer_total.stop();
+
+    std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush;
+#else
+    base_timer &timer_total = timers.gpu_add("alg4pd_gpu", width*height, "iP");
+    base_timer *timer;
+
+    cudaBindTextureToArray(t_in, a_in);
+
+    timer = &timers.gpu_add("collect-carries");
+
+    alg4pd_gpu_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>>
+        ( &d_rows_py1bar, &d_rows_ey2bar,
+          inv_width, inv_height, m_size );
+
+    timer->stop(); timer = &timers.gpu_add("adjust-carries");
+
+    alg4pd_gpu_adjust_carries<<< dim3(1, n_size), dim3(WS, NWA) >>>
+        ( &d_rows_py1bar, &d_rows_ey2bar,
+          m_size );
+
+    timer->stop(); timer = &timers.gpu_add("write-block-collect-carries");
+
+    alg4pd_gpu_write_results_transp<USE_FUSION><<< dim3(m_size, n_size), dim3(WS, NWW) >>>
+        ( d_transp_img, &d_rows_py1bar, &d_rows_ey2bar, &d_cols_py1bar, &d_cols_ey2bar,
+          inv_width, inv_height, m_size, n_size, stride_transp_img );
+
+    timer->stop(); timer = &timers.gpu_add("adjust-carries");
+
+    cudaUnbindTexture(t_in);
+    cudaBindTexture2D(&offset, t_in, d_transp_img, height, width, stride_transp_img*sizeof(float));
+
+    if (!USE_FUSION) { // it seems faster to not use FUSION - not quite
+        alg4pd_gpu_collect_carries<<< dim3(n_size, m_size), dim3(WS, NWC) >>>
+            ( &d_cols_py1bar, &d_cols_ey2bar,
+              inv_height, inv_width, n_size );
+    }
+
+    alg4pd_gpu_adjust_carries<<< dim3(1, m_size), dim3(WS, NWA) >>>
+        ( &d_cols_py1bar, &d_cols_ey2bar,
+          n_size );
+
+    timer->stop(); timer = &timers.gpu_add("write-block");
+
+    alg4pd_gpu_write_results_transp<false><<< dim3(n_size, m_size), dim3(WS, NWW) >>>
+        ( d_img, &d_cols_py1bar, &d_cols_ey2bar, &d_rows_py1bar, &d_rows_ey2bar,
+          inv_height, inv_width, n_size, m_size, stride_img );
+
+    timer->stop();
+
+    cudaUnbindTexture(t_in);
+
+    timer_total.stop();
+    timers.flush();
+#endif
+    cudaMemcpy2D(h_img, width*sizeof(float), d_img, stride_img*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToHost);
+
+}
+
+#endif // ALG4PD_GPU_CUH
diff --git a/paredge/alg5d_gpu.cuh b/paredge/alg5d_gpu.cuh
new file mode 100755
index 0000000..9663676
--- /dev/null
+++ b/paredge/alg5d_gpu.cuh
@@ -0,0 +1,855 @@
+/**
+ *  @file alg5d_gpu.cuh
+ *  @brief Algorithm 5 in the GPU (Deriche version) with borders
+ *  @author Andre Maximo
+ *  @date Jun, 2014
+ */
+
+#ifndef ALG5D_GPU_CUH
+#define ALG5D_GPU_CUH
+
+//=== IMPLEMENTATION ============================================================
+
+// Alg5 (Deriche) GPU -----------------------------------------------------------
+
+// Collect carries --------------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWC, NBCW)
+void alg5d_collect_carries( Matrix<float,R,WS> *g_py1bar,
+                            Matrix<float,R,WS> *g_pty1bar,
+                            Matrix<float,R,WS> *g_ety1bar,
+                            Matrix<float,R,WS> *g_ey2bar,
+                            Matrix<float,R,WS> *g_pty2bar,
+                            Matrix<float,R,WS> *g_ety2bar,
+                            Matrix<float,R,WS> *g_ptu11hat,
+                            Matrix<float,R,WS> *g_etu12hat,
+                            Matrix<float,R,WS> *g_etu22hat,
+                            Matrix<float,R,WS> *g_ptu21hat,
+                            Matrix<float,R,WS> *g_px,
+                            Matrix<float,R,WS> *g_ex,
+                            float inv_width, float inv_height,
+                            int m_size, int n_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y;
+
+    __shared__ Matrix<float,WS,WS+1> block;
+    read_block<NWC>(block, m, n, inv_width, inv_height);
+    __syncthreads();
+
+    float x[32]; // 32 regs per thread
+
+    if (ty==0) { // 1 warp forward computing y1
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xp = 0.f; // previous x
+        Vector<float,R> py1; // current P(Y_1)
+        Vector<float,R> px; // this block P(X)
+
+        // for now, no boundary condition (zero)
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            px[i] = x[WS-1-i];
+
+#pragma unroll // calculate py1bar, scan left -> right
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(py1, x[j], c_weights);
+            xp = xc;
+        }
+
+        g_py1bar[n*(m_size+1)+m+1].set_col(tx, py1);
+        g_px[n*(m_size+1)+m+1].set_col(tx, px);
+
+    } else if (ty==1) { // 1 warp reverse computing y2
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xn = 0.f, xa = 0.f; // next and after next x
+        Vector<float,R> ey2; // current E(Y_2)
+        Vector<float,R> ex; // this block E(X)
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            ex[i] = x[i];
+
+#pragma unroll // calculate ey2bar, scan right -> left
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], ey2, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+        g_ey2bar[n*(m_size+1)+m].set_col(tx, ey2);
+        g_ex[n*(m_size+1)+m].set_col(tx, ex);
+
+    }
+
+    __syncthreads();
+
+    if (ty==0) { // save y1 in smem
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[tx][i] = x[i];
+
+    }
+
+    __syncthreads();
+
+    if (ty==0) { //  1 warp forward-down computing u11
+
+#pragma unroll // transpose regs
+        for (int i=0; i<32; ++i)
+            x[i] = block[i][tx];
+
+        float xp = 0.f; // previous x
+        Vector<float,R> ptu11; // current Pt(U_11)
+        Vector<float,R> pty1; // this block Pt(Y1)
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            pty1[i] = x[WS-1-i];
+
+#pragma unroll // calculate pu11hat, scan top -> bottom
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(ptu11, x[j], c_weights);
+            xp = xc;
+        }
+
+        g_ptu11hat[m*(n_size+1)+n+1].set_col(tx, ptu11);
+        g_pty1bar[m*(n_size+1)+n+1].set_col(tx, pty1);
+
+    } else if (ty==2) { //  1 warp forward-up computing u12
+
+#pragma unroll // transpose regs
+        for (int i=0; i<32; ++i)
+            x[i] = block[i][tx];
+        
+        float xn = 0.f, xa = 0.f; // next and after next x
+        Vector<float,R> etu12; // current Et(U_12)
+        Vector<float,R> ety1; // this block Et(Y1)
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            ety1[i] = x[i];
+
+#pragma unroll // calculate eu12hat, scan bottom -> top
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], etu12, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+        g_etu12hat[m*(n_size+1)+n].set_col(tx, etu12);
+        g_ety1bar[m*(n_size+1)+n].set_col(tx, ety1);
+
+    }
+
+    __syncthreads();
+
+    if (ty==1) { // save y2 in smem
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[tx][i] = x[i];
+
+    }
+
+    __syncthreads();
+
+    if (ty==1) { //  1 warp reverse-up computing u22
+
+#pragma unroll // transpose regs
+        for (int i=0; i<32; ++i)
+            x[i] = block[i][tx];
+
+        float xn = 0.f, xa = 0.f; // next and after next x
+        Vector<float,R> etu22; // current Et(U_22)
+        Vector<float,R> ety2; // this block Et(Y2)
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            ety2[i] = x[i];
+
+#pragma unroll // calculate eu12hat, scan bottom -> top
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], etu22, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+        g_etu22hat[m*(n_size+1)+n].set_col(tx, etu22);
+        g_ety2bar[m*(n_size+1)+n].set_col(tx, ety2);
+
+    } else if (ty==3) { //  1 warp reverse-down computing u21
+
+#pragma unroll // transpose regs
+        for (int i=0; i<32; ++i)
+            x[i] = block[i][tx];
+
+        float xp = 0.f; // previous x
+        Vector<float,R> ptu21; // current Pt(U_21)
+        Vector<float,R> pty2; // this block Pt(Y2)
+
+#pragma unroll
+        for (int i=0; i<R; ++i)
+            pty2[i] = x[WS-1-i];
+
+#pragma unroll // calculate pu11hat, scan top -> bottom
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(ptu21, x[j], c_weights);
+            xp = xc;
+        }
+
+        g_ptu21hat[m*(n_size+1)+n+1].set_col(tx, ptu21);
+        g_pty2bar[m*(n_size+1)+n+1].set_col(tx, pty2);
+
+    }
+
+}
+
+// Adjust carries rows ----------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWA, NBA)
+void alg5d_adjust_carries_rows( Matrix<float,R,WS> *g_py1bar,
+                                Matrix<float,R,WS> *g_ey2bar,
+                                Matrix<float,R,WS> *g_px,
+                                Matrix<float,R,WS> *g_ex,
+                                int m_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m, n = blockIdx.y;
+    Matrix<float,R,WS> *gpy1bar, *gey2bar;
+    Vector<float,R> py1, ey2, py1bar, ey2bar;
+    __shared__ Matrix<float,R,WS> spy1ey2bar[NWA];
+
+    Matrix<float,R,WS> *gpx, *gex;
+    Vector<float,R> px, ex;
+    __shared__ Matrix<float,R,WS> spxex[NWA];
+
+    // P(y1bar) -> P(y1) processing ---------------------------------------------
+    m = 0;
+    gpy1bar = (Matrix<float,R,WS> *)&g_py1bar[n*(m_size+1)+m+ty+1][0][tx];
+    gpx = (Matrix<float,R,WS> *)&g_px[n*(m_size+1)+m+ty][0][tx];
+    py1 = zeros<float,R>(); // zero boundary condition
+
+    for (; m < m_size; m += NWA) { // for all image blocks
+
+        if (m+ty < m_size) { // using smem as cache
+            spy1ey2bar[ty].set_col(tx, gpy1bar->col(0));
+            spxex[ty].set_col(tx, gpx->col(0));
+        }
+
+        __syncthreads(); // wait to load smem
+
+        if (ty==0) {
+#pragma unroll // adjust py1bar left -> right
+            for (int w = 0; w < NWA; ++w) {
+
+                py1bar = spy1ey2bar[w].col(tx);
+                px = spxex[w].col(tx);
+
+                py1 = py1bar + py1 * c_AbF_T + px * c_TAFB_AC1P_T;
+
+                spy1ey2bar[w].set_col(tx, py1);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (m+ty < m_size) gpy1bar->set_col(0, spy1ey2bar[ty].col(tx));
+
+        gpy1bar += NWA;
+        gpx += NWA;
+
+    }
+
+    // E(y2bar) -> E(y2) processing ---------------------------------------------
+    m = m_size-1;
+    gey2bar = (Matrix<float,R,WS> *)&g_ey2bar[n*(m_size+1)+m-ty][0][tx];
+    gex = (Matrix<float,R,WS> *)&g_ex[n*(m_size+1)+m+1-ty][0][tx];
+    ey2 = zeros<float,R>();
+
+    for (; m >= 0; m -= NWA) { // for all image blocks
+
+        if (m-ty >= 0) { // using smem as cache
+            spy1ey2bar[ty].set_col(tx, gey2bar->col(0));
+            spxex[ty].set_col(tx, gex->col(0));
+        }
+
+        __syncthreads(); // wait to load smem
+
+        if (ty==0) {
+#pragma unroll // adjust ey2bar right -> left
+            for (int w = 0; w < NWA; ++w) {
+
+                ey2bar = spy1ey2bar[w].col(tx);
+                ex = spxex[w].col(tx);
+
+                ey2 = ey2bar + ey2 * c_AbR_T + ex * c_HARB_AC2E_T;
+
+                spy1ey2bar[w].set_col(tx, ey2);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (m-ty >= 0) gey2bar->set_col(0, spy1ey2bar[ty].col(tx));
+
+        gey2bar -= NWA;
+        gex -= NWA;
+
+    }
+
+}
+
+// Adjust carries columns -------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWA, NBAC)
+void alg5d_adjust_carries_cols( Matrix<float,R,WS> *g_py1,
+                                Matrix<float,R,WS> *g_pty1bar,
+                                Matrix<float,R,WS> *g_ety1bar,
+                                Matrix<float,R,WS> *g_ey2,
+                                Matrix<float,R,WS> *g_pty2bar,
+                                Matrix<float,R,WS> *g_ety2bar,
+                                Matrix<float,R,WS> *g_ptu11hat,
+                                Matrix<float,R,WS> *g_etu12hat,
+                                Matrix<float,R,WS> *g_etu22hat,
+                                Matrix<float,R,WS> *g_ptu21hat,
+                                Matrix<float,R,WS> *g_px,
+                                Matrix<float,R,WS> *g_ex,
+                                int m_size, int n_size ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n;
+    Matrix<float,R,WS> *gptu11hat, *gptu21hat;//, *getu12hat, *getu22hat;
+    Vector<float,R> ptu11hat, ptu21hat, ptu1hat, etu2hat;
+    Vector<float,R> pty1bar, pty2bar;
+    Vector<float,R> ptz, etz, ptu1, etu2;
+
+    Matrix<float,R,WS> *gpty1bar, *gpty2bar;//, *gety1bar, *gety2bar;
+    Matrix<float,R,WS> *gpx, *gex, *gpy1, *gey2;
+    Vector<float,R> px, ex, py1, ey2;
+
+    __shared__ Matrix<float,R,WS> spx[NWA], sex[NWA], spy1[NWA], sey2[NWA];
+    __shared__ Matrix<float,R,WS> spt1[NWA], spt2[NWA], set1[NWA], set2[NWA];
+
+    // store (pty1bar, pty2bar) or (ptu11hat, ptu21hat) in (spt1, spt2)
+    // store (ety1bar, ety2bar) or (etu22hat, etu12hat) in (set1, set2)
+    // store ptu1 in ptu11hat and etu2 in etu22hat
+
+    // Pt(u11hat) -> Pt(u1) processing ------------------------------------------
+    n = 0;
+    gptu11hat = (Matrix<float,R,WS> *)&g_ptu11hat[m*(n_size+1)+n+1+ty][0][tx];
+    gptu21hat = (Matrix<float,R,WS> *)&g_ptu21hat[m*(n_size+1)+n+1+ty][0][tx];
+    gpty1bar = (Matrix<float,R,WS> *)&g_pty1bar[m*(n_size+1)+n+1+ty][0][tx];
+    gpty2bar = (Matrix<float,R,WS> *)&g_pty2bar[m*(n_size+1)+n+1+ty][0][tx];
+    gpx = (Matrix<float,R,WS> *)&g_px[(n+ty)*(m_size+1)+m+0][0][tx];
+    gex = (Matrix<float,R,WS> *)&g_ex[(n+ty)*(m_size+1)+m+1][0][tx];
+    gpy1 = (Matrix<float,R,WS> *)&g_py1[(n+ty)*(m_size+1)+m+0][0][tx];
+    gey2 = (Matrix<float,R,WS> *)&g_ey2[(n+ty)*(m_size+1)+m+1][0][tx];
+
+    ptu1 = ptz = zeros<float,R>(); // zero boundary condition
+
+    for (; n < n_size; n += NWA) { // for all image blocks
+
+        if (n+ty < n_size) { // using smem as cache
+            spt1[ty].set_col(tx, gptu11hat->col(0));
+            spt2[ty].set_col(tx, gptu21hat->col(0));
+            spy1[ty].set_col(tx, gpy1->col(0));
+            sey2[ty].set_col(tx, gey2->col(0));
+            spx[ty].set_col(tx, gpx->col(0));
+            sex[ty].set_col(tx, gex->col(0));
+        }
+
+        __syncthreads(); // wait to load smem
+
+        if (ty == 0) {
+#pragma unroll // adjust ptucheck top -> bottom
+            for (int w = 0; w < NWA; ++w) {
+
+                ptu11hat = spt1[w].col(tx);
+                ptu21hat = spt2[w].col(tx);
+                py1 = spy1[w].col(tx);
+                ey2 = sey2[w].col(tx);
+                px = spx[w].col(tx);
+                ex = sex[w].col(tx);
+
+                ptz = pty1bar + pty2bar + ey2 * c_ARE_T.col(tx) + py1 * c_AFP_T.col(tx);
+
+                ptu1hat = ptu11hat + ptu21hat;
+                fixpet(ptu1hat, c_TAFB_AC1B, c_AFP_T, py1);
+                fixpet(ptu1hat, c_TAFB_AC1B, c_ARE_T, ey2);
+                fixpet(ptu1hat, c_TAFB_AC1B, c_AC1P_AFB_T, px);
+                fixpet(ptu1hat, c_TAFB_AC1B, c_AC2E_ARB_T, ex);
+
+                ptu1 = ptu1hat + ptu1 * c_AbF_T + ptz * c_TAFB_AC1P_T;
+
+                spt1[w].set_col(tx, ptu1);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (n+ty < n_size) gptu11hat->set_col(0, spt1[ty].col(tx));
+
+        gptu11hat += NWA;
+        //gpx, gex...
+        gpy1 += NWA*(m_size+1);
+        gey2 += NWA*(m_size+1);
+
+    }
+/*
+    // Et(vtilde) -> Et(v) processing -------------------------------------------
+    n = n_size-1;
+    getvtilde = (Matrix<float,R,WS> *)&g_etvtilde[m*(n_size+1)+n-ty][0][tx];
+    gptucheck = (Matrix<float,R,WS> *)&g_ptucheck[m*(n_size+1)+n-ty][0][tx];
+    gpy = (Matrix<float,R,WS> *)&g_py[(n-ty)*(m_size+1)+m][0][tx];
+    gez = (Matrix<float,R,WS> *)&g_ez[(n-ty)*(m_size+1)+m+1][0][tx];
+    etv = zeros<float,R>();
+
+    for (; n >= 0; n -= NWA) { // for all image blocks
+
+        if (n-ty >= 0) { // using smem as cache
+            setvtilde[ty].set_col(tx, getvtilde->col(0));
+            sptucheck[ty].set_col(tx, gptucheck->col(0));
+            spy[ty].set_col(tx, gpy->col(0));
+            sez[ty].set_col(tx, gez->col(0));
+        }
+
+        __syncthreads(); // wait to load smem
+
+        if (ty == 0) {
+#pragma unroll // adjust etvtilde bottom -> top
+            for (int w = 0; w < NWA; ++w) {
+
+                etvtilde = setvtilde[w].col(tx);
+                ptu = sptucheck[w].col(tx);
+                py = spy[w].col(tx);
+                ez = sez[w].col(tx);
+
+                etv = etvtilde + etv * c_AbR_T + ptu * c_HARB_AFP_T;
+
+                fixpet(etv, c_HARB_AFB, c_ARE_T, ez);
+                fixpet(etv, c_HARB_AFB, c_ARB_AFP_T, py);
+
+                setvtilde[w].set_col(tx, etv);
+
+            }
+        }
+
+        __syncthreads(); // wait to store result in gmem
+
+        if (n-ty >= 0) getvtilde->set_col(0, setvtilde[ty].col(tx));
+
+        getvtilde -= NWA;
+        gptucheck -= NWA;
+        gpy -= NWA*(m_size+1);
+        gez -= NWA*(m_size+1);
+
+    }
+*/
+}
+
+// Write results ----------------------------------------------------------------
+
+template <int R>
+__global__ __launch_bounds__(WS*NWW, NBCW)
+void alg5d_write_results( float *g_out,
+                          const Matrix<float,R,WS> *g_py1,
+                          const Matrix<float,R,WS> *g_ey2,
+                          const Matrix<float,R,WS> *g_ptu1,
+                          const Matrix<float,R,WS> *g_etu2,
+                          const Matrix<float,R,WS> *g_px,
+                          const Matrix<float,R,WS> *g_ex,
+                          const Matrix<float,R,WS> *g_ptz,
+                          const Matrix<float,R,WS> *g_etz,
+                          float inv_width, float inv_height,
+                          int m_size, int n_size,
+                          int out_stride ) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, m = blockIdx.x, n = blockIdx.y;
+
+    __shared__ Matrix<float,WS,WS+1> block;
+    read_block<NWW>(block, m, n, inv_width, inv_height);
+    __syncthreads();
+
+    float x[32]; // 32 regs per thread
+
+    Matrix<float,R,WS> 
+        *py1m1 = (Matrix<float,R,WS>*)&g_py1[n*(m_size+1)+m][0][tx],
+        *ey2m1 = (Matrix<float,R,WS>*)&g_ey2[n*(m_size+1)+m+1][0][tx],
+        *ptu1m1 = (Matrix<float,R,WS>*)&g_ptu1[m*(n_size+1)+n][0][tx],
+        *etu2m1 = (Matrix<float,R,WS>*)&g_etu2[m*(n_size+1)+n+1][0][tx];
+
+    Matrix<float,R,WS>
+        *pxm1 = (Matrix<float,R,WS>*)&g_px[n*(m_size+1)+m][0][tx],
+        *exm1 = (Matrix<float,R,WS>*)&g_ex[n*(m_size+1)+m+1][0][tx],
+        *ptzm1 = (Matrix<float,R,WS>*)&g_ptz[m*(n_size+1)+n][0][tx],
+        *etzm1 = (Matrix<float,R,WS>*)&g_etz[m*(n_size+1)+n+1][0][tx];
+
+    if (ty==0) { // 1 warp forward computing y1
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xp; // previous x
+        Vector<float,R> py1 = py1m1->col(0); // current P(Y_1)
+        Vector<float,R> px = pxm1->col(0); // previous block P(X)
+
+        xp = px[0];
+
+#pragma unroll // calculate block, scan left -> right
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(py1, x[j], c_weights);
+            xp = xc;
+        }
+
+    } else if (ty==1) { // 1 warp reverse computing y2
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] = block[tx][i];
+
+        float xn, xa; // next and after next x
+        Vector<float,R> ey2 = ey2m1->col(0); // current E(Y_2)
+        Vector<float,R> ex = exm1->col(0); // next block E(X)
+
+        xn = ex[0];
+        if (R==1) xa = xn;
+        else xa = ex[1];
+
+#pragma unroll // calculate block, scan right -> left
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], ey2, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+    }
+
+    // preparing for adding y1 and y2
+
+    __syncthreads(); // guarantee no smem usage
+
+    if (ty==1) {
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[tx][i] = x[i]; // save partial result in smem
+
+    }
+
+    __syncthreads(); // guarantee warp 0 can read smem
+
+    if (ty==0) { // now add
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[tx][i] = x[i] + block[tx][i]; // smem now has z
+
+    }
+
+    __syncthreads(); // guarantee no smem usage
+
+    if (ty==0) { // 1 warp down computing u1
+
+#pragma unroll // transpose regs
+        for (int i=0; i<32; ++i)
+            x[i] = block[i][tx];
+
+        float xp; // previous x
+        Vector<float,R> ptu1 = ptu1m1->col(0); // current Pt(U_1)
+        Vector<float,R> ptz = pxm1->col(0); // previous block Pt(Z)
+
+        xp = ptz[0];
+
+#pragma unroll // calculate block, scan top -> bottom
+        for (int j=0; j<WS; ++j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[0]*xc + c_coefs[1]*xp;
+            x[j] = fwdI(ptu1, x[j], c_weights);
+            xp = xc;
+        }
+
+    } else if (ty==1) { // 1 warp up computing u2
+
+#pragma unroll // transpose regs
+        for (int i=0; i<32; ++i)
+            x[i] = block[i][tx];
+
+        float xn, xa; // next and after next x
+        Vector<float,R> etu2 = etu2m1->col(0); // current Et(U_2)
+        Vector<float,R> etz = etzm1->col(0); // next block Et(Z)
+
+        xn = etz[0];
+        if (R==1) xa = xn;
+        else xa = etz[1];
+
+#pragma unroll // calculate block, scan bottom -> top
+        for (int j=WS-1; j>=0; --j) {
+            float xc = x[j]; // current x
+            x[j] = c_coefs[2]*xn + c_coefs[3]*xa;
+            x[j] = revI(x[j], etu2, c_weights);
+            xa = xn;
+            xn = xc;
+        }
+
+    }
+
+    // preparing for adding u1 and u2
+
+    __syncthreads(); // guarantee no smem usage
+
+    if (ty==1) {
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            block[i][tx] = x[i]; // save partial result in smem
+
+    }
+
+    __syncthreads(); // guarantee warp 0 can read smem
+
+    if (ty==0) { // now add
+
+#pragma unroll
+        for (int i=0; i<32; ++i)
+            x[i] += block[i][tx]; // regs now has v
+
+        g_out += ((n+1)*WS-1)*out_stride + m*WS + tx;
+#pragma unroll // write block
+        for (int i=0; i<WS; ++i, g_out-=out_stride) {
+            *g_out = x[WS-1-i];
+        }
+
+    }
+
+}
+
+// Alg5 Deriche in the GPU ------------------------------------------------------
+
+__host__
+void alg5d_gpu( float *h_img, int width, int height, 
+                float sigma, int order ) {
+
+    const int B = WS;
+    if (R != 2) { std::cout << "R must be 2!\n"; return; }
+
+    Vector<float, 6> coefs; // [a0-a3, coefp, coefn]
+    Vector<float, R+1> w; // weights
+    w[0] = 1.f;
+    
+    computeFilterParams(coefs[0], coefs[1], coefs[2], coefs[3],
+                        w[1], w[2], coefs[4], coefs[5],
+                        sigma, order);
+
+    Vector<float, R+1> ck1; // convolution kernel 1 (plus)
+    ck1[0] = coefs[0]; ck1[1] = coefs[1]; ck1[2] = 0.f;
+    Vector<float, R+1> ck2; // convolution kernel 2 (minus)
+    ck2[0] = 0.f; ck2[1] = coefs[2]; ck2[2] = coefs[3];
+
+    // pre-compute basic alg5 Deriche matrices
+    Matrix<float,R,R> Ir = identity<float,R,R>();
+    Matrix<float,B,R> Zbr = zeros<float,B,R>();
+    Matrix<float,R,B> Zrb = zeros<float,R,B>();
+    Matrix<float,B,B> Ib = identity<float,B,B>();
+
+    Matrix<float,R,B> AFP_T = fwd(Ir, Zrb, w), ARE_T = rev(Zrb, Ir, w);
+    Matrix<float,B,B> AFB_T = fwd(Zbr, Ib, w), ARB_T = rev(Ib, Zbr, w);
+    Matrix<float,R,R> AbF_T = tail<R>(AFP_T), AbR_T = head<R>(ARE_T);
+    Matrix<float,B,R> TAFB_T = tail<R>(AFB_T), HARB_T = head<R>(ARB_T);
+
+    Matrix<float,R,B> AC1P_T = transp(conv1p<float,B,R>(ck1));
+    Matrix<float,R,B> AC2E_T = transp(conv2e<float,B,R>(ck2));
+    Matrix<float,B,B> AC1B_T = transp(conv1b<float,B,R>(ck1));
+    Matrix<float,B,B> AC2B_T = transp(conv2b<float,B,R>(ck2));
+
+    Matrix<float,R,R> TAFB_AC1P_T = AC1P_T*TAFB_T;
+    Matrix<float,R,R> HARB_AC2E_T = AC2E_T*HARB_T;
+
+    Matrix<float,R,B> TAFB_AC1B = transp(AC1B_T*TAFB_T);
+    Matrix<float,R,B> HARB_AC2B = transp(AC2B_T*HARB_T);
+
+    Matrix<float,R,B> AC1P_AFB_T = AC1P_T*AFB_T;
+    Matrix<float,R,B> AC2E_ARB_T = AC2E_T*ARB_T;
+
+    int m_size = (width+WS-1)/WS, n_size = (height+WS-1)/WS;
+
+    // upload to the GPU
+    copy_to_symbol(c_weights, w);
+    copy_to_symbol(c_coefs, coefs);
+
+    copy_to_symbol(c_AbF_T, AbF_T);
+    copy_to_symbol(c_AbR_T, AbR_T);
+
+    copy_to_symbol(c_AFP_T, AFP_T);
+    copy_to_symbol(c_ARE_T, ARE_T);
+
+    copy_to_symbol(c_TAFB_AC1P_T, TAFB_AC1P_T);
+    copy_to_symbol(c_HARB_AC2E_T, HARB_AC2E_T);
+
+    copy_to_symbol(c_TAFB_AC1B, TAFB_AC1B);
+    copy_to_symbol(c_HARB_AC2B, HARB_AC2B);
+
+    copy_to_symbol(c_AC1P_AFB_T, AC1P_AFB_T);
+    copy_to_symbol(c_AC2E_ARB_T, AC2E_ARB_T);
+
+    float inv_width = 1.f/width, inv_height = 1.f/height;
+
+    cudaArray *a_in;
+    cudaChannelFormatDesc ccd = cudaCreateChannelDesc<float>();
+    cudaMallocArray(&a_in, &ccd, width, height);
+    cudaMemcpyToArray(a_in, 0, 0, h_img, width*height*sizeof(float),
+                      cudaMemcpyHostToDevice);
+
+    t_in.normalized = true;
+    t_in.filterMode = cudaFilterModePoint;
+    t_in.addressMode[0] = t_in.addressMode[1] = cudaAddressModeBorder;
+
+    // naming convention: stride in elements and pitch in bytes
+    // to achieve at the same time coalescing and L2-cache colision avoidance
+    // the transposed and regular output images have stride of maximum size
+    // (4Ki) + 1 block (WS), considering up to 4096x4096 input images
+    int stride_img = MAXSIZE+WS;
+    dvector<float> d_img(height*stride_img);
+
+    // +1 padding is important even in zero-border to avoid if's in kernels
+    dvector< Matrix<float,R,B> >
+        d_py1bar((m_size+1)*n_size), d_ey2bar((m_size+1)*n_size);
+    dvector< Matrix<float,R,B> >
+        d_pty1bar((n_size+1)*m_size), d_ety1bar((n_size+1)*m_size),
+        d_ptu11hat((n_size+1)*m_size), d_etu12hat((n_size+1)*m_size);
+    dvector< Matrix<float,R,B> >
+        d_pty2bar((n_size+1)*m_size), d_ety2bar((n_size+1)*m_size),
+        d_ptu21hat((n_size+1)*m_size), d_etu22hat((n_size+1)*m_size);
+    dvector< Matrix<float,R,B> >
+        d_px((m_size+1)*n_size), d_ex((m_size+1)*n_size),
+        d_ptz((n_size+1)*m_size), d_etz((n_size+1)*m_size);
+
+    d_py1bar.fillzero();
+    d_ey2bar.fillzero();
+    d_pty1bar.fillzero();
+    d_ety1bar.fillzero();
+    d_ptu11hat.fillzero();
+    d_etu12hat.fillzero();
+    d_pty2bar.fillzero();
+    d_ety2bar.fillzero();
+    d_ptu21hat.fillzero();
+    d_etu22hat.fillzero();
+
+    d_px.fillzero();
+    d_ex.fillzero();
+    d_ptz.fillzero();
+    d_etz.fillzero();
+
+#if RUNTIMES>1
+    base_timer &timer_total = timers.gpu_add("alg5d_gpu", width*height*RUNTIMES, "iP");
+
+    for(int r=0; r<RUNTIMES; ++r) {
+
+        cudaBindTextureToArray(t_in, a_in);
+
+        alg5d_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>>
+            ( &d_py1bar, &d_pty1bar, &d_ety1bar,
+              &d_ey2bar, &d_pty2bar, &d_ety2bar,
+              &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat,
+              &d_px, &d_ex,
+              inv_width, inv_height, m_size, n_size );
+
+        alg5d_adjust_carries_rows<<< dim3(1, n_size), dim3(WS, NWA) >>>
+            ( &d_py1bar, &d_ey2bar, &d_px, &d_ex, m_size );
+
+        // alg5d_adjust_carries_cols<<< dim3(m_size, 1), dim3(WS, NWA) >>>
+        //     ( &d_py1bar, &d_pty1bar, &d_ety1bar,
+        //       &d_ey2bar, &d_pty2bar, &d_ety2bar,
+        //       &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat,
+        //       &d_px, &d_ex,
+        //       m_size, n_size );
+
+        alg5d_write_results<<< dim3(m_size, n_size), dim3(WS, NWW) >>>
+            ( d_img, &d_py1bar, &d_ey2bar, &d_ptu11hat, &d_etu22hat,
+              &d_px, &d_ex, &d_ptz, &d_etz,
+              inv_width, inv_height, m_size, n_size, stride_img );
+
+        cudaUnbindTexture(t_in);
+
+    }
+
+    timer_total.stop();
+
+    std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush;
+#else
+    base_timer &timer_total = timers.gpu_add("alg5d_gpu", width*height, "iP");
+    base_timer *timer;
+
+    timer = &timers.gpu_add("collect-carries");
+
+    cudaBindTextureToArray(t_in, a_in);
+
+    alg5d_collect_carries<<< dim3(m_size, n_size), dim3(WS, NWC) >>>
+        ( &d_py1bar, &d_pty1bar, &d_ety1bar,
+          &d_ey2bar, &d_pty2bar, &d_ety2bar,
+          &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat,
+          &d_px, &d_ex,
+          inv_width, inv_height, m_size, n_size );
+
+    timer->stop(); timer = &timers.gpu_add("adjust-carries-rows");
+
+    alg5d_adjust_carries_rows<<< dim3(1, n_size), dim3(WS, NWA) >>>
+        ( &d_py1bar, &d_ey2bar, &d_px, &d_ex, m_size );
+
+    timer->stop(); timer = &timers.gpu_add("adjust-carries-columns");
+
+    // alg5d_adjust_carries_cols<<< dim3(m_size, 1), dim3(WS, NWA) >>>
+    //     ( &d_py1bar, &d_pty1bar, &d_ety1bar,
+    //       &d_ey2bar, &d_pty2bar, &d_ety2bar,
+    //       &d_ptu11hat, &d_etu12hat, &d_etu22hat, &d_ptu21hat,
+    //       &d_px, &d_ex,
+    //       m_size, n_size );
+
+    timer->stop(); timer = &timers.gpu_add("write-block");
+
+    alg5d_write_results<<< dim3(m_size, n_size), dim3(WS, NWW) >>>
+        ( d_img, &d_py1bar, &d_ey2bar, &d_ptu11hat, &d_etu22hat,
+          &d_px, &d_ex, &d_ptz, &d_etz,
+          inv_width, inv_height, m_size, n_size, stride_img );
+
+    timer->stop();
+
+    cudaUnbindTexture(t_in);
+
+    timer_total.stop();
+    timers.flush();
+#endif
+    cudaMemcpy2D(h_img, width*sizeof(float), d_img, stride_img*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToHost);
+
+}
+
+#endif // ALG5D_GPU_CUH
diff --git a/paredge/conv.h b/paredge/conv.h
new file mode 100755
index 0000000..b1eb922
--- /dev/null
+++ b/paredge/conv.h
@@ -0,0 +1,171 @@
+/**
+ *  @file conv.h
+ *  @brief Convolutions utilities definition
+ *  @author Andre Maximo
+ *  @date Jun, 2014
+ */
+
+#ifndef CONV_H
+#define CONV_H
+
+#include <util/linalg.h>
+
+
+template <class T>
+HOSTDEV
+T conv_op(const T &x, const T &a)
+{
+    return x + a;
+}
+
+// convolution -----------------------------------------------------
+
+// convolution 1 matrix (left) on prologues
+template <class T, int M, int N>
+Matrix<T,M,N> conv1p(const Vector<T,R+1>& ck)
+{
+    Matrix<T,M,N> m;
+    for(int i=0; i<M; ++i) {
+        for(int j=0; j<N; ++j) {
+            int k = i+j+1;
+            if (k>R)
+                m[i][j] = (T)0;
+            else
+                m[i][j] = ck[k];
+        }
+    }
+    return m;
+}
+
+// convolution 2 matrix (right) on epilogues
+template <class T, int M, int N>
+Matrix<T,M,N> conv2e(const Vector<T,R+1>& ck)
+{
+    Matrix<T,M,N> m;
+    for(int i=M-1; i>=0; --i) {
+        for(int j=0; j<N; ++j) {
+            int k = M-1-i+j+1;
+            if (k>R)
+                m[i][j] = (T)0;
+            else
+                m[i][j] = ck[k];
+        }
+    }
+    return m;
+}
+
+// convolution 1 matrix (left) on blocks
+template <class T, int M, int N>
+Matrix<T,M,M> conv1b(const Vector<T,N+1>& ck)
+{
+    Matrix<T,M,M> m = zeros<float,M,M>();
+    for(int i=0; i<M; ++i) {
+        for(int j=0; j<=N; ++j) {
+            int k = i+(-1)*j;
+            if (k>=0 && k<M)
+                m[i][k] = ck[j];
+        }
+    }
+    return m;
+}
+
+// convolution 2 matrix (right) on blocks
+template <class T, int M, int N>
+Matrix<T,M,M> conv2b(const Vector<T,N+1>& ck)
+{
+    Matrix<T,M,M> m = zeros<float,M,M>();
+    for(int i=0; i<M; ++i) {
+        for(int j=0; j<=N; ++j) {
+            int k = i+(+1)*j;
+            if (k>=0 && k<M)
+                m[i][k] = ck[j];
+        }
+    }
+    return m;
+}
+
+// conv1 ---------------------------------------------------------------
+
+template <class T, int R>
+T conv1(const Vector<T,R> &p, T x, const Vector<T,R+1> &ck)
+{
+    x = x * ck[0];
+#pragma unroll
+    for(int i=0; i<R; ++i)
+        x = conv_op(x, p[i]*ck[i+1]);
+    return x;
+}
+
+template <class T, int N, int R>
+void conv1_inplace(const Vector<T,R> &_p, Vector<T,N> &b, const Vector<T,R+1> &ck)
+{
+    Vector<T,R> p = _p;
+    for(int j=0; j<b.size(); ++j) {
+        T x = conv1(p, b[j], ck);
+        for (int k=R-1; k>=1; --k)
+            p[k] = p[k-1];
+        p[0] = b[j];
+        b[j] = x;
+    }
+}
+
+template <class T, int M, int N, int R>
+void conv1_inplace(const Matrix<T,M,R> &p, Matrix<T,M,N> &b, 
+                   const Vector<T,R+1> &ck)
+{
+    for(int i=0; i<b.rows(); ++i)
+        conv1_inplace(p[i], b[i], ck);
+}
+
+template <class T, int M, int N, int R>
+Matrix<T,M,N> conv1(const Matrix<T,M,R> &p, const Matrix<T,M,N> &b,
+                    const Vector<T,R+1> &ck)
+{
+    Matrix<T,M,N> cb = b;
+    conv1_inplace(p, cb, ck);
+    return cb;
+}
+
+// conv2 ---------------------------------------------------------------
+
+template <class T, int R>
+T conv2(T x, const Vector<T,R> &e, const Vector<T,R+1> &ck)
+{
+    x = x * ck[0];
+#pragma unroll
+    for(int i=0; i<R; ++i)
+        x = conv_op(x, e[i]*ck[i+1]);
+    return x;
+}
+
+template <class T, int N, int R>
+void conv2_inplace(Vector<T,N> &b, const Vector<T,R> &_e, const Vector<T,R+1> &ck)
+{
+    Vector<T,R> e = _e;
+    for(int j=b.size()-1; j>=0; --j) {
+        T x = conv2(b[j], e, ck);
+        for (int k=R-1; k>=1; --k)
+            e[k] = e[k-1];
+        e[0] = b[j];
+        b[j] = x;
+    }
+}
+
+template <class T, int M, int N, int R>
+void conv2_inplace(Matrix<T,M,N> &b, const Matrix<T,M,R> &e,
+                   const Vector<T,R+1> &ck)
+{
+    for(int i=0; i<b.rows(); ++i)
+        conv2_inplace(b[i], e[i], ck);
+}
+
+template <class T, int M, int N, int R>
+Matrix<T,M,N> conv2(const Matrix<T,M,N> &b, const Matrix<T,M,R> &e,
+                    const Vector<T,R+1> &ck)
+{
+    Matrix<T,M,N> cb = b;
+    conv2_inplace(cb, e, ck);
+    return cb;
+}
+
+#endif // CONV_H
diff --git a/paredge/findcudalib.mk b/paredge/findcudalib.mk
new file mode 100755
index 0000000..f40c2c3
--- /dev/null
+++ b/paredge/findcudalib.mk
@@ -0,0 +1,226 @@
+################################################################################
+#
+# Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+#  findcudalib.mk is used to find the locations for CUDA libraries and other
+#                 Unix Platforms.  This is supported Mac OS X and Linux.
+#
+################################################################################
+
+# OS Name (Linux or Darwin)
+OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]")
+OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+
+# Flags to detect 32-bit or 64-bit OS platform
+OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/" -e "s/armv7l/32/")
+OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")
+
+# Determine OS platform and unix distribution
+ifeq ("$(OSLOWER)","linux")
+   # first search lsb_release
+   DISTRO  = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+   DISTVER = $(shell lsb_release -r -s 2>/dev/null)
+   ifeq ("$(DISTRO)",'') 
+     # second search and parse /etc/issue
+     DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+     DISTVER= $(shell more /etc/issue | awk '{print $$2}' | sed '1!d' 2>/dev/null
+   endif
+   ifeq ("$(DISTRO)",'') 
+     # third, we can search in /etc/os-release or /etc/{distro}-release
+     DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+     DISTVER= $(shell awk '/DISTRIB_RELEASE/' /etc/*-release | sed 's/DISTRIB_RELEASE=//' | grep -v "DISTRIB_RELEASE")
+   endif
+endif
+
+# search at Darwin (unix based info)
+DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
+ifneq ($(DARWIN),)
+   SNOWLEOPARD = $(strip $(findstring 10.6, $(shell egrep "<string>10\.6" /System/Library/CoreServices/SystemVersion.plist)))
+   LION        = $(strip $(findstring 10.7, $(shell egrep "<string>10\.7" /System/Library/CoreServices/SystemVersion.plist)))
+   MOUNTAIN    = $(strip $(findstring 10.8, $(shell egrep "<string>10\.8" /System/Library/CoreServices/SystemVersion.plist)))
+   MAVERICKS   = $(strip $(findstring 10.9, $(shell egrep "<string>10\.9" /System/Library/CoreServices/SystemVersion.plist)))
+endif 
+
+# Common binaries
+GCC   ?= g++
+CLANG ?= /usr/bin/clang
+
+ifeq ("$(OSUPPER)","LINUX")
+     NVCC ?= $(CUDA_PATH)/bin/nvcc -ccbin $(GCC)
+else
+  # for some newer versions of XCode, CLANG is the default compiler, so we need to include this
+  ifneq ($(MAVERICKS),)
+        NVCC   ?= $(CUDA_PATH)/bin/nvcc -ccbin $(CLANG)
+        STDLIB ?= -stdlib=libstdc++
+  else
+        NVCC   ?= $(CUDA_PATH)/bin/nvcc -ccbin $(GCC)
+  endif
+endif
+
+# Take command line flags that override any of these settings
+ifeq ($(i386),1)
+	OS_SIZE = 32
+	OS_ARCH = i686
+endif
+ifeq ($(x86_64),1)
+	OS_SIZE = 64
+	OS_ARCH = x86_64
+endif
+ifeq ($(ARMv7),1)
+	OS_SIZE = 32
+	OS_ARCH = armv7l
+endif
+
+ifeq ("$(OSUPPER)","LINUX")
+    # Each Linux Distribuion has a set of different paths.  This applies especially when using the Linux RPM/debian packages
+    ifeq ("$(DISTRO)","ubuntu")
+        CUDAPATH  ?= /usr/lib/nvidia-current
+        CUDALINK  ?= -L/usr/lib/nvidia-current
+        DFLT_PATH  = /usr/lib
+    endif
+    ifeq ("$(DISTRO)","kubuntu")
+        CUDAPATH  ?= /usr/lib/nvidia-current
+        CUDALINK  ?= -L/usr/lib/nvidia-current
+        DFLT_PATH  = /usr/lib
+    endif
+    ifeq ("$(DISTRO)","debian")
+        CUDAPATH  ?= /usr/lib/nvidia-current
+        CUDALINK  ?= -L/usr/lib/nvidia-current
+        DFLT_PATH  = /usr/lib
+    endif
+    ifeq ("$(DISTRO)","suse")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","suse linux")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","opensuse")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","fedora")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?= /usr/lib64/nvidia
+        CUDALINK  ?= -L/usr/lib64/nvidia
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","redhat")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?= /usr/lib64/nvidia
+        CUDALINK  ?= -L/usr/lib64/nvidia
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","red")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?= /usr/lib64/nvidia
+        CUDALINK  ?= -L/usr/lib64/nvidia
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","redhatenterpriseworkstation")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?= /usr/lib64/nvidia
+        CUDALINK  ?= -L/usr/lib64/nvidia
+        DFLT_PATH ?= /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","centos")
+      ifeq ($(OS_SIZE),64)
+        CUDAPATH  ?= /usr/lib64/nvidia
+        CUDALINK  ?= -L/usr/lib64/nvidia
+        DFLT_PATH  = /usr/lib64
+      else
+        CUDAPATH  ?=
+        CUDALINK  ?=
+        DFLT_PATH  = /usr/lib
+      endif
+    endif
+  
+    ifeq ($(ARMv7),1)
+      CUDAPATH := /usr/arm-linux-gnueabihf/lib
+      CUDALINK := -L/usr/arm-linux-gnueabihf/lib
+      ifneq ($(TARGET_FS),) 
+        CUDAPATH += $(TARGET_FS)/usr/lib/nvidia-current
+        CUDALINK += -L$(TARGET_FS)/usr/lib/nvidia-current
+      endif 
+    endif
+
+  # Search for Linux distribution path for libcuda.so
+  CUDALIB ?= $(shell find $(CUDAPATH) $(DFLT_PATH) -name libcuda.so -print 2>/dev/null)
+
+  ifeq ("$(CUDALIB)",'')
+      $(info >>> WARNING - CUDA Driver libcuda.so is not found.  Please check and re-install the NVIDIA driver. <<<)
+      EXEC=@echo "[@]"
+  endif
+else
+  # This would be the Mac OS X path if we had to do anything special
+endif
+
diff --git a/paredge/findgllib.mk b/paredge/findgllib.mk
new file mode 100755
index 0000000..1525ad8
--- /dev/null
+++ b/paredge/findgllib.mk
@@ -0,0 +1,225 @@
+################################################################################
+#
+# Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+#
+# NOTICE TO USER:   
+#
+# This source code is subject to NVIDIA ownership rights under U.S. and 
+# international Copyright laws.  
+#
+# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+# OR PERFORMANCE OF THIS SOURCE CODE.  
+#
+# U.S. Government End Users.  This source code is a "commercial item" as 
+# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+# "commercial computer software" and "commercial computer software 
+# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+# and is provided to the U.S. Government only as a commercial end item.  
+# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+# source code with only those rights set forth herein.
+#
+################################################################################
+#
+#  findgllib.mk is used to find the necessary GL Libraries for specific distributions
+#               this is supported on Mac OSX and Linux Platforms
+#
+################################################################################
+
+# OS Name (Linux or Darwin)
+OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]")
+OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+
+# Flags to detect 32-bit or 64-bit OS platform
+OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/" -e "s/armv7l/32/")
+OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/")
+
+# Determine OS platform and unix distribution
+ifeq ("$(OSLOWER)","linux")
+   # first search lsb_release
+   DISTRO  = $(shell lsb_release -i -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+   DISTVER = $(shell lsb_release -r -s 2>/dev/null)
+   # $(info DISTRO1 = $(DISTRO) $(DISTVER))
+   ifeq ($(DISTRO),)
+     # second search and parse /etc/issue
+     DISTRO = $(shell more /etc/issue | awk '{print $$1}' | sed '1!d' | sed -e "/^$$/d" 2>/dev/null | tr "[:upper:]" "[:lower:]")
+     DISTVER= $(shell more /etc/issue | awk '{print $$2}' | sed '1!d' 2>/dev/null)
+     # $(info DISTRO2 = $(DISTRO) $(DISTVER))
+   endif
+   ifeq ($(DISTRO),)
+     # third, we can search in /etc/os-release or /etc/{distro}-release
+     DISTRO = $(shell awk '/ID/' /etc/*-release | sed 's/ID=//' | grep -v "VERSION" | grep -v "ID" | grep -v "DISTRIB")
+     DISTVER= $(shell awk '/DISTRIB_RELEASE/' /etc/*-release | sed 's/DISTRIB_RELEASE=//' | grep -v "DISTRIB_RELEASE")
+     # $(info DISTRO3 = $(DISTRO) $(DISTVER))
+   endif
+endif
+
+# Take command line flags that override any of these settings
+ifeq ($(i386),1)
+	OS_SIZE = 32
+	OS_ARCH = i686
+endif
+ifeq ($(x86_64),1)
+	OS_SIZE = 64
+	OS_ARCH = x86_64
+endif
+ifeq ($(ARMv7),1)
+	OS_SIZE = 32
+	OS_ARCH = armv7l
+endif
+
+ifeq ("$(OSUPPER)","LINUX")
+    # $(info) >> findgllib.mk -> LINUX path <<<)
+    # Each set of Linux Distros have different paths for where to find their OpenGL libraries reside
+    ifeq ("$(DISTRO)","ubuntu")
+        GLPATH    ?= /usr/lib/nvidia-current
+        GLLINK    ?= -L/usr/lib/nvidia-current
+        DFLT_PATH ?= /usr/lib
+    endif
+    ifeq ("$(DISTRO)","kubuntu")
+        GLPATH    ?= /usr/lib/nvidia-current
+        GLLINK    ?= -L/usr/lib/nvidia-current
+        DFLT_PATH ?= /usr/lib
+    endif
+    ifeq ("$(DISTRO)","debian")
+        GLPATH    ?= /usr/lib/nvidia-current
+        GLLINK    ?= -L/usr/lib/nvidia-current
+        DFLT_PATH ?= /usr/lib
+    endif
+    ifeq ("$(DISTRO)","suse")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/X11R6/lib64 /usr/X11R6/lib
+        GLLINK    ?= -L/usr/X11R6/lib64 -L/usr/X11R6/lib
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?= /usr/X11R6/lib
+        GLLINK    ?= -L/usr/X11R6/lib
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","suse linux")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/X11R6/lib64 /usr/X11R6/lib
+        GLLINK    ?= -L/usr/X11R6/lib64 -L/usr/X11R6/lib
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?= /usr/X11R6/lib
+        GLLINK    ?= -L/usr/X11R6/lib
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","opensuse")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/X11R6/lib64 /usr/X11R6/lib
+        GLLINK    ?= -L/usr/X11R6/lib64 -L/usr/X11R6/lib
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?= /usr/X11R6/lib
+        GLLINK    ?= -L/usr/X11R6/lib
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","fedora")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/lib64/nvidia
+        GLLINK    ?= -L/usr/lib64/nvidia
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?=
+        GLLINK    ?=
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","redhat")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/lib64/nvidia
+        GLLINK    ?= -L/usr/lib64/nvidia
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?=
+        GLLINK    ?=
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","red")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/lib64/nvidia
+        GLLINK    ?= -L/usr/lib64/nvidia
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?=
+        GLLINK    ?=
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","redhatenterpriseworkstation")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/lib64/nvidia
+        GLLINK    ?= -L/usr/lib64/nvidia
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?=
+        GLLINK    ?=
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+    ifeq ("$(DISTRO)","centos")
+      ifeq ($(OS_SIZE),64)
+        GLPATH    ?= /usr/lib64/nvidia
+        GLLINK    ?= -L/usr/lib64/nvidia
+        DFLT_PATH ?= /usr/lib64
+      else
+        GLPATH    ?=
+        GLLINK    ?=
+        DFLT_PATH ?= /usr/lib
+      endif
+    endif
+ 
+    ifeq ($(ARMv7),1)
+      GLPATH := /usr/arm-linux-gnueabihf/lib
+      GLLINK := -L/usr/arm-linux-gnueabihf/lib
+      ifneq ($(TARGET_FS),) 
+        GLPATH += $(TARGET_FS)/usr/lib/nvidia-current $(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        GLLINK += -L$(TARGET_FS)/usr/lib/nvidia-current -L$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+      endif 
+    endif
+
+  # find libGL, libGLU, libXi, 
+  GLLIB  := $(shell find $(GLPATH) $(DFLT_PATH) -name libGL.so  -print 2>/dev/null)
+  GLULIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libGLU.so -print 2>/dev/null)
+  X11LIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libX11.so -print 2>/dev/null)
+  XILIB  := $(shell find $(GLPATH) $(DFLT_PATH) -name libXi.so  -print 2>/dev/null)
+  XMULIB := $(shell find $(GLPATH) $(DFLT_PATH) -name libXmu.so -print 2>/dev/null)
+
+  ifeq ("$(GLLIB)",'')
+      $(info >>> WARNING - libGL.so not found, refer to CUDA Samples release notes for how to find and install them. <<<)
+      EXEC=@echo "[@]"
+  endif
+  ifeq ("$(GLULIB)",'')
+      $(info >>> WARNING - libGLU.so not found, refer to CUDA Samples release notes for how to find and install them. <<<)
+      EXEC=@echo "[@]"
+  endif
+  ifeq ("$(X11LIB)",'')
+      $(info >>> WARNING - libX11.so not found, refer to CUDA Samples release notes for how to find and install them. <<<)
+      EXEC=@echo "[@]"
+  endif
+  ifeq ("$(XILIB)",'')
+      $(info >>> WARNING - libXi.so not found, refer to CUDA Samples release notes for how to find and install them. <<<)
+      EXEC=@echo "[@]"
+  endif
+  ifeq ("$(XMULIB)",'')
+      $(info >>> WARNING - libXmu.so not found, refer to CUDA Samples release notes for how to find and install them. <<<)
+      EXEC=@echo "[@]"
+  endif
+else
+    # This would be the Mac OS X path if we had to do anything special
+endif
+
diff --git a/paredge/fix_ref.bib b/paredge/fix_ref.bib
new file mode 100755
index 0000000..24bd478
--- /dev/null
+++ b/paredge/fix_ref.bib
@@ -0,0 +1,10 @@
+@article{ruijters2012gpu,
+title={GPU prefilter for accurate cubic B-spline interpolation},
+author={Ruijters, Daniel and Th{\'e}venaz, Philippe},
+journal={The Computer Journal},
+volume={55},
+number={1},
+pages={15--20},
+year={2012},
+publisher={Oxford University Press}
+}
diff --git a/paredge/gpu_geforcetitan.txt b/paredge/gpu_geforcetitan.txt
new file mode 100755
index 0000000..7df0903
--- /dev/null
+++ b/paredge/gpu_geforcetitan.txt
@@ -0,0 +1,40 @@
+ CUDA Device Query (Runtime API) version (CUDART static linking)
+
+Detected 1 CUDA Capable device(s)
+
+Device 0: "GeForce GTX TITAN"
+  CUDA Driver Version / Runtime Version          5.5 / 5.5
+  CUDA Capability Major/Minor version number:    3.5
+  Total amount of global memory:                 6143 MBytes (6441730048 bytes)
+  (14) Multiprocessors, (192) CUDA Cores/MP:     2688 CUDA Cores
+  GPU Clock rate:                                876 MHz (0.88 GHz)
+  Memory Clock rate:                             3004 Mhz
+  Memory Bus Width:                              384-bit
+  L2 Cache Size:                                 1572864 bytes
+  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
+  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
+  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
+  Total amount of constant memory:               65536 bytes
+  Total amount of shared memory per block:       49152 bytes
+  Total number of registers available per block: 65536
+  Warp size:                                     32
+  Maximum number of threads per multiprocessor:  2048
+  Maximum number of threads per block:           1024
+  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
+  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
+  Maximum memory pitch:                          2147483647 bytes
+  Texture alignment:                             512 bytes
+  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
+  Run time limit on kernels:                     Yes
+  Integrated GPU sharing Host Memory:            No
+  Support host page-locked memory mapping:       Yes
+  Alignment requirement for Surfaces:            Yes
+  Device has ECC support:                        Disabled
+  Device supports Unified Addressing (UVA):      Yes
+  Device PCI Bus ID / PCI location ID:           3 / 0
+  Compute Mode:
+     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
+
+deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX TITAN
+Result = PASS
+
diff --git a/paredge/gpu_quadrok2000.txt b/paredge/gpu_quadrok2000.txt
new file mode 100755
index 0000000..c0b6f05
--- /dev/null
+++ b/paredge/gpu_quadrok2000.txt
@@ -0,0 +1,40 @@
+ CUDA Device Query (Runtime API) version (CUDART static linking)
+
+Detected 1 CUDA Capable device(s)
+
+Device 0: "Quadro K2000"
+  CUDA Driver Version / Runtime Version          6.0 / 5.5
+  CUDA Capability Major/Minor version number:    3.0
+  Total amount of global memory:                 2048 MBytes (2147155968 bytes)
+  ( 2) Multiprocessors, (192) CUDA Cores/MP:     384 CUDA Cores
+  GPU Clock rate:                                954 MHz (0.95 GHz)
+  Memory Clock rate:                             2000 Mhz
+  Memory Bus Width:                              128-bit
+  L2 Cache Size:                                 262144 bytes
+  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
+  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
+  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
+  Total amount of constant memory:               65536 bytes
+  Total amount of shared memory per block:       49152 bytes
+  Total number of registers available per block: 65536
+  Warp size:                                     32
+  Maximum number of threads per multiprocessor:  2048
+  Maximum number of threads per block:           1024
+  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
+  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
+  Maximum memory pitch:                          2147483647 bytes
+  Texture alignment:                             512 bytes
+  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
+  Run time limit on kernels:                     Yes
+  Integrated GPU sharing Host Memory:            No
+  Support host page-locked memory mapping:       Yes
+  Alignment requirement for Surfaces:            Yes
+  Device has ECC support:                        Disabled
+  Device supports Unified Addressing (UVA):      Yes
+  Device PCI Bus ID / PCI location ID:           15 / 0
+  Compute Mode:
+     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
+
+deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = Quadro K2000
+Result = PASS
+
diff --git a/paredge/gpud_aux.cuh b/paredge/gpud_aux.cuh
new file mode 100755
index 0000000..e1b8b82
--- /dev/null
+++ b/paredge/gpud_aux.cuh
@@ -0,0 +1,26 @@
+/**
+ *  @file gpud_aux.cuh
+ *  @brief Auxiliary GPU definitions (Deriche) and implementations
+ *  @author Andre Maximo
+ *  @date Jun, 2014
+ */
+
+#ifndef GPUD_AUX_CUH
+#define GPUD_AUX_CUH
+
+//=== GLOBAL-SCOPE DEFINITIONS ==================================================
+
+#define NBAC 5 // # of blocks to adjust carriers for cols (smem=2x bxb)
+
+// basics
+__constant__ Vector<float, 6> c_coefs;
+
+// alg4d
+__constant__ Matrix<float,R,R> c_TAFB_AC1P_T, c_HARB_AC2E_T;
+
+// alg5d
+__constant__ Matrix<float,R,WS> c_AFP_T;
+__constant__ Matrix<float,R,WS> c_TAFB_AC1B, c_HARB_AC2B,
+    c_AC1P_AFB_T, c_AC2E_ARB_T;
+
+#endif // GPUD_AUX_CUH
diff --git a/paredge/perf01.txt b/paredge/perf01.txt
new file mode 100755
index 0000000..cfdae02
--- /dev/null
+++ b/paredge/perf01.txt
@@ -0,0 +1,64 @@
+64 64 45.160233 64.117485 48.390018 80.791672 78.122360 
+128 128 109.495667 136.387070 176.345718 314.120544 298.755859 
+192 192 118.469055 136.417160 291.930145 468.001984 456.378448 
+256 256 147.451080 177.776108 370.707458 518.250244 517.507812 
+320 320 158.310699 182.476059 407.468628 614.357361 622.562927 
+384 384 195.555023 240.068359 516.386780 700.726807 723.647888 
+448 448 222.558670 268.162018 564.280396 743.403870 790.598755 
+512 512 253.770355 293.569672 601.107056 785.934875 824.709595 
+576 576 274.284668 315.311768 621.999939 786.772888 807.223083 
+640 640 295.734497 343.989288 640.378601 807.160645 823.183716 
+704 704 323.370361 361.492859 650.717712 820.898560 889.293152 
+768 768 339.730896 378.031494 670.750122 845.371155 864.034302 
+832 832 359.282227 396.145264 679.790649 832.685486 868.159180 
+896 896 380.712860 421.281403 693.714478 854.128357 902.375732 
+960 960 397.700775 433.684265 687.070068 849.685059 902.116638 
+1024 1024 417.481720 462.083984 704.057678 873.345520 923.876404 
+1088 1088 402.461182 459.446625 686.792358 861.952332 924.522461 
+1152 1152 440.991272 456.722595 703.789551 868.538269 931.912659 
+1216 1216 459.310455 487.640076 705.504578 842.084534 921.665527 
+1280 1280 449.966736 499.377289 711.788818 867.322998 921.989441 
+1344 1344 481.741241 486.807800 679.699585 846.891663 906.499329 
+1408 1408 497.357574 495.586853 719.500671 883.092529 953.552917 
+1472 1472 507.167328 516.822937 718.878479 880.720154 953.618774 
+1536 1536 517.084473 519.867676 727.509460 887.431213 961.191284 
+1600 1600 532.673401 519.448669 719.591919 887.360229 937.163574 
+1664 1664 539.005981 524.233765 726.562134 891.095398 945.263000 
+1728 1728 520.373169 501.643402 702.776062 834.652527 887.506836 
+1792 1792 535.895447 523.244385 742.748413 894.789246 965.409851 
+1856 1856 575.067627 531.831421 744.350342 906.591736 966.308411 
+1920 1920 576.124756 520.769897 748.907837 900.844238 874.108154 
+1984 1984 584.959290 523.752197 740.765381 904.273254 964.834595 
+2048 2048 591.803833 524.071838 747.518860 842.683044 950.838806 
+2112 2112 603.336182 417.341034 736.741028 895.728882 972.008057 
+2176 2176 598.840454 420.179169 745.584595 898.460449 971.712524 
+2240 2240 615.018066 429.975250 741.152893 898.690552 978.043945 
+2304 2304 614.313965 433.725708 747.196228 904.179321 982.713745 
+2368 2368 624.929138 444.420715 747.538086 900.160950 973.525879 
+2432 2432 636.339233 453.141083 750.813049 911.986145 974.878662 
+2496 2496 637.298157 456.277039 751.672119 907.004333 973.859924 
+2560 2560 643.568054 459.965729 752.201172 904.241211 971.773376 
+2624 2624 641.684021 467.779022 749.641296 901.607300 979.460632 
+2688 2688 643.450928 469.281982 751.228638 909.232849 978.871826 
+2752 2752 651.060791 478.914642 752.868530 909.955444 980.774109 
+2816 2816 653.118530 483.877350 762.031189 912.872803 984.416504 
+2880 2880 653.587524 487.520447 745.559814 910.216736 979.572083 
+2944 2944 655.284180 490.682709 761.514893 911.182190 987.370728 
+3008 3008 660.920776 498.858307 758.948547 911.210510 984.590332 
+3072 3072 658.691772 497.575195 761.142700 913.306885 987.769287 
+3136 3136 646.869812 502.283569 751.161499 910.928223 978.665405 
+3200 3200 660.213440 509.015381 758.642639 902.398193 983.400940 
+3264 3264 650.241455 505.617371 755.336914 909.077698 976.947327 
+3328 3328 665.676514 517.090149 753.827515 884.583374 985.660095 
+3392 3392 656.700623 519.008423 759.489990 914.666992 983.742493 
+3456 3456 671.460999 512.222229 756.539673 916.689148 987.267334 
+3520 3520 675.414978 516.492004 759.940918 914.387085 983.277893 
+3584 3584 678.601868 517.237427 764.180664 916.786072 978.531372 
+3648 3648 675.313477 520.775330 765.306946 918.075928 992.878662 
+3712 3712 676.890991 523.982605 766.977905 918.668640 993.898132 
+3776 3776 667.165894 522.591797 766.459656 915.686523 988.108582 
+3840 3840 653.314209 509.136292 767.725037 920.161560 992.750671 
+3904 3904 671.175110 526.073364 763.760864 917.518555 989.826965 
+3968 3968 668.559265 516.526672 761.514465 916.361694 984.483704 
+4032 4032 676.315918 529.717957 766.719421 918.734192 989.240906 
+4096 4096 683.850220 536.414429 765.472290 917.671021 987.498779 
diff --git a/paredge/perf02.txt b/paredge/perf02.txt
new file mode 100755
index 0000000..655236d
--- /dev/null
+++ b/paredge/perf02.txt
@@ -0,0 +1,64 @@
+64 64 51.506283 59.294174 44.573425 79.845184 77.305832 
+128 128 109.759804 133.756912 174.313293 312.091583 299.324646 
+192 192 122.762535 141.430588 283.593109 465.061005 458.717773 
+256 256 147.209396 173.429367 393.575867 593.916260 591.784668 
+320 320 174.673340 209.546921 446.827850 650.946289 661.807922 
+384 384 200.363998 235.921310 508.625824 693.695251 714.228394 
+448 448 228.087891 265.998657 561.434937 753.261230 788.068420 
+512 512 258.715515 300.524261 586.545532 769.635986 808.467773 
+576 576 278.403381 317.791931 598.871521 779.056396 822.371704 
+640 640 298.956635 345.284821 648.820862 815.117310 872.488647 
+704 704 321.602966 365.987305 644.256714 813.182129 876.553284 
+768 768 340.564331 381.660187 665.325012 833.149719 887.295166 
+832 832 361.705933 404.222443 679.782959 816.362427 903.503052 
+896 896 380.311340 413.942841 685.026123 849.975281 917.003723 
+960 960 362.206085 395.687347 625.351868 860.406311 913.973877 
+1024 1024 416.232849 450.659698 702.201416 863.699341 930.278931 
+1088 1088 426.081848 462.295898 674.021667 859.400452 923.629883 
+1152 1152 442.198517 474.325653 687.865967 868.570923 938.970520 
+1216 1216 454.775848 490.639557 692.894958 871.245667 940.268982 
+1280 1280 471.797180 497.931732 708.519714 878.674438 874.902893 
+1344 1344 484.016663 510.985046 707.645386 881.849670 947.863403 
+1408 1408 497.170349 513.555359 720.201660 874.891052 956.538025 
+1472 1472 509.107422 517.038757 721.355042 881.839539 949.424072 
+1536 1536 520.583801 519.536072 730.219177 885.242188 965.862488 
+1600 1600 532.904846 510.202423 727.696838 890.487732 949.014099 
+1664 1664 523.123352 526.567444 738.054626 890.733032 970.909485 
+1728 1728 546.770813 507.838623 736.654053 889.089661 969.930908 
+1792 1792 557.636536 525.379761 742.538879 893.314514 971.254883 
+1856 1856 571.082336 528.900757 740.567139 899.175049 967.873840 
+1920 1920 574.407471 518.737610 742.488892 901.531799 978.758911 
+1984 1984 585.905640 487.733490 741.941833 902.622192 971.019287 
+2048 2048 584.043701 540.448120 746.738647 899.985352 978.578430 
+2112 2112 602.731689 417.438416 739.744751 902.534912 968.760681 
+2176 2176 598.659363 420.051270 742.580811 901.148560 980.216858 
+2240 2240 614.454346 431.110077 744.342407 903.073059 971.016602 
+2304 2304 613.796448 434.699310 748.545959 903.312500 977.304565 
+2368 2368 628.194336 446.281433 749.469788 908.359680 978.983582 
+2432 2432 634.329895 453.890259 754.433167 908.573730 982.021301 
+2496 2496 636.775696 456.107330 752.925964 911.523376 981.154968 
+2560 2560 646.821655 462.572205 755.554321 911.247925 986.051453 
+2624 2624 648.622314 470.022522 756.165222 912.283264 981.182373 
+2688 2688 646.356140 470.587860 758.414307 913.817749 983.367249 
+2752 2752 654.524658 482.325287 757.287964 914.000366 987.715088 
+2816 2816 653.752808 483.601898 756.861816 910.569458 984.186890 
+2880 2880 653.678711 486.463409 759.201111 914.739380 981.833252 
+2944 2944 656.637756 491.532867 762.054199 916.912720 989.290771 
+3008 3008 664.706726 501.213104 761.967590 916.951904 990.543945 
+3072 3072 660.321533 499.157654 761.407104 917.808655 990.417725 
+3136 3136 666.532654 508.196411 755.210144 912.899170 986.584900 
+3200 3200 666.187378 511.404205 758.321350 910.805420 987.848267 
+3264 3264 654.122070 506.821411 759.466553 915.700073 989.437317 
+3328 3328 669.785522 517.792725 761.961121 916.254456 993.082336 
+3392 3392 674.182617 520.439514 759.942078 914.273743 990.591492 
+3456 3456 673.491943 516.815918 760.758545 916.864685 993.150208 
+3520 3520 643.759521 495.513672 732.179749 918.130432 993.693604 
+3584 3584 679.994507 521.850830 764.833496 917.416321 995.075928 
+3648 3648 676.976074 515.804932 765.742126 881.610901 945.287842 
+3712 3712 641.856262 502.447968 766.438416 920.871216 950.095276 
+3776 3776 670.322266 521.055725 758.005981 907.376648 982.342468 
+3840 3840 647.868469 507.671509 754.048828 899.487976 950.288391 
+3904 3904 664.707642 523.931396 746.185303 911.999329 985.176880 
+3968 3968 671.114563 519.500793 764.939819 893.656128 982.824585 
+4032 4032 665.139038 526.386353 749.845032 897.570068 969.283752 
+4096 4096 685.134766 529.514099 761.644653 913.418091 965.152466 
diff --git a/paredge/readme.txt b/paredge/readme.txt
new file mode 100755
index 0000000..0bba713
--- /dev/null
+++ b/paredge/readme.txt
@@ -0,0 +1,6 @@
+Sample: Recursive Gaussian Filter
+Minimum spec: SM 1.0
+
+This sample implements a Gaussian blur using Deriche's recursive method. The advantage of this method is that the execution time is independent of the filter width.
+
+Key concepts:
diff --git a/paredge/recAlg_cuda.cu b/paredge/recAlg_cuda.cu
new file mode 100755
index 0000000..99938e0
--- /dev/null
+++ b/paredge/recAlg_cuda.cu
@@ -0,0 +1,97 @@
+/**
+ *  @file recAlg_cuda.cuh
+ *  @brief Recursive Algorithms in the GPU
+ *  @author Andre Maximo
+ *  @date Jun, 2014
+ */
+
+#include "recursiveGaussian.h"
+
+#include "alg0_cpu.h"
+#include "alg5_cpu.h"
+#include "gpu_aux.h"
+#include "alg5_gpu.h"
+#include "alg5f4_aux.h"
+#include "alg5f4_gpu.h"
+#include "alg5f4_gauss_gpu.h"
+
+#include "gpud_aux.cuh"
+#include "alg4pd_gpu.cuh"
+#include "alg4d_gpu.cuh"
+#include "alg5d_gpu.cuh"
+
+/**
+ * @brief Run alg5f4 - Recursive Gaussian filter van Vliet (our alg gray float)
+ *
+ * @param[in,out] iod Input/output data
+ * @param[in] width Image width
+ * @param[in] height Image height
+ * @param[in] sigma Sigma of the Gaussian kernel
+ */
+
+extern "C"
+void runAlg5f4(float *iod, int width, int height, float sigma) {
+
+#if CLAMPTOEDGE
+    const BorderType border_type = CLAMP_TO_EDGE;
+    const int border = 1; // 1 block outside
+#else
+    const BorderType border_type = CLAMP_TO_ZERO;
+    const int border = 0;
+#endif
+
+    Vector<float, 1+1> w1;
+    Vector<float, 2+1> w2;
+    Vector<float, 3+1> w3;
+    weights(sigma, w1);
+    weights(sigma, w2);
+    weights(sigma, w3); // to use r=3 change R in recursiveGaussian.h
+
+    // For lena image in gray 512x512
+    // Deriche (original): 246 MiP/s
+    // Deriche (row/col): 289 MiP/s
+    // Alg5r3: 272 MiP/s
+    // Alg5f5: 378 MiP/s
+    // Alg5f4: 418 MiP/s
+
+    //alg0_cpu<1>(iod, width, height, w1, border, border_type);
+    //alg0_cpu<2>(iod, width, height, w2, border, border_type);
+    //alg0_cpu<3>(iod, width, height, w3, border, border_type);
+    //alg5_gpu<true, 3>(iod, width, height, w3, border, border_type);
+    //alg5f5_gpu<true, 1, 2>(iod, width, height, w1, w2, border, border_type);
+    alg5f4_gpu<true, 1, 2>(iod, width, height, w1, w2, border, border_type);
+
+}
+
+/**
+ * @brief Run alg4pd - Recursive Gaussian filter Deriche (our alg gray float)
+ *
+ * @param[in,out] iod Input/output data
+ * @param[in] width Image width
+ * @param[in] height Image height
+ * @param[in] sigma Sigma of the Gaussian kernel
+ * @param[in] order Filter order
+ */
+
+// R in recursiveGaussian.h must be 2
+
+extern "C"
+void runAlg4pd(float *iod, int width, int height, float sigma, int order) {
+
+    alg4pd_gpu(iod, width, height, sigma, order);
+
+}
+
+extern "C"
+void runAlg4d(float *iod, int width, int height, float sigma, int order) {
+
+    alg4d_gpu(iod, width, height, sigma, order);
+
+}
+
+extern "C"
+void runAlg5d(float *iod, int width, int height, float sigma, int order) {
+
+    alg5d_gpu(iod, width, height, sigma, order);
+
+}
diff --git a/paredge/recfilter.sobj b/paredge/recfilter.sobj
new file mode 100755
index 0000000..378c880
Binary files /dev/null and b/paredge/recfilter.sobj differ
diff --git a/paredge/recursiveGaussian.cpp b/paredge/recursiveGaussian.cpp
new file mode 100755
index 0000000..f6b5d73
--- /dev/null
+++ b/paredge/recursiveGaussian.cpp
@@ -0,0 +1,699 @@
+#pragma warning(disable:4819)
+
+/*
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    Recursive Gaussian filter
+    sgreen 8/1/08
+
+    This code sample implements a Gaussian blur using Deriche's recursive method:
+    http://citeseer.ist.psu.edu/deriche93recursively.html
+
+    This is similar to the box filter sample in the SDK, but it uses the previous
+    outputs of the filter as well as the previous inputs. This is also known as an
+    IIR (infinite impulse response) filter, since its response to an input impulse
+    can last forever.
+
+    The main advantage of this method is that the execution time is independent of
+    the filter width.
+
+    The GPU processes columns of the image in parallel. To avoid uncoalesced reads
+    for the row pass we transpose the image and then transpose it back again
+    afterwards.
+
+    The implementation is based on code from the CImg library:
+    http://cimg.sourceforge.net/
+    Thanks to David Tschumperl and all the CImg contributors!
+*/
+
+// OpenGL Graphics includes
+#include <GL/glew.h>
+#if defined (__APPLE__) || defined(MACOSX)
+#include <GLUT/glut.h>
+#else
+#include <GL/freeglut.h>
+#endif
+
+// CUDA includes and interop headers
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+
+// CUDA utilities and system includes
+#include <helper_functions.h>
+#include <helper_cuda.h>      // includes cuda.h and cuda_runtime_api.h
+#include <helper_cuda_gl.h>   // includes cuda_runtime_api.h
+
+// Includes
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "recursiveGaussian.h"
+
+#define MAX(a,b) ((a > b) ? a : b)
+
+#define USE_SIMPLE_FILTER 0
+
+#define MAX_EPSILON_ERROR 5.0f
+#define THRESHOLD  0.15f
+
+// Define the files that are to be save and the reference images for validation
+const char *sOriginal[] =
+{
+    "lena_10.ppm",
+    "lena_14.ppm",
+    "lena_18.ppm",
+    "lena_22.ppm",
+    NULL
+};
+
+const char *sReference[] =
+{
+    "ref_10.ppm",
+    "ref_14.ppm",
+    "ref_18.ppm",
+    "ref_22.ppm",
+    NULL
+};
+
+char *image_filename = (char*)"lena.ppm";
+//char *image_filename = (char*)"calhouse.ppm";
+float sigma = 4.0f;
+int order = 0;
+int nthreads = 256;  // number of threads per block
+
+unsigned int width, height;
+unsigned int *h_img = NULL;
+unsigned int *d_img = NULL;
+unsigned int *d_temp = NULL;
+
+float *h_gimg = NULL;
+float *d_gimg = NULL;
+float *d_gtemp = NULL;
+
+GLuint pbo = 0;     // OpenGL pixel buffer object
+GLuint texid = 0;   // texture
+
+StopWatchInterface *timer = 0;
+
+// Auto-Verification Code
+const int frameCheckNumber = 4;
+int fpsCount = 0;        // FPS count for averaging
+int fpsLimit = 1;        // FPS limit for sampling
+uint frameCount = 0;
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+bool runBenchmark = true;
+
+const char *sSDKsample = "CUDA Recursive Gaussian";
+
+extern "C"
+void gaussianFilterRGBA(unsigned int *d_src, unsigned int *d_dest, unsigned int *d_temp, int width, int height, float sigma, int order, int nthreads);
+
+extern "C"
+void rg0(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads);
+
+extern "C"
+void rg1(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads);
+
+extern "C"
+float rgbaIntToFloat1(uint c);
+
+extern "C"
+void convert_rgbaIntToFloat(uint *id, float *od0, float *od1, float *od2, float *od3, int w, int h);
+
+extern "C"
+void convert_rgbaFloatToUchar(uchar *od, float *id0, float *id1, float *id2, float *id3, int w, int h);
+
+extern "C"
+void runAlg5f4(float *id, int width, int height, float sigma);
+
+extern "C"
+void runAlg4pd(float *id, int width, int height, float sigma, int order);
+
+extern "C"
+void runAlg4d(float *id, int width, int height, float sigma, int order);
+
+extern "C"
+void runAlg5d(float *id, int width, int height, float sigma, int order);
+
+void cleanup();
+
+void computeFPS()
+{
+    frameCount++;
+    fpsCount++;
+
+    if (fpsCount == fpsLimit)
+    {
+        char fps[256];
+        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
+        sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps);
+
+        glutSetWindowTitle(fps);
+        fpsCount = 0;
+
+        fpsLimit = MAX(ifps, 1.f);
+        sdkResetTimer(&timer);
+    }
+}
+
+// display results using OpenGL
+void display()
+{
+    sdkStartTimer(&timer);
+
+    // execute filter, writing results to pbo
+    unsigned int *d_result;
+    checkCudaErrors(cudaGLMapBufferObject((void **)&d_result, pbo));
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+    checkCudaErrors(cudaGLUnmapBufferObject(pbo));
+
+    // load texture from pbo
+    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glBindTexture(GL_TEXTURE_2D, texid);
+    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+
+    // display results
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    glEnable(GL_TEXTURE_2D);
+    glDisable(GL_DEPTH_TEST);
+
+    glBegin(GL_QUADS);
+    glTexCoord2f(0, 1);
+    glVertex2f(0, 0);
+    glTexCoord2f(1, 1);
+    glVertex2f(1, 0);
+    glTexCoord2f(1, 0);
+    glVertex2f(1, 1);
+    glTexCoord2f(0, 0);
+    glVertex2f(0, 1);
+    glEnd();
+
+    glDisable(GL_TEXTURE_2D);
+    glutSwapBuffers();
+
+    sdkStopTimer(&timer);
+
+    computeFPS();
+}
+
+void idle()
+{
+    glutPostRedisplay();
+}
+
+void cleanup()
+{
+    sdkDeleteTimer(&timer);
+
+    checkCudaErrors(cudaFree(d_img));
+    checkCudaErrors(cudaFree(d_temp));
+    checkCudaErrors(cudaFree(d_gimg));
+    checkCudaErrors(cudaFree(d_gtemp));
+
+    delete [] h_img;
+    delete [] h_gimg;
+
+    if (!runBenchmark)
+    {
+        if (pbo)
+        {
+            checkCudaErrors(cudaGLUnregisterBufferObject(pbo));
+            glDeleteBuffersARB(1, &pbo);
+        }
+
+        if (texid)
+        {
+            glDeleteTextures(1, &texid);
+        }
+    }
+}
+
+void keyboard(uchar key, int x, int y)
+{
+    switch (key)
+    {
+        case 27:
+            exit(EXIT_SUCCESS);
+            break;
+
+        case '=':
+            sigma+=0.1f;
+            break;
+
+        case '-':
+            sigma-=0.1f;
+
+            if (sigma < 0.0)
+            {
+                sigma = 0.0f;
+            }
+
+            break;
+
+        case '+':
+            sigma+=1.0f;
+            break;
+
+        case '_':
+            sigma-=1.0f;
+
+            if (sigma < 0.0)
+            {
+                sigma = 0.0f;
+            }
+
+            break;
+
+        case '0':
+            order = 0;
+            break;
+
+        case '1':
+            order = 1;
+            sigma = 0.5f;
+            break;
+
+        case '2':
+            order = 2;
+            sigma = 0.5f;
+            break;
+
+        default:
+            break;
+    }
+
+    printf("sigma = %f\n", sigma);
+    glutPostRedisplay();
+}
+
+void reshape(int x, int y)
+{
+    glViewport(0, 0, x, y);
+
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+}
+
+void initCudaBuffers()
+{
+    unsigned int size = width * height * sizeof(unsigned int);
+    unsigned int gsize = width * height * sizeof(float); // size for gray image
+
+    // allocate device memory
+    checkCudaErrors(cudaMalloc((void **) &d_img, size));
+    checkCudaErrors(cudaMalloc((void **) &d_temp, size));
+    checkCudaErrors(cudaMalloc((void **) &d_gimg, gsize));
+    checkCudaErrors(cudaMalloc((void **) &d_gtemp, gsize));
+
+    checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_gimg, h_gimg, gsize, cudaMemcpyHostToDevice));
+
+    sdkCreateTimer(&timer);
+}
+
+
+void initGLBuffers()
+{
+    // create pixel buffer object to store final image
+    glGenBuffersARB(1, &pbo);
+    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLubyte)*4, h_img, GL_STREAM_DRAW_ARB);
+
+    glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+    checkCudaErrors(cudaGLRegisterBufferObject(pbo));
+
+    // create texture for display
+    glGenTextures(1, &texid);
+    glBindTexture(GL_TEXTURE_2D, texid);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glBindTexture(GL_TEXTURE_2D, 0);
+}
+
+void initGL(int *argc, char **argv)
+{
+    glutInit(argc, argv);
+    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
+    glutInitWindowSize(width, height);
+    glutCreateWindow(sSDKsample);
+    glutDisplayFunc(display);
+    glutKeyboardFunc(keyboard);
+    glutReshapeFunc(reshape);
+    glutIdleFunc(idle);
+
+    printf("Press '+' and '-' to change filter width\n");
+    printf("0, 1, 2 - change filter order\n");
+
+    glewInit();
+
+    if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object"))
+    {
+        fprintf(stderr, "Required OpenGL extensions missing.");
+        exit(EXIT_FAILURE);
+    }
+}
+
+void
+benchmark(void)
+{
+    // allocate memory for result
+    float *d_gresult;
+    unsigned int size = width * height * sizeof(float);
+    checkCudaErrors(cudaMalloc((void **) &d_gresult, size));
+
+    // execute the kernel (RUNTIMES number of times)
+    rg0(d_gimg, d_gresult, d_gtemp, width, height, sigma, order, nthreads);
+
+    float *rg0_img = new float[width*height];
+    checkCudaErrors(cudaMemcpy(rg0_img, d_gresult, width*height*sizeof(float), cudaMemcpyDeviceToHost));
+
+    rg1(d_gimg, d_gresult, d_gtemp, width, height, sigma, order, nthreads);
+
+    float *rg1_img = new float[width*height];
+    checkCudaErrors(cudaMemcpy(rg1_img, d_gresult, width*height*sizeof(float), cudaMemcpyDeviceToHost));
+
+    // algs
+    int ne = width*height;
+    float me = 0.f, mre = 0.f;
+
+    std::vector< float > van_img(ne), ap4_img(ne), ag4_img(ne), ag5_img(ne);
+    for (uint i = 0; i < ne; ++i)
+        van_img[i] = ap4_img[i] = ag4_img[i] = ag5_img[i] = h_gimg[i];
+
+    runAlg5f4(&van_img[0], width, height, sigma);
+
+    runAlg4pd(&ap4_img[0], width, height, sigma, order);
+
+    runAlg4d(&ag4_img[0], width, height, sigma, order);
+
+    //runAlg5d(&ag5_img[0], width, height, sigma, order);
+
+    static char *rg0fn = (char*)"rg0.pgm";
+    static char *rg1fn = (char*)"rg1.pgm";
+    static char *vanfn = (char*)"van.pgm";
+    static char *ap4fn = (char*)"ap4.pgm";
+    static char *ag4fn = (char*)"ag4.pgm";
+
+    sdkSavePGM(rg0fn, rg0_img, width, height);
+    sdkSavePGM(rg1fn, rg1_img, width, height);
+    sdkSavePGM(vanfn, &van_img[0], width, height);
+    sdkSavePGM(ap4fn, &ap4_img[0], width, height);
+    sdkSavePGM(ag4fn, &ag4_img[0], width, height);
+
+#if RUNTIMES==1
+    std::cout << " --- Checking computations ---\n";
+    check_cpu_reference( rg0_img, rg1_img, ne, me, mre );
+    std::cout << " rg0 x rg1: me = " << me << " mre = " << mre << "\n";
+    check_cpu_reference( rg0_img, &van_img[0], ne, me, mre );
+    std::cout << " rg0 x alg5f4: me = " << me << " mre = " << mre << "\n";
+    check_cpu_reference( rg0_img, &ap4_img[0], ne, me, mre );
+    std::cout << " rg0 x alg4pd: me = " << me << " mre = " << mre << "\n";
+    check_cpu_reference( rg0_img, &ag4_img[0], ne, me, mre );
+    std::cout << " rg0 x alg4d: me = " << me << " mre = " << mre << "\n";
+#endif
+
+    delete [] rg0_img;
+    delete [] rg1_img;
+
+    checkCudaErrors(cudaFree(d_gresult));
+}
+
+void
+colorAlg(void)
+{
+    int ne = width*height;
+    std::vector< float > *alg_img = new std::vector< float >[4];
+
+    for (uint i = 0; i < 4; ++i)
+    {
+        alg_img[i].resize(ne);
+    }
+
+    convert_rgbaIntToFloat(h_img, &alg_img[0][0], &alg_img[1][0],
+                           &alg_img[2][0], &alg_img[3][0], width, height);
+
+    for (uint i = 0; i < ne; ++i)
+    {
+        runAlg5f4(&alg_img[i][0], width, height, sigma);
+        std::cout << "\n";
+    }
+
+    uchar *a_result = new uchar[ne*4];
+    convert_rgbaFloatToUchar(a_result, &alg_img[0][0], &alg_img[1][0],
+                           &alg_img[2][0], &alg_img[3][0], width, height);
+
+    static char *algppmfn = (char*)"alg.ppm";
+    sdkSavePPM4ub(algppmfn, a_result, width, height);
+
+    delete [] alg_img;
+}
+
+bool
+runSingleTest(const char *ref_file, const char *exec_path)
+{
+    // allocate memory for result
+    int nTotalErrors = 0;
+    unsigned int *d_result;
+    unsigned int size = width * height * sizeof(unsigned int);
+    checkCudaErrors(cudaMalloc((void **) &d_result, size));
+
+    // warm-up
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStartTimer(&timer);
+
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+    checkCudaErrors(cudaDeviceSynchronize());
+    getLastCudaError("Kernel execution failed");
+    sdkStopTimer(&timer);
+
+    uchar *h_result = (uchar *)malloc(width*height*4);
+    checkCudaErrors(cudaMemcpy(h_result, d_result, width*height*4, cudaMemcpyDeviceToHost));
+
+    char dump_file[1024];
+    sprintf(dump_file, "lena_%02d.ppm", (int)sigma);
+    sdkSavePPM4ub(dump_file, h_result, width, height);
+
+    if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false))
+    {
+        nTotalErrors++;
+    }
+
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mpixels/sec\n", (width*height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+
+    checkCudaErrors(cudaFree(d_result));
+    free(h_result);
+
+    printf("Summary: %d errors!\n", nTotalErrors);
+
+    printf(nTotalErrors == 0 ? "Test passed\n": "Test failed!\n");
+    return(nTotalErrors == 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char **argv)
+{
+    pArgc = &argc;
+    pArgv = argv;
+    char *ref_file = NULL;
+
+#if RUNTIMES==1
+    printf("%s Starting...\n\n", sSDKsample);
+#endif
+
+    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+    if (argc > 1)
+    {
+        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
+        {
+            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
+            fpsLimit = frameCheckNumber;
+        }
+    }
+
+    // Get the path of the filename
+    char *filename;
+
+    if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename))
+    {
+        image_filename = filename;
+    }
+
+    // load image or random image
+    char *image_path = sdkFindFilePath(image_filename, argv[0]);
+
+    if (image_path == NULL)
+    {
+        fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename);
+        exit(EXIT_FAILURE);
+    }
+
+    bool randimg = false;
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "width"))
+    {
+        width = getCmdLineArgumentInt(argc, (const char **) argv, "width");
+        randimg = true;
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "height"))
+    {
+        height = getCmdLineArgumentInt(argc, (const char **) argv, "height");
+        randimg = true;
+    }
+
+    if (randimg)
+    {
+        image_path = (char *)"rand";
+        h_img = new unsigned int[width*height*4];
+        srand(1234);
+        for (int i = 0; i < width*height*4; ++i)
+            h_img[i] = (unsigned int)(rand() % 256);
+    } else
+    {
+        sdkLoadPPM4ub(image_path, (uchar **)&h_img, &width, &height);
+    }
+
+    if (!h_img)
+    {
+        printf("Error unable to generate image file: '%s'\n", image_path);
+        exit(EXIT_FAILURE);
+    }
+
+#if RUNTIMES==1
+    printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);
+#endif
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "threads"))
+    {
+        //nthreads = getCmdLineArgumentInt(argc, (const char **) argv, "threads");
+        printf("Error unable to change number of threads!'\n");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "sigma"))
+    {
+        sigma = getCmdLineArgumentFloat(argc, (const char **) argv, "sigma");
+    }
+
+#if RUNTIMES==1
+    printf("Sigma: %f\n", sigma);
+#endif
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "order"))
+    {
+        order = getCmdLineArgumentInt(argc, (const char **) argv, "order");
+    }
+
+#if RUNTIMES==1
+    printf("Filter order: %d\n", order);
+#endif
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "benchmark"))
+    {
+        runBenchmark = (bool)getCmdLineArgumentInt(argc, (const char **) argv, "benchmark");
+    }
+
+#if RUNTIMES==1
+    printf("Run benchmark: %d\n", (benchmark?1:0));
+#endif
+
+    h_gimg = new float[width*height]; // allocate gray image
+
+    for (uint i=0; i<width*height; ++i)
+        h_gimg[i] = rgbaIntToFloat1(h_img[i]);
+
+    static char *gfn = (char*)"gray.pgm";
+    sdkSavePGM(gfn, h_gimg, width, height);
+
+#if RUNTIMES==1
+    printf("Gray image converted: %s\n", gfn);
+#endif
+
+    int device;
+    struct cudaDeviceProp prop;
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&prop, device);
+
+    if (!strncmp("Tesla", prop.name, 5))
+    {
+        printf("Tesla card detected, running the test in benchmark mode (no OpenGL display)\n");
+        runBenchmark = true;
+    }
+
+    // Benchmark or AutoTest mode detected, no OpenGL
+    if (runBenchmark == true || ref_file != NULL)
+    {
+#if RUNTIMES==1
+        findCudaDevice(argc, (const char **)argv);
+#endif
+    }
+    else
+    {
+        // First initialize OpenGL context, so we can properly set the GL for CUDA.
+        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
+        initGL(&argc, argv);
+        findCudaGLDevice(argc, (const char **)argv);
+    }
+
+    initCudaBuffers();
+
+    if (ref_file)
+    {
+        printf("(Automated Testing)\n");
+        bool testPassed = runSingleTest(ref_file, argv[0]);
+
+        cleanup();
+        cudaDeviceReset();
+
+        exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE);
+    }
+
+    if (runBenchmark)
+    {
+#if RUNTIMES==1
+        printf("(Run Benchmark)\n");
+#endif
+        benchmark();
+
+        cleanup();
+        cudaDeviceReset();
+
+        exit(EXIT_SUCCESS);
+    }
+
+    initGLBuffers();
+    //atexit(cleanup);
+    glutMainLoop();
+
+    cleanup();
+    cudaDeviceReset();
+
+    exit(EXIT_SUCCESS);
+}
diff --git a/paredge/recursiveGaussian_cuda.cu b/paredge/recursiveGaussian_cuda.cu
new file mode 100755
index 0000000..d744530
--- /dev/null
+++ b/paredge/recursiveGaussian_cuda.cu
@@ -0,0 +1,326 @@
+/*
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+* Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/*
+    Recursive Gaussian filter
+    sgreen 8/1/08
+
+    This code sample implements a Gaussian blur using Deriche's recursive method:
+    http://citeseer.ist.psu.edu/deriche93recursively.html
+
+    This is similar to the box filter sample in the SDK, but it uses the previous
+    outputs of the filter as well as the previous inputs. This is also known as an
+    IIR (infinite impulse response) filter, since its response to an input impulse
+    can last forever.
+
+    The main advantage of this method is that the execution time is independent of
+    the filter width.
+
+    The GPU processes columns of the image in parallel. To avoid uncoalesced reads
+    for the row pass we transpose the image and then transpose it back again
+    afterwards.
+
+    The implementation is based on code from the CImg library:
+    http://cimg.sourceforge.net/
+    Thanks to David Tschumperl and all the CImg contributors!
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <helper_math.h>
+
+#include "recursiveGaussian.h"
+
+#include "recursiveGaussian_kernel.cuh"
+#include "rg.cuh"
+
+#define USE_SIMPLE_FILTER 0
+
+#define WARPSIZE 32
+
+//Round a / b to nearest higher integer value
+int iDivUp(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+/*
+    Transpose a 2D array (see SDK transpose example)
+*/
+template< typename T >
+void transpose(T *d_src, T *d_dest, uint width, int height)
+{
+    dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1);
+    dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
+    d_transpose<<< grid, threads >>>(d_dest, d_src, width, height);
+    getLastCudaError("Kernel execution failed");
+}
+
+/**
+ * @brief Computer recursive filter parameters
+ * @param[out] a0 Filter param a0
+ * @param[out] a1 Filter param a1
+ * @param[out] a2 Filter param a2
+ * @param[out] a3 Filter param a3
+ * @param[out] b1 Filter param b1
+ * @param[out] b2 Filter param b2
+ * @param[out] coefp Filter param coefficient previous
+ * @param[out] coefn Filter param coefficient next
+ * @param[in] sigma Sigma of Gaussian
+ * @param[in] order Filter order
+*/
+extern "C"
+void computeFilterParams(float& a0, float& a1, float& a2, float& a3,
+                         float& b1, float& b2, float& coefp, float& coefn,
+                         const float& sigma, const int& order)
+{
+    // compute filter coefficients
+    const float
+        nsigma = sigma < 0.1f ? 0.1f : sigma,
+        alpha = 1.695f / nsigma,
+        ema = (float)std::exp(-alpha),
+        ema2 = (float)std::exp(-2*alpha);
+
+    b1 = -2*ema;
+    b2 = ema2;
+
+    a0 = a1 = a2 = a3 = coefp = coefn = 0.f;
+
+    switch (order)
+    {
+        case 0:
+            {
+                const float k = (1-ema)*(1-ema)/(1+2*alpha*ema-ema2);
+                a0 = k;
+                a1 = k*(alpha-1)*ema;
+                a2 = k*(alpha+1)*ema;
+                a3 = -k*ema2;
+            }
+            break;
+
+        case 1:
+            {
+                const float k = (1-ema)*(1-ema)/ema;
+                a0 = k*ema;
+                a1 = a3 = 0;
+                a2 = -a0;
+            }
+            break;
+
+        case 2:
+            {
+                const float
+                ea = (float)std::exp(-alpha),
+                k = -(ema2-1)/(2*alpha*ema),
+                kn = (-2*(-1+3*ea-3*ea*ea+ea*ea*ea)/(3*ea+1+3*ea*ea+ea*ea*ea));
+                a0 = kn;
+                a1 = -kn*(1+k*alpha)*ema;
+                a2 = kn*(1-k*alpha)*ema;
+                a3 = -kn*ema2;
+            }
+            break;
+
+        default:
+            fprintf(stderr, "gaussianFilter: invalid order parameter!\n");
+            return;
+    }
+
+    coefp = (a0+a1)/(1+b1+b2);
+    coefn = (a2+a3)/(1+b1+b2);
+}
+
+/*
+    Perform Gaussian filter on a 2D image using CUDA
+
+    Parameters:
+    d_src  - pointer to input image in device memory
+    d_dest - pointer to destination image in device memory
+    d_temp - pointer to temporary storage in device memory
+    width  - image width
+    height - image height
+    sigma  - sigma of Gaussian
+    order  - filter order (0, 1 or 2)
+*/
+
+// 8-bit RGBA version
+extern "C"
+void gaussianFilterRGBA(uint *d_src, uint *d_dest, uint *d_temp, int width, int height, float sigma, int order, int nthreads)
+{
+    float a0, a1, a2, a3, b1, b2, coefp, coefn;
+    computeFilterParams(a0, a1, a2, a3, b1, b2, coefp, coefn, sigma, order);
+
+    // process columns
+#if USE_SIMPLE_FILTER
+    d_simpleRecursive_rgba<<< iDivUp(width, nthreads), nthreads >>>(d_src, d_temp, width, height, ema);
+#else
+    d_recursiveGaussian_rgba<<< iDivUp(width, nthreads), nthreads >>>(d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn);
+#endif
+    getLastCudaError("Kernel execution failed");
+
+    transpose(d_temp, d_dest, width, height);
+    getLastCudaError("transpose: Kernel execution failed");
+
+    // process rows
+#if USE_SIMPLE_FILTER
+    d_simpleRecursive_rgba<<< iDivUp(height, nthreads), nthreads >>>(d_dest, d_temp, height, width, ema);
+#else
+    d_recursiveGaussian_rgba<<< iDivUp(height, nthreads), nthreads >>>(d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn);
+#endif
+    getLastCudaError("Kernel execution failed");
+
+    transpose(d_temp, d_dest, height, width);
+    getLastCudaError("transpose: Kernel execution failed");
+}
+
+// 32-bit gray version of gaussianFilterRGBA
+extern "C"
+void rg0(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads)
+{
+    float a0, a1, a2, a3, b1, b2, coefp, coefn;
+    computeFilterParams(a0, a1, a2, a3, b1, b2, coefp, coefn, sigma, order);
+
+    // grids (width and height) and block dimension
+    dim3 gdW(iDivUp(width, nthreads), 1);
+    dim3 gdH(iDivUp(height, nthreads), 1);
+    dim3 bd(WARPSIZE, nthreads/WARPSIZE); // block dimension
+
+#if RUNTIMES>1
+    base_timer &timer_total = timers.gpu_add("rg0", width*height*RUNTIMES, "iP");
+
+    for (int r=0; r<RUNTIMES; ++r) {
+        d_rg0<<< gdW, bd >>>(d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn);
+        getLastCudaError("Kernel execution failed");
+
+        transpose(d_temp, d_dest, width, height);
+        getLastCudaError("transpose: Kernel execution failed");
+
+        d_rg0<<< gdH, bd >>>(d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn);
+        getLastCudaError("Kernel execution failed");
+
+        transpose(d_temp, d_dest, height, width);
+        getLastCudaError("transpose: Kernel execution failed");
+    }
+
+    timer_total.stop();
+
+    std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush;
+#else
+    base_timer &timer_total = timers.gpu_add("rg0", width*height, "iP");
+    base_timer *timer;
+
+    timer = &timers.gpu_add("rg0-rows");
+
+    d_rg0<<< gdW, bd >>>(d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn);
+    getLastCudaError("Kernel execution failed");
+
+    timer->stop(); timer = &timers.gpu_add("rg0-transpose");
+
+    transpose(d_temp, d_dest, width, height);
+    getLastCudaError("transpose: Kernel execution failed");
+
+    timer->stop(); timer = &timers.gpu_add("rg0-cols");
+
+    d_rg0<<< gdH, bd >>>(d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn);
+    getLastCudaError("Kernel execution failed");
+
+    timer->stop(); timer = &timers.gpu_add("rg0-transpose");
+
+    transpose(d_temp, d_dest, height, width);
+    getLastCudaError("transpose: Kernel execution failed");
+
+    timer->stop();
+
+    timer_total.stop();
+    timers.flush();
+#endif
+}
+
+// Parallel up/down Deriche Recursive Gaussian filter
+extern "C"
+void rg1(float *d_src, float *d_dest, float *d_temp, int width, int height, float sigma, int order, int nthreads)
+{
+    float a0, a1, a2, a3, b1, b2, coefp, coefn;
+    computeFilterParams(a0, a1, a2, a3, b1, b2, coefp, coefn, sigma, order);
+
+    // grids (width and height) and block dimension
+    dim3 gdW(iDivUp(width, nthreads)*2, 1);
+    dim3 gdH(iDivUp(height, nthreads)*2, 1);
+    dim3 bd(WARPSIZE, nthreads/WARPSIZE);
+
+#if RUNTIMES>1
+    base_timer &timer_total = timers.gpu_add("rg1", width*height*RUNTIMES, "iP");
+
+    for (int r=0; r<RUNTIMES; ++r) {
+        d_rg1<<< gdW, bd >>>(d_src, d_temp, d_dest, width, height,
+                             a0, a1, a2, a3, b1, b2, coefp, coefn);
+        getLastCudaError("Kernel execution failed");
+
+        transpose(d_temp, d_dest, width, height);
+        getLastCudaError("transpose: Kernel execution failed");
+
+        d_rg1<<< gdH, bd >>>(d_dest, d_temp, d_src, height, width,
+                             a0, a1, a2, a3, b1, b2, coefp, coefn);
+        getLastCudaError("Kernel execution failed");
+
+        transpose(d_temp, d_dest, height, width);
+        getLastCudaError("transpose: Kernel execution failed");
+    }
+
+    timer_total.stop();
+
+    std::cout << std::fixed << timer_total.data_size()/(timer_total.elapsed()*1024*1024) << " " << std::flush;
+#else
+    base_timer &timer_total = timers.gpu_add("rg1", width*height, "iP");
+    base_timer *timer;
+
+    timer = &timers.gpu_add("rg1-rows");
+
+    d_rg1<<< gdW, bd >>>(d_src, d_temp, d_dest, width, height,
+                         a0, a1, a2, a3, b1, b2, coefp, coefn);
+    getLastCudaError("Kernel execution failed");
+
+    timer->stop(); timer = &timers.gpu_add("rg1-transpose");
+
+    transpose(d_temp, d_dest, width, height);
+    getLastCudaError("transpose: Kernel execution failed");
+
+    timer->stop(); timer = &timers.gpu_add("rg1-cols");
+
+    d_rg1<<< gdH, bd >>>(d_dest, d_temp, d_src, height, width,
+                         a0, a1, a2, a3, b1, b2, coefp, coefn);
+    getLastCudaError("Kernel execution failed");
+
+    timer->stop(); timer = &timers.gpu_add("rg1-transpose");
+
+    transpose(d_temp, d_dest, height, width);
+    getLastCudaError("transpose: Kernel execution failed");
+
+    timer->stop();
+
+    timer_total.stop();
+    timers.flush();
+#endif
+}
diff --git a/paredge/recursiveGaussian_kernel.cuh b/paredge/recursiveGaussian_kernel.cuh
new file mode 100755
index 0000000..9d4420b
--- /dev/null
+++ b/paredge/recursiveGaussian_kernel.cuh
@@ -0,0 +1,244 @@
+/*
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    Recursive Gaussian filter
+*/
+
+#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_
+#define _RECURSIVEGAUSSIAN_KERNEL_CU_
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <helper_cuda.h>
+#include <helper_math.h>
+
+#define BLOCK_DIM 16
+
+// Transpose kernel (see transpose SDK sample for details)
+template< typename T >
+__global__ void d_transpose(T *odata, T *idata, int width, int height)
+{
+    __shared__ T block[BLOCK_DIM][BLOCK_DIM+1];
+
+    // read the matrix tile into shared memory
+    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
+    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
+
+    if ((xIndex < width) && (yIndex < height))
+    {
+        unsigned int index_in = yIndex * width + xIndex;
+        block[threadIdx.y][threadIdx.x] = idata[index_in];
+    }
+
+    __syncthreads();
+
+    // write the transposed matrix tile to global memory
+    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
+    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
+
+    if ((xIndex < height) && (yIndex < width))
+    {
+        unsigned int index_out = yIndex * height + xIndex;
+        odata[index_out] = block[threadIdx.x][threadIdx.y];
+    }
+}
+
+// RGBA version
+// reads from 32-bit uint array holding 8-bit RGBA
+
+// convert floating point rgba color to 32-bit integer
+__device__ uint rgbaFloatToInt(float4 rgba)
+{
+    rgba.x = __saturatef(rgba.x);   // clamp to [0.0, 1.0]
+    rgba.y = __saturatef(rgba.y);
+    rgba.z = __saturatef(rgba.z);
+    rgba.w = __saturatef(rgba.w);
+    return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255);
+}
+
+// convert from 32-bit int to float4
+__host__ __device__ float4 rgbaIntToFloat(uint c)
+{
+    float4 rgba;
+    rgba.x = (c & 0xff) / 255.0f;
+    rgba.y = ((c>>8) & 0xff) / 255.0f;
+    rgba.z = ((c>>16) & 0xff) / 255.0f;
+    rgba.w = ((c>>24) & 0xff) / 255.0f;
+    return rgba;
+}
+
+extern "C"
+__host__ float rgbaIntToFloat1(uint c)
+{
+    float4 rgba = rgbaIntToFloat(c);
+    return (rgba.x+rgba.y+rgba.z+rgba.w)/4.f;
+}
+
+extern "C"
+__host__ void convert_rgbaIntToFloat(uint *id, float *od0, float *od1, float *od2, float *od3, int w, int h)
+{
+    float4 rgba;
+    for (int i = 0; i < w*h; ++i) {
+        rgba = rgbaIntToFloat(id[i]);
+        od0[i] = rgba.x;
+        od1[i] = rgba.y;
+        od2[i] = rgba.z;
+        od3[i] = rgba.w;
+    }
+}
+
+__host__ float saturatef(const float& v) {
+    if (v > 1.f) return 1.f;
+    else if (v < 0.f) return 0.f;
+    return v;
+}
+
+extern "C"
+__host__ void convert_rgbaFloatToUchar(uchar *od, float *id0, float *id1, float *id2, float *id3, int w, int h)
+{
+    for (int i = 0; i < w*h; ++i) {
+        od[i*4+0] = uchar(saturatef(id0[i])*255);
+        od[i*4+1] = uchar(saturatef(id1[i])*255);
+        od[i*4+2] = uchar(saturatef(id2[i])*255);
+        od[i*4+3] = uchar(saturatef(id3[i])*255);
+    }
+}
+
+/*
+    simple 1st order recursive filter
+    - processes one image column per thread
+
+    parameters:
+    id - pointer to input data (RGBA image packed into 32-bit integers)
+    od - pointer to output data
+    w  - image width
+    h  - image height
+    a  - blur parameter
+*/
+
+__global__ void
+d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a)
+{
+    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (x >= w) return;
+
+    id += x;    // advance pointers to correct column
+    od += x;
+
+    // forward pass
+    float4 yp = rgbaIntToFloat(*id);  // previous output
+
+    for (int y = 0; y < h; y++)
+    {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = xc + a*(yp - xc);   // simple lerp between current and previous value
+        *od = rgbaFloatToInt(yc);
+        id += w;
+        od += w;    // move to next row
+        yp = yc;
+    }
+
+    // reset pointers to point to last element in column
+    id -= w;
+    od -= w;
+
+    // reverse pass
+    // ensures response is symmetrical
+    yp = rgbaIntToFloat(*id);
+
+    for (int y = h-1; y >= 0; y--)
+    {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = xc + a*(yp - xc);
+        *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc)*0.5f);
+        id -= w;
+        od -= w;  // move to previous row
+        yp = yc;
+    }
+}
+
+/*
+    recursive Gaussian filter (RGBA uint)
+
+    parameters:
+    id - pointer to input data (RGBA image packed into 32-bit integers)
+    od - pointer to output data
+    w  - image width
+    h  - image height
+    a0-a3, b1, b2, coefp, coefn - filter parameters
+*/
+
+__global__ void
+d_recursiveGaussian_rgba(uint *id, uint *od, int w, int h, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn)
+{
+    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if (x >= w) return;
+
+    id += x;    // advance pointers to correct column
+    od += x;
+
+    // forward pass
+    float4 xp = make_float4(0.0f);  // previous input
+    float4 yp = make_float4(0.0f);  // previous output
+    float4 yb = make_float4(0.0f);  // previous output by 2
+#if CLAMPTOEDGE
+    xp = rgbaIntToFloat(*id);
+    yb = coefp*xp;
+    yp = yb;
+#endif
+
+    for (int y = 0; y < h; y++)
+    {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = a0*xc + a1*xp - b1*yp - b2*yb;
+        *od = rgbaFloatToInt(yc);
+        id += w;
+        od += w;    // move to next row
+        xp = xc;
+        yb = yp;
+        yp = yc;
+    }
+
+    // reset pointers to point to last element in column
+    id -= w;
+    od -= w;
+
+    // reverse pass
+    // ensures response is symmetrical
+    float4 xn = make_float4(0.0f);
+    float4 xa = make_float4(0.0f);
+    float4 yn = make_float4(0.0f);
+    float4 ya = make_float4(0.0f);
+#if CLAMPTOEDGE
+    xn = xa = rgbaIntToFloat(*id);
+    yn = coefn*xn;
+    ya = yn;
+#endif
+
+    for (int y = h-1; y >= 0; y--)
+    {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = a2*xn + a3*xa - b1*yn - b2*ya;
+        xa = xn;
+        xn = xc;
+        ya = yn;
+        yn = yc;
+        *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc);
+        id -= w;
+        od -= w;  // move to previous row
+    }
+}
+
+#endif // #ifndef _GAUSSIAN_KERNEL_H_
diff --git a/paredge/results/calhouse2.xcf b/paredge/results/calhouse2.xcf
new file mode 100755
index 0000000..7278b67
Binary files /dev/null and b/paredge/results/calhouse2.xcf differ
diff --git a/paredge/results/g0s16.png b/paredge/results/g0s16.png
new file mode 100755
index 0000000..3531f36
Binary files /dev/null and b/paredge/results/g0s16.png differ
diff --git a/paredge/results/g1s02.png b/paredge/results/g1s02.png
new file mode 100755
index 0000000..e12ed86
Binary files /dev/null and b/paredge/results/g1s02.png differ
diff --git a/paredge/results/g2s02.png b/paredge/results/g2s02.png
new file mode 100755
index 0000000..6124314
Binary files /dev/null and b/paredge/results/g2s02.png differ
diff --git a/paredge/rg.cuh b/paredge/rg.cuh
new file mode 100755
index 0000000..4f668d9
--- /dev/null
+++ b/paredge/rg.cuh
@@ -0,0 +1,176 @@
+/**
+ *  @file rg.cuh
+ *  @brief Recursive Gaussian Deriche original CUDA sample
+ *  @author Andre Maximo
+ *  @date May, 2014
+ *  @copyright GE Proprietary
+ */
+
+/**
+ * @brief Recursive Gaussian Deriche filter
+ * @param[in] id Input data
+ * @param[out] od Output data
+ * @param[in] w Image width
+ * @param[in] h Image height
+ * @param[in] a0-a3, b1, b2, coefp, coefn - filter parameters
+*/
+
+__global__ void
+d_rg0(float *id, float *od, int w, int h,
+      float a0, float a1, float a2, float a3,
+      float b1, float b2, float coefp, float coefn) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, bx = blockIdx.x, bdx = blockDim.x, bdy = blockDim.y;
+    int bdxy = bdx*bdy;
+    unsigned int x = bx*(bdxy) + ty*bdx + tx;
+
+    if (x >= w) return;
+
+    id += x;    // advance pointers to correct column
+    od += x;
+
+    // forward pass
+    float xp = 0.f;  // previous input
+    float yp = 0.f;  // previous output
+    float yb = 0.f;  // previous output by 2
+#if CLAMPTOEDGE
+    xp = *id;
+    yb = coefp*xp;
+    yp = yb;
+#endif
+
+    for (int y = 0; y < h; y++) {
+        float xc = *id;
+        float yc = a0*xc + a1*xp - b1*yp - b2*yb;
+        *od = yc;
+        id += w;
+        od += w;    // move to next row
+        xp = xc;
+        yb = yp;
+        yp = yc;
+    }
+
+    // reset pointers to point to last element in column
+    id -= w;
+    od -= w;
+
+    // reverse pass
+    // ensures response is symmetrical
+    float xn = 0.f;
+    float xa = 0.f;
+    float yn = 0.f;
+    float ya = 0.f;
+#if CLAMPTOEDGE
+    xn = xa = *id;
+    yn = coefn*xn;
+    ya = yn;
+#endif
+
+    for (int y = h-1; y >= 0; y--) {
+        float xc = *id;
+        float yc = a2*xn + a3*xa - b1*yn - b2*ya;
+        xa = xn;
+        xn = xc;
+        ya = yn;
+        yn = yc;
+        *od = *od + yc;
+        id -= w;
+        od -= w;  // move to previous row
+    }
+}
+
+__global__ void
+d_rg1(float *id, float *od, float *td, int w, int h,
+      float a0, float a1, float a2, float a3,
+      float b1, float b2, float coefp, float coefn) {
+
+    int tx = threadIdx.x, ty = threadIdx.y, bx = blockIdx.x, bdx = blockDim.x, bdy = blockDim.y;
+    int bdxy2 = bdx*bdy/2, bdy2 = bdy/2, tyby2 = ty % bdy2;
+    unsigned int x = bx*bdxy2 + tyby2*bdx + tx;
+
+    if (x >= w) return;
+
+    id += x;    // advance pointers to correct column
+    od += x;
+    td += x;
+
+    if (ty < bdy2) {
+
+        // forward pass
+        float xp = 0.f;  // previous input
+        float yp = 0.f;  // previous output
+        float yb = 0.f;  // previous output by 2
+#if CLAMPTOEDGE
+        xp = *id;
+        yb = coefp*xp;
+        yp = yb;
+#endif
+
+        for (int y = 0; y < h; y++) {
+            float xc = *id;
+            float yc = a0*xc + a1*xp - b1*yp - b2*yb;
+            *od = yc;
+            id += w;
+            od += w;    // move to next row
+            xp = xc;
+            yb = yp;
+            yp = yc;
+        }
+
+        od -= w;
+        td += w*(h-1);
+
+    } else {
+
+        id += w*(h-1);
+        td += w*(h-1); // advance pointers to last row
+
+        // reverse pass
+        // DOEST NOT ensures response is symmetrical
+        float xn = 0.f;
+        float xa = 0.f;
+        float yn = 0.f;
+        float ya = 0.f;
+#if CLAMPTOEDGE
+        xn = xa = *id;
+        yn = coefn*xn;
+        ya = yn;
+#endif
+
+        for (int y = h-1; y >= 0; y--) {
+            float xc = *id;
+            float yc = a2*xn + a3*xa - b1*yn - b2*ya;
+            xa = xn;
+            xn = xc;
+            ya = yn;
+            yn = yc;
+            *td = yc;
+            id -= w;
+            td -= w;  // move to previous row
+        }
+
+        td += w;
+
+    }
+
+    __syncthreads();
+
+    if (ty < bdy2) {
+
+        for (int y = 0; y < (h+1)/2; y++) {
+            *od = *od + *td;
+            od -= w;
+            td -= w;
+        }
+
+    } else {
+
+        for (int y = 0; y < h/2; y++) {
+            *od = *od + *td;
+            od += w;
+            td += w;
+        }
+
+    }
+
+}
diff --git a/paredge/run.sh b/paredge/run.sh
new file mode 100755
index 0000000..cb1ef66
--- /dev/null
+++ b/paredge/run.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+for i in $(seq 1 64);
+do
+    echo -n "$(($i * 64)) $(($i * 64)) "
+    $1 -width=$(($i * 64)) -height=$(($i * 64))
+    echo
+done
diff --git a/paredge/sage_session.sobj b/paredge/sage_session.sobj
new file mode 100755
index 0000000..aecb401
Binary files /dev/null and b/paredge/sage_session.sobj differ