diff --git a/cudaconv3/src/filter_acts.cu b/cudaconv3/src/filter_acts.cu
index e8782cb..4f1b9e8 100644
--- a/cudaconv3/src/filter_acts.cu
+++ b/cudaconv3/src/filter_acts.cu
@@ -601,6 +601,18 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(float* im
     }
 }
 
+/*****************************Function Revision Record*****************************
+ * Author: Tencent BestImage Team(ankerguo@tencent.com)                           *
+ * Date:   2015-05-18                                                             *
+ * Reason: Optimizing kernel to get faster speed according to GPU features        *
+ * Method:                                                                        *
+ *         1. reorganizing data structure to avoid bank conflict;                 *
+ *         2. using vectorized data type;                                         *
+ *         3. improving instruction-level parallelism;                            *
+ *         4. removing redundant 'if' branches;                                   *
+ *         5. removing local variables to save registers.                         *
+ *********************************************************************************/
+
 /*
  * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
  * filters:     (numFilterColors, filterPixels, numFilters) if conv
@@ -611,7 +623,9 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(float* im
  */
 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int colorCache,
           bool scale, bool checkImgBounds>
-__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets,
+__global__ void 
+__launch_bounds__(128, 4)
+filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets,
                                        const int numImages, const int numFilters,
                                        const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
                                        const int moduleStride,
@@ -619,8 +633,9 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
                                        const int numGroups,
                                        const float scaleTargets, const float scaleOutputs,
                                        const bool conv/*, const bool noloads*/) {
-    __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters
-    __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images
+    // avoid bank conflict by reorganizing the data structure and improve the band width by using 'float2' instead of 'float'
+    __shared__ float2 shFilters[colorCache / 2][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters
+    __shared__ float2 shImages[colorCache][B_X * imgsPerThread / 2]; // pre-load 1 pixel from B_X*imgsPerThread images
     const int imgPixels = imgSizeY * imgSizeX;
     const int filterPixels = filterSize * filterSize;
     const int numFilterColors = numImgColors / numGroups;
@@ -636,19 +651,20 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
     // in the range 0..31. It appears that this allows the compiler to optimize?
     const int tx = threadIdx.x % B_X;
     const int ty = threadIdx.y % B_Y;
-    const int tidx = ty * B_X + threadIdx.x;
+    //const int tidx = ty * B_X + threadIdx.x; // reduce one register
 
     const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride;
     const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride;
 
-    const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
-    const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
-    const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
-    const int imgOffset = (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx;
+    // reduce two registers
+    //const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+    //const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+    const int myImgIdx = blockIdx.x * B_X * imgsPerThread + tx;
+    const int imgOffset = (blockColorIdx + ty) * imgPixels * imgStride + myImgIdx;
 
 //    images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx;
     const int filterOffset = blockFilterIdx
-            + shFilterLoadY * numFilters * filterPixels + shFilterLoadX + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters);
+            + ((ty * B_X + tx) / (B_Y * filtersPerThread)) * numFilters * filterPixels + ((ty * B_X + tx) % (B_Y * filtersPerThread)) + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters);
 //    filters +=blockFilterIdx
 //            + shFilterLoadY * numFilters * filterPixels + shFilterLoadX;
 //    if (!conv) {
@@ -659,6 +675,8 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
             + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules
             + myImgIdx;
 
+    // combine two registers into one
+    const int numModImages = numModules * numImages;
     float prod[imgsPerThread][filtersPerThread];
 //    float fCache[filtersPerThread];
     #pragma unroll
@@ -682,19 +700,15 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
 
     filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgStartY, imgStartX, fPidx, iPidx);
 
+    // remove redundant conditions
     #pragma unroll
     for (int i = 0; i < imgsPerThread; i++) {
-        if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
-            imPreload[i] = tex1Dfetch<float>(images, imgOffset + imgStride * iPidx + i * B_X);
-        } else {
-            imPreload[i] = 0;
-        }
+        imPreload[i]  = tex1Dfetch<float>(images, imgOffset + imgStride * iPidx + i * B_X);
     }
-    if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { // This if statement reduces reg usage..
-        #pragma unroll
-        for (int c = 0; c < colorCache; c += B_X/filtersPerThread) {
-            fPreload[c*filtersPerThread/B_X] = tex1Dfetch<float>(filters, filterOffset + (c * filterPixels + fPidx) * numFilters);
-        }
+
+    #pragma unroll
+    for (int c = 0; c < colorCache; c += B_X/filtersPerThread) {
+        fPreload[c*filtersPerThread/B_X] = tex1Dfetch<float>(filters, filterOffset + (c * filterPixels + fPidx) * numFilters);
     }
     for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
 //        const int filterPxY = imgY - imgLoadModPosY;
@@ -714,8 +728,14 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
             }
             filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgYNext, imgXNext, fPidxNext, iPidxNext);
             for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop)
-//                const float* ff = &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)];
-//                const float* mm = &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)];
+                // store the preloaded pixel of filter and image into shared memory
+                shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)][(ty * B_X + tx) % (B_Y * filtersPerThread)].x = fPreload[0];
+                shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)][(ty * B_X + tx) % (B_Y * filtersPerThread)].y = fPreload[1];
+                shImages[ty][tx].x = imPreload[0];
+                shImages[ty][tx].y = imPreload[1];                
+                shImages[ty][tx+B_X].x = imPreload[2]; 
+                shImages[ty][tx+B_X].y = imPreload[3];
+
                 int imgOffset2 = imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx);
                 int filterOffset2 = filterOffset + numFilters * ((oc + colorCache) * filterPixels + fPidx);
                 if (oc == numFilterColors - colorCache) {
@@ -725,57 +745,30 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
                     iPidx = iPidxNext;
                 }
 
-                #pragma unroll
-                for (int c = 0; c < colorCache; c += B_X/filtersPerThread) {
-                    shFilters[c + shFilterLoadY][shFilterLoadX] = fPreload[c*filtersPerThread/B_X];
-                }
-
-                #pragma unroll
-                for (int i = 0; i < imgsPerThread; i++) {
-                    // NOTE: bank conflicts here!
-                    shImages[ty][tx * imgsPerThread + i] = imPreload[i];
-                }
-                imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages) ? 0 : tex1Dfetch<float>(images, imgOffset2 + 0 * B_X);
-                imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages) ? 0 : tex1Dfetch<float>(images, imgOffset2 + 1 * B_X);
-                imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages) ? 0 : tex1Dfetch<float>(images, imgOffset2 + 2 * B_X);
+                // preload one pixel of filter and image from texture, and no need to check 'checkImgBounds' with all callers setting it as false
+                imPreload[0] = tex1Dfetch<float>(images, imgOffset2);
+                imPreload[1] = tex1Dfetch<float>(images, imgOffset2 + B_X);
+                imPreload[2] = tex1Dfetch<float>(images, imgOffset2 + 2 * B_X);
+                imPreload[3] = tex1Dfetch<float>(images, imgOffset2 + 3 * B_X);
+                fPreload[0] = tex1Dfetch<float>(filters, filterOffset2);
+                fPreload[1] = tex1Dfetch<float>(filters, filterOffset2 + 2 * filterPixels * numFilters);
 
                 __syncthreads();
 
-                #pragma unroll
-                for(int i = 0; i < imgsPerThread; i++) {
-                    #pragma unroll
-                    for(int f = 0; f < filtersPerThread; f++) {
-                        prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] * shFilters[0][threadIdx.y * filtersPerThread + f];
-                    }
-                }
-
-                fPreload[0] = tex1Dfetch<float>(filters, filterOffset2 + 0);
-
-                #pragma unroll
-                for(int i = 0; i < imgsPerThread; i++) {
-                    #pragma unroll
-                    for(int f = 0; f < filtersPerThread; f++) {
-                        prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] * shFilters[1][threadIdx.y * filtersPerThread + f];
-                    }
-                }
-
-                fPreload[1] = tex1Dfetch<float>(filters, filterOffset2 + (B_X/filtersPerThread * filterPixels) * numFilters);
-
-                #pragma unroll
-                for(int i = 0; i < imgsPerThread; i++) {
-                    #pragma unroll
-                    for(int f = 0; f < filtersPerThread; f++) {
-                        prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] * shFilters[2][threadIdx.y * filtersPerThread + f];
-                    }
-                }
-
-                imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages) ? 0 : tex1Dfetch<float>(images, imgOffset2 + 3 * B_X);
-
-                #pragma unroll
-                for(int i = 0; i < imgsPerThread; i++) {
+                // put together the instructions with same type to improve instruction-level parallelism 
+                // calculate the convolution between images and filters
+                #pragma unroll 
+                for (int f = 0; f < filtersPerThread; f++) {
                     #pragma unroll
-                    for(int f = 0; f < filtersPerThread; f++) {
-                        prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] * shFilters[3][threadIdx.y * filtersPerThread + f];
+                    for (int r = 0; r < colorCache / 2; r++) {
+                        prod[0][f] += shImages[r][tx].x      * shFilters[r][ty*filtersPerThread+f].x;
+                        prod[1][f] += shImages[r][tx].y      * shFilters[r][ty*filtersPerThread+f].x;
+                        prod[2][f] += shImages[r][tx+B_X].x   * shFilters[r][ty*filtersPerThread+f].x;
+                        prod[3][f] += shImages[r][tx+B_X].y   * shFilters[r][ty*filtersPerThread+f].x;
+                        prod[0][f] += shImages[r+2][tx].x    * shFilters[r][ty*filtersPerThread+f].y;
+                        prod[1][f] += shImages[r+2][tx].y    * shFilters[r][ty*filtersPerThread+f].y;
+                        prod[2][f] += shImages[r+2][tx+B_X].x * shFilters[r][ty*filtersPerThread+f].y;
+                        prod[3][f] += shImages[r+2][tx+B_X].y * shFilters[r][ty*filtersPerThread+f].y;
                     }
                 }
                 __syncthreads();
@@ -788,9 +781,8 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
         for (int f = 0; f < filtersPerThread; f++) {
             #pragma unroll
             for (int i = 0; i < imgsPerThread; i++) {
-                if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
-                    targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f];
-                }
+                // remove the redundant condition for less registers
+                targets[i * B_X + f * numModImages] = scaleTargets * targets[i * B_X + f * numModImages] + scaleOutputs * prod[i][f];
             }
         }
     } else {
@@ -799,9 +791,8 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT
         for (int i = 0; i < imgsPerThread; i++) {
             #pragma unroll
             for (int f = 0; f < filtersPerThread; f++) {
-                if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
-                    targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f];
-                }
+                // remove the redundant condition for less registers
+                targets[i * B_X + f * numModImages] = scaleOutputs * prod[i][f];
             }
         }
     }
@@ -1159,6 +1150,219 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
     }
 }
 
+
+/*****************************Function Revision Record*****************************
+ * Author: Tencent BestImage Team(ankerguo@tencent.com)                           *
+ * Date:   2015-05-18                                                             *
+ * Reason: Optimizing kernel to get faster speed according to GPU features        *
+ * Method:                                                                        *
+ *         1. reorganizing data structure to avoid bank conflict;                 *
+ *         2. using vectorized data type;                                         *
+ * Note:   This function can be used when each thread loads even number of filter *
+ *         pixels(filtersPerThread * colorCache / B_X is even), and this can be   *
+ *         optimized more when the number of loaded image's pixel is even.        *
+ *********************************************************************************/
+template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int colorCache,
+          bool scale, bool checkImgBounds>
+__global__ void filterActs_YxX_sparse2_f_vec(float* images, float* filters, float* targets,
+                                       const int numImages, const int numFilters,
+                                       const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
+                                       const int moduleStride,
+                                       const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors,
+                                       const int numGroups, 
+                                       const float scaleTargets, const float scaleOutputs,
+                                       const bool conv) {
+    // improve shared memory's band width by using 'float2' instead of 'float'
+    __shared__ float2 shFilters[colorCache/2][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters
+    __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images
+
+    const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+    const int imgPixels = imgSizeY * imgSizeX;
+    const int filterPixels = filterSize * filterSize;
+    const int numFilterColors = numImgColors / numGroups;
+    const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
+    const int moduleIdx = blockIdx.y / blocksPerModule;
+    const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
+    const int numFiltersPerGroup = numFilters / numGroups;
+    const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
+
+    const int numModules = numModulesX * numModulesY;
+    const int blockColorIdx = numFilterColors * blockGroupIdx;
+
+    const int tidx = ty * B_X + tx;
+
+    const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride;
+    const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride;
+
+    // load position of filters' pixels for current thread
+    const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
+    const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
+    // load position of images' pixels for current thread
+    const int shImgLoadY = tidx / (B_X * imgsPerThread);
+    const int shImgLoadX = tidx % (B_X * imgsPerThread);
+
+    const int myImgIdx = blockIdx.x * B_X * imgsPerThread + shImgLoadX;
+    images += (blockColorIdx + shImgLoadY) * imgPixels * imgStride + myImgIdx;
+
+    filters +=blockFilterIdx
+            + shFilterLoadY * numFilters * filterPixels + shFilterLoadX;
+    if (!conv) {
+        filters += moduleIdx * numFilterColors * filterPixels * numFilters;
+    }
+
+    targets += moduleIdx * numImages
+            + (blockFilterIdx + ty) * numImages * numModules
+            + blockIdx.x * B_X * imgsPerThread + tx;
+
+    float prod[filtersPerThread][imgsPerThread];
+    #pragma unroll
+    for(int f = 0; f < filtersPerThread; f++) {
+        #pragma unroll
+        for(int g = 0; g < imgsPerThread; g++) {
+            prod[f][g] = 0;
+        }
+    }
+
+    const int imgStartX = MAX(0, imgLoadModPosX);
+    const int imgStartY = MAX(0, imgLoadModPosY);
+    const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX);
+    const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY);
+
+    // temporary buffer to store the filter's loaded pixels during each loop
+    float fPreload[colorCache * filtersPerThread / B_X];
+    // temporary buffer to store the image's loaded pixels during each loop
+    float iPreload[colorCache * imgsPerThread / B_Y];
+
+    // preload filter's pixels
+    #pragma unroll
+    for (int c = 0; c < colorCache; c += B_X/filtersPerThread) {
+        fPreload[c * filtersPerThread / B_X] = filters[(c * filterPixels + (imgStartY - imgLoadModPosY) * filterSize + (imgStartX - imgLoadModPosX)) * numFilters];
+    }
+
+    // preload image's pixels
+    if (!checkImgBounds || myImgIdx < numImages) {
+        #pragma unroll
+        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+            iPreload[c * imgsPerThread / B_Y] = images[(c * imgPixels + imgStartY * imgSizeX + imgStartX) * imgStride];
+        }
+    } else {
+        #pragma unroll
+        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+            iPreload[c * imgsPerThread / B_Y] = 0;
+        }
+    }
+
+    for (int imgY = imgStartY; imgY < imgEndY; ++imgY) {
+        //const int filterPxY = imgY - imgLoadModPosY;
+        for (int imgX = imgStartX; imgX < imgEndX; ++imgX) {
+            for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop)
+                // store the preloaded filter's pixels into shared memory
+                #pragma unroll
+                for (int c = 0; c < colorCache / 2; c += B_X / filtersPerThread) {
+                    shFilters[c + shFilterLoadY][shFilterLoadX].x = fPreload[c * filtersPerThread / B_X];
+                    shFilters[c + shFilterLoadY][shFilterLoadX].y = fPreload[(c + colorCache / 2) * filtersPerThread / B_X];
+                }
+
+                // store the preloaded image's pixels into shared memory
+                #pragma unroll
+                for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+                    shImages[c + shImgLoadY][shImgLoadX] = iPreload[c * imgsPerThread / B_Y];
+                }
+                /*
+                 * Load a pixel from B_Y*filtersPerThread filters
+                 * This condition covers the case when B_X is not divisible by filtersPerThread.
+                 * In this case, not all of the threads will participate in the loading operation.
+                 * This ensures that in each loop iteration, an integer number of rows of shFilters
+                 * are filled, which makes indexing simple.
+
+                 * nvcc is behaving in a completely insane way: removing this condition under
+                 * template parameters that guarantee it to be true actually slows down
+                 * the computation.
+                 *
+                 */
+
+                /* preload image and filter pixels' data */
+                if ((oc + colorCache) == numFilterColors) { // move to next pixel when all colors of current pixel have been finished
+                   int imgXn = (imgX < (imgEndX - 1)) ? (imgX + 1) : imgStartX; 
+                   int imgYn = imgY + (imgXn != (imgX + 1));
+
+                    #pragma unroll
+                    for (int c = 0; c < colorCache; c += B_X/filtersPerThread) {
+                        fPreload[c * filtersPerThread / B_X] = filters[(c * filterPixels + (imgYn - imgLoadModPosY) * filterSize + (imgXn - imgLoadModPosX)) * numFilters];
+                    }
+
+                    if (!checkImgBounds || myImgIdx < numImages) {
+                        #pragma unroll
+                        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+                            iPreload[c * imgsPerThread / B_Y] = images[(c * imgPixels + imgYn * imgSizeX + imgXn) * imgStride];
+                        }
+                    } else {
+                        #pragma unroll
+                        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+                            iPreload[c * imgsPerThread / B_Y] = 0;
+                        }
+                    }
+                } else { // move next colorCache
+                    #pragma unroll
+                    for (int c = 0; c < colorCache; c += B_X/filtersPerThread) {
+                        fPreload[c * filtersPerThread / B_X] = filters[((c + oc + colorCache) * filterPixels + (imgY - imgLoadModPosY) * filterSize + (imgX - imgLoadModPosX)) * numFilters];
+                    }
+
+                    if (!checkImgBounds || myImgIdx < numImages) {
+                        #pragma unroll
+                        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+                            iPreload[c * imgsPerThread / B_Y] = images[((c + oc + colorCache) * imgPixels + imgY * imgSizeX + imgX) * imgStride];
+                        }
+                    } else {
+                        #pragma unroll
+                        for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) {
+                            iPreload[c * imgsPerThread / B_Y] = 0;
+                        }
+                    }
+                }
+
+                __syncthreads();
+
+                // convolution
+                for (int c = 0; c < colorCache / 2; c++) {
+                    #pragma unroll
+                    for(int g = 0; g < imgsPerThread; g++) {
+                        #pragma unroll
+                        for(int f = 0; f < filtersPerThread; f++) {
+                            prod[f][g] += shImages[c][g * B_X + tx]      * shFilters[c][ty + f * B_Y].x;
+                            prod[f][g] += shImages[c + colorCache / 2][g * B_X + tx] * shFilters[c][ty + f * B_Y].y;
+                        }
+                    }
+                }
+                __syncthreads();
+            }
+        }
+    }
+
+    // write convolution result into global memory
+    if (scale) {
+        #pragma unroll
+        for (int g = 0; g < imgsPerThread; g++) {
+            if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+                #pragma unroll
+                for (int f = 0; f < filtersPerThread; f++) {
+                    targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g];
+                }
+            }
+        }
+    } else {
+        // Note: reversing order of these loops saves 2 registers, but costs time
+        #pragma unroll
+        for (int f = 0; f < filtersPerThread; f++) {
+            #pragma unroll
+            for (int g = 0; g < imgsPerThread; g++) {
+                if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
+                    targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g];
+                }
+            }
+        }
+    }
+}
 /*
  * images:      (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
  * filters:     (numFilterColors, filterPixels, numFilters)             if conv
@@ -1238,6 +1442,8 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
     }
     cudaStream_t stream = NVMatrix::getDefaultStream();
 
+    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); // using wider band width
+    
     // Auto-generated calling code...
     // NOTE: The calling code is set up such that if checkImgBounds is true, then imgsPerThread = 1.
     // In principle it doesn't have to be this way, and you may want to optimize for that case.
@@ -1265,8 +1471,8 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
                         }
                     }
                     else if (numFiltersPerGroup % 32 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 1 == 0) {
                         cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, false, false >, cudaFuncCachePreferShared);
@@ -1275,16 +1481,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
                 }
                 else if (numImages % 64 == 0) {
                     if (numFiltersPerGroup % 128 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 64 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 32 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 1 == 0) {
                         cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, false, false >, cudaFuncCachePreferShared);
@@ -1293,16 +1499,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
                 }
                 else if (numImages % 32 == 0) {
                     if (numFiltersPerGroup % 128 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 64 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 32 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, false, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, false, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 1 == 0) {
                         cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, false >, cudaFuncCachePreferShared);
@@ -1661,8 +1867,8 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
                         }
                     }
                     else if (numFiltersPerGroup % 32 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 1 == 0) {
                         cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, true, false >, cudaFuncCachePreferShared);
@@ -1671,16 +1877,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
                 }
                 else if (numImages % 64 == 0) {
                     if (numFiltersPerGroup % 128 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 64 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 32 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 1 == 0) {
                         cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, true, false >, cudaFuncCachePreferShared);
@@ -1689,16 +1895,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
                 }
                 else if (numImages % 32 == 0) {
                     if (numFiltersPerGroup % 128 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 64 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 32 == 0) {
-                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, false >, cudaFuncCachePreferShared);
-                        filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
+                        cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, true, false >, cudaFuncCachePreferShared);
+                        filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, true, false > <<<blocks, threads, 0, stream>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
                     }
                     else if (numFiltersPerGroup % 1 == 0) {
                         cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, false >, cudaFuncCachePreferShared);
@@ -2034,6 +2240,7 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar
             }
         }
     }
+    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
 
     getLastCudaError("filterActs: kernel execution failed");
 }
diff --git a/cudaconv3/src/img_acts.cu b/cudaconv3/src/img_acts.cu
index ae1071a..b5763ec 100644
--- a/cudaconv3/src/img_acts.cu
+++ b/cudaconv3/src/img_acts.cu
@@ -684,6 +684,218 @@ __global__ void conv_img_acts_manycolor_kepler(const float* hidActs, const float
     }
 }
 
+/*****************************Function Revision Record*****************************
+ * Author: Tencent BestImage Team(ankerguo@tencent.com)                           *
+ * Date:   2015-05-18                                                             *
+ * Reason: Optimizing kernel to get faster speed according to GPU features        *
+ * Method:                                                                        *
+ *         1. reorganizing data structure to avoid bank conflict;                 *
+ *         2. using vectorized data type;                                         *
+ *         3. dividing loaded filter pixels into 2 parts in each step             *
+ * Note:   This function can be used when each thread loads even number of filter *
+ *         pixels(colorsPerThread * filterCacheF / B_X is even) and even number of*
+ *         hidAct pixels(imgsPerThread * filterCacheH / B_Y), and ff is 32, fh is *
+ *         16
+ *********************************************************************************/
+template <int B_Y, int B_X, int imgsPerThread, int colorsPerThread, int filterCacheF, int filterCacheH, bool scale, bool checkCaseBounds, bool conv>
+__global__ void conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16(const float* hidActs, const float* filters, float* targets,
+                                          const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
+                                          const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
+                                          const int numImgColors, const int numGroups,
+                                          const float scaleTargets, const float scaleOutputs) {
+    // improve band width of shared memory by using 'float2' instead of 'float'
+    __shared__ float2 shFilters[filterCacheF/2][colorsPerThread*B_Y/2 + 2];
+    __shared__ float2 shHidActs[filterCacheH/2][B_X*imgsPerThread];
+
+    const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
+    const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
+
+    const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally
+    const int numFilterColors = numImgColors / numGroups;
+    const int blockGroupIdx = imgColorIdx / numFilterColors;
+    const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
+    const int numFiltersPerGroup = numFilters / numGroups;
+    const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+
+    const int blockPixelIdx = blockIdx.y;
+    const int blockPixelIdxX = blockPixelIdx % imgSizeX;
+    const int blockPixelIdxY = blockPixelIdx / imgSizeX;
+
+    const int filterPixels = filterSize * filterSize;
+    const int imgPixels = imgSizeY * imgSizeX;
+
+    const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+    const int tidx = ty * B_X + tx;
+
+    const int hidActLoadY = tidx / (imgsPerThread * B_X), hidActLoadX = tidx % (imgsPerThread * B_X);
+    const int filtersLoadY = tidx / (filterCacheF / 2), filtersLoadX = tidx % (filterCacheF / 2);
+    const int numModules = numModulesY * numModulesX;
+
+    hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
+    filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
+    targets += (imgColorIdx + ty) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + tx;
+
+    float prod[colorsPerThread][imgsPerThread];
+    #pragma unroll
+    for (int c = 0; c < colorsPerThread; c++) {
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            prod[c][i] = 0;
+        }
+    }
+
+    const int startY = blockPixelIdxY - paddingStart < filterSize ? 0
+                        : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
+    const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
+    const int startX = blockPixelIdxX - paddingStart < filterSize ? 0
+                        : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
+    const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
+
+    // temporary buffers to store preloaded pixel data
+    float fPreload[colorsPerThread * filterCacheF / B_X / 2];
+    float hPreload[imgsPerThread * filterCacheH / B_Y];
+
+    // preload filter's data 
+    const int pxStartIdxInFilter = (blockPixelIdxY - paddingStart - startY * moduleStride) * filterSize + blockPixelIdxX - paddingStart - startX * moduleStride;
+    const float *currFltData = conv ? &filters[pxStartIdxInFilter * numFilters] : &filters[pxStartIdxInFilter * numFilters + (startY * numModulesX + startX) * numFilterColors * filterPixels * numFilters] ;
+    #pragma unroll
+    for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y / (filterCacheF/2)) {
+        fPreload[i * (filterCacheF/2) / (B_X * B_Y)] = currFltData[i * filterPixels * numFilters];
+    }
+
+    // preload hidAct's data
+    if (!checkCaseBounds || blockCaseIdx + hidActLoadX < numImages) {
+        #pragma unroll
+        for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) {
+            hPreload[i * imgsPerThread / B_Y] = hidActs[(startY * numModulesX + startX) * numImages + i * numModules * numImages];
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) {
+            hPreload[i * imgsPerThread / B_Y] = 0;
+        }
+    }
+
+    for (int my = startY; my < endY; my++) {
+        const int moduleTop = paddingStart + my * moduleStride;
+        const int pxInFilterY = blockPixelIdxY - moduleTop;
+
+        for (int mx = startX; mx < endX; mx++) {
+            const int moduleIdx = my * numModulesX + mx;
+            const int moduleLeft = paddingStart + mx * moduleStride;
+            const int pxInFilterX = blockPixelIdxX - moduleLeft;
+
+            const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
+
+            for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time
+                for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) {
+                    // store the preloaded filter's data into shared memory
+                    #pragma unroll
+                    for (int c = 0; c < colorsPerThread * B_Y / 2; c += B_X*B_Y / (filterCacheF/2)) {
+                        shFilters[filtersLoadX][c + filtersLoadY].x = fPreload[c * (filterCacheF / 2) / (B_X * B_Y)];
+                        shFilters[filtersLoadX][c + filtersLoadY].y = fPreload[(c + colorsPerThread * B_Y / 2) * (filterCacheF / 2) / (B_X * B_Y)];
+                    }
+
+                    // store the preloaded hidAct's data into shared memory
+                    #pragma unroll
+                    for (int c = 0; c < filterCacheH / 2; c += B_Y / imgsPerThread) {
+                        shHidActs[c + hidActLoadY][hidActLoadX].x = hPreload[c * imgsPerThread/ B_Y];
+                        shHidActs[c + hidActLoadY][hidActLoadX].y = hPreload[(c + filterCacheH / 2) * imgsPerThread / B_Y];
+                    }
+                    
+                    /* preload filter and hidAct's data for next loop */
+                    if ((fh + filterCacheH) == numFiltersPerGroup) { // move to next pixel's position when all filters finished
+                        const int mxN = (mx < (endX - 1)) ? (mx + 1) : startX;
+                        const int myN = my + (mxN < mx);
+                        const int pxIdxInFilterN = (blockPixelIdxY - paddingStart - myN * moduleStride) * filterSize + blockPixelIdxX - paddingStart - mxN * moduleStride;
+                        const int moduleIdxN = myN * numModulesX + mxN;
+                        const float *currFlt = conv ? &filters[pxIdxInFilterN * numFilters] 
+                                                    : &filters[pxIdxInFilterN * numFilters + moduleIdxN * numFilterColors * filterPixels * numFilters] ;
+                        #pragma unroll
+                        for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y / (filterCacheF/2)) {
+                            fPreload[i * (filterCacheF/2) / (B_X * B_Y)] = currFlt[i * filterPixels * numFilters];
+                        }
+
+
+                        if (!checkCaseBounds || blockCaseIdx + hidActLoadX < numImages) {
+                        #pragma unroll
+                            for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) {
+                                hPreload[i * imgsPerThread / B_Y] = hidActs[moduleIdxN * numImages + i * numModules * numImages];
+                            }
+                        } else {
+                            #pragma unroll
+                            for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) {
+                                hPreload[i * imgsPerThread / B_Y] = 0;
+                            }
+                        } 
+                    } else { // move to next filterCacheF
+                        const float *currFlt = conv ? &filters[pxIdxInFilter * numFilters + fh + filterCacheH] 
+                                                    : &filters[pxIdxInFilter * numFilters + fh + filterCacheH + moduleIdx * numFilterColors * filterPixels * numFilters] ;
+                        #pragma unroll
+                        for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y / (filterCacheF/2)) {
+                            fPreload[i * (filterCacheF/2) / (B_X * B_Y)] = currFlt[i * filterPixels * numFilters];
+                        }
+
+
+                        if (!checkCaseBounds || blockCaseIdx + hidActLoadX < numImages) {
+                        #pragma unroll
+                            for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) {
+                                hPreload[i * imgsPerThread / B_Y] = hidActs[(moduleIdx + (fh + filterCacheH) * numModules) * numImages + i * numModules * numImages];
+                            }
+                        } else {
+                            #pragma unroll
+                            for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) {
+                                hPreload[i * imgsPerThread / B_Y] = 0;
+                            }
+                        } 
+                    }
+                    __syncthreads();
+
+                    // computation
+                    #pragma unroll
+                    for (int w = 0; w < filterCacheH/2; w++) {
+                        #pragma unroll
+                        for (int c = 0; c < colorsPerThread/2; c++) {
+                            #pragma unroll
+                            for (int i = 0; i < imgsPerThread; i++) {
+                                prod[c][i]                       += shFilters[w][c * B_Y + ty].x                    * shHidActs[w][tx + i * B_X].x;
+                                prod[c][i]                       += shFilters[w + filterCacheH / 2][c * B_Y + ty].x * shHidActs[w][tx + i * B_X].y;
+                                prod[c + colorsPerThread / 2][i] += shFilters[w][c * B_Y + ty].y                    * shHidActs[w][tx + i * B_X].x;
+                                prod[c + colorsPerThread / 2][i] += shFilters[w + filterCacheH / 2][c * B_Y + ty].y * shHidActs[w][tx + i * B_X].y;
+
+                            }
+                        }
+                    }
+                    __syncthreads();
+
+                }
+            }
+        }
+    }
+
+    // store the result into global memory
+    if (scale) {
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            if (!checkCaseBounds || blockCaseIdx + tx + i * B_X < numImages) {
+                #pragma unroll
+                for (int c = 0; c < colorsPerThread; c++) {
+                    targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i];
+                }
+            }
+        }
+    } else {
+        #pragma unroll
+        for (int i = 0; i < imgsPerThread; i++) {
+            if (!checkCaseBounds || blockCaseIdx + tx + i * B_X < numImages) {
+                #pragma unroll
+                for (int c = 0; c < colorsPerThread; c++) {
+                    targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i];
+                }
+            }
+        }
+    }
+}
 /*
  * New Titan-optimized stuff.
  */
@@ -736,9 +948,22 @@ for (int w = 0; w < filterCacheH; w++) { \
     hPreload[y][x] =  tex1Dfetch<float>(hidActs, hidActsLoadOffset + (y) * B_Y * numModules * numImages + (x) * B_X); \
 }
 
+/*****************************Function Revision Record*****************************
+ * Author: Tencent BestImage Team(ankerguo@tencent.com)                           *
+ * Date:   2015-05-18                                                             *
+ * Reason: Optimizing kernel to get faster speed according to GPU features        *
+ * Method:                                                                        *
+ *         1. reorganizing data structure to avoid bank conflict;                 *
+ *         2. using vectorized data type;                                         *
+ *         3. improving instruction-level parallelism;                            *
+ *         4. removing redundant 'if' branches;                                   *
+ *         5. removing local variables to save registers.                         *
+ *         6. dividing the process into two steps with less data loaded.          *
+ *********************************************************************************/
+
 template <int B_Y, int B_X, int imgsPerThread, int colorsPerThread, int filterCacheF, int filterCacheH, bool scale, bool checkCaseBounds, bool conv>
 __global__ void
-__launch_bounds__(256, 2)   // 256 threads per block, 2 blocks per multiprocessor
+__launch_bounds__(256, 3)   // 256 threads per block, 3 blocks per multiprocessor
                             // These launch bounds ensure 25% occupancy (128 registers used)
                             // as oppposed to 13% (130 registers) achieved by defaults.
 conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObject_t hidActs, cudaTextureObject_t filters, float* targets,
@@ -746,39 +971,43 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje
                                           const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
                                           const int numImgColors, const int numGroups,
                                           const float scaleTargets, const float scaleOutputs) {
-    __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF];
-    __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread];
+    // avoid bank conflict by re-organizing the data structure, and improve band width by using 'float2' instead of 'float'
+    __shared__ float2 shFilters[filterCacheF/2][colorsPerThread*B_Y/2 + 2];
+    __shared__ float2 shHidActs[filterCacheH][B_X*imgsPerThread/2];
 
+    const int tx = threadIdx.x, ty = threadIdx.y;
     const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
-    const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
-    const int myCaseIdx = blockCaseIdx + threadIdx.x;
+    //const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
+    //const int myCaseIdx = blockCaseIdx + threadIdx.x;
 
-    const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally
+    //const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally
     const int numFilterColors = numImgColors / numGroups;
-    const int blockGroupIdx = imgColorIdx / numFilterColors;
-    const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
+    const int blockGroupIdx = (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread / numFilterColors;
+    const int filterColorIdx = ((blockIdx.x / numImgBlocks) * B_Y * colorsPerThread) % numFilterColors; // color idx within group
     const int numFiltersPerGroup = numFilters / numGroups;
-    const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
+    //const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
 
-    const int blockPixelIdx = blockIdx.y;
-    const int blockPixelIdxX = blockPixelIdx % imgSizeX;
-    const int blockPixelIdxY = blockPixelIdx / imgSizeX;
+    //const int blockPixelIdx = blockIdx.y;
+    const int blockPixelIdxX = blockIdx.y % imgSizeX;
+    const int blockPixelIdxY = blockIdx.y / imgSizeX;
 
-    const int filterPixels = filterSize * filterSize;
-    const int imgPixels = imgSizeY * imgSizeX;
-    const int tidx = threadIdx.y * B_X + threadIdx.x;
+    //const int filterPixels = filterSize * filterSize;
+    const int filterPxlAll = filterSize * filterSize * numFilters;
+    //const int imgPixels = imgSizeY * imgSizeX;
+    //const int tidx = threadIdx.y * B_X + threadIdx.x;
 //    const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % B_X;
     //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread);
-    const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF;
+    //const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF;
     // nvcc is behaving idiotically again, these useless declarations save registers
     //const int outputY = threadIdx.y, outputX = threadIdx.x;
     //const int ty = threadIdx.y, tx = threadIdx.x;
-    const int numModules = numModulesY * numModulesX;
-    const int hidActsOffset = (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx;
-    const int filtersOffset = blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
+    //const int numModules = numModulesY * numModulesX;
+    const int numImgMods = numModulesY * numModulesX * numImages;  
+    const int hidActsOffset = (blockGroupIdx * numFiltersPerGroup + ty) * numImgMods + (blockIdx.x % numImgBlocks) * B_X * imgsPerThread + tx;
+    const int filtersOffset = blockGroupIdx * numFiltersPerGroup + (filterColorIdx + (ty * B_X + tx) / filterCacheH) * filterPxlAll + (ty * B_X + tx) % filterCacheH;
 //    hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx;
 //    filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
-    targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + myCaseIdx;
+    targets += ((blockIdx.x / numImgBlocks) * B_Y * colorsPerThread + ty) * imgSizeY * imgSizeX * numImages + blockIdx.y * numImages + (blockIdx.x % numImgBlocks) * 128 + tx;
 
     float prod[colorsPerThread][imgsPerThread];
     #pragma unroll
@@ -796,15 +1025,16 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje
                         : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
     const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
 
-    float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
-    float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread];
+    // reduce two registers
+    //float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
+    //float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread];
     //const bool noFLoop = filterCacheF == filterCacheH;
 
     /*
      * Initial preload
      */
-    float hPreload[filterCacheH/B_Y][imgsPerThread]; // [2][4]
-    float wPreload[filterCacheF*colorsPerThread/B_X]; // [8]
+    float hPreload[filterCacheH/B_Y * imgsPerThread]; // [2][4]
+    float wPreload[filterCacheF*colorsPerThread/B_X/2]; // [4]
 
     int moduleIdx, pxIdxInFilter;
     conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(startY, startX, numModulesX, paddingStart, moduleStride, blockPixelIdxY,
@@ -812,145 +1042,106 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje
 //    const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0]
 //                              : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + 0];
     int filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + 0
-                                                  : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters);
+                                                  : moduleIdx * numFilterColors * filterPxlAll + pxIdxInFilter * numFilters);
+    // preload the filter's pixel
     #pragma unroll
-    for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) {
-        if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) {
-            wPreload[i * filterCacheF/(B_X*B_Y)] = tex1Dfetch<float>(filters, filtersLoadOffset + i * filterPixels * numFilters);
-        }
+    for (int i = 0; i < 4; i++) {
+        // discarding redundant conditions  
+        wPreload[i] = tex1Dfetch<float>(filters, filtersLoadOffset + i * 16 * filterPxlAll);
     }
 
 //    const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages];
-    int hidActsLoadOffset = hidActsOffset + (moduleIdx + 0 * numModules) * numImages;
+    int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages;
+    
+    // preload the hidAct's pixel
     #pragma unroll
-    for (int j = 0; j < filterCacheH; j += B_Y) {
-        if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-            #pragma unroll
-            for (int i = 0; i < imgsPerThread; i++) {
-                if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
-                    hPreload[j/B_Y][i] = tex1Dfetch<float>(hidActs, hidActsLoadOffset + j * numModules * numImages + i * B_X);
-                }
-            }
-        }
+    for (int x = 0; x < 4; x++) {
+        hPreload[x]   = tex1Dfetch<float>(hidActs, hidActsLoadOffset + x * 32); 
+        hPreload[x+4] = tex1Dfetch<float>(hidActs, hidActsLoadOffset + 8 * numImgMods + x * 32); 
     }
 
     for (int my = startY; my < endY; my++) {
-        const int moduleTop = paddingStart + my * moduleStride;
-        const int pxInFilterY = blockPixelIdxY - moduleTop;
 
         for (int mx = startX; mx < endX; mx++) {
-            moduleIdx = my * numModulesX + mx;
-            const int moduleLeft = paddingStart + mx * moduleStride;
-            const int pxInFilterX = blockPixelIdxX - moduleLeft;
+            conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(my+(mx+1 == endX), (mx+1 == endX)?startX:(mx+1), numModulesX, paddingStart, moduleStride, blockPixelIdxY, blockPixelIdxX, filterSize, moduleIdx, pxIdxInFilter);
 
-            pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
-            int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext;
-            const bool lastModule = my == endY - 1 && mx == endX - 1;
-            if (!lastModule) {
-                mxNext = mx + 1 == endX ? startX : mx + 1;
-                myNext = my + (mx + 1 == endX);
-            }
-            conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(myNext, mxNext, numModulesX, paddingStart, moduleStride, blockPixelIdxY,
-                                                                                 blockPixelIdxX, filterSize, moduleIdxNext, pxIdxInFilterNext);
             for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time
-                #pragma unroll
-                for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) {
-                    if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) {
-                        shFilterLoad[i * filterCacheF] = wPreload[i * filterCacheF/(B_X*B_Y)];
-                    }
-                }
-
-                filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + f + filterCacheF
-                                                          : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f + filterCacheF);
-                if (f == numFiltersPerGroup - filterCacheF) {
-                    filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilterNext * numFilters
-                                                              : moduleIdxNext * numFilterColors * filterPixels * numFilters + pxIdxInFilterNext * numFilters);
-                }
-
-                #pragma unroll
-                for (int j = 0; j < filterCacheH; j += B_Y) {
-                    if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
-                        #pragma unroll
-                        for (int i = 0; i < imgsPerThread; i++) {
-                            // NOTE: bank conflicts here!
-                            if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
-                                shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i];
-                            }
-                        }
+                // As 'filterCacheF' is two times of 'filterCacheH', so divide the procedure into two steps, which loads less data during each step
+                // with this division, program would not stall at memory access, and get higher gpu occupancy
+                for (int k = 0; k < filterCacheF / filterCacheH; k++) {
+                    // store the preloaded filters's pixel into shared memory with 64 bit width
+                    shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16].x = wPreload[0];
+                    shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16].y = wPreload[2];
+                    shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16 + 16].x = wPreload[1];
+                    shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16 + 16].y = wPreload[3];
+
+                    // store the preloaded hidacts' pixel into shared memory with 64 bit width
+                    shHidActs[ty][tx].x = hPreload[0];
+                    shHidActs[ty][tx].y = hPreload[1];
+                    shHidActs[ty][tx + B_X].x = hPreload[2];
+                    shHidActs[ty][tx + B_X].y = hPreload[3];
+                    shHidActs[ty + filterCacheH/2][tx].x = hPreload[4];
+                    shHidActs[ty + filterCacheH/2][tx].y = hPreload[5];
+                    shHidActs[ty + filterCacheH/2][tx + B_X].x = hPreload[6];
+                    shHidActs[ty + filterCacheH/2][tx + B_X].y = hPreload[7];
+
+                    // update to the 2nd 'filterCacheH' 
+                    filtersLoadOffset += filterCacheH;
+                    hidActsLoadOffset += filterCacheH * numImgMods;
+
+                    // move to the next pixel
+                    if ((k == 1) && (f == numFiltersPerGroup - filterCacheF))  {
+                        filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters
+                                            : moduleIdx * numFilterColors * filterPxlAll + pxIdxInFilter * numFilters);
+                        hidActsLoadOffset = hidActsOffset + moduleIdx * numImages;
+                    }
+
+                    // preloade filter's pixel 
+                    #pragma unroll 4
+                    for (int x = 0; x < 4; x++) {
+                        wPreload[x] = tex1Dfetch<float>(filters, filtersLoadOffset + x * 16 * filterPxlAll);
                     }
-                }
-
-                __syncthreads();
-
-                hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheH) * numModules) * numImages;
-
-                #pragma unroll
-                for (int z = 0; z < 4; ++z) {
-                    IA_PRELOAD_LOOP(z,0);
-                    IA_PRELOAD_W_TX(z);
-                }
-
-                #pragma unroll
-                for (int z = 4; z < 12; ++z) {
-                    IA_PRELOAD_LOOP(z,0);
-                    IA_PRELOAD_H_TX((z-4)/4,z%4);
-                }
 
-                #pragma unroll
-                for (int z = 12; z < 16; ++z) {
-                    IA_PRELOAD_LOOP(z,0);
-                }
+                    __syncthreads();
 
-                __syncthreads();
 
-                #pragma unroll
-                for (int j = 0; j < filterCacheH; j += B_Y) {
-                    if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) {
+                    // put together the instructions with same type to improve instruction-level parallelism 
+                    // calculate the derivative of the hidAct with respect to image
+                    #pragma unroll 
+                    for (int w = 0; w < filterCacheH; w++) {
                         #pragma unroll
-                        for (int i = 0; i < imgsPerThread; i++) {
-                            if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
-                                shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i];
-                            }
-                        }
+                        for (int c = 0; c < colorsPerThread / 2; c++) { 
+                            prod[c][0] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx].x; 
+                            prod[c][1] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx].y; 
+                            prod[c][2] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx+B_X].x; 
+                            prod[c][3] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx+B_X].y; 
+                            prod[c + colorsPerThread / 2][0] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx].x;
+                            prod[c + colorsPerThread / 2][1] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx].y; 
+                            prod[c + colorsPerThread / 2][2] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx+B_X].x; 
+                            prod[c + colorsPerThread / 2][3] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx+B_X].y; 
+                        } 
+                    }
+
+                    // preloade the hidAct's pixel
+                    #pragma unroll
+                    for (int x = 0; x < 4; x++) {
+                        hPreload[x]   = tex1Dfetch<float>(hidActs, hidActsLoadOffset + x * B_X); 
+                        hPreload[x+4] = tex1Dfetch<float>(hidActs, hidActsLoadOffset + 8 * numImgMods + x * B_X); 
                     }
-                }
-
-                __syncthreads();
-
-                hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheF) * numModules) * numImages;
-                if (f == numFiltersPerGroup - filterCacheF) {
-                    hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages;
-                }
-
-                #pragma unroll
-                for (int z = 0; z < 4; ++z) {
-                    IA_PRELOAD_LOOP(z,filterCacheH);
-                    IA_PRELOAD_W_TX(z+4);
-                }
 
-                #pragma unroll
-                for (int z = 4; z < 12; ++z) {
-                    IA_PRELOAD_LOOP(z,filterCacheH);
-                    IA_PRELOAD_H_TX((z-4)/4, z%4);
-                }
-
-                #pragma unroll
-                for (int z = 12; z < 16; ++z) {
-                    IA_PRELOAD_LOOP(z,filterCacheH);
+                    __syncthreads();
                 }
-
-                __syncthreads();
             }
         }
     }
+    
+    // remove redundant conditions for less registers
     if (scale) {
         #pragma unroll
         for (int c = 0; c < colorsPerThread; c++) {
             #pragma unroll
             for (int i = 0; i < imgsPerThread; i++) {
-                if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
-                    targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i];
-                }
+                targets[c * B_Y * imgSizeY * imgSizeX * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgSizeY * imgSizeX * numImages + i * B_X] + scaleOutputs * prod[c][i];
             }
         }
     } else {
@@ -958,9 +1149,7 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje
         for (int c = 0; c < colorsPerThread; c++) {
             #pragma unroll
             for (int i = 0; i < imgsPerThread; i++) {
-                if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) {
-                    targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i];
-                }
+                targets[c * B_Y * imgSizeY * imgSizeX * numImages + i * B_X] = scaleOutputs * prod[c][i];
             }
         }
     }
@@ -1286,6 +1475,8 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
 //    printf("numImages: %d\n", numImages);
 //    cudaStream_t stream = NVMatrix::getDefaultStream();
 
+    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+    
     if (conv == true) {
         if (scale == false) {
             if (checkCaseBounds == false) {
@@ -1297,12 +1488,12 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
                                 conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
                             }
                             else if (numImages % 64 == 0) {
-                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared);
-                                conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
+                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared);
+                                conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, false, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
                             }
                             else if (numImages % 32 == 0) {
-                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared);
-                                conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
+                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared);
+                                conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, false, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
                             }
                             else if (numImages % 16 == 0) {
                                 cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared);
@@ -1643,12 +1834,12 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
                                 conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, true, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
                             }
                             else if (numImages % 64 == 0) {
-                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared);
-                                conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
+                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared);
+                                conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, true, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
                             }
                             else if (numImages % 32 == 0) {
-                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared);
-                                conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
+                                cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared);
+                                conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, true, false, true ><<<blocks, threads, 0, stream>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
                             }
                             else if (numImages % 16 == 0) {
                                 cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared);
@@ -2675,6 +2866,7 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
         }
     }
 
+    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
     getLastCudaError("imgActs: kernel execution failed");
 }
 
diff --git a/cudaconv3/src/weight_acts.cu b/cudaconv3/src/weight_acts.cu
index 20d7a58..09f7a90 100644
--- a/cudaconv3/src/weight_acts.cu
+++ b/cudaconv3/src/weight_acts.cu
@@ -1367,6 +1367,17 @@ __global__ void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3(cudaTextureObj
     }
 }
 
+/*****************************Function Revision Record*****************************
+ * Author: Tencent BestImage Team(ankerguo@tencent.com)                           *
+ * Date:   2015-05-18                                                             *
+ * Reason: Optimizing kernel to get faster speed according to GPU features        *
+ * Method:                                                                        *
+ *         1. reorganizing data structure to avoid bank conflict;                 *
+ *         2. using vectorized data type;                                         *
+ *         3. improving instruction-level parallelism;                            *
+ *         4. removing redundant 'if' branches;                                   *
+ *         5. removing local variables to save registers.                         *
+ *********************************************************************************/
 
 /*
  * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
@@ -1383,10 +1394,12 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu
                                        const int paddingStart, const int moduleStride, const int imgStride,
                                        const int numImgColors, const int numGroups, const int sumWidth,
                                        const float scaleTargets, const float scaleOutputs) {
-    __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases
-    __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts
+    // avoid bank conflict by reorganizing the data structure, and improve the band width by using 'float2'  instead of 'float'
+    __shared__ float2 shImages[preloadCases][colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases
+    __shared__ float2 shHidActs[preloadCases][filtersPerThread * B_X / 2 + 2]; // preload preloadCases cases of B_X hidacts
 
-    const int tidx = B_X * threadIdx.y + threadIdx.x;
+    const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+    const int tidx = B_X * ty + tx;
     const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
 
     const int filterPixels = filterSize * filterSize;
@@ -1428,24 +1441,25 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu
 //            + loadX;
 
     targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors
-            + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters
+            + (blockFilterColorIdx + ty) * filterPixels * numFilters
             + blockPixelOffset * numFilters
             + blockFilterIdx
-            + threadIdx.x;
-//    if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+            + tx;
+    // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
 
     const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
     const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
     const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
     const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
 
-//    if (mStartY == mEndY || mStartX == mEndX) {
-//        return;
-//    }
-//    const bool doWork = mStartY < mEndY && mStartX < mEndX;
+    // if (mStartY == mEndY || mStartX == mEndX) {
+    //     return;
+    // }
+    const bool doWork = mStartY < mEndY && mStartX < mEndX;
 
-    float* shHidActLoad = &shHidActs[loadY][loadX];
-    float* shImgLoad = &shImages[loadY][loadX];
+    // reduce 2 registers
+    //float* shHidActLoad = &shHidActs[loadY][loadX];
+    //float* shImgLoad = &shImages[loadY][loadX];
 
     float imPreload[preloadCases*colorsPerThread/B_X]; // [8]
     float haPreload[preloadCases*filtersPerThread/B_Y]; // [8]
@@ -1466,18 +1480,23 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu
             blockPixelY, blockPixelX, imgSizeX, imgStride,
             pixIdx, m);
 
+    if (doWork) {
     #pragma unroll
-    for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) {
-        // It's bizarre, but this is the fastest way I've found to get it not to load nonexistent pixels.
-        // All other ways cause crazy excessive register usage.
-        const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * imgPixels * imgStride + pixIdx);
-        imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch<float>(images, imgOffset + idx);
+        for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) {
+            // It's bizarre, but this is the fastest way I've found to get it not to load nonexistent pixels.
+            // All other ways cause crazy excessive register usage.
+            const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * imgPixels * imgStride + pixIdx);
+            imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch<float>(images, imgOffset + idx);
+        }
     }
-    #pragma unroll
-    for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
-        // Almost certainly not necessary here.
-        const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * numImages * numModules + m * numImages);
-        haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(hidActs, hidActsOffset + idx);
+    
+    if (doWork) {
+        #pragma unroll
+        for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
+            // Almost certainly not necessary here.
+            const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * numImages * numModules + m * numImages);
+            haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(hidActs, hidActsOffset + idx);
+        }
     }
 
 
@@ -1497,13 +1516,14 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu
                     pixIdxNext, mNext);
 
             for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
-
+                // store the preloaded image's pixel into shared memory
                 #pragma unroll
-                for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) {
-                    shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)];
+                for (int y = 0; y < 4; y++) {
+                    shImages[loadX][loadY+y*8].x = imPreload[y];
+                    shImages[loadX][loadY+y*8].y = imPreload[y+4];
                 }
-//                const float* im = &images[caseIdx + preloadCases + pixIdx];
-//                const float* ha = &hidActs[caseIdx + preloadCases + m * numImages];
+                //const float* im = &images[caseIdx + preloadCases + pixIdx];
+                //const float* ha = &hidActs[caseIdx + preloadCases + m * numImages];
                 int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx;
                 int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages;
                 if (caseIdx + preloadCases == numImages) {
@@ -1512,24 +1532,40 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu
                     imgOffset2 = imgOffset + pixIdxNext;
                     hidActsOffset2 = hidActsOffset + mNext * numImages;
                 }
+        
+                // store the images and hidActs 
+                shHidActs[loadX][loadY].x = haPreload[0];
+                shHidActs[loadX][loadY].y = haPreload[2];
+                shHidActs[loadX][loadY+16].x = haPreload[4];
+                shHidActs[loadX][loadY+16].y = haPreload[6];
+                shHidActs[loadX][loadY+8].x = haPreload[1];
+                shHidActs[loadX][loadY+8].y = haPreload[3];
+                shHidActs[loadX][loadY+24].x = haPreload[5];
+                shHidActs[loadX][loadY+24].y = haPreload[7];
+
+                // preloade the image's and hidAct's pixel
                 #pragma unroll
-                for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
-                    shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)];
+                for (int r = 0; r < 8; r++) {
+                    imPreload[r] = tex1Dfetch<float>(images, imgOffset2 + (r) * 8 * imgPixels * imgStride);
+                    haPreload[r] = tex1Dfetch<float>(hidActs, hidActsOffset2 + (r) * 8 * numImages * numModules);
                 }
 
                 __syncthreads();
-
+                // put together the instructions of same type to improve instruction-level parallelism
                 #pragma unroll
-                for (int z = 0; z < 8; ++z) {
-                    WA_IMLOAD_TX(z);
-                    WA_LOOP2(z);
+                for (int r = 0; r < 16; r++) {
+                    for (int c = 0; c < 4; c++) { 
+                        prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].x; 
+                        prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].y; 
+                        prod[2][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].x; 
+                        prod[3][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].y; 
+                        prod[0][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].x; 
+                        prod[1][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].y; 
+                        prod[2][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].x; 
+                        prod[3][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].y; 
+                    }
                 }
 
-                #pragma unroll
-                for (int z = 0; z < 8; ++z) {
-                    WA_HALOAD_TX(z);
-                    WA_LOOP2(z+8);
-                }
                 __syncthreads();
             }
         }
@@ -1789,6 +1825,18 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32(cu
     }
 }
 
+/*****************************Function Revision Record*****************************
+ * Author: Tencent BestImage Team(ankerguo@tencent.com)                           *
+ * Date:   2015-05-18                                                             *
+ * Reason: Optimizing kernel to get faster speed according to GPU features        *
+ * Method:                                                                        *
+ *         1. reorganizing data structure to avoid bank conflict;                 *
+ *         2. using vectorized data type;                                         *
+ *         3. improving instruction-level parallelism;                            *
+ *         4. removing redundant 'if' branches;                                   *
+ *         5. removing local variables to save registers.                         *
+ *********************************************************************************/
+
 /*
  * images:      (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
  * hidActs:     (numFilters, numModulesY, numModulesX, numImages)
@@ -1804,20 +1852,24 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu
                                        const int paddingStart, const int moduleStride, const int imgStride,
                                        const int numImgColors, const int numGroups, const int sumWidth,
                                        const float scaleTargets, const float scaleOutputs) {
-    __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases
-    __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts
-
-    const int tidx = B_X * threadIdx.y + threadIdx.x;
-    const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
-
-    const int filterPixels = filterSize * filterSize;
+    // avoid bank conflict by re-organizing the data structure, and improve band width by using 'float2' instead of 'float'
+    __shared__ float2 shImages[preloadCases][colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases
+    __shared__ float2 shHidActs[preloadCases][filtersPerThread * B_X / 2 + 2]; // preload preloadCases cases of B_X hidacts
+    const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y;
+    //const int tidx = B_X * threadIdx.y + threadIdx.x;
+    // reduce two registers
+    //const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
+
+    //const int filterPixels = filterSize * filterSize;
+    // reduce one register
+    const int filterPixelsAll = numFilters * filterSize * filterSize;
     const int imgPixels = imgSizeY * imgSizeX;
 
     const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
     const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks;
 
     const int numModuleChunksX = DIVUP(numModulesX, sumWidth);
-//    const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
+    // const int numModuleChunksY = DIVUP(numModulesY, sumWidth);
 
     const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX;
     const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX;
@@ -1825,7 +1877,7 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu
     const int blockModuleStartX = blockModuleChunkX * sumWidth;
     const int blockModuleStartY = blockModuleChunkY * sumWidth;
 
-//    const int moduleIdx = partialSum * outputModuleIdx;
+    // const int moduleIdx = partialSum * outputModuleIdx;
     const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
     const int numModules = numModulesY * numModulesX;
 
@@ -1837,33 +1889,37 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu
     const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize;
     const int blockFilterColorIdx = blockIdx.y  * B_Y * colorsPerThread;
     const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors;
-    const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
-//    images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
+    const int imgOffset = (imgColorIdx + (ty * B_X + tx) / preloadCases) * imgPixels * imgStride + (ty * B_X + tx) % preloadCases;
+    // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX;
     const int hidActsOffset = blockFilterIdx * numImages * numModules
-            + loadY * numImages * numModules
-            + loadX;
-//
-//    hidActs +=
-//             blockFilterIdx * numImages * numModules
-//            + loadY * numImages * numModules
-//            + loadX;
-
-    targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors
-            + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters
+            + ((ty * B_X + tx) / preloadCases) * numImages * numModules
+            + ((ty * B_X + tx) % preloadCases);
+    //
+    // hidActs +=
+    //             blockFilterIdx * numImages * numModules
+    //            + loadY * numImages * numModules
+    //            + loadX;
+
+    // usie one temporary register instead of multiple registers
+    const int pIdxBase = imgStride * ((paddingStart + blockPixelY) * imgSizeX + paddingStart + blockPixelX);
+
+    targets += blockModuleChunkIdx * numFilters * filterSize * filterSize * numFilterColors
+            + (blockFilterColorIdx + ty) * filterSize * filterSize * numFilters
             + blockPixelOffset * numFilters
             + blockFilterIdx
-            + threadIdx.x;
-//    if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
+            + tx;
+    // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return;
 
     const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride));
     const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride));
     const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride)));
     const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride)));
 
+    // reduce 3 registers
     const bool doWork = mStartY < mEndY && mStartX < mEndX;
 
-    float* shHidActLoad = &shHidActs[loadY][loadX];
-    float* shImgLoad = &shImages[loadY][loadX];
+    //float* shHidActLoad = &shHidActs[loadY][loadX];
+    //float* shImgLoad = &shImages[loadY][loadX];
 
     float imPreload[preloadCases*colorsPerThread/B_X]; // [4]
     float haPreload[preloadCases*filtersPerThread/B_Y]; // [8]
@@ -1877,115 +1933,105 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu
             prod[f][c] = 0;
         }
     }
-    int pixIdx, pixIdxNext, m, mNext;
-
-    conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
-            mStartY, mStartX, paddingStart, numModulesX, moduleStride,
-            blockPixelY, blockPixelX, imgSizeX, imgStride,
-            pixIdx, m);
-
-    if (doWork && loadY < B_Y * colorsPerThread) {
+    //int pixIdx, pixIdxNext, m, mNext;
+
+    //conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
+    //        mStartY, mStartX, paddingStart, numModulesX, moduleStride,
+    //        blockPixelY, blockPixelX, imgSizeX, imgStride,
+    //        pixIdx, m);
+    
+    const int pixIdx = pIdxBase + (mStartY * imgSizeX + mStartX) * moduleStride * imgStride;
+    const int m = (mStartY * numModulesX + mStartX);
+
+    // preload the image's pixel 
+    if (doWork && (ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) {
         #pragma unroll
-        for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) {
-            imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch<float>(images, imgOffset + y * imgPixels * imgStride + pixIdx);
+        for (int i = 0; i < 4; i++) {
+            imPreload[i] = tex1Dfetch<float>(images, imgOffset + 16 * i * imgPixels * imgStride + pixIdx);
         }
     }
 
-    if (doWork && loadY < B_X * filtersPerThread) {
+    // preload the hidAct's pixel
+    if (doWork && (ty * B_X + tx) / preloadCases < (B_X * filtersPerThread) / 8) {
         #pragma unroll
-        for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
-            haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch<float>(hidActs, hidActsOffset + y * numImages * numModules + m * numImages);
+        for (int i = 0; i < 8; i++) {
+            haPreload[i] = tex1Dfetch<float>(hidActs, hidActsOffset + 16 * i * numImages * numModules + m * numImages);
         }
     }
 
     for (int my = mStartY; my < mEndY; my++) {
         for (int mx = mStartX; mx < mEndX; mx++) {
-            int myNext = my, mxNext = mx;
-            const bool lastModule = my == mEndY - 1 && mx == mEndX - 1;
-
-            if (!lastModule) {
-                mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
-                myNext = my + (mx + 1 == mEndX);
-            }
-
-            conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords(
-                    myNext, mxNext, paddingStart, numModulesX, moduleStride,
-                    blockPixelY, blockPixelX, imgSizeX, imgStride,
-                    pixIdxNext, mNext);
 
             for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
-
-//                const float* im = &images[caseIdx + preloadCases + pixIdx];
-                int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx;
-                int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages;
-//                const float* ha = &hidActs[caseIdx + preloadCases + m * numImages];
+                int imgOffset2 = imgOffset + caseIdx + preloadCases + pIdxBase + (my * imgSizeX + mx) * moduleStride * imgStride;
+                int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + (my * numModulesX + mx) * numImages;
 
                 if (caseIdx + preloadCases == numImages) {
-                    pixIdx = pixIdxNext;
-                    m = mNext;
-//                    im = &images[pixIdxNext];
-                    imgOffset2 = imgOffset + pixIdxNext;
-                    hidActsOffset2 = hidActsOffset + mNext * numImages;
+                    const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1;
+                    const int myNext = my + (mx + 1 == mEndX);
+
+                    imgOffset2 = imgOffset + + pIdxBase + (myNext * imgSizeX + mxNext) * moduleStride * imgStride;
+                    hidActsOffset2 = hidActsOffset + (myNext * numModulesX + mxNext) * numImages;
+                }
+
+                if ((ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) {
+                    // store the previousely preloaded pixel into shared memory
+                    shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].x = imPreload[0];
+                    shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].y = imPreload[2];
+                    shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].x = imPreload[1];
+                    shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].y = imPreload[3];
+		}
+
+                if ((ty * B_X + tx) / preloadCases < (B_X * filtersPerThread / 8)) {
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].x = haPreload[0];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].y = haPreload[2];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 32].x = haPreload[4];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 32].y = haPreload[6];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].x = haPreload[1];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].y = haPreload[3];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 48].x = haPreload[5];
+                    shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 48].y = haPreload[7];
+		}
 
-//                    ha = &hidActs[mNext * numImages];
-                }
-
-                if (loadY < B_Y * colorsPerThread) {
-                    #pragma unroll
-                    for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) {
-                        shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)];
-                    }
+                #pragma unroll
+                for (int r = 0; r < 8; r++) {
+                    haPreload[r] = tex1Dfetch<float>(hidActs, hidActsOffset2 + r * 16 * numImages * numModules);
                 }
 
-                if (loadY < B_X * filtersPerThread) {
-                    #pragma unroll
-                    for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
-                        shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)];
-                    }
+                #pragma unroll
+                for (int r = 0; r < 4; r++) {
+                    imPreload[r] = tex1Dfetch<float>(images, imgOffset2 + r * 16 * imgPixels * imgStride);
                 }
-
                 __syncthreads();
 
-                WA_LOOP(0);
-                WA_IMLOAD_TX(0);
-                WA_LOOP(1);
-                WA_IMLOAD_TX(1);
-                WA_LOOP(2);
-                WA_IMLOAD_TX(2);
-                WA_LOOP(3);
-                WA_IMLOAD_TX(3);
-                WA_LOOP(4);
-                WA_HALOAD_TX(0);
-                WA_LOOP(5);
-                WA_HALOAD_TX(1);
-                WA_LOOP(6);
-                WA_HALOAD_TX(2);
-                WA_LOOP(7);
-                WA_HALOAD_TX(3);
-                WA_LOOP(8);
-                WA_HALOAD_TX(4);
-                WA_LOOP(9);
-                WA_HALOAD_TX(5);
-                WA_LOOP(10);
-                WA_HALOAD_TX(6);
-                WA_LOOP(11);
-                WA_HALOAD_TX(7);
-                WA_LOOP(12);
-                WA_LOOP(13);
-                WA_LOOP(14);
-                WA_LOOP(15);
+                // put together the instructions of same type to improve instruction-level parallelism
+                // calculate the derivative of the hidAct with respect to weight
+                #pragma unroll
+                for (int r = 0; r < 16; r++) {
+                    #pragma unroll
+                    for (int c = 0; c < 4; c++) { 
+                        prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].x; 
+                        prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].y; 
+                        prod[2][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].x; 
+                        prod[3][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].y; 
+                        prod[0][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].x; 
+                        prod[1][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].y; 
+                        prod[2][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].x; 
+                        prod[3][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].y; 
+                    }
+                }    
 
                 __syncthreads();
-            }
-        }
-    }
+            } 
+        } 
+    } 
 
     if (scale) {
         #pragma unroll
         for (int c = 0; c < colorsPerThread; c++) {
             #pragma unroll
             for (int f = 0; f < filtersPerThread; f++) {
-                targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c];
+                targets[c * B_Y * filterPixelsAll + f * B_X] = scaleTargets * targets[c * B_Y * filterPixelsAll + f * B_X] + scaleOutputs * prod[f][c];
             }
         }
     } else {
@@ -1993,7 +2039,7 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu
         for (int c = 0; c < colorsPerThread; c++) {
             #pragma unroll
             for (int f = 0; f < filtersPerThread; f++) {
-                targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c];
+                targets[c * B_Y * filterPixelsAll + f * B_X] = scaleOutputs * prod[f][c];
             }
         }
     }
@@ -2127,6 +2173,8 @@ void _weightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
         assert(targets.getNumCols() == targetSize.second);
     }
     cudaStream_t stream = NVMatrix::getDefaultStream();
+    
+    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
 
     if (scale == false) {
         if (checkCaseBounds == false) {
@@ -2660,7 +2708,7 @@ void _weightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
             }
         }
     }
-
+    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
     getLastCudaError("weightActs: kernel execution failed");
 }