From e7739c889226aab78f3568947b555e3d5b783ae3 Mon Sep 17 00:00:00 2001 From: bestimage-tencent Date: Mon, 18 May 2015 14:10:21 +0800 Subject: [PATCH] Considerable speedup(VGG model:1.5x, AlexNet:1.1x) Optimizations focus on the gpu-related features, such as avoiding bank conflict, employing wider band width of shared memory, and using vectorized data type, etc.. --- cudaconv3/src/filter_acts.cu | 411 +++++++++++++++++++++-------- cudaconv3/src/img_acts.cu | 484 ++++++++++++++++++++++++----------- cudaconv3/src/weight_acts.cu | 340 +++++++++++++----------- 3 files changed, 841 insertions(+), 394 deletions(-) diff --git a/cudaconv3/src/filter_acts.cu b/cudaconv3/src/filter_acts.cu index e8782cb..4f1b9e8 100644 --- a/cudaconv3/src/filter_acts.cu +++ b/cudaconv3/src/filter_acts.cu @@ -601,6 +601,18 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(float* im } } +/*****************************Function Revision Record***************************** + * Author: Tencent BestImage Team(ankerguo@tencent.com) * + * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + *********************************************************************************/ + /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv @@ -611,7 +623,9 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4(float* im */ template -__global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, +__global__ void +__launch_bounds__(128, 4) +filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaTextureObject_t images, cudaTextureObject_t filters, float* targets, const int numImages, const int numFilters, const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, const int moduleStride, @@ -619,8 +633,9 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT const int numGroups, const float scaleTargets, const float scaleOutputs, const bool conv/*, const bool noloads*/) { - __shared__ float shFilters[colorCache][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters - __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + // avoid bank conflict by reorganizing the data structure and improve the band width by using 'float2' instead of 'float' + __shared__ float2 shFilters[colorCache / 2][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float2 shImages[colorCache][B_X * imgsPerThread / 2]; // pre-load 1 pixel from B_X*imgsPerThread images const int imgPixels = imgSizeY * imgSizeX; const int filterPixels = filterSize * filterSize; const int numFilterColors = numImgColors / numGroups; @@ -636,19 +651,20 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT // in the range 0..31. It appears that this allows the compiler to optimize? const int tx = threadIdx.x % B_X; const int ty = threadIdx.y % B_Y; - const int tidx = ty * B_X + threadIdx.x; + //const int tidx = ty * B_X + threadIdx.x; // reduce one register const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; - const int shFilterLoadY = tidx / (B_Y * filtersPerThread); - const int shFilterLoadX = tidx % (B_Y * filtersPerThread); - const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x; - const int imgOffset = (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; + // reduce two registers + //const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + //const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + tx; + const int imgOffset = (blockColorIdx + ty) * imgPixels * imgStride + myImgIdx; // images += (blockColorIdx + threadIdx.y) * imgPixels * imgStride + myImgIdx; const int filterOffset = blockFilterIdx - + shFilterLoadY * numFilters * filterPixels + shFilterLoadX + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters); + + ((ty * B_X + tx) / (B_Y * filtersPerThread)) * numFilters * filterPixels + ((ty * B_X + tx) % (B_Y * filtersPerThread)) + (conv ? 0 : moduleIdx * numFilterColors * filterPixels * numFilters); // filters +=blockFilterIdx // + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; // if (!conv) { @@ -659,6 +675,8 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT + (blockFilterIdx + threadIdx.y * filtersPerThread) * numImages * numModules + myImgIdx; + // combine two registers into one + const int numModImages = numModules * numImages; float prod[imgsPerThread][filtersPerThread]; // float fCache[filtersPerThread]; #pragma unroll @@ -682,19 +700,15 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgStartY, imgStartX, fPidx, iPidx); + // remove redundant conditions #pragma unroll for (int i = 0; i < imgsPerThread; i++) { - if (!checkImgBounds || myImgIdx + i * B_X < numImages) { - imPreload[i] = tex1Dfetch(images, imgOffset + imgStride * iPidx + i * B_X); - } else { - imPreload[i] = 0; - } + imPreload[i] = tex1Dfetch(images, imgOffset + imgStride * iPidx + i * B_X); } - if (/*B_X % filtersPerThread == 0 ||*/ shFilterLoadY < B_X/filtersPerThread) { // This if statement reduces reg usage.. - #pragma unroll - for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { - fPreload[c*filtersPerThread/B_X] = tex1Dfetch(filters, filterOffset + (c * filterPixels + fPidx) * numFilters); - } + + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + fPreload[c*filtersPerThread/B_X] = tex1Dfetch(filters, filterOffset + (c * filterPixels + fPidx) * numFilters); } for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { // const int filterPxY = imgY - imgLoadModPosY; @@ -714,8 +728,14 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT } filterActs_YxX_sparse2_preload_ty_4_tx_32_f_16_c_4_setPixelCoords(filterSize, imgSizeX, imgLoadModPosY, imgLoadModPosX, imgYNext, imgXNext, fPidxNext, iPidxNext); for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) -// const float* ff = &filters[numFilters * ((oc + colorCache) * filterPixels + fPidx)]; -// const float* mm = &images[imgStride * ((oc + colorCache) * imgPixels + iPidx)]; + // store the preloaded pixel of filter and image into shared memory + shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)][(ty * B_X + tx) % (B_Y * filtersPerThread)].x = fPreload[0]; + shFilters[(ty * B_X + tx) / (B_Y * filtersPerThread)][(ty * B_X + tx) % (B_Y * filtersPerThread)].y = fPreload[1]; + shImages[ty][tx].x = imPreload[0]; + shImages[ty][tx].y = imPreload[1]; + shImages[ty][tx+B_X].x = imPreload[2]; + shImages[ty][tx+B_X].y = imPreload[3]; + int imgOffset2 = imgOffset + imgStride * ((oc + colorCache) * imgPixels + iPidx); int filterOffset2 = filterOffset + numFilters * ((oc + colorCache) * filterPixels + fPidx); if (oc == numFilterColors - colorCache) { @@ -725,57 +745,30 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT iPidx = iPidxNext; } - #pragma unroll - for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { - shFilters[c + shFilterLoadY][shFilterLoadX] = fPreload[c*filtersPerThread/B_X]; - } - - #pragma unroll - for (int i = 0; i < imgsPerThread; i++) { - // NOTE: bank conflicts here! - shImages[ty][tx * imgsPerThread + i] = imPreload[i]; - } - imPreload[0] = (checkImgBounds && myImgIdx + 0 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 0 * B_X); - imPreload[1] = (checkImgBounds && myImgIdx + 1 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 1 * B_X); - imPreload[2] = (checkImgBounds && myImgIdx + 2 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 2 * B_X); + // preload one pixel of filter and image from texture, and no need to check 'checkImgBounds' with all callers setting it as false + imPreload[0] = tex1Dfetch(images, imgOffset2); + imPreload[1] = tex1Dfetch(images, imgOffset2 + B_X); + imPreload[2] = tex1Dfetch(images, imgOffset2 + 2 * B_X); + imPreload[3] = tex1Dfetch(images, imgOffset2 + 3 * B_X); + fPreload[0] = tex1Dfetch(filters, filterOffset2); + fPreload[1] = tex1Dfetch(filters, filterOffset2 + 2 * filterPixels * numFilters); __syncthreads(); - #pragma unroll - for(int i = 0; i < imgsPerThread; i++) { - #pragma unroll - for(int f = 0; f < filtersPerThread; f++) { - prod[i][f] += shImages[0][threadIdx.x * imgsPerThread + i] * shFilters[0][threadIdx.y * filtersPerThread + f]; - } - } - - fPreload[0] = tex1Dfetch(filters, filterOffset2 + 0); - - #pragma unroll - for(int i = 0; i < imgsPerThread; i++) { - #pragma unroll - for(int f = 0; f < filtersPerThread; f++) { - prod[i][f] += shImages[1][threadIdx.x * imgsPerThread + i] * shFilters[1][threadIdx.y * filtersPerThread + f]; - } - } - - fPreload[1] = tex1Dfetch(filters, filterOffset2 + (B_X/filtersPerThread * filterPixels) * numFilters); - - #pragma unroll - for(int i = 0; i < imgsPerThread; i++) { - #pragma unroll - for(int f = 0; f < filtersPerThread; f++) { - prod[i][f] += shImages[2][threadIdx.x * imgsPerThread + i] * shFilters[2][threadIdx.y * filtersPerThread + f]; - } - } - - imPreload[3] = (checkImgBounds && myImgIdx + 3 * B_X >= numImages) ? 0 : tex1Dfetch(images, imgOffset2 + 3 * B_X); - - #pragma unroll - for(int i = 0; i < imgsPerThread; i++) { + // put together the instructions with same type to improve instruction-level parallelism + // calculate the convolution between images and filters + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { #pragma unroll - for(int f = 0; f < filtersPerThread; f++) { - prod[i][f] += shImages[3][threadIdx.x * imgsPerThread + i] * shFilters[3][threadIdx.y * filtersPerThread + f]; + for (int r = 0; r < colorCache / 2; r++) { + prod[0][f] += shImages[r][tx].x * shFilters[r][ty*filtersPerThread+f].x; + prod[1][f] += shImages[r][tx].y * shFilters[r][ty*filtersPerThread+f].x; + prod[2][f] += shImages[r][tx+B_X].x * shFilters[r][ty*filtersPerThread+f].x; + prod[3][f] += shImages[r][tx+B_X].y * shFilters[r][ty*filtersPerThread+f].x; + prod[0][f] += shImages[r+2][tx].x * shFilters[r][ty*filtersPerThread+f].y; + prod[1][f] += shImages[r+2][tx].y * shFilters[r][ty*filtersPerThread+f].y; + prod[2][f] += shImages[r+2][tx+B_X].x * shFilters[r][ty*filtersPerThread+f].y; + prod[3][f] += shImages[r+2][tx+B_X].y * shFilters[r][ty*filtersPerThread+f].y; } } __syncthreads(); @@ -788,9 +781,8 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT for (int f = 0; f < filtersPerThread; f++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { - if (!checkImgBounds || myImgIdx + i * B_X < numImages) { - targets[i * B_X + f * numImages * numModules] = scaleTargets * targets[i * B_X + f * numImages * numModules] + scaleOutputs * prod[i][f]; - } + // remove the redundant condition for less registers + targets[i * B_X + f * numModImages] = scaleTargets * targets[i * B_X + f * numModImages] + scaleOutputs * prod[i][f]; } } } else { @@ -799,9 +791,8 @@ __global__ void filterActs_YxX_sparse2_preload_ty_4_tx_32_i_4_f_16_c_4_tex(cudaT for (int i = 0; i < imgsPerThread; i++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - if (!checkImgBounds || myImgIdx + i * B_X < numImages) { - targets[i * B_X + f * numImages * numModules] = scaleOutputs * prod[i][f]; - } + // remove the redundant condition for less registers + targets[i * B_X + f * numModImages] = scaleOutputs * prod[i][f]; } } } @@ -1159,6 +1150,219 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } } + +/*****************************Function Revision Record***************************** + * Author: Tencent BestImage Team(ankerguo@tencent.com) * + * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * Note: This function can be used when each thread loads even number of filter * + * pixels(filtersPerThread * colorCache / B_X is even), and this can be * + * optimized more when the number of loaded image's pixel is even. * + *********************************************************************************/ +template +__global__ void filterActs_YxX_sparse2_f_vec(float* images, float* filters, float* targets, + const int numImages, const int numFilters, + const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart, + const int moduleStride, + const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors, + const int numGroups, + const float scaleTargets, const float scaleOutputs, + const bool conv) { + // improve shared memory's band width by using 'float2' instead of 'float' + __shared__ float2 shFilters[colorCache/2][B_Y * filtersPerThread]; // pre-load 1 pixel from B_Y*filtersPerThread filters + __shared__ float shImages[colorCache][B_X * imgsPerThread]; // pre-load 1 pixel from B_X*imgsPerThread images + + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + const int imgPixels = imgSizeY * imgSizeX; + const int filterPixels = filterSize * filterSize; + const int numFilterColors = numImgColors / numGroups; + const int blocksPerModule = numFilters / (B_Y*filtersPerThread); + const int moduleIdx = blockIdx.y / blocksPerModule; + const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule); + const int numFiltersPerGroup = numFilters / numGroups; + const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup; + + const int numModules = numModulesX * numModulesY; + const int blockColorIdx = numFilterColors * blockGroupIdx; + + const int tidx = ty * B_X + tx; + + const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride; + const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride; + + // load position of filters' pixels for current thread + const int shFilterLoadY = tidx / (B_Y * filtersPerThread); + const int shFilterLoadX = tidx % (B_Y * filtersPerThread); + // load position of images' pixels for current thread + const int shImgLoadY = tidx / (B_X * imgsPerThread); + const int shImgLoadX = tidx % (B_X * imgsPerThread); + + const int myImgIdx = blockIdx.x * B_X * imgsPerThread + shImgLoadX; + images += (blockColorIdx + shImgLoadY) * imgPixels * imgStride + myImgIdx; + + filters +=blockFilterIdx + + shFilterLoadY * numFilters * filterPixels + shFilterLoadX; + if (!conv) { + filters += moduleIdx * numFilterColors * filterPixels * numFilters; + } + + targets += moduleIdx * numImages + + (blockFilterIdx + ty) * numImages * numModules + + blockIdx.x * B_X * imgsPerThread + tx; + + float prod[filtersPerThread][imgsPerThread]; + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for(int g = 0; g < imgsPerThread; g++) { + prod[f][g] = 0; + } + } + + const int imgStartX = MAX(0, imgLoadModPosX); + const int imgStartY = MAX(0, imgLoadModPosY); + const int imgEndX = MIN(imgLoadModPosX + filterSize, imgSizeX); + const int imgEndY = MIN(imgLoadModPosY + filterSize, imgSizeY); + + // temporary buffer to store the filter's loaded pixels during each loop + float fPreload[colorCache * filtersPerThread / B_X]; + // temporary buffer to store the image's loaded pixels during each loop + float iPreload[colorCache * imgsPerThread / B_Y]; + + // preload filter's pixels + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = filters[(c * filterPixels + (imgStartY - imgLoadModPosY) * filterSize + (imgStartX - imgLoadModPosX)) * numFilters]; + } + + // preload image's pixels + if (!checkImgBounds || myImgIdx < numImages) { + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = images[(c * imgPixels + imgStartY * imgSizeX + imgStartX) * imgStride]; + } + } else { + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = 0; + } + } + + for (int imgY = imgStartY; imgY < imgEndY; ++imgY) { + //const int filterPxY = imgY - imgLoadModPosY; + for (int imgX = imgStartX; imgX < imgEndX; ++imgX) { + for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop) + // store the preloaded filter's pixels into shared memory + #pragma unroll + for (int c = 0; c < colorCache / 2; c += B_X / filtersPerThread) { + shFilters[c + shFilterLoadY][shFilterLoadX].x = fPreload[c * filtersPerThread / B_X]; + shFilters[c + shFilterLoadY][shFilterLoadX].y = fPreload[(c + colorCache / 2) * filtersPerThread / B_X]; + } + + // store the preloaded image's pixels into shared memory + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + shImages[c + shImgLoadY][shImgLoadX] = iPreload[c * imgsPerThread / B_Y]; + } + /* + * Load a pixel from B_Y*filtersPerThread filters + * This condition covers the case when B_X is not divisible by filtersPerThread. + * In this case, not all of the threads will participate in the loading operation. + * This ensures that in each loop iteration, an integer number of rows of shFilters + * are filled, which makes indexing simple. + + * nvcc is behaving in a completely insane way: removing this condition under + * template parameters that guarantee it to be true actually slows down + * the computation. + * + */ + + /* preload image and filter pixels' data */ + if ((oc + colorCache) == numFilterColors) { // move to next pixel when all colors of current pixel have been finished + int imgXn = (imgX < (imgEndX - 1)) ? (imgX + 1) : imgStartX; + int imgYn = imgY + (imgXn != (imgX + 1)); + + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = filters[(c * filterPixels + (imgYn - imgLoadModPosY) * filterSize + (imgXn - imgLoadModPosX)) * numFilters]; + } + + if (!checkImgBounds || myImgIdx < numImages) { + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = images[(c * imgPixels + imgYn * imgSizeX + imgXn) * imgStride]; + } + } else { + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = 0; + } + } + } else { // move next colorCache + #pragma unroll + for (int c = 0; c < colorCache; c += B_X/filtersPerThread) { + fPreload[c * filtersPerThread / B_X] = filters[((c + oc + colorCache) * filterPixels + (imgY - imgLoadModPosY) * filterSize + (imgX - imgLoadModPosX)) * numFilters]; + } + + if (!checkImgBounds || myImgIdx < numImages) { + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = images[((c + oc + colorCache) * imgPixels + imgY * imgSizeX + imgX) * imgStride]; + } + } else { + #pragma unroll + for (int c = 0; c < colorCache; c += B_Y / imgsPerThread) { + iPreload[c * imgsPerThread / B_Y] = 0; + } + } + } + + __syncthreads(); + + // convolution + for (int c = 0; c < colorCache / 2; c++) { + #pragma unroll + for(int g = 0; g < imgsPerThread; g++) { + #pragma unroll + for(int f = 0; f < filtersPerThread; f++) { + prod[f][g] += shImages[c][g * B_X + tx] * shFilters[c][ty + f * B_Y].x; + prod[f][g] += shImages[c + colorCache / 2][g * B_X + tx] * shFilters[c][ty + f * B_Y].y; + } + } + } + __syncthreads(); + } + } + } + + // write convolution result into global memory + if (scale) { + #pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g]; + } + } + } + } else { + // Note: reversing order of these loops saves 2 registers, but costs time + #pragma unroll + for (int f = 0; f < filtersPerThread; f++) { + #pragma unroll + for (int g = 0; g < imgsPerThread; g++) { + if (!checkImgBounds || myImgIdx + g * B_X < numImages) { + targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g]; + } + } + } + } +} /* * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given * filters: (numFilterColors, filterPixels, numFilters) if conv @@ -1238,6 +1442,8 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } cudaStream_t stream = NVMatrix::getDefaultStream(); + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); // using wider band width + // Auto-generated calling code... // NOTE: The calling code is set up such that if checkImgBounds is true, then imgsPerThread = 1. // In principle it doesn't have to be this way, and you may want to optimize for that case. @@ -1265,8 +1471,8 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } } else if (numFiltersPerGroup % 32 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, false, false >, cudaFuncCachePreferShared); @@ -1275,16 +1481,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, false, false >, cudaFuncCachePreferShared); @@ -1293,16 +1499,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, false, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, false, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, false, false >, cudaFuncCachePreferShared); @@ -1661,8 +1867,8 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } } else if (numFiltersPerGroup % 32 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 4, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 4, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 4, 4, 8, true, false >, cudaFuncCachePreferShared); @@ -1671,16 +1877,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } else if (numImages % 64 == 0) { if (numFiltersPerGroup % 128 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 8, 32, 2, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 8, 32, 2, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 2, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 2, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 2, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 2, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 2, 4, 8, true, false >, cudaFuncCachePreferShared); @@ -1689,16 +1895,16 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } else if (numImages % 32 == 0) { if (numFiltersPerGroup % 128 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 8, 32, 1, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 8, 32, 1, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 64 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 1, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 1, 16, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 32 == 0) { - cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, false >, cudaFuncCachePreferShared); - filterActs_YxX_sparse2 < 4, 32, 1, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); + cudaFuncSetCacheConfig(filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, true, false >, cudaFuncCachePreferShared); + filterActs_YxX_sparse2_f_vec < 4, 32, 1, 8, 8, true, false > <<>>(images.getDevData(), filters.getDevData(), targets.getDevData(), numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv); } else if (numFiltersPerGroup % 1 == 0) { cudaFuncSetCacheConfig(filterActs_YxX_sparse2 < 4, 32, 1, 4, 8, true, false >, cudaFuncCachePreferShared); @@ -2034,6 +2240,7 @@ __global__ void filterActs_YxX_sparse2(float* images, float* filters, float* tar } } } + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); getLastCudaError("filterActs: kernel execution failed"); } diff --git a/cudaconv3/src/img_acts.cu b/cudaconv3/src/img_acts.cu index ae1071a..b5763ec 100644 --- a/cudaconv3/src/img_acts.cu +++ b/cudaconv3/src/img_acts.cu @@ -684,6 +684,218 @@ __global__ void conv_img_acts_manycolor_kepler(const float* hidActs, const float } } +/*****************************Function Revision Record***************************** + * Author: Tencent BestImage Team(ankerguo@tencent.com) * + * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. dividing loaded filter pixels into 2 parts in each step * + * Note: This function can be used when each thread loads even number of filter * + * pixels(colorsPerThread * filterCacheF / B_X is even) and even number of* + * hidAct pixels(imgsPerThread * filterCacheH / B_Y), and ff is 32, fh is * + * 16 + *********************************************************************************/ +template +__global__ void conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16(const float* hidActs, const float* filters, float* targets, + const int numModulesY, const int numModulesX, const int numImages, const int numFilters, + const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, + const int numImgColors, const int numGroups, + const float scaleTargets, const float scaleOutputs) { + // improve band width of shared memory by using 'float2' instead of 'float' + __shared__ float2 shFilters[filterCacheF/2][colorsPerThread*B_Y/2 + 2]; + __shared__ float2 shHidActs[filterCacheH/2][B_X*imgsPerThread]; + + const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); + const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; + + const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally + const int numFilterColors = numImgColors / numGroups; + const int blockGroupIdx = imgColorIdx / numFilterColors; + const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int numFiltersPerGroup = numFilters / numGroups; + const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + + const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockPixelIdx % imgSizeX; + const int blockPixelIdxY = blockPixelIdx / imgSizeX; + + const int filterPixels = filterSize * filterSize; + const int imgPixels = imgSizeY * imgSizeX; + + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + const int tidx = ty * B_X + tx; + + const int hidActLoadY = tidx / (imgsPerThread * B_X), hidActLoadX = tidx % (imgsPerThread * B_X); + const int filtersLoadY = tidx / (filterCacheF / 2), filtersLoadX = tidx % (filterCacheF / 2); + const int numModules = numModulesY * numModulesX; + + hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX; + filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + targets += (imgColorIdx + ty) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + tx; + + float prod[colorsPerThread][imgsPerThread]; + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] = 0; + } + } + + const int startY = blockPixelIdxY - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride; + const int endY = min(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride); + const int startX = blockPixelIdxX - paddingStart < filterSize ? 0 + : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; + const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); + + // temporary buffers to store preloaded pixel data + float fPreload[colorsPerThread * filterCacheF / B_X / 2]; + float hPreload[imgsPerThread * filterCacheH / B_Y]; + + // preload filter's data + const int pxStartIdxInFilter = (blockPixelIdxY - paddingStart - startY * moduleStride) * filterSize + blockPixelIdxX - paddingStart - startX * moduleStride; + const float *currFltData = conv ? &filters[pxStartIdxInFilter * numFilters] : &filters[pxStartIdxInFilter * numFilters + (startY * numModulesX + startX) * numFilterColors * filterPixels * numFilters] ; + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y / (filterCacheF/2)) { + fPreload[i * (filterCacheF/2) / (B_X * B_Y)] = currFltData[i * filterPixels * numFilters]; + } + + // preload hidAct's data + if (!checkCaseBounds || blockCaseIdx + hidActLoadX < numImages) { + #pragma unroll + for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) { + hPreload[i * imgsPerThread / B_Y] = hidActs[(startY * numModulesX + startX) * numImages + i * numModules * numImages]; + } + } else { + #pragma unroll + for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) { + hPreload[i * imgsPerThread / B_Y] = 0; + } + } + + for (int my = startY; my < endY; my++) { + const int moduleTop = paddingStart + my * moduleStride; + const int pxInFilterY = blockPixelIdxY - moduleTop; + + for (int mx = startX; mx < endX; mx++) { + const int moduleIdx = my * numModulesX + mx; + const int moduleLeft = paddingStart + mx * moduleStride; + const int pxInFilterX = blockPixelIdxX - moduleLeft; + + const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; + + for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time + for (int fh = f; fh < f + filterCacheF; fh += filterCacheH) { + // store the preloaded filter's data into shared memory + #pragma unroll + for (int c = 0; c < colorsPerThread * B_Y / 2; c += B_X*B_Y / (filterCacheF/2)) { + shFilters[filtersLoadX][c + filtersLoadY].x = fPreload[c * (filterCacheF / 2) / (B_X * B_Y)]; + shFilters[filtersLoadX][c + filtersLoadY].y = fPreload[(c + colorsPerThread * B_Y / 2) * (filterCacheF / 2) / (B_X * B_Y)]; + } + + // store the preloaded hidAct's data into shared memory + #pragma unroll + for (int c = 0; c < filterCacheH / 2; c += B_Y / imgsPerThread) { + shHidActs[c + hidActLoadY][hidActLoadX].x = hPreload[c * imgsPerThread/ B_Y]; + shHidActs[c + hidActLoadY][hidActLoadX].y = hPreload[(c + filterCacheH / 2) * imgsPerThread / B_Y]; + } + + /* preload filter and hidAct's data for next loop */ + if ((fh + filterCacheH) == numFiltersPerGroup) { // move to next pixel's position when all filters finished + const int mxN = (mx < (endX - 1)) ? (mx + 1) : startX; + const int myN = my + (mxN < mx); + const int pxIdxInFilterN = (blockPixelIdxY - paddingStart - myN * moduleStride) * filterSize + blockPixelIdxX - paddingStart - mxN * moduleStride; + const int moduleIdxN = myN * numModulesX + mxN; + const float *currFlt = conv ? &filters[pxIdxInFilterN * numFilters] + : &filters[pxIdxInFilterN * numFilters + moduleIdxN * numFilterColors * filterPixels * numFilters] ; + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y / (filterCacheF/2)) { + fPreload[i * (filterCacheF/2) / (B_X * B_Y)] = currFlt[i * filterPixels * numFilters]; + } + + + if (!checkCaseBounds || blockCaseIdx + hidActLoadX < numImages) { + #pragma unroll + for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) { + hPreload[i * imgsPerThread / B_Y] = hidActs[moduleIdxN * numImages + i * numModules * numImages]; + } + } else { + #pragma unroll + for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) { + hPreload[i * imgsPerThread / B_Y] = 0; + } + } + } else { // move to next filterCacheF + const float *currFlt = conv ? &filters[pxIdxInFilter * numFilters + fh + filterCacheH] + : &filters[pxIdxInFilter * numFilters + fh + filterCacheH + moduleIdx * numFilterColors * filterPixels * numFilters] ; + #pragma unroll + for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y / (filterCacheF/2)) { + fPreload[i * (filterCacheF/2) / (B_X * B_Y)] = currFlt[i * filterPixels * numFilters]; + } + + + if (!checkCaseBounds || blockCaseIdx + hidActLoadX < numImages) { + #pragma unroll + for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) { + hPreload[i * imgsPerThread / B_Y] = hidActs[(moduleIdx + (fh + filterCacheH) * numModules) * numImages + i * numModules * numImages]; + } + } else { + #pragma unroll + for (int i = 0; i < filterCacheH; i += B_Y / imgsPerThread) { + hPreload[i * imgsPerThread / B_Y] = 0; + } + } + } + __syncthreads(); + + // computation + #pragma unroll + for (int w = 0; w < filterCacheH/2; w++) { + #pragma unroll + for (int c = 0; c < colorsPerThread/2; c++) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + prod[c][i] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx + i * B_X].x; + prod[c][i] += shFilters[w + filterCacheH / 2][c * B_Y + ty].x * shHidActs[w][tx + i * B_X].y; + prod[c + colorsPerThread / 2][i] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx + i * B_X].x; + prod[c + colorsPerThread / 2][i] += shFilters[w + filterCacheH / 2][c * B_Y + ty].y * shHidActs[w][tx + i * B_X].y; + + } + } + } + __syncthreads(); + + } + } + } + } + + // store the result into global memory + if (scale) { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + tx + i * B_X < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; + } + } + } + } else { + #pragma unroll + for (int i = 0; i < imgsPerThread; i++) { + if (!checkCaseBounds || blockCaseIdx + tx + i * B_X < numImages) { + #pragma unroll + for (int c = 0; c < colorsPerThread; c++) { + targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; + } + } + } + } +} /* * New Titan-optimized stuff. */ @@ -736,9 +948,22 @@ for (int w = 0; w < filterCacheH; w++) { \ hPreload[y][x] = tex1Dfetch(hidActs, hidActsLoadOffset + (y) * B_Y * numModules * numImages + (x) * B_X); \ } +/*****************************Function Revision Record***************************** + * Author: Tencent BestImage Team(ankerguo@tencent.com) * + * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + * 6. dividing the process into two steps with less data loaded. * + *********************************************************************************/ + template __global__ void -__launch_bounds__(256, 2) // 256 threads per block, 2 blocks per multiprocessor +__launch_bounds__(256, 3) // 256 threads per block, 3 blocks per multiprocessor // These launch bounds ensure 25% occupancy (128 registers used) // as oppposed to 13% (130 registers) achieved by defaults. conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObject_t hidActs, cudaTextureObject_t filters, float* targets, @@ -746,39 +971,43 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride, const int numImgColors, const int numGroups, const float scaleTargets, const float scaleOutputs) { - __shared__ float shFilters[colorsPerThread*B_Y][filterCacheF]; - __shared__ float shHidActs[filterCacheH][B_X*imgsPerThread]; + // avoid bank conflict by re-organizing the data structure, and improve band width by using 'float2' instead of 'float' + __shared__ float2 shFilters[filterCacheF/2][colorsPerThread*B_Y/2 + 2]; + __shared__ float2 shHidActs[filterCacheH][B_X*imgsPerThread/2]; + const int tx = threadIdx.x, ty = threadIdx.y; const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread); - const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; - const int myCaseIdx = blockCaseIdx + threadIdx.x; + //const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread; + //const int myCaseIdx = blockCaseIdx + threadIdx.x; - const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally + //const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally const int numFilterColors = numImgColors / numGroups; - const int blockGroupIdx = imgColorIdx / numFilterColors; - const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group + const int blockGroupIdx = (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread / numFilterColors; + const int filterColorIdx = ((blockIdx.x / numImgBlocks) * B_Y * colorsPerThread) % numFilterColors; // color idx within group const int numFiltersPerGroup = numFilters / numGroups; - const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; + //const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup; - const int blockPixelIdx = blockIdx.y; - const int blockPixelIdxX = blockPixelIdx % imgSizeX; - const int blockPixelIdxY = blockPixelIdx / imgSizeX; + //const int blockPixelIdx = blockIdx.y; + const int blockPixelIdxX = blockIdx.y % imgSizeX; + const int blockPixelIdxY = blockIdx.y / imgSizeX; - const int filterPixels = filterSize * filterSize; - const int imgPixels = imgSizeY * imgSizeX; - const int tidx = threadIdx.y * B_X + threadIdx.x; + //const int filterPixels = filterSize * filterSize; + const int filterPxlAll = filterSize * filterSize * numFilters; + //const int imgPixels = imgSizeY * imgSizeX; + //const int tidx = threadIdx.y * B_X + threadIdx.x; // const int hidActLoadY = threadIdx.y % B_Y, hidActLoadX = threadIdx.x % B_X; //const int hidActLoadY = tidx / (B_X*imgsPerThread), hidActLoadX = tidx % (B_X*imgsPerThread); - const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; + //const int filtersLoadY = tidx / filterCacheF, filtersLoadX = tidx % filterCacheF; // nvcc is behaving idiotically again, these useless declarations save registers //const int outputY = threadIdx.y, outputX = threadIdx.x; //const int ty = threadIdx.y, tx = threadIdx.x; - const int numModules = numModulesY * numModulesX; - const int hidActsOffset = (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; - const int filtersOffset = blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; + //const int numModules = numModulesY * numModulesX; + const int numImgMods = numModulesY * numModulesX * numImages; + const int hidActsOffset = (blockGroupIdx * numFiltersPerGroup + ty) * numImgMods + (blockIdx.x % numImgBlocks) * B_X * imgsPerThread + tx; + const int filtersOffset = blockGroupIdx * numFiltersPerGroup + (filterColorIdx + (ty * B_X + tx) / filterCacheH) * filterPxlAll + (ty * B_X + tx) % filterCacheH; // hidActs += (blockFilterIdx + threadIdx.y) * numImages * numModules + myCaseIdx; // filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX; - targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + myCaseIdx; + targets += ((blockIdx.x / numImgBlocks) * B_Y * colorsPerThread + ty) * imgSizeY * imgSizeX * numImages + blockIdx.y * numImages + (blockIdx.x % numImgBlocks) * 128 + tx; float prod[colorsPerThread][imgsPerThread]; #pragma unroll @@ -796,15 +1025,16 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride; const int endX = min(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride); - float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; - float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; + // reduce two registers + //float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX]; + //float* shHidActLoad = &shHidActs[threadIdx.y][threadIdx.x * imgsPerThread]; //const bool noFLoop = filterCacheF == filterCacheH; /* * Initial preload */ - float hPreload[filterCacheH/B_Y][imgsPerThread]; // [2][4] - float wPreload[filterCacheF*colorsPerThread/B_X]; // [8] + float hPreload[filterCacheH/B_Y * imgsPerThread]; // [2][4] + float wPreload[filterCacheF*colorsPerThread/B_X/2]; // [4] int moduleIdx, pxIdxInFilter; conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(startY, startX, numModulesX, paddingStart, moduleStride, blockPixelIdxY, @@ -812,145 +1042,106 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje // const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + 0] // : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + 0]; int filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + 0 - : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters); + : moduleIdx * numFilterColors * filterPxlAll + pxIdxInFilter * numFilters); + // preload the filter's pixel #pragma unroll - for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { - if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { - wPreload[i * filterCacheF/(B_X*B_Y)] = tex1Dfetch(filters, filtersLoadOffset + i * filterPixels * numFilters); - } + for (int i = 0; i < 4; i++) { + // discarding redundant conditions + wPreload[i] = tex1Dfetch(filters, filtersLoadOffset + i * 16 * filterPxlAll); } // const float* hLoad = &hidActs[(moduleIdx + 0 * numModules) * numImages]; - int hidActsLoadOffset = hidActsOffset + (moduleIdx + 0 * numModules) * numImages; + int hidActsLoadOffset = hidActsOffset + moduleIdx * numImages; + + // preload the hidAct's pixel #pragma unroll - for (int j = 0; j < filterCacheH; j += B_Y) { - if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { - #pragma unroll - for (int i = 0; i < imgsPerThread; i++) { - if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { - hPreload[j/B_Y][i] = tex1Dfetch(hidActs, hidActsLoadOffset + j * numModules * numImages + i * B_X); - } - } - } + for (int x = 0; x < 4; x++) { + hPreload[x] = tex1Dfetch(hidActs, hidActsLoadOffset + x * 32); + hPreload[x+4] = tex1Dfetch(hidActs, hidActsLoadOffset + 8 * numImgMods + x * 32); } for (int my = startY; my < endY; my++) { - const int moduleTop = paddingStart + my * moduleStride; - const int pxInFilterY = blockPixelIdxY - moduleTop; for (int mx = startX; mx < endX; mx++) { - moduleIdx = my * numModulesX + mx; - const int moduleLeft = paddingStart + mx * moduleStride; - const int pxInFilterX = blockPixelIdxX - moduleLeft; + conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(my+(mx+1 == endX), (mx+1 == endX)?startX:(mx+1), numModulesX, paddingStart, moduleStride, blockPixelIdxY, blockPixelIdxX, filterSize, moduleIdx, pxIdxInFilter); - pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX; - int myNext = my, mxNext = mx, moduleIdxNext, pxIdxInFilterNext; - const bool lastModule = my == endY - 1 && mx == endX - 1; - if (!lastModule) { - mxNext = mx + 1 == endX ? startX : mx + 1; - myNext = my + (mx + 1 == endX); - } - conv_img_acts_manycolor_preload_ty_8_tx_32_c_8_ff_32_fh_16_setCoords(myNext, mxNext, numModulesX, paddingStart, moduleStride, blockPixelIdxY, - blockPixelIdxX, filterSize, moduleIdxNext, pxIdxInFilterNext); for (int f = 0; f < numFiltersPerGroup; f += filterCacheF) { // multiply with filterCacheF filters at a time - #pragma unroll - for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/filterCacheF) { - if ((colorsPerThread*B_Y) % (B_X*B_Y/filterCacheF) == 0 || i + filtersLoadY < colorsPerThread*B_Y) { - shFilterLoad[i * filterCacheF] = wPreload[i * filterCacheF/(B_X*B_Y)]; - } - } - - filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + f + filterCacheF - : moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f + filterCacheF); - if (f == numFiltersPerGroup - filterCacheF) { - filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilterNext * numFilters - : moduleIdxNext * numFilterColors * filterPixels * numFilters + pxIdxInFilterNext * numFilters); - } - - #pragma unroll - for (int j = 0; j < filterCacheH; j += B_Y) { - if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { - #pragma unroll - for (int i = 0; i < imgsPerThread; i++) { - // NOTE: bank conflicts here! - if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { - shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; - } - } + // As 'filterCacheF' is two times of 'filterCacheH', so divide the procedure into two steps, which loads less data during each step + // with this division, program would not stall at memory access, and get higher gpu occupancy + for (int k = 0; k < filterCacheF / filterCacheH; k++) { + // store the preloaded filters's pixel into shared memory with 64 bit width + shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16].x = wPreload[0]; + shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16].y = wPreload[2]; + shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16 + 16].x = wPreload[1]; + shFilters[(ty * B_X + tx) % 16][(ty * B_X + tx) / 16 + 16].y = wPreload[3]; + + // store the preloaded hidacts' pixel into shared memory with 64 bit width + shHidActs[ty][tx].x = hPreload[0]; + shHidActs[ty][tx].y = hPreload[1]; + shHidActs[ty][tx + B_X].x = hPreload[2]; + shHidActs[ty][tx + B_X].y = hPreload[3]; + shHidActs[ty + filterCacheH/2][tx].x = hPreload[4]; + shHidActs[ty + filterCacheH/2][tx].y = hPreload[5]; + shHidActs[ty + filterCacheH/2][tx + B_X].x = hPreload[6]; + shHidActs[ty + filterCacheH/2][tx + B_X].y = hPreload[7]; + + // update to the 2nd 'filterCacheH' + filtersLoadOffset += filterCacheH; + hidActsLoadOffset += filterCacheH * numImgMods; + + // move to the next pixel + if ((k == 1) && (f == numFiltersPerGroup - filterCacheF)) { + filtersLoadOffset = filtersOffset + (conv ? pxIdxInFilter * numFilters + : moduleIdx * numFilterColors * filterPxlAll + pxIdxInFilter * numFilters); + hidActsLoadOffset = hidActsOffset + moduleIdx * numImages; + } + + // preloade filter's pixel + #pragma unroll 4 + for (int x = 0; x < 4; x++) { + wPreload[x] = tex1Dfetch(filters, filtersLoadOffset + x * 16 * filterPxlAll); } - } - - __syncthreads(); - - hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheH) * numModules) * numImages; - - #pragma unroll - for (int z = 0; z < 4; ++z) { - IA_PRELOAD_LOOP(z,0); - IA_PRELOAD_W_TX(z); - } - - #pragma unroll - for (int z = 4; z < 12; ++z) { - IA_PRELOAD_LOOP(z,0); - IA_PRELOAD_H_TX((z-4)/4,z%4); - } - #pragma unroll - for (int z = 12; z < 16; ++z) { - IA_PRELOAD_LOOP(z,0); - } + __syncthreads(); - __syncthreads(); - #pragma unroll - for (int j = 0; j < filterCacheH; j += B_Y) { - if (filterCacheH % B_Y == 0 || threadIdx.y + j < filterCacheH) { + // put together the instructions with same type to improve instruction-level parallelism + // calculate the derivative of the hidAct with respect to image + #pragma unroll + for (int w = 0; w < filterCacheH; w++) { #pragma unroll - for (int i = 0; i < imgsPerThread; i++) { - if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { - shHidActLoad[j * B_X * imgsPerThread + i] = hPreload[j/B_Y][i]; - } - } + for (int c = 0; c < colorsPerThread / 2; c++) { + prod[c][0] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx].x; + prod[c][1] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx].y; + prod[c][2] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx+B_X].x; + prod[c][3] += shFilters[w][c * B_Y + ty].x * shHidActs[w][tx+B_X].y; + prod[c + colorsPerThread / 2][0] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx].x; + prod[c + colorsPerThread / 2][1] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx].y; + prod[c + colorsPerThread / 2][2] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx+B_X].x; + prod[c + colorsPerThread / 2][3] += shFilters[w][c * B_Y + ty].y * shHidActs[w][tx+B_X].y; + } + } + + // preloade the hidAct's pixel + #pragma unroll + for (int x = 0; x < 4; x++) { + hPreload[x] = tex1Dfetch(hidActs, hidActsLoadOffset + x * B_X); + hPreload[x+4] = tex1Dfetch(hidActs, hidActsLoadOffset + 8 * numImgMods + x * B_X); } - } - - __syncthreads(); - - hidActsLoadOffset = hidActsOffset + (moduleIdx + (f + filterCacheF) * numModules) * numImages; - if (f == numFiltersPerGroup - filterCacheF) { - hidActsLoadOffset = hidActsOffset + moduleIdxNext * numImages; - } - - #pragma unroll - for (int z = 0; z < 4; ++z) { - IA_PRELOAD_LOOP(z,filterCacheH); - IA_PRELOAD_W_TX(z+4); - } - #pragma unroll - for (int z = 4; z < 12; ++z) { - IA_PRELOAD_LOOP(z,filterCacheH); - IA_PRELOAD_H_TX((z-4)/4, z%4); - } - - #pragma unroll - for (int z = 12; z < 16; ++z) { - IA_PRELOAD_LOOP(z,filterCacheH); + __syncthreads(); } - - __syncthreads(); } } } + + // remove redundant conditions for less registers if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { - if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { - targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i]; - } + targets[c * B_Y * imgSizeY * imgSizeX * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgSizeY * imgSizeX * numImages + i * B_X] + scaleOutputs * prod[c][i]; } } } else { @@ -958,9 +1149,7 @@ conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex(cudaTextureObje for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int i = 0; i < imgsPerThread; i++) { - if (!checkCaseBounds || myCaseIdx + i * B_X < numImages) { - targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i]; - } + targets[c * B_Y * imgSizeY * imgSizeX * numImages + i * B_X] = scaleOutputs * prod[c][i]; } } } @@ -1286,6 +1475,8 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, // printf("numImages: %d\n", numImages); // cudaStream_t stream = NVMatrix::getDefaultStream(); + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); + if (conv == true) { if (scale == false) { if (checkCaseBounds == false) { @@ -1297,12 +1488,12 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, false, false, true ><<>>(hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { - cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); - conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, false, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, false, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { - cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); - conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, false, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, false, false, true >, cudaFuncCachePreferShared); @@ -1643,12 +1834,12 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, conv_img_acts_manycolor_preloadfh_ty_8_tx_32_c_8_ff_32_fh_16_tex< 8, 32, 4, 8, 32, 16, true, false, true ><<>>(hidActs.getTextureObject(), filters.getTextureObject(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 64 == 0) { - cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); - conv_img_acts_manycolor_kepler < 8, 32, 2, 8, 32, 16, true, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 2, 8, 32, 16, true, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 32 == 0) { - cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); - conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); + cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); + conv_img_acts_manycolor_kepler_f_vec_h_vec_ff_32_fh_16 < 8, 32, 1, 8, 32, 16, true, false, true ><<>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput); } else if (numImages % 16 == 0) { cudaFuncSetCacheConfig(conv_img_acts_manycolor_kepler < 8, 32, 1, 8, 32, 16, true, false, true >, cudaFuncCachePreferShared); @@ -2675,6 +2866,7 @@ void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, } } + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); getLastCudaError("imgActs: kernel execution failed"); } diff --git a/cudaconv3/src/weight_acts.cu b/cudaconv3/src/weight_acts.cu index 20d7a58..09f7a90 100644 --- a/cudaconv3/src/weight_acts.cu +++ b/cudaconv3/src/weight_acts.cu @@ -1367,6 +1367,17 @@ __global__ void conv_weight_acts_c_preload_pc_2_pt_4_f_3_r_32_c_3(cudaTextureObj } } +/*****************************Function Revision Record***************************** + * Author: Tencent BestImage Team(ankerguo@tencent.com) * + * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + *********************************************************************************/ /* * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given @@ -1383,10 +1394,12 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int sumWidth, const float scaleTargets, const float scaleOutputs) { - __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases - __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts + // avoid bank conflict by reorganizing the data structure, and improve the band width by using 'float2' instead of 'float' + __shared__ float2 shImages[preloadCases][colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases + __shared__ float2 shHidActs[preloadCases][filtersPerThread * B_X / 2 + 2]; // preload preloadCases cases of B_X hidacts - const int tidx = B_X * threadIdx.y + threadIdx.x; + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + const int tidx = B_X * ty + tx; const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; const int filterPixels = filterSize * filterSize; @@ -1428,24 +1441,25 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu // + loadX; targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors - + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + (blockFilterColorIdx + ty) * filterPixels * numFilters + blockPixelOffset * numFilters + blockFilterIdx - + threadIdx.x; -// if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + tx; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); -// if (mStartY == mEndY || mStartX == mEndX) { -// return; -// } -// const bool doWork = mStartY < mEndY && mStartX < mEndX; + // if (mStartY == mEndY || mStartX == mEndX) { + // return; + // } + const bool doWork = mStartY < mEndY && mStartX < mEndX; - float* shHidActLoad = &shHidActs[loadY][loadX]; - float* shImgLoad = &shImages[loadY][loadX]; + // reduce 2 registers + //float* shHidActLoad = &shHidActs[loadY][loadX]; + //float* shImgLoad = &shImages[loadY][loadX]; float imPreload[preloadCases*colorsPerThread/B_X]; // [8] float haPreload[preloadCases*filtersPerThread/B_Y]; // [8] @@ -1466,18 +1480,23 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu blockPixelY, blockPixelX, imgSizeX, imgStride, pixIdx, m); + if (doWork) { #pragma unroll - for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { - // It's bizarre, but this is the fastest way I've found to get it not to load nonexistent pixels. - // All other ways cause crazy excessive register usage. - const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * imgPixels * imgStride + pixIdx); - imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + idx); + for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { + // It's bizarre, but this is the fastest way I've found to get it not to load nonexistent pixels. + // All other ways cause crazy excessive register usage. + const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * imgPixels * imgStride + pixIdx); + imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + idx); + } } - #pragma unroll - for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { - // Almost certainly not necessary here. - const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * numImages * numModules + m * numImages); - haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + idx); + + if (doWork) { + #pragma unroll + for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { + // Almost certainly not necessary here. + const int idx = (mStartY < mEndY && mStartX < mEndX) * (0 + y * numImages * numModules + m * numImages); + haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + idx); + } } @@ -1497,13 +1516,14 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu pixIdxNext, mNext); for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { - + // store the preloaded image's pixel into shared memory #pragma unroll - for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { - shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)]; + for (int y = 0; y < 4; y++) { + shImages[loadX][loadY+y*8].x = imPreload[y]; + shImages[loadX][loadY+y*8].y = imPreload[y+4]; } -// const float* im = &images[caseIdx + preloadCases + pixIdx]; -// const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + //const float* im = &images[caseIdx + preloadCases + pixIdx]; + //const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; if (caseIdx + preloadCases == numImages) { @@ -1512,24 +1532,40 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_16_f_4_c_8_r_16(cu imgOffset2 = imgOffset + pixIdxNext; hidActsOffset2 = hidActsOffset + mNext * numImages; } + + // store the images and hidActs + shHidActs[loadX][loadY].x = haPreload[0]; + shHidActs[loadX][loadY].y = haPreload[2]; + shHidActs[loadX][loadY+16].x = haPreload[4]; + shHidActs[loadX][loadY+16].y = haPreload[6]; + shHidActs[loadX][loadY+8].x = haPreload[1]; + shHidActs[loadX][loadY+8].y = haPreload[3]; + shHidActs[loadX][loadY+24].x = haPreload[5]; + shHidActs[loadX][loadY+24].y = haPreload[7]; + + // preloade the image's and hidAct's pixel #pragma unroll - for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { - shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)]; + for (int r = 0; r < 8; r++) { + imPreload[r] = tex1Dfetch(images, imgOffset2 + (r) * 8 * imgPixels * imgStride); + haPreload[r] = tex1Dfetch(hidActs, hidActsOffset2 + (r) * 8 * numImages * numModules); } __syncthreads(); - + // put together the instructions of same type to improve instruction-level parallelism #pragma unroll - for (int z = 0; z < 8; ++z) { - WA_IMLOAD_TX(z); - WA_LOOP2(z); + for (int r = 0; r < 16; r++) { + for (int c = 0; c < 4; c++) { + prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].x; + prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx].y; + prod[2][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].x; + prod[3][c] += shImages[r][ty + c * B_Y].x * shHidActs[(r)][tx + B_X].y; + prod[0][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].x; + prod[1][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx].y; + prod[2][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].x; + prod[3][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[(r)][tx + B_X].y; + } } - #pragma unroll - for (int z = 0; z < 8; ++z) { - WA_HALOAD_TX(z); - WA_LOOP2(z+8); - } __syncthreads(); } } @@ -1789,6 +1825,18 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_6_r_32(cu } } +/*****************************Function Revision Record***************************** + * Author: Tencent BestImage Team(ankerguo@tencent.com) * + * Date: 2015-05-18 * + * Reason: Optimizing kernel to get faster speed according to GPU features * + * Method: * + * 1. reorganizing data structure to avoid bank conflict; * + * 2. using vectorized data type; * + * 3. improving instruction-level parallelism; * + * 4. removing redundant 'if' branches; * + * 5. removing local variables to save registers. * + *********************************************************************************/ + /* * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given * hidActs: (numFilters, numModulesY, numModulesX, numImages) @@ -1804,20 +1852,24 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu const int paddingStart, const int moduleStride, const int imgStride, const int numImgColors, const int numGroups, const int sumWidth, const float scaleTargets, const float scaleOutputs) { - __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases - __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts - - const int tidx = B_X * threadIdx.y + threadIdx.x; - const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; - - const int filterPixels = filterSize * filterSize; + // avoid bank conflict by re-organizing the data structure, and improve band width by using 'float2' instead of 'float' + __shared__ float2 shImages[preloadCases][colorsPerThread * B_Y / 2 + 2]; // preload preloadCases cases + __shared__ float2 shHidActs[preloadCases][filtersPerThread * B_X / 2 + 2]; // preload preloadCases cases of B_X hidacts + const int tx = threadIdx.x % B_X, ty = threadIdx.y % B_Y; + //const int tidx = B_X * threadIdx.y + threadIdx.x; + // reduce two registers + //const int loadY = tidx / preloadCases, loadX = tidx % preloadCases; + + //const int filterPixels = filterSize * filterSize; + // reduce one register + const int filterPixelsAll = numFilters * filterSize * filterSize; const int imgPixels = imgSizeY * imgSizeX; const int numFilterBlocks = numFilters / (B_X * filtersPerThread); const int blockModuleChunkIdx = blockIdx.x / numFilterBlocks; const int numModuleChunksX = DIVUP(numModulesX, sumWidth); -// const int numModuleChunksY = DIVUP(numModulesY, sumWidth); + // const int numModuleChunksY = DIVUP(numModulesY, sumWidth); const int blockModuleChunkX = blockModuleChunkIdx % numModuleChunksX; const int blockModuleChunkY = blockModuleChunkIdx / numModuleChunksX; @@ -1825,7 +1877,7 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu const int blockModuleStartX = blockModuleChunkX * sumWidth; const int blockModuleStartY = blockModuleChunkY * sumWidth; -// const int moduleIdx = partialSum * outputModuleIdx; + // const int moduleIdx = partialSum * outputModuleIdx; const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks); const int numModules = numModulesY * numModulesX; @@ -1837,33 +1889,37 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu const int blockPixelY = blockPixelOffset / filterSize, blockPixelX = blockPixelOffset % filterSize; const int blockFilterColorIdx = blockIdx.y * B_Y * colorsPerThread; const int imgColorIdx = blockFilterColorIdx + blockGroupIdx * numFilterColors; - const int imgOffset = (imgColorIdx + loadY) * imgPixels * imgStride + loadX; -// images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; + const int imgOffset = (imgColorIdx + (ty * B_X + tx) / preloadCases) * imgPixels * imgStride + (ty * B_X + tx) % preloadCases; + // images += (imgColorIdx + loadY) * imgPixels * imgStride + loadX; const int hidActsOffset = blockFilterIdx * numImages * numModules - + loadY * numImages * numModules - + loadX; -// -// hidActs += -// blockFilterIdx * numImages * numModules -// + loadY * numImages * numModules -// + loadX; - - targets += blockModuleChunkIdx * numFilters * filterPixels * numFilterColors - + (blockFilterColorIdx + threadIdx.y) * filterPixels * numFilters + + ((ty * B_X + tx) / preloadCases) * numImages * numModules + + ((ty * B_X + tx) % preloadCases); + // + // hidActs += + // blockFilterIdx * numImages * numModules + // + loadY * numImages * numModules + // + loadX; + + // usie one temporary register instead of multiple registers + const int pIdxBase = imgStride * ((paddingStart + blockPixelY) * imgSizeX + paddingStart + blockPixelX); + + targets += blockModuleChunkIdx * numFilters * filterSize * filterSize * numFilterColors + + (blockFilterColorIdx + ty) * filterSize * filterSize * numFilters + blockPixelOffset * numFilters + blockFilterIdx - + threadIdx.x; -// if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; + + tx; + // if (blockIdx.x != 0 || blockIdx.y != 0 || blockIdx.z != 0) return; const int mStartX = max(blockModuleStartX, DIVUP(-blockPixelX - paddingStart, moduleStride)); const int mStartY = max(blockModuleStartY, DIVUP(-blockPixelY - paddingStart, moduleStride)); const int mEndX = min(numModulesX, min(blockModuleStartX + sumWidth, DIVUP(imgSizeX - blockPixelX - paddingStart, moduleStride))); const int mEndY = min(numModulesY, min(blockModuleStartY + sumWidth, DIVUP(imgSizeY - blockPixelY - paddingStart, moduleStride))); + // reduce 3 registers const bool doWork = mStartY < mEndY && mStartX < mEndX; - float* shHidActLoad = &shHidActs[loadY][loadX]; - float* shImgLoad = &shImages[loadY][loadX]; + //float* shHidActLoad = &shHidActs[loadY][loadX]; + //float* shImgLoad = &shImages[loadY][loadX]; float imPreload[preloadCases*colorsPerThread/B_X]; // [4] float haPreload[preloadCases*filtersPerThread/B_Y]; // [8] @@ -1877,115 +1933,105 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu prod[f][c] = 0; } } - int pixIdx, pixIdxNext, m, mNext; - - conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( - mStartY, mStartX, paddingStart, numModulesX, moduleStride, - blockPixelY, blockPixelX, imgSizeX, imgStride, - pixIdx, m); - - if (doWork && loadY < B_Y * colorsPerThread) { + //int pixIdx, pixIdxNext, m, mNext; + + //conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( + // mStartY, mStartX, paddingStart, numModulesX, moduleStride, + // blockPixelY, blockPixelX, imgSizeX, imgStride, + // pixIdx, m); + + const int pixIdx = pIdxBase + (mStartY * imgSizeX + mStartX) * moduleStride * imgStride; + const int m = (mStartY * numModulesX + mStartX); + + // preload the image's pixel + if (doWork && (ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) { #pragma unroll - for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { - imPreload[y * preloadCases/(B_X * B_Y)] = tex1Dfetch(images, imgOffset + y * imgPixels * imgStride + pixIdx); + for (int i = 0; i < 4; i++) { + imPreload[i] = tex1Dfetch(images, imgOffset + 16 * i * imgPixels * imgStride + pixIdx); } } - if (doWork && loadY < B_X * filtersPerThread) { + // preload the hidAct's pixel + if (doWork && (ty * B_X + tx) / preloadCases < (B_X * filtersPerThread) / 8) { #pragma unroll - for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { - haPreload[y * preloadCases / (B_X * B_Y)] = tex1Dfetch(hidActs, hidActsOffset + y * numImages * numModules + m * numImages); + for (int i = 0; i < 8; i++) { + haPreload[i] = tex1Dfetch(hidActs, hidActsOffset + 16 * i * numImages * numModules + m * numImages); } } for (int my = mStartY; my < mEndY; my++) { for (int mx = mStartX; mx < mEndX; mx++) { - int myNext = my, mxNext = mx; - const bool lastModule = my == mEndY - 1 && mx == mEndX - 1; - - if (!lastModule) { - mxNext = mx + 1 == mEndX ? mStartX : mx + 1; - myNext = my + (mx + 1 == mEndX); - } - - conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16_setCoords( - myNext, mxNext, paddingStart, numModulesX, moduleStride, - blockPixelY, blockPixelX, imgSizeX, imgStride, - pixIdxNext, mNext); for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) { - -// const float* im = &images[caseIdx + preloadCases + pixIdx]; - int imgOffset2 = imgOffset + caseIdx + preloadCases + pixIdx; - int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + m * numImages; -// const float* ha = &hidActs[caseIdx + preloadCases + m * numImages]; + int imgOffset2 = imgOffset + caseIdx + preloadCases + pIdxBase + (my * imgSizeX + mx) * moduleStride * imgStride; + int hidActsOffset2 = hidActsOffset + caseIdx + preloadCases + (my * numModulesX + mx) * numImages; if (caseIdx + preloadCases == numImages) { - pixIdx = pixIdxNext; - m = mNext; -// im = &images[pixIdxNext]; - imgOffset2 = imgOffset + pixIdxNext; - hidActsOffset2 = hidActsOffset + mNext * numImages; + const int mxNext = mx + 1 == mEndX ? mStartX : mx + 1; + const int myNext = my + (mx + 1 == mEndX); + + imgOffset2 = imgOffset + + pIdxBase + (myNext * imgSizeX + mxNext) * moduleStride * imgStride; + hidActsOffset2 = hidActsOffset + (myNext * numModulesX + mxNext) * numImages; + } + + if ((ty * B_X + tx) / preloadCases < (B_Y * colorsPerThread / 4)) { + // store the previousely preloaded pixel into shared memory + shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].x = imPreload[0]; + shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].y = imPreload[2]; + shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].x = imPreload[1]; + shImages[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].y = imPreload[3]; + } + + if ((ty * B_X + tx) / preloadCases < (B_X * filtersPerThread / 8)) { + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].x = haPreload[0]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases].y = haPreload[2]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 32].x = haPreload[4]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 32].y = haPreload[6]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].x = haPreload[1]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 16].y = haPreload[3]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 48].x = haPreload[5]; + shHidActs[(ty * B_X + tx) % preloadCases][(ty * B_X + tx) / preloadCases + 48].y = haPreload[7]; + } -// ha = &hidActs[mNext * numImages]; - } - - if (loadY < B_Y * colorsPerThread) { - #pragma unroll - for (int y = 0; y < B_Y * colorsPerThread; y += (B_X * B_Y) / preloadCases) { - shImgLoad[(y) * preloadCases] = imPreload[y * preloadCases / (B_X * B_Y)]; - } + #pragma unroll + for (int r = 0; r < 8; r++) { + haPreload[r] = tex1Dfetch(hidActs, hidActsOffset2 + r * 16 * numImages * numModules); } - if (loadY < B_X * filtersPerThread) { - #pragma unroll - for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) { - shHidActLoad[y * (preloadCases + 1)] = haPreload[y * preloadCases / (B_X * B_Y)]; - } + #pragma unroll + for (int r = 0; r < 4; r++) { + imPreload[r] = tex1Dfetch(images, imgOffset2 + r * 16 * imgPixels * imgStride); } - __syncthreads(); - WA_LOOP(0); - WA_IMLOAD_TX(0); - WA_LOOP(1); - WA_IMLOAD_TX(1); - WA_LOOP(2); - WA_IMLOAD_TX(2); - WA_LOOP(3); - WA_IMLOAD_TX(3); - WA_LOOP(4); - WA_HALOAD_TX(0); - WA_LOOP(5); - WA_HALOAD_TX(1); - WA_LOOP(6); - WA_HALOAD_TX(2); - WA_LOOP(7); - WA_HALOAD_TX(3); - WA_LOOP(8); - WA_HALOAD_TX(4); - WA_LOOP(9); - WA_HALOAD_TX(5); - WA_LOOP(10); - WA_HALOAD_TX(6); - WA_LOOP(11); - WA_HALOAD_TX(7); - WA_LOOP(12); - WA_LOOP(13); - WA_LOOP(14); - WA_LOOP(15); + // put together the instructions of same type to improve instruction-level parallelism + // calculate the derivative of the hidAct with respect to weight + #pragma unroll + for (int r = 0; r < 16; r++) { + #pragma unroll + for (int c = 0; c < 4; c++) { + prod[0][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].x; + prod[1][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx].y; + prod[2][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].x; + prod[3][c] += shImages[r][ty + c * B_Y].x * shHidActs[r][tx + B_X].y; + prod[0][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].x; + prod[1][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx].y; + prod[2][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].x; + prod[3][c+4] += shImages[r][ty + c * B_Y].y * shHidActs[r][tx + B_X].y; + } + } __syncthreads(); - } - } - } + } + } + } if (scale) { #pragma unroll for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * B_Y * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[f][c]; + targets[c * B_Y * filterPixelsAll + f * B_X] = scaleTargets * targets[c * B_Y * filterPixelsAll + f * B_X] + scaleOutputs * prod[f][c]; } } } else { @@ -1993,7 +2039,7 @@ __global__ void conv_weight_acts_mc_mf_kepler_preload_ty_8_tx_32_f_4_c_8_r_16(cu for (int c = 0; c < colorsPerThread; c++) { #pragma unroll for (int f = 0; f < filtersPerThread; f++) { - targets[c * B_Y * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[f][c]; + targets[c * B_Y * filterPixelsAll + f * B_X] = scaleOutputs * prod[f][c]; } } } @@ -2127,6 +2173,8 @@ void _weightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, assert(targets.getNumCols() == targetSize.second); } cudaStream_t stream = NVMatrix::getDefaultStream(); + + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); if (scale == false) { if (checkCaseBounds == false) { @@ -2660,7 +2708,7 @@ void _weightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, } } } - + checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); getLastCudaError("weightActs: kernel execution failed"); }