From ba6bf014a22c6e1ac284606fb61e0972150cbf6a Mon Sep 17 00:00:00 2001 From: Paco Zamora Martinez Date: Sun, 20 Jan 2013 16:59:09 +0100 Subject: [PATCH] Added parameter transpose to dot product action, which makes possible to do BP computation using the transposed of a weights matrix --- AUTHORS.txt | 6 + .../ann/ann_base/binding/bind_ann_base.lua.cc | 16 +- .../ann/ann_base/c_src/all_all_connection.cc | 3 +- .../ann/ann_base/c_src/all_all_connection.h | 1 - .../ann/ann_base/c_src/bias_connection.cc | 14 +- packages/ann/ann_base/c_src/connection.cc | 4 +- packages/ann/ann_base/c_src/connection.h | 10 +- .../ann/ann_base/c_src/dot_product_action.cc | 187 ++-- .../ann/ann_base/c_src/dot_product_action.h | 4 +- packages/ann/mlp/binding/bind_mlp.lua.cc | 10 + packages/ann/mlp/c_src/all_all_mlp.cc | 10 +- packages/ann/mlp/c_src/mlp.cc | 12 +- packages/basics/math/c_src/wrapper.cu | 968 +++++++++--------- 13 files changed, 680 insertions(+), 565 deletions(-) create mode 100644 AUTHORS.txt diff --git a/AUTHORS.txt b/AUTHORS.txt new file mode 100644 index 000000000..4b7e7ddf4 --- /dev/null +++ b/AUTHORS.txt @@ -0,0 +1,6 @@ +In this project has been worked: + - Salvador España Boquera + - Jorge Gorbe Moya + - Adrián Palacios Corella + - Joan Pastor Pellicer + - Francisco Zamora Martínez diff --git a/packages/ann/ann_base/binding/bind_ann_base.lua.cc b/packages/ann/ann_base/binding/bind_ann_base.lua.cc index 278300b0d..9ea6f318a 100644 --- a/packages/ann/ann_base/binding/bind_ann_base.lua.cc +++ b/packages/ann/ann_base/binding/bind_ann_base.lua.cc @@ -107,6 +107,8 @@ using namespace Functions; obj = new RealActivationUnits(size, ann->getConfReference(), strcmp(type, "inputs") != 0); ann->registerActivationUnits(obj); + if (strcmp(type, "inputs") == 0) ann->registerInput(obj); + else if (strcmp(type, "outputs") == 0) ann->registerOutput(obj); LUABIND_RETURN(RealActivationUnits, obj); } //BIND_END @@ -242,6 +244,7 @@ using namespace Functions; obj = new ForwardBiasAction(ann->getConfReference(), output, conn); + ann->registerAction(obj); LUABIND_RETURN(ForwardBiasAction, obj); } //BIND_END @@ -255,20 +258,25 @@ using namespace Functions; //BIND_CONSTRUCTOR DotProductAction { LUABIND_CHECK_ARGN(==,1); - check_table_fields(L, 1, "ann", "input", "output", "connections", 0); + check_table_fields(L, 1, "ann", "input", "output", "connections", + "transpose", 0); ActivationUnits *input; ActivationUnits *output; Connections *conn; ANNBase *ann; + bool transpose; LUABIND_GET_TABLE_PARAMETER(1, input, ActivationUnits, input); LUABIND_GET_TABLE_PARAMETER(1, output, ActivationUnits, output); LUABIND_GET_TABLE_PARAMETER(1, connections, Connections, conn); LUABIND_GET_TABLE_PARAMETER(1, ann, ANNBase, ann); + LUABIND_GET_TABLE_OPTIONAL_PARAMETER(1, transpose, bool, transpose, false); obj = new DotProductAction(ann->getConfReference(), - input, output, conn); + input, output, conn, + transpose); + ann->registerAction(obj); LUABIND_RETURN(DotProductAction, obj); } //BIND_END @@ -294,6 +302,7 @@ using namespace Functions; obj = new ActivationsAction(ann->getConfReference(), output, actfunc); + ann->registerAction(obj); LUABIND_RETURN(ActivationsAction, obj); } //BIND_END @@ -386,7 +395,8 @@ using namespace Functions; LUABIND_GET_TABLE_PARAMETER(1, w, MatrixFloat, w); LUABIND_GET_TABLE_OPTIONAL_PARAMETER(1, oldw, MatrixFloat, oldw, w); LUABIND_GET_TABLE_OPTIONAL_PARAMETER(1, first_pos, uint, first_pos, 0); - LUABIND_GET_TABLE_OPTIONAL_PARAMETER(1, column_size, uint, column_size, 1); + LUABIND_GET_TABLE_OPTIONAL_PARAMETER(1, column_size, uint, column_size, + obj->getNumInputs()); LUABIND_RETURN(uint, obj->loadWeights(w, oldw, first_pos, column_size)); } diff --git a/packages/ann/ann_base/c_src/all_all_connection.cc b/packages/ann/ann_base/c_src/all_all_connection.cc index 18fb57212..ee13fc28f 100644 --- a/packages/ann/ann_base/c_src/all_all_connection.cc +++ b/packages/ann/ann_base/c_src/all_all_connection.cc @@ -25,8 +25,7 @@ namespace ANN { AllAllConnections::AllAllConnections(unsigned int num_inputs, unsigned int num_outputs) : - Connections(num_inputs*num_outputs), - num_inputs(num_inputs), num_outputs(num_outputs) { + Connections(num_inputs*num_outputs, num_inputs, num_outputs) { } bool AllAllConnections::checkInputOutputSizes(ActivationUnits *input, diff --git a/packages/ann/ann_base/c_src/all_all_connection.h b/packages/ann/ann_base/c_src/all_all_connection.h index 8f6793877..d99efd31b 100644 --- a/packages/ann/ann_base/c_src/all_all_connection.h +++ b/packages/ann/ann_base/c_src/all_all_connection.h @@ -26,7 +26,6 @@ namespace ANN { class AllAllConnections : public Connections { - unsigned int num_inputs, num_outputs; public: AllAllConnections(unsigned int num_inputs, unsigned int num_outputs); diff --git a/packages/ann/ann_base/c_src/bias_connection.cc b/packages/ann/ann_base/c_src/bias_connection.cc index 1c95e6d44..8952b0ec3 100644 --- a/packages/ann/ann_base/c_src/bias_connection.cc +++ b/packages/ann/ann_base/c_src/bias_connection.cc @@ -25,7 +25,7 @@ namespace ANN { BiasConnections::BiasConnections(unsigned int bias_size) : - Connections(bias_size) { + Connections(bias_size, 1, bias_size) { } bool BiasConnections::checkInputOutputSizes(ActivationUnits *input, @@ -57,14 +57,10 @@ namespace ANN { float *w = weights->getPPALForReadAndWrite(); float *prev_w = prev_weights->getPPALForReadAndWrite(); - - for (unsigned int j=0; jnumNeurons()), @@ -40,9 +41,15 @@ namespace ANN { learning_rate(-1.0f), momentum(0.0f), weight_decay(0.0f), - c_weight_decay(1.0f) { - if (!weights_matrix->checkInputOutputSizes(inputs, outputs)) - ERROR_EXIT(256, "The input/output sizes are not correct.\n"); + c_weight_decay(1.0f), + transpose_weights(transpose_weights) { + if (!transpose_weights) { + if (!weights_matrix->checkInputOutputSizes(inputs, outputs)) + ERROR_EXIT(256, "The input/output sizes are not correct.\n"); + } + else + if (!weights_matrix->checkInputOutputSizes(outputs, inputs)) + ERROR_EXIT(256, "The input/output sizes are not correct.\n"); weights_matrix->countReference(); IncRef(inputs); IncRef(outputs); @@ -61,28 +68,51 @@ namespace ANN { FloatGPUMirroredMemoryBlock *output_ptr = outputs->getPtr(); FloatGPUMirroredMemoryBlock *weights_mat_ptr = weights_matrix->getPtr(); - if (conf.cur_bunch_size == 1) + if (conf.cur_bunch_size == 1) { // vector x matrix product - doSgemv(CblasColMajor, CblasNoTrans, - num_outputs, num_inputs, - 1.0f, weights_mat_ptr, num_outputs, - input_ptr, conf.max_bunch_size, - 1.0f, output_ptr, conf.max_bunch_size, - 0, inputs->getOffset(), outputs->getOffset(), - conf.use_cuda_flag); - else + if (!transpose_weights) + doSgemv(CblasColMajor, CblasNoTrans, + num_outputs, num_inputs, + 1.0f, weights_mat_ptr, num_outputs, + input_ptr, conf.max_bunch_size, + 1.0f, output_ptr, conf.max_bunch_size, + 0, inputs->getOffset(), outputs->getOffset(), + conf.use_cuda_flag); + else + doSgemv(CblasColMajor, CblasTrans, + num_inputs, num_outputs, + 1.0f, weights_mat_ptr, num_inputs, + input_ptr, conf.max_bunch_size, + 1.0f, output_ptr, conf.max_bunch_size, + 0, inputs->getOffset(), outputs->getOffset(), + conf.use_cuda_flag); + } + else { // matrix x matrix product // C = \alpha op(A) op(B) + \beta C // input * weights = output - doSgemm(CblasColMajor, CblasNoTrans, CblasTrans, - conf.cur_bunch_size, num_outputs, num_inputs, - 1.0f, input_ptr, conf.max_bunch_size, - weights_mat_ptr, num_outputs, - // beta = 1.0f, C matrix contains BIAS and probably other layer - // computations - 1.0f, output_ptr, conf.max_bunch_size, - inputs->getOffset(), 0, outputs->getOffset(), - conf.use_cuda_flag); + if (!transpose_weights) + doSgemm(CblasColMajor, CblasNoTrans, CblasTrans, + conf.cur_bunch_size, num_outputs, num_inputs, + 1.0f, input_ptr, conf.max_bunch_size, + weights_mat_ptr, num_outputs, + // beta = 1.0f, C matrix contains BIAS and probably other layer + // computations + 1.0f, output_ptr, conf.max_bunch_size, + inputs->getOffset(), 0, outputs->getOffset(), + conf.use_cuda_flag); + else + doSgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + conf.cur_bunch_size, num_outputs, num_inputs, + 1.0f, + input_ptr, conf.max_bunch_size, + weights_mat_ptr, num_inputs, + // beta = 1.0f, C matrix contains BIAS and probably other layer + // computations + 1.0f, output_ptr, conf.max_bunch_size, + inputs->getOffset(), 0, outputs->getOffset(), + conf.use_cuda_flag); + } } void DotProductAction:: @@ -94,22 +124,42 @@ namespace ANN { if (output_error != 0) { if (conf.cur_bunch_size > 1) { // C = alpha * A * B + beta * C - doSgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - conf.cur_bunch_size, num_inputs, num_outputs, - 1.0f, input_error, conf.max_bunch_size, - weights_mat_ptr, num_outputs, - 1.0f, output_error, conf.max_bunch_size, - input_error_shift, 0, output_error_shift, - conf.use_cuda_flag); + if (!transpose_weights) + doSgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, + conf.cur_bunch_size, num_inputs, num_outputs, + 1.0f, input_error, conf.max_bunch_size, + weights_mat_ptr, num_outputs, + 1.0f, output_error, conf.max_bunch_size, + input_error_shift, 0, output_error_shift, + conf.use_cuda_flag); + else + doSgemm(CblasColMajor, CblasNoTrans, CblasTrans, + conf.cur_bunch_size, num_inputs, num_outputs, + 1.0f, input_error, conf.max_bunch_size, + weights_mat_ptr, num_inputs, + 1.0f, output_error, conf.max_bunch_size, + input_error_shift, 0, output_error_shift, + conf.use_cuda_flag); } else { - doSgemv(CblasColMajor, CblasNoTrans, - num_inputs, num_outputs, - 1.0f, weights_mat_ptr, num_inputs, - input_error, conf.max_bunch_size, - 1.0f, output_error, conf.max_bunch_size, - 0, input_error_shift, output_error_shift, - conf.use_cuda_flag); + // FIXME: I'm not sure of this two calls... please review it + if (!transpose_weights) + doSgemv(CblasColMajor, CblasTrans, + num_outputs, num_inputs, + 1.0f, weights_mat_ptr, num_outputs, + input_error, conf.max_bunch_size, + 1.0f, output_error, conf.max_bunch_size, + 0, input_error_shift, output_error_shift, + conf.use_cuda_flag); + else { + doSgemv(CblasColMajor, CblasNoTrans, + num_inputs, num_outputs, + 1.0f, weights_mat_ptr, num_inputs, + input_error, conf.max_bunch_size, + 1.0f, output_error, conf.max_bunch_size, + 0, input_error_shift, output_error_shift, + conf.use_cuda_flag); + } } } } @@ -130,33 +180,56 @@ namespace ANN { -(1.0f/sqrtf(static_cast(references))) * learning_rate; - if (conf.cur_bunch_size > 1) - doSgemm(CblasColMajor, CblasTrans, CblasNoTrans, // transposicones - num_outputs, num_inputs, conf.cur_bunch_size, // dimensiones - norm_learn_rate, // alpha - input_error, // A - conf.max_bunch_size, // A stride - input, // B - conf.max_bunch_size, // B stride - beta, // beta - prev_weights_mat_ptr, // C - num_outputs, // C stride - input_error_shift, input_shift, 0, // desplazamientos - conf.use_cuda_flag); + if (conf.cur_bunch_size > 1) { + if (!transpose_weights) + doSgemm(CblasColMajor, CblasTrans, CblasNoTrans, + num_outputs, num_inputs, conf.cur_bunch_size, // dimensiones + norm_learn_rate, // alpha + input_error, // A + conf.max_bunch_size, // A stride + input, // B + conf.max_bunch_size, // B stride + beta, // beta + prev_weights_mat_ptr, // C + num_outputs, // C stride + input_error_shift, input_shift, 0, // desplazamientos + conf.use_cuda_flag); + else + doSgemm(CblasColMajor, CblasTrans, CblasNoTrans, + num_inputs, num_outputs, conf.cur_bunch_size, // dimensiones + norm_learn_rate, // alpha + input, // B + conf.max_bunch_size, // B stride + input_error, // A + conf.max_bunch_size, // A stride + beta, // beta + prev_weights_mat_ptr, // C + num_inputs, // C stride + input_shift, input_error_shift, 0, // desplazamientos + conf.use_cuda_flag); + } else { if (beta < 1.0f) doSscal((num_inputs * num_outputs), beta, prev_weights_mat_ptr, 0, 1, conf.use_cuda_flag); - - doSger(CblasColMajor, - num_outputs, num_inputs, - norm_learn_rate, - input_error, input_error_shift, conf.max_bunch_size, - input, input_shift, conf.max_bunch_size, - prev_weights_mat_ptr, 0, num_outputs, - conf.use_cuda_flag); + if (!transpose_weights) + doSger(CblasColMajor, + num_outputs, num_inputs, + norm_learn_rate, + input_error, input_error_shift, conf.max_bunch_size, + input, input_shift, conf.max_bunch_size, + prev_weights_mat_ptr, 0, num_outputs, + conf.use_cuda_flag); + else + doSger(CblasColMajor, + num_inputs, num_outputs, + norm_learn_rate, + input, input_shift, conf.max_bunch_size, + input_error, input_error_shift, conf.max_bunch_size, + prev_weights_mat_ptr, 0, num_inputs, + conf.use_cuda_flag); } } diff --git a/packages/ann/ann_base/c_src/dot_product_action.h b/packages/ann/ann_base/c_src/dot_product_action.h index d6d9a0fb4..6f98a0db6 100644 --- a/packages/ann/ann_base/c_src/dot_product_action.h +++ b/packages/ann/ann_base/c_src/dot_product_action.h @@ -34,6 +34,7 @@ namespace ANN { const unsigned int num_inputs, num_outputs; const ANNConfiguration &conf; float learning_rate, momentum, weight_decay, c_weight_decay; + bool transpose_weights; void backpropagateErrors(FloatGPUMirroredMemoryBlock *weights_mat_ptr, @@ -54,7 +55,8 @@ namespace ANN { DotProductAction(const ANNConfiguration &conf, ActivationUnits *inputs, ActivationUnits *outputs, - Connections *weights_matrix); + Connections *weights_matrix, + bool transpose_weights=false); virtual ~DotProductAction(); virtual void doForward(); virtual void doBackward(); diff --git a/packages/ann/mlp/binding/bind_mlp.lua.cc b/packages/ann/mlp/binding/bind_mlp.lua.cc index 6f9fbac0f..3559def7a 100644 --- a/packages/ann/mlp/binding/bind_mlp.lua.cc +++ b/packages/ann/mlp/binding/bind_mlp.lua.cc @@ -78,6 +78,16 @@ using namespace ANN; } //BIND_END +//BIND_METHOD MLP show_weights +//DOC_BEGIN +// void show_weights() +/// Show at stdout weights values: for debugging +//DOC_END +{ + obj->showWeights(); +} +//BIND_END + //BIND_METHOD MLP randomize_weights //DOC_BEGIN // void randomize_weights({ random = random(...), [inf = number], [sup = number] }) diff --git a/packages/ann/mlp/c_src/all_all_mlp.cc b/packages/ann/mlp/c_src/all_all_mlp.cc index d864b7eb2..0dae00e66 100644 --- a/packages/ann/mlp/c_src/all_all_mlp.cc +++ b/packages/ann/mlp/c_src/all_all_mlp.cc @@ -68,7 +68,8 @@ namespace ANN { if (activations.size() < 2) ERROR_EXIT(128, "Impossible to generate a zero layer AllAllMLP\n"); - // We register the input and the output layers of the network + // We register the input and the output layers of the network. The + // activations vector is a property of superclass registerInput(activations[0]); registerOutput(activations.back()); @@ -127,13 +128,20 @@ namespace ANN { MatrixFloat *old_weights_mat) { generateActionsAllAll(str); unsigned int pos = 0; + if (static_cast(weights_mat->size) != getNumberOfWeights()) + ERROR_EXIT(255, "Incorrect number of weights at matrix!!!\n"); + if (static_cast(old_weights_mat->size) != getNumberOfWeights()) + ERROR_EXIT(255, "Incorrect number of weights at old matrix!!!\n"); // step +=2 because connections are stored in groups of two: bias and the // rest of weights for (unsigned int i=0,k=0; inumNeurons()+1; // ATTENTION: The loadWeights function returns the next pos value + + // bias connections connections[i]->loadWeights(weights_mat, old_weights_mat, pos, colsize); + // rest of weights connections pos = connections[i+1]->loadWeights(weights_mat, old_weights_mat, pos+1, colsize) - 1; } diff --git a/packages/ann/mlp/c_src/mlp.cc b/packages/ann/mlp/c_src/mlp.cc index 96c3ef5b1..6bae6f7ab 100644 --- a/packages/ann/mlp/c_src/mlp.cc +++ b/packages/ann/mlp/c_src/mlp.cc @@ -19,6 +19,7 @@ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ +#include #include #include "constString.h" #include "error_print.h" @@ -97,12 +98,13 @@ namespace ANN { void MLP::showWeights() { - /* - printf("Weights:\n"); - for (unsigned int i = 0; i < connections.size(); i++) - connections[i]->showWeights(); + for (unsigned int i=0; i 30.0f) @@ -475,7 +475,7 @@ __global__ void applyFullCrossEntropyKernel(const float *output, float epsilon, float inf, unsigned int max_x, - unsigned int lda_x, + unsigned int lda_x, unsigned int max_y) { unsigned int matrix_x_pos, matrix_y_pos; getColumnMajorBunchMatrixPositions(blockIdx, @@ -657,7 +657,7 @@ void doApplySoftmaxActivation(FloatGPUMirroredMemoryBlock *units, unsigned int top_reduction = units_top; dim3 block, grid; computeBlockAndGridSizesForARowMajorBunch(conf, units_size, - block, grid); + block, grid); minMaxFirstReduction<<>> (units_ptr, @@ -669,7 +669,7 @@ void doApplySoftmaxActivation(FloatGPUMirroredMemoryBlock *units, conf.max_bunch_size); for (top_reduction >>= 1; top_reduction != 1; top_reduction >>= 1) { computeBlockAndGridSizesForARowMajorBunch(conf, top_reduction, - block, grid); + block, grid); minMaxNextReduction<<>> (minimums_ptr, maximums_ptr, @@ -685,7 +685,7 @@ void doApplySoftmaxActivation(FloatGPUMirroredMemoryBlock *units, conf.cur_bunch_size); computeBlockAndGridSizesForARowMajorBunch(conf, units_size, - block, grid); + block, grid); applyExpMinusMinimum<<>> (units_ptr, @@ -706,7 +706,7 @@ void doApplySoftmaxActivation(FloatGPUMirroredMemoryBlock *units, conf.max_bunch_size); for (top_reduction >>= 1; top_reduction != 1; top_reduction >>= 1) { computeBlockAndGridSizesForARowMajorBunch(conf, top_reduction, - block, grid); + block, grid); sumNextReduction<<>> (sums_ptr, top_reduction, @@ -720,7 +720,7 @@ void doApplySoftmaxActivation(FloatGPUMirroredMemoryBlock *units, conf.cur_bunch_size); computeBlockAndGridSizesForARowMajorBunch(conf, units_size, - block, grid); + block, grid); applyRatio<<>> (units_ptr, @@ -885,41 +885,41 @@ void doCalculateTanh(FloatGPUMirroredMemoryBlock *output, } /* -void doCalculateMixtureCrossEntropy(FloatGPUMirroredMemoryBlock *output, - FloatGPUMirroredMemoryBlock *target_output, - FloatGPUMirroredMemoryBlock *output_error, - FloatGPUMirroredMemoryBlock *pattern_errors, - float EPSILON, - float INF, - unsigned int output_size, - const ANNConfiguration &conf, - bool use_gpu) { + void doCalculateMixtureCrossEntropy(FloatGPUMirroredMemoryBlock *output, + FloatGPUMirroredMemoryBlock *target_output, + FloatGPUMirroredMemoryBlock *output_error, + FloatGPUMirroredMemoryBlock *pattern_errors, + float EPSILON, + float INF, + unsigned int output_size, + const ANNConfiguration &conf, + bool use_gpu) { const float *output_ptr = output->getPPALForRead(); const float *target_output_ptr = target_output->getPPALForRead(); float *output_error_ptr = output_error->getPPALForWrite(); float *pattern_errors_ptr = pattern_errors->getGPUForReadAndWrite(); for (unsigned int b=0; b EPSILON) ? logf(prob) : INF); + float Z = 0.0f; + unsigned int ipos = b; + for (unsigned int i=0; i EPSILON) ? logf(prob) : INF); } return s; -} + } */ // F'(a,b)/a_i = ( 2 b_i H(a,b) - G(a,b) ) / H^2(a,b) @@ -975,158 +975,158 @@ void doCalculateLocalFMeasure(float alpha, } /* - float doCalculateGA(FloatGPUMirroredMemoryBlock *output, - FloatGPUMirroredMemoryBlock *target_output, - FloatGPUMirroredMemoryBlock *output_error, - FloatGPUMirroredMemoryBlock *pattern_errors, - unsigned int output_size, - const ANNConfiguration &conf, - bool use_gpu) { - const float *output_ptr = output->getPPALForRead(); - const float *target_output_ptr = target_output->getPPALForRead(); - float *output_error_ptr = output_error->getPPALForWrite(); - - for (unsigned int b=0; bgetPPALForRead(); + const float *target_output_ptr = target_output->getPPALForRead(); + float *output_error_ptr = output_error->getPPALForWrite(); - */ + for (unsigned int b=0; bgetGPUForRead(); - const float *target_output_ptr = target_output->getGPUForRead(); - float *output_error_ptr = output_error->getGPUForWrite(); - float *pattern_errors_ptr = pattern_errors->getGPUForReadAndWrite(); - dim3 block, grid; - computeBlockAndGridSizesForAColumnMajorBunch(conf, output_size, - block, grid); - - applyCrossEntropyKernel<<>> - (output_ptr, - target_output_ptr, - output_error_ptr, - pattern_errors_ptr, - EPSILON, - INF, - conf.cur_bunch_size, - conf.max_bunch_size, - output_size); - } - else { + if (use_gpu) { + const float *output_ptr = output->getGPUForRead(); + const float *target_output_ptr = target_output->getGPUForRead(); + float *output_error_ptr = output_error->getGPUForWrite(); + float *pattern_errors_ptr = pattern_errors->getGPUForReadAndWrite(); + dim3 block, grid; + computeBlockAndGridSizesForAColumnMajorBunch(conf, output_size, + block, grid); + + applyCrossEntropyKernel<<>> + (output_ptr, + target_output_ptr, + output_error_ptr, + pattern_errors_ptr, + EPSILON, + INF, + conf.cur_bunch_size, + conf.max_bunch_size, + output_size); + } + else { #endif - const float *output_ptr = output->getPPALForRead(); - const float *target_output_ptr = target_output->getPPALForRead(); - float *output_error_ptr = output_error->getPPALForWrite(); - float *pattern_errors_ptr = pattern_errors->getPPALForReadAndWrite(); - - for (unsigned int i = 0; i < output_size; i++) { - for (unsigned int b=0; b EPSILON) - pattern_errors_ptr[b] += target_output_ptr[b] * ((fabs(output_ptr[b]) > EPSILON) ? - logf(output_ptr[b]) : INF); - } - output_ptr += conf.max_bunch_size; - target_output_ptr += conf.max_bunch_size; - output_error_ptr += conf.max_bunch_size; - pattern_errors_ptr += conf.max_bunch_size; - } -#ifdef USE_CUDA + const float *output_ptr = output->getPPALForRead(); + const float *target_output_ptr = target_output->getPPALForRead(); + float *output_error_ptr = output_error->getPPALForWrite(); + float *pattern_errors_ptr = pattern_errors->getPPALForReadAndWrite(); + + for (unsigned int i = 0; i < output_size; i++) { + for (unsigned int b=0; b EPSILON) + pattern_errors_ptr[b] += target_output_ptr[b] * ((fabs(output_ptr[b]) > EPSILON) ? + logf(output_ptr[b]) : INF); + } + output_ptr += conf.max_bunch_size; + target_output_ptr += conf.max_bunch_size; + output_error_ptr += conf.max_bunch_size; + pattern_errors_ptr += conf.max_bunch_size; } +#ifdef USE_CUDA + } #endif } void doCalculateFullCrossEntropy(FloatGPUMirroredMemoryBlock *output, - FloatGPUMirroredMemoryBlock *target_output, - FloatGPUMirroredMemoryBlock *output_error, - FloatGPUMirroredMemoryBlock *pattern_errors, - float EPSILON, - float INF, - unsigned int output_size, - const ANNConfiguration &conf, - bool use_gpu) { + FloatGPUMirroredMemoryBlock *target_output, + FloatGPUMirroredMemoryBlock *output_error, + FloatGPUMirroredMemoryBlock *pattern_errors, + float EPSILON, + float INF, + unsigned int output_size, + const ANNConfiguration &conf, + bool use_gpu) { #ifdef USE_CUDA - if (use_gpu) { - const float *output_ptr = output->getGPUForRead(); - const float *target_output_ptr = target_output->getGPUForRead(); - float *output_error_ptr = output_error->getGPUForWrite(); - float *pattern_errors_ptr = pattern_errors->getGPUForReadAndWrite(); - dim3 block, grid; - computeBlockAndGridSizesForAColumnMajorBunch(conf, output_size, - block, grid); - - applyFullCrossEntropyKernel<<>> - (output_ptr, - target_output_ptr, - output_error_ptr, - pattern_errors_ptr, - EPSILON, - INF, - conf.cur_bunch_size, - conf.max_bunch_size, - output_size); + if (use_gpu) { + const float *output_ptr = output->getGPUForRead(); + const float *target_output_ptr = target_output->getGPUForRead(); + float *output_error_ptr = output_error->getGPUForWrite(); + float *pattern_errors_ptr = pattern_errors->getGPUForReadAndWrite(); + dim3 block, grid; + computeBlockAndGridSizesForAColumnMajorBunch(conf, output_size, + block, grid); - } - else { + applyFullCrossEntropyKernel<<>> + (output_ptr, + target_output_ptr, + output_error_ptr, + pattern_errors_ptr, + EPSILON, + INF, + conf.cur_bunch_size, + conf.max_bunch_size, + output_size); + + } + else { #endif - const float *output_ptr = output->getPPALForRead(); - const float *target_output_ptr = target_output->getPPALForRead(); - float *output_error_ptr = output_error->getPPALForWrite(); - float *pattern_errors_ptr = pattern_errors->getPPALForReadAndWrite(); - - for (unsigned int i = 0; i < output_size; i++) { - for (unsigned int b=0; b EPSILON) { - pattern_errors_ptr[b] += aux_target * ((fabs(aux_out) > EPSILON) ? - logf(aux_out) : INF); - } - if (fabs(1.0f - aux_target) > EPSILON) { - pattern_errors_ptr[b] += (1.0f - aux_target) * ((fabs(1.0f - aux_out) > EPSILON) ? - logf(1.0f - aux_out) : INF); - } - } - output_ptr += conf.max_bunch_size; - target_output_ptr += conf.max_bunch_size; - output_error_ptr += conf.max_bunch_size; - pattern_errors_ptr += conf.max_bunch_size; - } -#ifdef USE_CUDA + const float *output_ptr = output->getPPALForRead(); + const float *target_output_ptr = target_output->getPPALForRead(); + float *output_error_ptr = output_error->getPPALForWrite(); + float *pattern_errors_ptr = pattern_errors->getPPALForReadAndWrite(); + + for (unsigned int i = 0; i < output_size; i++) { + for (unsigned int b=0; b EPSILON) { + pattern_errors_ptr[b] += aux_target * ((fabs(aux_out) > EPSILON) ? + logf(aux_out) : INF); + } + if (fabs(1.0f - aux_target) > EPSILON) { + pattern_errors_ptr[b] += (1.0f - aux_target) * ((fabs(1.0f - aux_out) > EPSILON) ? + logf(1.0f - aux_out) : INF); + } + } + output_ptr += conf.max_bunch_size; + target_output_ptr += conf.max_bunch_size; + output_error_ptr += conf.max_bunch_size; + pattern_errors_ptr += conf.max_bunch_size; } +#ifdef USE_CUDA + } #endif } @@ -1136,138 +1136,138 @@ void doCalculateFullCrossEntropy(FloatGPUMirroredMemoryBlock *output, /////////////////////////////////////////////////////////// void doSgemv(CBLAS_ORDER major_type, CBLAS_TRANSPOSE a_transpose, - int m, int n, - float alpha, FloatGPUMirroredMemoryBlock *a, unsigned int a_inc, - FloatGPUMirroredMemoryBlock *x, unsigned int x_inc, - float beta, FloatGPUMirroredMemoryBlock *y, unsigned int y_inc, - unsigned int a_shift, unsigned int x_shift, unsigned int y_shift, - bool use_gpu) { - const float *a_mem, *x_mem; - float *y_mem; + int m, int n, + float alpha, FloatGPUMirroredMemoryBlock *a, unsigned int a_inc, + FloatGPUMirroredMemoryBlock *x, unsigned int x_inc, + float beta, FloatGPUMirroredMemoryBlock *y, unsigned int y_inc, + unsigned int a_shift, unsigned int x_shift, unsigned int y_shift, + bool use_gpu) { + const float *a_mem, *x_mem; + float *y_mem; #ifdef USE_CUDA - if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - assert(major_type == CblasColMajor); - cublasOperation_t cublas_a_transpose = getCublasOperation(a_transpose); - a_mem = a->getGPUForRead() + a_shift; - x_mem = x->getGPUForRead() + x_shift; - y_mem = y->getGPUForReadAndWrite() + y_shift; - - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); - - status = cublasSgemv(handle, cublas_a_transpose, - m, n, - &alpha, a_mem, a_inc, - x_mem, x_inc, - &beta, y_mem, y_inc); - - checkCublasError(status); - } - else { + if (use_gpu) { + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + assert(major_type == CblasColMajor); + cublasOperation_t cublas_a_transpose = getCublasOperation(a_transpose); + a_mem = a->getGPUForRead() + a_shift; + x_mem = x->getGPUForRead() + x_shift; + y_mem = y->getGPUForReadAndWrite() + y_shift; + + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); + + status = cublasSgemv(handle, cublas_a_transpose, + m, n, + &alpha, a_mem, a_inc, + x_mem, x_inc, + &beta, y_mem, y_inc); + + checkCublasError(status); + } + else { #endif - a_mem = a->getPPALForRead() + a_shift; - x_mem = x->getPPALForRead() + x_shift; - y_mem = y->getPPALForReadAndWrite() + y_shift; - cblas_sgemv(major_type, a_transpose, + a_mem = a->getPPALForRead() + a_shift; + x_mem = x->getPPALForRead() + x_shift; + y_mem = y->getPPALForReadAndWrite() + y_shift; + cblas_sgemv(major_type, a_transpose, m, n, alpha, a_mem, a_inc, x_mem, x_inc, beta, y_mem, y_inc); #ifdef USE_CUDA - } + } #endif } void doScopy(int N, FloatGPUMirroredMemoryBlock* x, - unsigned int x_shift, - unsigned int x_inc, - FloatGPUMirroredMemoryBlock* y, - unsigned int y_shift, - unsigned int y_inc, - bool use_gpu) + unsigned int x_shift, + unsigned int x_inc, + FloatGPUMirroredMemoryBlock* y, + unsigned int y_shift, + unsigned int y_inc, + bool use_gpu) { - const float *x_mem; - float *y_mem; + const float *x_mem; + float *y_mem; #ifdef USE_CUDA - if (use_gpu) + if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - //printf("Doing a scopy with comp=1 & cuda=1\n"); - x_mem = x->getGPUForRead() + x_shift; - y_mem = y->getGPUForWrite() + y_shift; + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + //printf("Doing a scopy with comp=1 & cuda=1\n"); + x_mem = x->getGPUForRead() + x_shift; + y_mem = y->getGPUForWrite() + y_shift; - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); - status = cublasScopy(handle, N, x_mem, x_inc, y_mem, y_inc); + status = cublasScopy(handle, N, x_mem, x_inc, y_mem, y_inc); - checkCublasError(status); + checkCublasError(status); } - else + else { - //printf("Doing a scopy with comp=1 & cuda=0\n"); + //printf("Doing a scopy with comp=1 & cuda=0\n"); #endif #ifndef USE_CUDA - //printf("Doing a scopy with comp=0 & cuda=0\n"); + //printf("Doing a scopy with comp=0 & cuda=0\n"); #endif - x_mem = x->getPPALForRead() + x_shift; - y_mem = y->getPPALForWrite() + y_shift; + x_mem = x->getPPALForRead() + x_shift; + y_mem = y->getPPALForWrite() + y_shift; - cblas_scopy(N, x_mem, x_inc, y_mem, y_inc); + cblas_scopy(N, x_mem, x_inc, y_mem, y_inc); #ifdef USE_CUDA } #endif } void doScopyLoop(int N, - FloatGPUMirroredMemoryBlock* x, - unsigned int x_inc, - FloatGPUMirroredMemoryBlock* y, - unsigned int y_inc, - unsigned int times, - const unsigned int stride, - bool use_gpu) + FloatGPUMirroredMemoryBlock* x, + unsigned int x_inc, + FloatGPUMirroredMemoryBlock* y, + unsigned int y_inc, + unsigned int times, + const unsigned int stride, + bool use_gpu) { - const float *x_mem; - float *y_mem; + const float *x_mem; + float *y_mem; #ifdef USE_CUDA - if (use_gpu) + if (use_gpu) { - //printf("Doing a scopy with comp=1 & cuda=1\n"); - x_mem = x->getGPUForRead(); - y_mem = y->getGPUForWrite(); - - const unsigned int MAX_THREADS = GPUHelper::getMaxThreadsPerBlock(); - dim3 block, grid; - // Number of threads on each block dimension - block.x = min(MAX_THREADS, times); - block.y = min(MAX_THREADS/block.x, N); - block.z = 1; - - grid.x = (times/block.x + + //printf("Doing a scopy with comp=1 & cuda=1\n"); + x_mem = x->getGPUForRead(); + y_mem = y->getGPUForWrite(); + + const unsigned int MAX_THREADS = GPUHelper::getMaxThreadsPerBlock(); + dim3 block, grid; + // Number of threads on each block dimension + block.x = min(MAX_THREADS, times); + block.y = min(MAX_THREADS/block.x, N); + block.z = 1; + + grid.x = (times/block.x + (times % block.x ? 1 : 0)); - grid.y = (N/block.y + (N % block.y ? 1 : 0)); - grid.z = 1; + grid.y = (N/block.y + (N % block.y ? 1 : 0)); + grid.z = 1; - scopyLoopKernel<<>> - (N, x_mem, x_inc, y_mem, y_inc, times, stride); + scopyLoopKernel<<>> + (N, x_mem, x_inc, y_mem, y_inc, times, stride); } - else + else { - //printf("Doing a scopy with comp=1 & cuda=0\n"); + //printf("Doing a scopy with comp=1 & cuda=0\n"); #endif #ifndef USE_CUDA - //printf("Doing a scopy with comp=0 & cuda=0\n"); + //printf("Doing a scopy with comp=0 & cuda=0\n"); #endif - x_mem = x->getPPALForRead(); - y_mem = y->getPPALForWrite(); + x_mem = x->getPPALForRead(); + y_mem = y->getPPALForWrite(); - for (unsigned int i = 0; i < times; i++) - cblas_scopy(N, + for (unsigned int i = 0; i < times; i++) + cblas_scopy(N, x_mem, x_inc, y_mem + i * stride , y_inc); #ifdef USE_CUDA @@ -1276,67 +1276,67 @@ void doScopyLoop(int N, } void doSaxpy(int N, - float alpha, - FloatGPUMirroredMemoryBlock* x, - unsigned int x_shift, - unsigned int x_inc, - FloatGPUMirroredMemoryBlock* y, - unsigned int y_shift, - unsigned int y_inc, - bool use_gpu) + float alpha, + FloatGPUMirroredMemoryBlock* x, + unsigned int x_shift, + unsigned int x_inc, + FloatGPUMirroredMemoryBlock* y, + unsigned int y_shift, + unsigned int y_inc, + bool use_gpu) { - const float *x_mem; - float *y_mem; + const float *x_mem; + float *y_mem; #ifdef USE_CUDA - if (use_gpu) + if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - //printf("Doing a saxpy with comp=1 & cuda=1\n"); - x_mem = x->getGPUForRead() + x_shift; - y_mem = y->getGPUForReadAndWrite() + y_shift; + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + //printf("Doing a saxpy with comp=1 & cuda=1\n"); + x_mem = x->getGPUForRead() + x_shift; + y_mem = y->getGPUForReadAndWrite() + y_shift; - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); - status = cublasSaxpy(handle, N, &alpha, x_mem, x_inc, y_mem, y_inc); + status = cublasSaxpy(handle, N, &alpha, x_mem, x_inc, y_mem, y_inc); - checkCublasError(status); + checkCublasError(status); } - else + else { - //printf("Doing a saxpy with comp=1 & cuda=0\n"); + //printf("Doing a saxpy with comp=1 & cuda=0\n"); #endif #ifndef USE_CUDA - //printf("Doing a saxpy with comp=0 & cuda=0\n"); + //printf("Doing a saxpy with comp=0 & cuda=0\n"); #endif - x_mem = x->getPPALForRead() + x_shift; - y_mem = y->getPPALForReadAndWrite() + y_shift; + x_mem = x->getPPALForRead() + x_shift; + y_mem = y->getPPALForReadAndWrite() + y_shift; - cblas_saxpy(N, alpha, x_mem, x_inc, y_mem, y_inc); + cblas_saxpy(N, alpha, x_mem, x_inc, y_mem, y_inc); #ifdef USE_CUDA } #endif } void doSaxpyLoop(int N, - float alpha, - FloatGPUMirroredMemoryBlock* x, - unsigned int x_inc, - FloatGPUMirroredMemoryBlock* y, - unsigned int y_inc, - unsigned int times, - const unsigned int stride, - bool use_gpu) + float alpha, + FloatGPUMirroredMemoryBlock* x, + unsigned int x_inc, + FloatGPUMirroredMemoryBlock* y, + unsigned int y_inc, + unsigned int times, + const unsigned int stride, + bool use_gpu) { - const float *x_mem; - float *y_mem; + const float *x_mem; + float *y_mem; #ifdef USE_CUDA - if (use_gpu) + if (use_gpu) { - /* - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); + /* + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); //printf("Doing a saxpy loop with comp=1 & cuda=1\n"); x_mem = x->getGPUForRead(); y_mem = y->getGPUForReadAndWrite(); @@ -1351,37 +1351,37 @@ void doSaxpyLoop(int N, checkCublasError(status); } - */ - x_mem = x->getGPUForRead(); - y_mem = y->getGPUForWrite(); - - const unsigned int MAX_THREADS = GPUHelper::getMaxThreadsPerBlock(); - dim3 block, grid; - // Number of threads on each block dimension - block.x = min(MAX_THREADS, times); - block.y = min(MAX_THREADS/block.x, N); - block.z = 1; - - grid.x = (times/block.x + + */ + x_mem = x->getGPUForRead(); + y_mem = y->getGPUForWrite(); + + const unsigned int MAX_THREADS = GPUHelper::getMaxThreadsPerBlock(); + dim3 block, grid; + // Number of threads on each block dimension + block.x = min(MAX_THREADS, times); + block.y = min(MAX_THREADS/block.x, N); + block.z = 1; + + grid.x = (times/block.x + (times % block.x ? 1 : 0)); - grid.y = (N/block.y + (N % block.y ? 1 : 0)); - grid.z = 1; + grid.y = (N/block.y + (N % block.y ? 1 : 0)); + grid.z = 1; - saxpyLoopKernel<<>> - (N, alpha, x_mem, x_inc, y_mem, y_inc, times, stride); + saxpyLoopKernel<<>> + (N, alpha, x_mem, x_inc, y_mem, y_inc, times, stride); } - else + else { - //printf("Doing a saxpy loop with comp=1 & cuda=0\n"); + //printf("Doing a saxpy loop with comp=1 & cuda=0\n"); #endif #ifndef USE_CUDA - //printf("Doing a saxpy loop with comp=0 & cuda=0\n"); + //printf("Doing a saxpy loop with comp=0 & cuda=0\n"); #endif - x_mem = x->getPPALForRead(); - y_mem = y->getPPALForReadAndWrite(); + x_mem = x->getPPALForRead(); + y_mem = y->getPPALForReadAndWrite(); - for (unsigned int i = 0; i < times; i++) - cblas_saxpy(N, alpha, + for (unsigned int i = 0; i < times; i++) + cblas_saxpy(N, alpha, x_mem + i * stride, x_inc, y_mem, y_inc); #ifdef USE_CUDA @@ -1390,198 +1390,198 @@ void doSaxpyLoop(int N, } void doSgemm(CBLAS_ORDER major_type, - CBLAS_TRANSPOSE a_transpose, - CBLAS_TRANSPOSE b_transpose, - int m, - int n, - int k, - float alpha, - FloatGPUMirroredMemoryBlock* a, - unsigned int a_inc, - FloatGPUMirroredMemoryBlock* b, - unsigned int b_inc, - float beta, - FloatGPUMirroredMemoryBlock* c, - unsigned int c_inc, - unsigned int a_shift, - unsigned int b_shift, - unsigned int c_shift, - bool use_gpu) + CBLAS_TRANSPOSE a_transpose, + CBLAS_TRANSPOSE b_transpose, + int m, + int n, + int k, + float alpha, + FloatGPUMirroredMemoryBlock* a, + unsigned int a_inc, + FloatGPUMirroredMemoryBlock* b, + unsigned int b_inc, + float beta, + FloatGPUMirroredMemoryBlock* c, + unsigned int c_inc, + unsigned int a_shift, + unsigned int b_shift, + unsigned int c_shift, + bool use_gpu) { - const float *a_mem, *b_mem; - float *c_mem; + const float *a_mem, *b_mem; + float *c_mem; #ifdef USE_CUDA - if (use_gpu) + if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - assert(major_type == CblasColMajor); - //printf("Doing a sgemm with comp=1 & cuda=1\n"); - a_mem = a->getGPUForRead() + a_shift; - b_mem = b->getGPUForRead() + b_shift; - c_mem = c->getGPUForReadAndWrite() + c_shift; - cublasOperation_t cublas_a_transpose = getCublasOperation(a_transpose); - cublasOperation_t cublas_b_transpose = getCublasOperation(b_transpose); - - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); - - status = cublasSgemm(handle, cublas_a_transpose, cublas_b_transpose, - m, n, k, - &alpha, a_mem, a_inc, - b_mem, b_inc, - &beta, c_mem, c_inc); - - checkCublasError(status); + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + assert(major_type == CblasColMajor); + //printf("Doing a sgemm with comp=1 & cuda=1\n"); + a_mem = a->getGPUForRead() + a_shift; + b_mem = b->getGPUForRead() + b_shift; + c_mem = c->getGPUForReadAndWrite() + c_shift; + cublasOperation_t cublas_a_transpose = getCublasOperation(a_transpose); + cublasOperation_t cublas_b_transpose = getCublasOperation(b_transpose); + + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); + + status = cublasSgemm(handle, cublas_a_transpose, cublas_b_transpose, + m, n, k, + &alpha, a_mem, a_inc, + b_mem, b_inc, + &beta, c_mem, c_inc); + + checkCublasError(status); } - else + else { - //printf("Doing a sgemm with comp=1 & cuda=0\n"); + //printf("Doing a sgemm with comp=1 & cuda=0\n"); #endif - //printf("Doing a sgemm with comp=0 & cuda=0\n"); - a_mem = a->getPPALForRead() + a_shift; - b_mem = b->getPPALForRead() + b_shift; - c_mem = c->getPPALForReadAndWrite() + c_shift; - - // matrix matrix product: C = \alpha op(A) op(B) + \beta C - cblas_sgemm(major_type, // Row or Col Major - a_transpose, // Transpose or not A - b_transpose, // Transpose or not B - m, // num rows of A (before transpose) - n, // num rows at B (before transpose) - k, // Common dimension between A and B - alpha, // Alpha value - a_mem, // A matrix - a_inc, // A matrix stride - b_mem, // B matrix - b_inc, // B matrix stride - beta, // Beta value - c_mem, // C matrix - c_inc); // C matrix stride + //printf("Doing a sgemm with comp=0 & cuda=0\n"); + a_mem = a->getPPALForRead() + a_shift; + b_mem = b->getPPALForRead() + b_shift; + c_mem = c->getPPALForReadAndWrite() + c_shift; + + // matrix matrix product: C = \alpha op(A) op(B) + \beta C + cblas_sgemm(major_type, // Row or Col Major + a_transpose, // Transpose or not A + b_transpose, // Transpose or not B + m, // num rows of A (before transpose) + n, // num rows at B (before transpose) + k, // Common dimension between A and B + alpha, // Alpha value + a_mem, // A matrix + a_inc, // A matrix stride + b_mem, // B matrix + b_inc, // B matrix stride + beta, // Beta value + c_mem, // C matrix + c_inc); // C matrix stride #ifdef USE_CUDA } #endif } void doVectorSetToZero(FloatGPUMirroredMemoryBlock *v, - unsigned int v_size, - unsigned int inc, - unsigned int shift, - bool use_gpu) { + unsigned int v_size, + unsigned int inc, + unsigned int shift, + bool use_gpu) { #ifdef USE_CUDA - if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - float *ptr = v->getGPUForWrite() + shift; - float value = 0.0f; + if (use_gpu) { + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + float *ptr = v->getGPUForWrite() + shift; + float value = 0.0f; - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); - status = cublasSscal(handle, v_size, &value, ptr, inc); - // FIXME: To use cuMemsetD32 instead of cublasSscal - checkCublasError(status); - } - else { + status = cublasSscal(handle, v_size, &value, ptr, inc); + // FIXME: To use cuMemsetD32 instead of cublasSscal + checkCublasError(status); + } + else { #endif - float *ptr = v->getPPALForWrite() + shift; - VECTOR_SSET(v_size, 0.0f, ptr, inc); + float *ptr = v->getPPALForWrite() + shift; + VECTOR_SSET(v_size, 0.0f, ptr, inc); #ifdef USE_CUDA - } + } #endif } void doVectorSet(FloatGPUMirroredMemoryBlock *v, - float value, - unsigned int v_size, - unsigned int inc, - unsigned int shift, - bool use_gpu) { - float *ptr = v->getPPALForWrite() + shift; - VECTOR_SSET(v_size, value, ptr, inc); + float value, + unsigned int v_size, + unsigned int inc, + unsigned int shift, + bool use_gpu) { + float *ptr = v->getPPALForWrite() + shift; + VECTOR_SSET(v_size, value, ptr, inc); } void doSscal(unsigned int size, - float alpha, - FloatGPUMirroredMemoryBlock *x, - unsigned int shift, - unsigned int inc, - bool use_gpu) { - float *x_mem; + float alpha, + FloatGPUMirroredMemoryBlock *x, + unsigned int shift, + unsigned int inc, + bool use_gpu) { + float *x_mem; #ifdef USE_CUDA - if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - x_mem = x->getGPUForReadAndWrite() + shift; + if (use_gpu) { + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + x_mem = x->getGPUForReadAndWrite() + shift; - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); - status = cublasSscal(handle, size, &alpha, x_mem, inc); + status = cublasSscal(handle, size, &alpha, x_mem, inc); - checkCublasError(status); - } - else { + checkCublasError(status); + } + else { #endif - x_mem = x->getPPALForReadAndWrite() + shift; - cblas_sscal(size, alpha, x_mem, inc); + x_mem = x->getPPALForReadAndWrite() + shift; + cblas_sscal(size, alpha, x_mem, inc); #ifdef USE_CUDA - } + } #endif } void doSger(CBLAS_ORDER major_type, - unsigned int m, - unsigned int n, - float alpha, - FloatGPUMirroredMemoryBlock *x, - unsigned int x_shift, - unsigned int x_inc, - FloatGPUMirroredMemoryBlock *y, - unsigned int y_shift, - unsigned int y_inc, - FloatGPUMirroredMemoryBlock *a, - unsigned int a_shift, - unsigned int a_inc, - bool use_gpu) { - const float *x_mem; - const float *y_mem; - float *a_mem; + unsigned int m, + unsigned int n, + float alpha, + FloatGPUMirroredMemoryBlock *x, + unsigned int x_shift, + unsigned int x_inc, + FloatGPUMirroredMemoryBlock *y, + unsigned int y_shift, + unsigned int y_inc, + FloatGPUMirroredMemoryBlock *a, + unsigned int a_shift, + unsigned int a_inc, + bool use_gpu) { + const float *x_mem; + const float *y_mem; + float *a_mem; #ifdef USE_CUDA - if (use_gpu) { - cublasStatus_t status; - cublasHandle_t handle = GPUHelper::getHandler(); - x_mem = x->getGPUForRead() + x_shift; - y_mem = y->getGPUForRead() + y_shift; - a_mem = a->getGPUForReadAndWrite() + a_shift; - - status = cublasSetStream(handle, GPUHelper::getCurrentStream()); - checkCublasError(status); - - status = cublasSger(handle, - m, n, - &alpha, - x_mem, x_inc, - y_mem, y_inc, - a_mem, a_inc); - - checkCublasError(status); - } - else { + if (use_gpu) { + cublasStatus_t status; + cublasHandle_t handle = GPUHelper::getHandler(); + x_mem = x->getGPUForRead() + x_shift; + y_mem = y->getGPUForRead() + y_shift; + a_mem = a->getGPUForReadAndWrite() + a_shift; + + status = cublasSetStream(handle, GPUHelper::getCurrentStream()); + checkCublasError(status); + + status = cublasSger(handle, + m, n, + &alpha, + x_mem, x_inc, + y_mem, y_inc, + a_mem, a_inc); + + checkCublasError(status); + } + else { #endif - x_mem = x->getPPALForRead() + x_shift; - y_mem = y->getPPALForRead() + y_shift; - a_mem = a->getPPALForReadAndWrite() + a_shift; - - cblas_sger(major_type, - m, n, - alpha, - x_mem, x_inc, - y_mem, y_inc, - a_mem, a_inc); + x_mem = x->getPPALForRead() + x_shift; + y_mem = y->getPPALForRead() + y_shift; + a_mem = a->getPPALForReadAndWrite() + a_shift; + + cblas_sger(major_type, + m, n, + alpha, + x_mem, x_inc, + y_mem, y_inc, + a_mem, a_inc); #ifdef USE_CUDA - } + } #endif }