From 5a9b30176da0167d80f9614fd8034c0d1cf029e8 Mon Sep 17 00:00:00 2001
From: Jakub Szuppe <j.szuppe@gmail.com>
Date: Sat, 25 Jul 2015 13:36:40 +0200
Subject: [PATCH 1/3] Add more tests for sort

---
 test/test_sort.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
diff --git a/test/test_sort.cpp b/test/test_sort.cpp
index ce76e1d48..0ee58c016 100644
--- a/test/test_sort.cpp
+++ b/test/test_sort.cpp
@@ -15,6 +15,19 @@
 #include <boost/compute/algorithm/sort.hpp>
 #include <boost/compute/algorithm/is_sorted.hpp>
 #include <boost/compute/container/vector.hpp>
+#include <boost/compute/types/struct.hpp>
+
+struct Particle
+{
+    Particle(): x(0.f), y(0.f) { }
+    Particle(float _x, float _y): x(_x), y(_y) { }
+
+    float x;
+    float y;
+};
+
+// adapt struct for OpenCL
+BOOST_COMPUTE_ADAPT_STRUCT(Particle, Particle, (x, y))
 
 #include "check_macros.hpp"
 #include "context_setup.hpp"
@@ -277,4 +290,74 @@ BOOST_AUTO_TEST_CASE(sort_host_vector)
     CHECK_RANGE_EQUAL(int, 8, vector, (0, 1, 2, 3, 4, 5, 6, 7));
 }
 
+BOOST_AUTO_TEST_CASE(sort_custom_struct)
+{
+    // function to compare particles by their x-coordinate
+    BOOST_COMPUTE_FUNCTION(bool, sort_by_x, (Particle a, Particle b),
+    {
+        return a.x < b.x;
+    });
+
+    std::vector<Particle> particles;
+    particles.push_back(Particle(0.1f, 0.f));
+    particles.push_back(Particle(-0.4f, 0.f));
+    particles.push_back(Particle(10.0f, 0.f));
+    particles.push_back(Particle(0.001f, 0.f));
+
+    boost::compute::vector<Particle> vector(4, context);
+    boost::compute::copy(particles.begin(), particles.end(), vector.begin(), queue);
+    BOOST_CHECK_EQUAL(vector.size(), size_t(4));
+    BOOST_CHECK(
+        boost::compute::is_sorted(vector.begin(), vector.end(),
+                                  sort_by_x, queue) == false
+    );
+
+    boost::compute::sort(vector.begin(), vector.end(), sort_by_x, queue);
+    BOOST_CHECK(
+        boost::compute::is_sorted(vector.begin(), vector.end(),
+                                  sort_by_x, queue) == true
+    );
+    boost::compute::copy(vector.begin(), vector.end(), particles.begin(), queue);
+    BOOST_CHECK_CLOSE(particles[0].x, -0.4f, 0.1);
+    BOOST_CHECK_CLOSE(particles[1].x, 0.001f, 0.1);
+    BOOST_CHECK_CLOSE(particles[2].x, 0.1f, 0.1);
+    BOOST_CHECK_CLOSE(particles[3].x, 10.0f, 0.1);
+}
+
+BOOST_AUTO_TEST_CASE(sort_int2)
+{
+    using bc::int2_;
+
+    BOOST_COMPUTE_FUNCTION(bool, sort_int2, (int2_ a, int2_ b),
+    {
+        return a.x < b.x;
+    });
+
+    const size_t size = 100;
+    std::vector<int2_> host(size, int2_(0, 0));
+    host[0] = int2_(100.f, 0.f);
+    host[size/4] = int2_(20.f, 0.f);
+    host[(size*3)/4] = int2_(9.f, 0.f);
+    host[size-3] = int2_(-10.0f, 0.f);
+
+    boost::compute::vector<int2_> vector(size, context);
+    boost::compute::copy(host.begin(), host.end(), vector.begin(), queue);
+    BOOST_CHECK_EQUAL(vector.size(), size);
+    BOOST_CHECK(
+        boost::compute::is_sorted(vector.begin(), vector.end(),
+                                  sort_int2, queue) == false
+    );
+
+    boost::compute::sort(vector.begin(), vector.end(), sort_int2, queue);
+    BOOST_CHECK(
+        boost::compute::is_sorted(vector.begin(), vector.end(),
+                                  sort_int2, queue) == true
+    );
+    boost::compute::copy(vector.begin(), vector.end(), host.begin(), queue);
+    BOOST_CHECK_CLOSE(host[0][0], -10.f, 0.1);
+    BOOST_CHECK_CLOSE(host[(size - 3)][0], 9.f, 0.1);
+    BOOST_CHECK_CLOSE(host[(size - 2)][0], 20.f, 0.1);
+    BOOST_CHECK_CLOSE(host[(size - 1)][0], 100.f, 0.1);
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From b5bef712cdcb5e2037da3772c66d86ad3ced50c9 Mon Sep 17 00:00:00 2001
From: Jakub Szuppe <j.szuppe@gmail.com>
Date: Tue, 28 Jul 2015 18:37:11 +0200
Subject: [PATCH 2/3] Merge sort for CPU devices

---
 .../algorithm/detail/merge_sort_on_cpu.hpp    | 176 ++++++++++++++++++
 include/boost/compute/algorithm/sort.hpp      |  45 +++--
 .../boost/compute/algorithm/stable_sort.hpp   |  34 ++--
 3 files changed, 220 insertions(+), 35 deletions(-)
 create mode 100644 include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp

diff --git a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
new file mode 100644
index 000000000..0af6eae92
--- /dev/null
+++ b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
@@ -0,0 +1,176 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class Compare>
+inline void merge_blocks(Iterator first,
+                         Iterator result,
+                         Compare compare,
+                         size_t count,
+                         const size_t block_size,
+                         command_queue &queue)
+{
+    meta_kernel k("merge_sort_on_cpu_merge_blocks");
+    size_t count_arg = k.add_arg<const uint_>("count");
+    size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+    k <<
+        k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
+        k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
+        k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
+        k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
+        k.decl<uint_>("result_idx") << " = b1_start;\n" <<
+
+        // merging block 1 and block 2 (stable)
+        "while(b1_start < b1_end && b2_start < b2_end){\n" <<
+        "    if( " << compare(first[k.var<uint_>("b2_start")],
+                              first[k.var<uint_>("b1_start")]) << "){\n" <<
+        "        " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                      first[k.var<uint_>("b2_start")] << ";\n" <<
+        "        b2_start++;\n" <<
+        "    }\n" <<
+        "    else {\n" <<
+        "        " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                      first[k.var<uint_>("b1_start")] << ";\n" <<
+        "        b1_start++;\n" <<
+        "    }\n" <<
+        "    result_idx++;\n" <<
+        "}\n" <<
+        "while(b1_start < b1_end){\n" <<
+        "   " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                 first[k.var<uint_>("b1_start")] << ";\n" <<
+        "    b1_start++;\n" <<
+        "    result_idx++;\n" <<
+        "}\n" <<
+        "while(b2_start < b2_end){\n" <<
+        "   " << result[k.var<uint_>("result_idx")] <<  " = " <<
+                 first[k.var<uint_>("b2_start")] << ";\n" <<
+        "    b2_start++;\n" <<
+        "    result_idx++;\n" <<
+        "}\n";
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(count_arg, static_cast<const uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+    const size_t global_size = static_cast<size_t>(
+        std::ceil(float(count) / (2 * block_size))
+    );
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void block_insertion_sort(Iterator first,
+                                 Compare compare,
+                                 const size_t count,
+                                 const size_t block_size,
+                                 command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+    k <<
+        k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
+        k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
+
+        // block insertion sort (stable)
+        "for(uint i = start+1; i < end; i++){\n" <<
+        "    " << k.decl<const T>("value") << " = " << first[k.var<uint_>("i")] << ";\n" <<
+        "    uint pos = i;\n" <<
+        "    while(pos > start && " <<
+                   compare(k.var<const T>("value"),
+                           first[k.var<uint_>("pos-1")]) << "){\n" <<
+        "        " << first[k.var<uint_>("pos")] << " = " << first[k.var<uint_>("pos-1")] << ";\n" <<
+        "        pos--;\n" <<
+        "    }\n" <<
+        "    " << first[k.var<uint_>("pos")] << " = value;\n" <<
+        "}\n"; // block insertion sort
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+    const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_cpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count < 2){
+        return;
+    }
+    // for small input size only insertion sort is performed
+    else if(count <= 512){
+        block_insertion_sort(first, compare, count, count, queue);
+        return;
+    }
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    const size_t block_size =
+        parameters->get(cache_key, "insertion_sort_block_size", 64);
+    block_insertion_sort(first, compare, count, block_size, queue);
+
+    // temporary buffer for merge result
+    vector<value_type> temp(count, context);
+    bool result_in_temp = false;
+
+    for(size_t i = block_size; i < count; i *= 2){
+        result_in_temp = !result_in_temp;
+        if(result_in_temp) {
+            merge_blocks(first, temp.begin(), compare, count, i, queue);
+        } else {
+            merge_blocks(temp.begin(), first, compare, count, i, queue);
+        }
+    }
+
+    // if the result is in temp buffer we need to copy it to input
+    if(result_in_temp) {
+        copy(temp.begin(), temp.end(), first, queue);
+    }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
diff --git a/include/boost/compute/algorithm/sort.hpp b/include/boost/compute/algorithm/sort.hpp
index bed01c8d8..b2730b3e2 100644
--- a/include/boost/compute/algorithm/sort.hpp
+++ b/include/boost/compute/algorithm/sort.hpp
@@ -17,6 +17,7 @@
 
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
@@ -30,13 +31,13 @@ namespace compute {
 namespace detail {
 
 template<class T>
-inline void dispatch_device_sort(buffer_iterator<T> first,
-                                 buffer_iterator<T> last,
-                                 less<T>,
-                                 command_queue &queue,
-                                 typename boost::enable_if_c<
-                                     is_radix_sortable<T>::value
-                                 >::type* = 0)
+inline void dispatch_gpu_sort(buffer_iterator<T> first,
+                              buffer_iterator<T> last,
+                              less<T>,
+                              command_queue &queue,
+                              typename boost::enable_if_c<
+                                  is_radix_sortable<T>::value
+                              >::type* = 0)
 {
     size_t count = detail::iterator_range_size(first, last);
 
@@ -53,13 +54,13 @@ inline void dispatch_device_sort(buffer_iterator<T> first,
 }
 
 template<class T>
-inline void dispatch_device_sort(buffer_iterator<T> first,
-                                 buffer_iterator<T> last,
-                                 greater<T> compare,
-                                 command_queue &queue,
-                                 typename boost::enable_if_c<
-                                     is_radix_sortable<T>::value
-                                 >::type* = 0)
+inline void dispatch_gpu_sort(buffer_iterator<T> first,
+                              buffer_iterator<T> last,
+                              greater<T> compare,
+                              command_queue &queue,
+                              typename boost::enable_if_c<
+                                  is_radix_sortable<T>::value
+                              >::type* = 0)
 {
     size_t count = detail::iterator_range_size(first, last);
 
@@ -82,10 +83,10 @@ inline void dispatch_device_sort(buffer_iterator<T> first,
 }
 
 template<class Iterator, class Compare>
-inline void dispatch_device_sort(Iterator first,
-                                 Iterator last,
-                                 Compare compare,
-                                 command_queue &queue)
+inline void dispatch_gpu_sort(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
 {
     ::boost::compute::detail::serial_insertion_sort(
         first, last, compare, queue
@@ -102,7 +103,11 @@ inline void dispatch_sort(Iterator first,
                               is_device_iterator<Iterator>
                           >::type* = 0)
 {
-    dispatch_device_sort(first, last, compare, queue);
+    if(queue.get_device().type() & device::gpu) {
+        dispatch_gpu_sort(first, last, compare, queue);
+        return;
+    }
+    ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
 }
 
 // sort() for host iterators
@@ -125,7 +130,7 @@ inline void dispatch_sort(Iterator first,
     );
 
     // sort mapped buffer
-    dispatch_device_sort(view.begin(), view.end(), compare, queue);
+    dispatch_sort(view.begin(), view.end(), compare, queue);
 
     // return results to host
     view.map(queue);
diff --git a/include/boost/compute/algorithm/stable_sort.hpp b/include/boost/compute/algorithm/stable_sort.hpp
index 71769c0e0..cd82a0a60 100644
--- a/include/boost/compute/algorithm/stable_sort.hpp
+++ b/include/boost/compute/algorithm/stable_sort.hpp
@@ -15,6 +15,7 @@
 
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
@@ -25,10 +26,10 @@ namespace compute {
 namespace detail {
 
 template<class Iterator, class Compare>
-inline void dispatch_stable_sort(Iterator first,
-                                 Iterator last,
-                                 Compare compare,
-                                 command_queue &queue)
+inline void dispatch_gpu_stable_sort(Iterator first,
+                                     Iterator last,
+                                     Compare compare,
+                                     command_queue &queue)
 {
     ::boost::compute::detail::serial_insertion_sort(
         first, last, compare, queue
@@ -37,20 +38,20 @@ inline void dispatch_stable_sort(Iterator first,
 
 template<class T>
 inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
-dispatch_stable_sort(buffer_iterator<T> first,
-                     buffer_iterator<T> last,
-                     less<T>,
-                     command_queue &queue)
+dispatch_gpu_stable_sort(buffer_iterator<T> first,
+                         buffer_iterator<T> last,
+                         less<T>,
+                         command_queue &queue)
 {
     ::boost::compute::detail::radix_sort(first, last, queue);
 }
 
 template<class T>
 inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
-dispatch_stable_sort(buffer_iterator<T> first,
-                     buffer_iterator<T> last,
-                     greater<T>,
-                     command_queue &queue)
+dispatch_gpu_stable_sort(buffer_iterator<T> first,
+                         buffer_iterator<T> last,
+                         greater<T>,
+                         command_queue &queue)
 {
     // radix sort in ascending order
     ::boost::compute::detail::radix_sort(first, last, queue);
@@ -71,9 +72,12 @@ inline void stable_sort(Iterator first,
                         Compare compare,
                         command_queue &queue = system::default_queue())
 {
-    ::boost::compute::detail::dispatch_stable_sort(
-        first, last, compare, queue
-    );
+    if(queue.get_device().type() & device::gpu) {
+        ::boost::compute::detail::dispatch_gpu_stable_sort(
+            first, last, compare, queue
+        );
+    }
+    ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
 }
 
 /// \overload

From b6c2e71ba3fdccd0658e9d2fd8ff5edfbcec56b0 Mon Sep 17 00:00:00 2001
From: Jakub Szuppe <j.szuppe@gmail.com>
Date: Thu, 30 Jul 2015 18:52:52 +0200
Subject: [PATCH 3/3] Using merge with merge path algorithm in merge sort

Merge with merge path is used in merge part of merge sort
algorithm as it's seems to be more efficient than block merge
when there are a few big sorted blocks left to be merged.
---
 .../algorithm/detail/merge_sort_on_cpu.hpp    | 63 ++++++++++++++++---
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
index 0af6eae92..a58104a54 100644
--- a/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
+++ b/include/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
@@ -14,6 +14,7 @@
 #include <boost/compute/kernel.hpp>
 #include <boost/compute/program.hpp>
 #include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
 #include <boost/compute/container/vector.hpp>
 #include <boost/compute/detail/meta_kernel.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
@@ -80,6 +81,39 @@ inline void merge_blocks(Iterator first,
     queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
 }
 
+template<class Iterator, class Compare>
+inline void dispatch_merge_blocks(Iterator first,
+                                  Iterator result,
+                                  Compare compare,
+                                  size_t count,
+                                  const size_t block_size,
+                                  const size_t input_size_threshold,
+                                  const size_t blocks_no_threshold,
+                                  command_queue &queue)
+{
+    const size_t blocks_no = static_cast<size_t>(
+        std::ceil(float(count) / block_size)
+    );
+    // merge with merge path should used only for the large arrays and at the
+    // end of merging part when there are only a few big blocks left to be merged
+    if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
+        Iterator last = first + count;
+        for(size_t i = 0; i < count; i+= 2*block_size)
+        {
+            Iterator first1 = (std::min)(first + i, last);
+            Iterator last1 = (std::min)(first1 + block_size, last);
+            Iterator first2 = last1;
+            Iterator last2 = (std::min)(first2 + block_size, last);
+            Iterator block_result = (std::min)(result + i, result + count);
+            merge_with_merge_path(first1, last1, first2, last2,
+                                  block_result, compare, queue);
+        }
+    }
+    else {
+        merge_blocks(first, result, compare, count, block_size, queue);
+    }
+}
+
 template<class Iterator, class Compare>
 inline void block_insertion_sort(Iterator first,
                                  Compare compare,
@@ -119,6 +153,7 @@ inline void block_insertion_sort(Iterator first,
     queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
 }
 
+// This sort is stable.
 template<class Iterator, class Compare>
 inline void merge_sort_on_cpu(Iterator first,
                               Iterator last,
@@ -146,25 +181,39 @@ inline void merge_sort_on_cpu(Iterator first,
     boost::shared_ptr<parameter_cache> parameters =
         detail::parameter_cache::get_global_cache(device);
 
+    // When there is merge_with_path_blocks_no_threshold or less blocks left to
+    // merge AND input size is merge_with_merge_path_input_size_threshold or more
+    // merge_with_merge_path() algorithm is used to merge sorted blocks;
+    // otherwise merge_blocks() is used.
+    const size_t merge_with_path_blocks_no_threshold =
+        parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
+    const size_t merge_with_path_input_size_threshold =
+        parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
+
     const size_t block_size =
         parameters->get(cache_key, "insertion_sort_block_size", 64);
     block_insertion_sort(first, compare, count, block_size, queue);
 
     // temporary buffer for merge result
     vector<value_type> temp(count, context);
-    bool result_in_temp = false;
+    bool result_in_temporary_buffer = false;
 
     for(size_t i = block_size; i < count; i *= 2){
-        result_in_temp = !result_in_temp;
-        if(result_in_temp) {
-            merge_blocks(first, temp.begin(), compare, count, i, queue);
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            dispatch_merge_blocks(first, temp.begin(), compare, count, i,
+                                  merge_with_path_input_size_threshold,
+                                  merge_with_path_blocks_no_threshold,
+                                  queue);
         } else {
-            merge_blocks(temp.begin(), first, compare, count, i, queue);
+            dispatch_merge_blocks(temp.begin(), first, compare, count, i,
+                                  merge_with_path_input_size_threshold,
+                                  merge_with_path_blocks_no_threshold,
+                                  queue);
         }
     }
 
-    // if the result is in temp buffer we need to copy it to input
-    if(result_in_temp) {
+    if(result_in_temporary_buffer) {
         copy(temp.begin(), temp.end(), first, queue);
     }
 }