diff --git a/doc/algorithm.qbk b/doc/algorithm.qbk index 66971d17c..9a13a7973 100644 --- a/doc/algorithm.qbk +++ b/doc/algorithm.qbk @@ -46,6 +46,10 @@ Thanks to all the people who have reviewed this library and made suggestions for [include knuth_morris_pratt.qbk] [endsect] +[section:Sorting Integer Sorting Algorithms] +[include integer_sort.qbk] +[endsect] + [section:CXX11 C++11 Algorithms] [include all_of.qbk] [include any_of.qbk] diff --git a/doc/integer_sort.qbk b/doc/integer_sort.qbk new file mode 100644 index 000000000..b2558f4b1 --- /dev/null +++ b/doc/integer_sort.qbk @@ -0,0 +1,88 @@ +[/ QuickBook Document version 1.5 ] + +[section:integer_sort Integer Sorting Algorithms] + +[/license +Copyright (C) 2014 Jeremy W. Murphy + +Distributed under the Boost Software License, Version 1.0. (See accompanying +file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +] + + +[heading Overview] +Integer sorting algorithms take advantage of the properties of integers to sort them using mechanisms other than comparison. +Counting sort algorithm literally counts the frequency of values in the input to form an intermediate representation of the data from which a stable, ordered sequence can be created. + +Least-significant digit (LSD) radix sort uses counting sort to order input data iteratively. With a default one-byte digit, radix sort runs counting sort on one digit of the input at a time. + + +[/ Counts are stored in an array indexed by the value. +/ The partial sum of the array of counts is calculated, calculating the right-most index of each value / in the output. +/ Values are then read from the end of the input, storing each in its position calculated from the array, which is decremented at each step. +/ It is limited to sorting types that can be projected in order onto an unsigned integral type. +] + +[heading Interface] +Requirements are for the input iterator to be bidirectional and for the output iterator to be random access. The basic interface requires the input type T to be of an unsigned integral type. Radix and counting sort have an almost identical interface: counting sort has one additional parameter, digit, which radix sort calculates and passes to counting sort internally. + +`` +template +void stable_counting sort(Input first, Input last, Output result); +template +void radix sort(Input first, Input last, Output result); +`` + +The next interface introduces customization of the conversion to allow user-defined types. +The output type of the conv function has the same requirements as T above. +`` +template +void stable_counting sort(Input first, Input last, Output result, Conversion conv); +`` +The next interface adds the option to specify min and max manually. +`` +template +void stable_counting sort(Input first, Input last, Output result, Conversion conv, T min, T max); +`` +Finally, the complete interface for total customization includes specifying the radix and digit. +`` +template +void stable_counting sort(Input first, Input last, Output result, Conversion conv, T min, T max, unsigned radix, char unsigned digit); +`` + +[heading Complexity] +Let k equal the range of the input (max - min). Counting sort runs in \Theta(k) space. If k = O(n), counting sort runs in \Theta(n) time, otherwise it runs in \Theta(n + k). + +If k = O(n), radix sort runs in \theta(dn) time, otherwise it runs in \Theta(d(n + k)). Even though this complexity is worse than counting sort, the performance characteristics more than make up for it in practice. + +Space complexity for radix sort depends on the width of the unsigned integral type divided by the radix, called digits in the algorithm: + + digits space complexity + 1 \Theta(k) + 2 \Theta(n + k) + >=3 \Theta(2n + k) + +If digits equals one, LSD radix sort is equivalent to stable counting sort. When digits equals two, one temporary buffer is required, and for greater than two digits, two temporary buffers are required. + +// To guarantee the best linear complexity... + + +[heading Exception Safety] +Counting and radix sort take their parameters by value and have no global state. + +[heading Customization Points] +If UnsignedInteger(T) is false, a Conversion type is required to project T onto an unsigned integral type of appropriate size. + + +[heading Performance] +Radix sort performance is proportional to the size of T and k. + +(On x86_64 ) Compared to std::sort it is approximately 20 times faster at sorting `char`, +10 times faster for `short`, 4 times faster for `int` and almost 2 times faster for `long`. + + +[heading Notes] +It is typical for algorithms to treat empty input (n = 0) as a special case. These algorithms also treat n = 1 as a special case of no sorting work to be done. This was largely motivated by the fact that the LSD radix sort algorithm calculates log(n) but does not expect zero, +however it makes logical sense for a sorting algorithm in general. + +[endsect] diff --git a/include/boost/algorithm/integer_sort.hpp b/include/boost/algorithm/integer_sort.hpp new file mode 100644 index 000000000..39fa2e9f8 --- /dev/null +++ b/include/boost/algorithm/integer_sort.hpp @@ -0,0 +1,12 @@ +// (C) Copyright Jeremy W. Murphy 2013. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef INTEGER_SORT +#define INTEGER_SORT + +#include +#include + +#endif diff --git a/include/boost/algorithm/integer_sort/counting-sort.hpp b/include/boost/algorithm/integer_sort/counting-sort.hpp new file mode 100644 index 000000000..7f7974efd --- /dev/null +++ b/include/boost/algorithm/integer_sort/counting-sort.hpp @@ -0,0 +1,177 @@ +// (C) Copyright Jeremy W. Murphy 2013. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +/** \file counting-sort.hpp + * \brief Stable counting sort. + */ + +#ifndef COUNTING_SORT +#define COUNTING_SORT + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace boost { +namespace algorithm { + namespace detail { + + template + inline Value count_index(Value const a, Shift const b, Value const c, Bitmask const d) + { + return ((a >> b) - c) & d; + } + } + + + namespace transformation + { + template + struct identity + { + typedef T result_type; + + identity() {} + + T const &operator()(T const &x) const + { + return x; + } + }; + + + // For types that are implicitly convertible to an unsigned integral type. + template + struct implicit + { + typedef T result_type; + + implicit() {} + + template + T operator()(U const &x) const + { + return x; + } + }; + } + + + /** + * Requires that client allocates space for result beforehand. + * + * @brief Generalized stable counting-sort. + * + * \c Input Bidirectional input iterator. + * \c Output Random access output iterator. + * + * \param first Input iterator that points to the first element of the unsorted data. + * \param last Input iterator that points past the last element of the unsorted data. + * \param result Output iterator that points to the first element where the sorted data will go. + * \param conv Function object that converts the input type to an unsigned integral type. + * \param min The smallest value present in the input >> r * d. + * \param max The largest value present in the input >> r * d. + * \param r The radix or width of digit to consider. + * \param d Which digit to consider. + */ + template + BOOST_CONCEPT_REQUIRES( + ((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + ((UnsignedInteger::value_type)>::type>)) + , (Output)) + stable_counting_sort(Input first, Input last, Output result, Conversion conv, + typename std::result_of::value_type)>::type const min, + typename std::result_of::value_type)>::type const max, + unsigned const radix, unsigned char const digit) + { + typedef std::reverse_iterator ReverseIterator; + + if(first != last) + { + if(std::next(first) == last) + *result++ = *first; + else + { + typedef typename std::result_of::value_type)>::type T; + assert(radix != 0); + // TODO: Maybe this next assertion should be an exception? + assert(max - min != std::numeric_limits::max()); // Because k - min + 1 == 0. + auto const shift = radix * digit; + uintmax_t const bitmask = (1ul << radix) - 1; + std::vector C(static_cast(max - min) + 1); + ReverseIterator rfirst(last); + ReverseIterator const rlast(first); + + // TODO: Could this be done faster by left-shifting _min and _bitmask once instead of right-shifting the value n times? + std::for_each(first, last, [&](T const &x) + { + C[detail::count_index(conv(x), shift, min, bitmask)]++; + }); + + std::partial_sum(C.begin(), C.end(), C.begin()); + + for(; rfirst != rlast; rfirst++) + *(result + --C[detail::count_index(conv(*rfirst), shift, min, bitmask)]) = *rfirst; + } + } + return result; + } + + + template + BOOST_CONCEPT_REQUIRES( + ((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + ((UnsignedInteger::value_type)>::type>)) + , (Output)) + stable_counting_sort(Input first, Input last, Output result, Conversion conv, + typename std::result_of::value_type)>::type const min, + typename std::result_of::value_type)>::type const max) + { + unsigned const radix(sizeof(typename std::result_of::value_type)>::type) * 8); + return stable_counting_sort(first, last, result, conv, min, max, radix, 0); + } + + + template + BOOST_CONCEPT_REQUIRES( + ((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + ((UnsignedInteger::value_type)>::type>)) + , (Output)) + stable_counting_sort(Input first, Input last, Output result, Conversion conv) + { + if(first != last) + { + auto const bound(std::minmax_element(first, last)); + return stable_counting_sort(first, last, result, conv, *bound.first, *bound.second); + } + else + return result; + } + + + template + BOOST_CONCEPT_REQUIRES( + ((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + , (Output)) + stable_counting_sort(Input first, Input last, Output result) + { + return stable_counting_sort(first, last, result, transformation::identity::value_type>()); + } +} +} +#endif diff --git a/include/boost/algorithm/integer_sort/radix-sort.hpp b/include/boost/algorithm/integer_sort/radix-sort.hpp new file mode 100644 index 000000000..f58bd1457 --- /dev/null +++ b/include/boost/algorithm/integer_sort/radix-sort.hpp @@ -0,0 +1,128 @@ +// (C) Copyright Jeremy W. Murphy 2013. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +/// \file radix-sort.hpp +/// \brief Stable LSD radix sort. + +#ifndef RADIX_SORT +#define RADIX_SORT + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* TODO: Calculate and use effective b? + * It's assumed that value_type is no larger than necessary. That is, k >= 2^(b/2). + * However, if k << 2^b, then there are a lot of unused significant bits that do not need sorting. + * Using effective b could reduce the number of passes required to sort. + */ + +namespace boost { +namespace algorithm { + /** + * \fn stable_radix_sort + * \brief Stable LSD radix sort. + * + * \c Input Bidirectional input iterator. + * \c Output Random access output iterator. + * + * \param first Input iterator that points to the first element of the unsorted data. + * \param last Input iterator that points past the last element of the unsorted data. + * \param result Output iterator that points to the first element where the sorted data will go. + * \param conv Function object that converts the input type to an unsigned integral type. + * \param min The smallest value present in the input. + * \param max The largest value present in the input. + */ + template + BOOST_CONCEPT_REQUIRES(((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + ((UnsignedInteger::value_type)>::type>)), + (Output)) + stable_radix_sort(Input first, Input last, Output result, Conversion conv, + typename std::result_of::value_type)>::type const min, + typename std::result_of::value_type)>::type const max, + unsigned const radix = 8u) + { + typedef typename std::iterator_traits::value_type value_type; + typedef typename std::result_of::value_type)>::type uint_type; + + if(first != last) + { + assert(max >= min); + if(std::next(first) == last) + *result++ = *first; + else + { + char unsigned const bits(sizeof(uint_type) * 8), + digits(ceil(static_cast(bits) / static_cast(radix))); + + assert(radix * digits >= bits); + + if(digits == 1) + stable_counting_sort(first, last, result, conv, min, max); + else + { + std::vector tmp1(first, last); + uint_type const dk = (uint_type(1) << radix) - 1; // TODO: This can be improved. + // NOTE: Is there an easy way to utilize minimum here? + + if(digits == 2) + { + stable_counting_sort(first, last, tmp1.begin(), conv, 0, dk, radix, 0); + stable_counting_sort(tmp1.begin(), tmp1.end(), result, conv, 0, dk, radix, 1); + } + else + { + std::vector tmp2(tmp1.size()); + for(unsigned i = 0; i < digits; i++) + { + stable_counting_sort(tmp1.begin(), tmp1.end(), tmp2.begin(), conv, 0, dk, radix, i); + std::swap(tmp1, tmp2); + } + + std::copy(tmp1.begin(), tmp1.end(), result); + } + } + } + } + return result; + } + + + template + BOOST_CONCEPT_REQUIRES(((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + ((UnsignedInteger::value_type)>::type>)), + (Output)) + stable_radix_sort(Input first, Input last, Output result, Conversion conv) + { + if(first != last) + { + auto const bound(std::minmax_element(first, last)); + return stable_radix_sort(first, last, result, conv, conv(*bound.first), conv(*bound.second)); + } + else + return result; + } + + + template + BOOST_CONCEPT_REQUIRES(((BidirectionalIterator)) + ((Mutable_RandomAccessIterator)) + ((UnsignedInteger::value_type>)), + (Output)) + stable_radix_sort(Input first, Input last, Output result) + { + return stable_radix_sort(first, last, result, transformation::identity::value_type>()); + } +} +} +#endif diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index bb4dff653..a4c30e387 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -65,6 +65,8 @@ alias unit_test_framework [ run gather_test1.cpp unit_test_framework : : : : gather_test1 ] [ compile-fail gather_fail1.cpp ] - ; +# Integer sort tests + [ run stable_counting_test.cpp unit_test_framework : : : : stable_counting_test ] + [ run stable_radix_test.cpp unit_test_framework : : : : stable_radix_test ] + ; } - diff --git a/test/stable_counting_test.cpp b/test/stable_counting_test.cpp new file mode 100644 index 000000000..e06432bbe --- /dev/null +++ b/test/stable_counting_test.cpp @@ -0,0 +1,188 @@ + +#define BOOST_TEST_MAIN +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace boost::algorithm; + +typedef boost::mpl::list all_unsigned_types; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(empty, T, all_unsigned_types) +{ + // So empty, it would be a crime to dereference it. + T *const p = NULL, * const result = NULL; + stable_counting_sort(p, p, result); + // No BOOST_REQUIRE... we just make sure the program does not crash/throw. +} + + +template +struct count_one +{ + std::vector result; + + count_one() + { + result.resize(1); + } + + template + void operator()(Container) + { + result[0] = 9; + Container const c(1, 0); + stable_counting_sort(c.begin(), c.end(), result.begin()); + BOOST_CHECK(std::equal(result.begin(), result.end(), c.begin())); + } +}; + + +/* n == 1 is a special case; fundamentally, there is no problem, the data is + * already in order. + */ +BOOST_AUTO_TEST_CASE_TEMPLATE(n_equals_one, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1 >::type sequence_containers; + boost::mpl::for_each(count_one()); +} + + +template +class one_1000_foo +{ +private: + std::vector input; + std::vector result; + +public: + one_1000_foo(std::vector const &input) : input(input) + { + result.resize(input.size()); + } + + template + void operator()(Container) + { + std::fill(result.begin(), result.end(), 9); + Container const c(input.begin(), input.end()); + stable_counting_sort(c.begin(), c.end(), result.begin()); + BOOST_CHECK(std::equal(c.begin(), c.end(), result.begin())); + } +}; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(one_1000, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1 >::type test_containers; + + unsigned int const n = 1000; + std::vector const input(n, 1); + + boost::mpl::for_each(one_1000_foo(input)); +} + + +template +struct random_k +{ + std::vector input; + + random_k(T k, unsigned n) + { + boost::random::mt19937 gen(0); + boost::random::uniform_int_distribution dist(0, k); + std::generate_n(std::back_inserter(input), n, boost::bind(dist, gen)); + } + + template + void operator()(Container) + { + Container const c(input.begin(), input.end()); + std::vector result1; + result1.resize(input.size()); + stable_counting_sort(c.begin(), c.end(), result1.begin()); + std::vector result2(input.begin(), input.end()); + std::stable_sort(result2.begin(), result2.end()); + BOOST_CHECK(std::equal(result1.begin(), result1.end(), result2.begin())); + } +}; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(random_small_k, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1 >::type test_containers; + + T const k = ~T(0); + unsigned n = 1000; + try + { + boost::mpl::for_each(random_k(k, n)); + } + catch(std::bad_alloc const &e) + { + BOOST_TEST_MESSAGE("std::bad_alloc thrown but not a failure."); + } +} + + +template +struct convert +{ + struct foo + { + T key; + + foo() {} + foo(T key) : key(key) {} + + operator T() const { return key; } + }; + + std::vector input; + + convert(unsigned n) + { + boost::random::mt19937 gen(0); + boost::random::uniform_int_distribution dist(0, ~T(0)); + std::generate_n(std::back_inserter(input), n, boost::bind(dist, gen)); + } + + template + void operator()(Container) + { + Container const c(input.begin(), input.end()); + std::vector result1; + result1.resize(input.size()); + stable_counting_sort(c.begin(), c.end(), result1.begin(), transformation::implicit()); + std::vector result2(input.begin(), input.end()); + std::stable_sort(result2.begin(), result2.end()); + BOOST_CHECK(std::equal(result1.begin(), result1.end(), result2.begin())); + } +}; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(conversion, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1::foo> >::type test_containers; + try + { + boost::mpl::for_each(convert(1000)); + } + catch(std::bad_alloc const &e) + { + BOOST_TEST_MESSAGE("std::bad_alloc thrown but not a failure."); + } +} diff --git a/test/stable_radix_test.cpp b/test/stable_radix_test.cpp new file mode 100644 index 000000000..88e389666 --- /dev/null +++ b/test/stable_radix_test.cpp @@ -0,0 +1,175 @@ + +#define BOOST_TEST_MAIN +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace boost::algorithm; + +typedef boost::mpl::list all_unsigned_types; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(empty, T, all_unsigned_types) +{ + // So empty, it would be a crime to dereference it. + T *const p = NULL, * const result = NULL; + stable_radix_sort(p, p, result); + // No BOOST_REQUIRE... we just make sure the program does not crash/throw. +} + + +template +struct count_one +{ + std::vector result; + + count_one() + { + result.resize(1); + } + + template + void operator()(Container) + { + result[0] = 9; + Container const c(1, 0); + stable_radix_sort(c.begin(), c.end(), result.begin()); + BOOST_CHECK(std::equal(result.begin(), result.end(), c.begin())); + } +}; + + +/* n == 1 is a special case; fundamentally, there is no problem, the data is + * already in order. + */ +BOOST_AUTO_TEST_CASE_TEMPLATE(n_equals_one, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1 >::type sequence_containers; + boost::mpl::for_each(count_one()); +} + + +template +class one_1000_foo +{ +private: + std::vector input; + std::vector result; + +public: + one_1000_foo(std::vector const &input) : input(input) + { + result.resize(input.size()); + } + + template + void operator()(Container) + { + std::fill(result.begin(), result.end(), 9); + Container const c(input.begin(), input.end()); + stable_radix_sort(c.begin(), c.end(), result.begin()); + BOOST_CHECK(std::equal(c.begin(), c.end(), result.begin())); + } +}; + + + +BOOST_AUTO_TEST_CASE_TEMPLATE(one_1000, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1 >::type test_containers; + + unsigned int const n = 1000; + std::vector const input(n, 1); + + boost::mpl::for_each(one_1000_foo(input)); +} + + +template +struct random_k +{ + std::vector input; + + random_k(T k, unsigned n) + { + boost::random::mt19937 gen(0); + boost::random::uniform_int_distribution dist(0, k); + std::generate_n(std::back_inserter(input), n, boost::bind(dist, gen)); + } + + template + void operator()(Container) + { + Container const c(input.begin(), input.end()); + std::vector result1; + result1.resize(input.size()); + stable_radix_sort(c.begin(), c.end(), result1.begin()); + std::vector result2(input.begin(), input.end()); + std::stable_sort(result2.begin(), result2.end()); + BOOST_CHECK(std::equal(result1.begin(), result1.end(), result2.begin())); + } +}; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(random_small_k, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1 >::type test_containers; + + T const k = ~T(0); + unsigned n = 1000; + boost::mpl::for_each(random_k(k, n)); +} + + +template +struct convert +{ + struct foo + { + T key; + + foo() {} + foo(T key) : key(key) {} + + operator T() const { return key; } + }; + + std::vector input; + + convert(unsigned n) + { + boost::random::mt19937 gen(0); + boost::random::uniform_int_distribution dist(0, ~T(0)); + std::generate_n(std::back_inserter(input), n, boost::bind(dist, gen)); + } + + template + void operator()(Container) + { + Container const c(input.begin(), input.end()); + std::vector result1; + result1.resize(input.size()); + stable_radix_sort(c.begin(), c.end(), result1.begin(), transformation::implicit()); + std::vector result2(input.begin(), input.end()); + std::stable_sort(result2.begin(), result2.end()); + BOOST_CHECK(std::equal(result1.begin(), result1.end(), result2.begin())); + } +}; + + +BOOST_AUTO_TEST_CASE_TEMPLATE(conversion, T, all_unsigned_types) +{ + typedef typename boost::mpl::transform, std::deque, std::list >, boost::mpl::apply1::foo> >::type test_containers; + boost::mpl::for_each(convert(1000)); +}