diff --git a/perf/CMakeLists.txt b/perf/CMakeLists.txt
index 9e604db0c..99dddfedf 100644
--- a/perf/CMakeLists.txt
+++ b/perf/CMakeLists.txt
@@ -117,11 +117,17 @@ if(${BOOST_COMPUTE_HAVE_CUDA})
   set(CUDA_BENCHMARKS
     thrust_accumulate
     thrust_count
+    thrust_exclusive_scan
     thrust_inner_product
+    thrust_merge
     thrust_partial_sum
+    thrust_partition
+    thrust_reverse
+    thrust_rotate
     thrust_saxpy
+    thrust_set_difference
     thrust_sort
-    thrust_exclusive_scan
+    thrust_unique
   )
 
   foreach(BENCHMARK ${CUDA_BENCHMARKS})
diff --git a/perf/perf.py b/perf/perf.py
index 94bf0d4f9..dafd49277 100755
--- a/perf/perf.py
+++ b/perf/perf.py
@@ -114,43 +114,55 @@ def run_benchmark(name, sizes, vs=[]):
         report.add_sample("compute", size, time)
 
     competitors = {
-        "thrust" : ["accumulate",
-                    "count",
-                    "inner_product",
-                    "partial_sum",
-                    "sort",
-                    "saxpy"],
-        "tbb": ["accumulate",
-                "merge",
-                "sort"],
-        "stl": ["accumulate",
-                "count",
-                "find_end",
-                "includes",
-                "inner_product",
-                "is_permutation",
-                "max_element",
-                "merge",
-                "next_permutation",
-                "nth_element",
-                "partial_sum",
-                "partition",
-                "partition_point",
-                "prev_permutation",
-                "reverse",
-                "rotate",
-                "rotate_copy",
-                "saxpy",
-                "search",
-                "search_n",
-                "set_difference",
-                "set_intersection",
-                "set_symmetric_difference",
-                "set_union",
-                "sort",
-                "stable_partition",
-                "unique",
-                "unique_copy"]
+        "thrust" : [
+            "accumulate",
+            "count",
+            "exclusive_scan",
+            "inner_product",
+            "merge",
+            "partial_sum",
+            "partition",
+            "reverse",
+            "rotate",
+            "saxpy",
+            "sort",
+            "unique"
+        ],
+        "tbb": [
+            "accumulate",
+            "merge",
+            "sort"
+        ],
+        "stl": [
+            "accumulate",
+            "count",
+            "find_end",
+            "includes",
+            "inner_product",
+            "is_permutation",
+            "max_element",
+            "merge",
+            "next_permutation",
+            "nth_element",
+            "partial_sum",
+            "partition",
+            "partition_point",
+            "prev_permutation",
+            "reverse",
+            "rotate",
+            "rotate_copy",
+            "saxpy",
+            "search",
+            "search_n",
+            "set_difference",
+            "set_intersection",
+            "set_symmetric_difference",
+            "set_union",
+            "sort",
+            "stable_partition",
+            "unique",
+            "unique_copy"
+        ]
     }
 
     for other in vs:
diff --git a/perf/perf_partition.cpp b/perf/perf_partition.cpp
index 311b473f7..68488d6b9 100644
--- a/perf/perf_partition.cpp
+++ b/perf/perf_partition.cpp
@@ -49,6 +49,10 @@ int main(int argc, char *argv[])
 
     perf_timer t;
     for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        boost::compute::copy(
+            host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+        );
+
         t.start();
         boost::compute::partition(
             device_vector.begin(), device_vector.end(), _1 < 10, queue
diff --git a/perf/perf_set_difference.cpp b/perf/perf_set_difference.cpp
index c113e5a7f..8bb1cdc5d 100644
--- a/perf/perf_set_difference.cpp
+++ b/perf/perf_set_difference.cpp
@@ -36,39 +36,40 @@ int main(int argc, char *argv[])
     std::cout << "device: " << device.name() << std::endl;
 
     // create vectors of random numbers on the host
-    std::vector<int> host_vector(PERF_N);
-    std::generate(host_vector.begin(), host_vector.end(), rand_int);
-    std::sort(host_vector.begin(), host_vector.end());
-
-    std::vector<int> host_vector2(PERF_N);
-    std::generate(host_vector2.begin(), host_vector2.end(), rand_int);
-    std::sort(host_vector2.begin(), host_vector2.end());
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
 
     // create vectors on the device and copy the data
-    boost::compute::vector<int> device_vector(PERF_N, context);
+    boost::compute::vector<int> gpu_v1(PERF_N, context);
+    boost::compute::vector<int> gpu_v2(PERF_N, context);
+
     boost::compute::copy(
-        host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+        v1.begin(), v1.end(), gpu_v1.begin(), queue
     );
-
-    boost::compute::vector<int> device_vector2(PERF_N, context);
     boost::compute::copy(
-        host_vector2.begin(), host_vector2.end(), device_vector2.begin(), queue
+        v2.begin(), v2.end(), gpu_v2.begin(), queue
     );
 
-    boost::compute::vector<int> result(PERF_N, context);
+    boost::compute::vector<int> gpu_v3(PERF_N, context);
+    boost::compute::vector<int>::iterator gpu_v3_end;
 
     perf_timer t;
     for(size_t trial = 0; trial < PERF_TRIALS; trial++){
         t.start();
-        boost::compute::set_difference(
-            device_vector.begin(), device_vector.begin(),
-            device_vector2.begin(), device_vector2.end(),
-            result.begin(), queue
+        gpu_v3_end = boost::compute::set_difference(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin(), queue
         );
         queue.finish();
         t.stop();
     }
     std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
 
     return 0;
 }
diff --git a/perf/perf_stl_set_difference.cpp b/perf/perf_stl_set_difference.cpp
index 7172f243c..bf436b48a 100644
--- a/perf/perf_stl_set_difference.cpp
+++ b/perf/perf_stl_set_difference.cpp
@@ -25,21 +25,22 @@ int main(int argc, char *argv[])
 
     std::cout << "size: " << PERF_N << std::endl;
 
-    std::vector<int> v1(PERF_N);
-    std::generate(v1.begin(), v1.end(), rand_int);
+    std::vector<int> v1(std::floor(PERF_N / 2.0));
+    std::vector<int> v2(std::ceil(PERF_N / 2.0));
 
-    std::vector<int> v2(PERF_N);
+    std::generate(v1.begin(), v1.end(), rand_int);
     std::generate(v2.begin(), v2.end(), rand_int);
 
-    std::vector<int> v3(PERF_N);
-
     std::sort(v1.begin(), v1.end());
     std::sort(v2.begin(), v2.end());
 
+    std::vector<int> v3(PERF_N);
+    std::vector<int>::iterator v3_end;
+
     perf_timer t;
     for(size_t trial = 0; trial < PERF_TRIALS; trial++){
         t.start();
-        std::set_difference(
+        v3_end = std::set_difference(
             v1.begin(), v1.end(),
             v2.begin(), v2.end(),
             v3.begin()
@@ -47,6 +48,7 @@ int main(int argc, char *argv[])
         t.stop();
     }
     std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << std::distance(v3.begin(), v3_end) << std::endl;
 
     return 0;
 }
diff --git a/perf/perf_thrust_merge.cu b/perf/perf_thrust_merge.cu
new file mode 100644
index 000000000..4bded4674
--- /dev/null
+++ b/perf/perf_thrust_merge.cu
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/merge.h>
+#include <thrust/sort.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> v1(std::floor(PERF_N / 2.0));
+    thrust::host_vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand);
+    std::generate(v2.begin(), v2.end(), rand);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // transfer data to the device
+    thrust::device_vector<int> gpu_v1 = v1;
+    thrust::device_vector<int> gpu_v2 = v2;
+    thrust::device_vector<int> gpu_v3(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        thrust::merge(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin()
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    thrust::host_vector<int> check_v3 = gpu_v3;
+
+    thrust::host_vector<int> v3(PERF_N);
+    std::merge(v1.begin(), v1.end(), v2.begin(), v2.end(), v3.begin());
+    bool ok = std::equal(check_v3.begin(), check_v3.end(), v3.begin());
+    if(!ok){
+        std::cerr << "ERROR: merged ranges different" << std::endl;
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/perf/perf_thrust_partition.cu b/perf/perf_thrust_partition.cu
new file mode 100644
index 000000000..d259ec52e
--- /dev/null
+++ b/perf/perf_thrust_partition.cu
@@ -0,0 +1,59 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/partition.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+struct less_than_ten : public thrust::unary_function<bool, int>
+{
+    __device__ bool operator()(int x) const
+    {
+        return x < 10;
+    }
+};
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec(PERF_N);
+    std::generate(h_vec.begin(), h_vec.end(), rand_int);
+
+    thrust::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::partition(
+            d_vec.begin(), d_vec.end(), less_than_ten()
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/perf/perf_thrust_reverse.cu b/perf/perf_thrust_reverse.cu
new file mode 100644
index 000000000..bf5f9053d
--- /dev/null
+++ b/perf/perf_thrust_reverse.cu
@@ -0,0 +1,47 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/reverse.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::reverse(d_vec.begin(), d_vec.end());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/perf/perf_thrust_rotate.cu b/perf/perf_thrust_rotate.cu
new file mode 100644
index 000000000..b9ee36dd2
--- /dev/null
+++ b/perf/perf_thrust_rotate.cu
@@ -0,0 +1,50 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "perf.hpp"
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec = generate_random_vector<int>(PERF_N);
+
+    // transfer data to the device
+    thrust::device_vector<int> d_vec;
+
+    size_t rotate_distance = PERF_N / 2;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        // there is no thrust::rotate() so we implement it manually with copy()
+        thrust::device_vector<int> tmp(d_vec.begin(), d_vec.begin() + rotate_distance);
+        thrust::copy(d_vec.begin() + rotate_distance, d_vec.end(), d_vec.begin());
+        thrust::copy(tmp.begin(), tmp.end(), d_vec.begin() + rotate_distance);
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    // transfer data back to host
+    thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
+
+    return 0;
+}
diff --git a/perf/perf_thrust_set_difference.cu b/perf/perf_thrust_set_difference.cu
new file mode 100644
index 000000000..6b23b61a2
--- /dev/null
+++ b/perf/perf_thrust_set_difference.cu
@@ -0,0 +1,61 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <iostream>
+#include <iterator>
+#include <algorithm>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> v1(std::floor(PERF_N / 2.0));
+    thrust::host_vector<int> v2(std::ceil(PERF_N / 2.0));
+    std::generate(v1.begin(), v1.end(), rand_int);
+    std::generate(v2.begin(), v2.end(), rand_int);
+    std::sort(v1.begin(), v1.end());
+    std::sort(v2.begin(), v2.end());
+
+    // transfer data to the device
+    thrust::device_vector<int> gpu_v1 = v1;
+    thrust::device_vector<int> gpu_v2 = v2;
+    thrust::device_vector<int> gpu_v3(PERF_N);
+
+    thrust::device_vector<int>::iterator gpu_v3_end;
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        t.start();
+        gpu_v3_end = thrust::set_difference(
+            gpu_v1.begin(), gpu_v1.end(),
+            gpu_v2.begin(), gpu_v2.end(),
+            gpu_v3.begin()
+        );
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+    std::cout << "size: " << thrust::distance(gpu_v3.begin(), gpu_v3_end) << std::endl;
+
+    return 0;
+}
diff --git a/perf/perf_thrust_unique.cu b/perf/perf_thrust_unique.cu
new file mode 100644
index 000000000..2fd7f6e95
--- /dev/null
+++ b/perf/perf_thrust_unique.cu
@@ -0,0 +1,49 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://kylelutz.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <algorithm>
+#include <cstdlib>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/host_vector.h>
+#include <thrust/unique.h>
+
+#include "perf.hpp"
+
+int rand_int()
+{
+    return static_cast<int>((rand() / double(RAND_MAX)) * 25.0);
+}
+
+int main(int argc, char *argv[])
+{
+    perf_parse_args(argc, argv);
+
+    std::cout << "size: " << PERF_N << std::endl;
+    thrust::host_vector<int> h_vec(PERF_N);
+    std::generate(h_vec.begin(), h_vec.end(), rand_int);
+
+    thrust::device_vector<int> d_vec(PERF_N);
+
+    perf_timer t;
+    for(size_t trial = 0; trial < PERF_TRIALS; trial++){
+        d_vec = h_vec;
+
+        t.start();
+        thrust::unique(d_vec.begin(), d_vec.end());
+        cudaDeviceSynchronize();
+        t.stop();
+    }
+    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
+
+    return 0;
+}
diff --git a/perf/perfdoc.py b/perf/perfdoc.py
index a3c9cd14d..2b53092b6 100755
--- a/perf/perfdoc.py
+++ b/perf/perfdoc.py
@@ -46,7 +46,6 @@ def plot_to_file(report, filename):
         "reverse",
         "rotate",
         "saxpy",
-        "set_difference",
         "sort",
         "unique",
     ]