diff --git a/doc/math.qbk b/doc/math.qbk
index 71da94bf04..2909ea3a60 100644
--- a/doc/math.qbk
+++ b/doc/math.qbk
@@ -159,6 +159,7 @@ and use the function's name as the link text.]
 [def __double_factorial [link math_toolkit.factorials.sf_double_factorial double_factorial]]
 [def __rising_factorial [link math_toolkit.factorials.sf_rising_factorial rising_factorial]]
 [def __falling_factorial [link math_toolkit.factorials.sf_falling_factorial falling_factorial]]
+[def __binomial_coefficient [link math_toolkit.factorials.sf_binomial binomial_coefficient]]
 
 [/error functions]
 [def __erf [link math_toolkit.sf_erf.error_function erf]]
@@ -185,6 +186,7 @@ and use the function's name as the link text.]
 [def __ellint_rf  [link math_toolkit.ellint.ellint_carlson ellint_rf]]
 [def __ellint_rc  [link math_toolkit.ellint.ellint_carlson ellint_rc]]
 [def __ellint_rd  [link math_toolkit.ellint.ellint_carlson ellint_rd]]
+[def __ellint_rg  [link math_toolkit.ellint.ellint_carlson ellint_rg]]
 [def __ellint_1  [link math_toolkit.ellint.ellint_1 ellint_1]]
 [def __ellint_2  [link math_toolkit.ellint.ellint_2 ellint_2]]
 [def __ellint_3  [link math_toolkit.ellint.ellint_3 ellint_3]]
@@ -240,6 +242,8 @@ and use the function's name as the link text.]
 [/sinus cardinals]
 [def __sinc_pi  [link math_toolkit.sinc.sinc_pi sinc_pi]]
 [def __sinhc_pi  [link math_toolkit.sinc.sinhc_pi sinhc_pi]]
+[def __sin_pi  [link math_toolkit.powers.sin_pi sin_pi]]
+[def __cos_pi  [link math_toolkit.powers.cos_pi cos_pi]]
 
 [/Inverse hyperbolics]
 [def __acosh  [link math_toolkit.inv_hyper.acosh acosh]]
@@ -260,6 +264,9 @@ and use the function's name as the link text.]
 [def __nextafter [link math_toolkit.next_float.nextafter nextafter]]
 [def __float_advance [link math_toolkit.next_float.float_advance float_advance]]
 [def __ulp [link math_toolkit.next_float.ulp ulp]]
+[def __signbit [link math_toolkit.sign_functions signbit]]
+[def __sign [link math_toolkit.sign_functions sign]]
+[def __changesign [link math_toolkit.sign_functions changesign]]
 
 [/powers etc]
 [def __expm1 [link math_toolkit.powers.expm1 expm1]]
@@ -324,7 +331,7 @@ and use the function's name as the link text.]
 [def __algorithms [link math_toolkit.dist_ref.dist_algorithms algorithms]]
 
 [/ distribution def names end in distrib to avoid clashes]
-[def __arcsine_distrib [link math_toolkit.dist_ref.dists.arcsine_dist Arcsine Distribution]]
+[def __arcsine_distrib [link math_toolkit.dist_ref.dists.arcine_dist Arcsine Distribution]]
 [def __beta_distrib [link math_toolkit.dist_ref.dists.beta_dist Beta Distribution]]
 [def __binomial_distrib [link math_toolkit.dist_ref.dists.binomial_dist Binomial Distribution]]
 [def __cauchy_distrib [link math_toolkit.dist_ref.dists.cauchy_dist Cauchy Distribution]]
@@ -351,8 +358,11 @@ and use the function's name as the link text.]
 [def __normal_distrib [link math_toolkit.dist_ref.dists.normal_dist Normal Distribution]]
 [def __poisson_distrib [link math_toolkit.dist_ref.dists.poisson_dist Poisson Distribution]]
 [def __pareto_distrib [link math_toolkit.dist_ref.dists.pareto Pareto Distribution]]
+[def __rayleigh_distrib [link math_toolkit.dist_ref.dists.rayleigh Rayleigh Distribution]]
 [def __students_t_distrib [link math_toolkit.dist_ref.dists.students_t_dist Students t Distribution]]
 [def __skew_normal_distrib [link math_toolkit.dist_ref.dists.skew_normal_dist Skew Normal Distribution]]
+[def __triangular_distrib [link math_toolkit.dist_ref.dists.triangular_dist Triangular Distribution]]
+[def __uniform_distrib [link math_toolkit.dist_ref.dists.uniform_dist Uniform Distribution]]
 [def __weibull_distrib [link math_toolkit.dist_ref.dists.weibull_dist Weibull Distribution]]
 
 [/links to policy]
@@ -487,6 +497,7 @@ and as a CD ISBN 0-9504833-2-X  978-0-9504833-2-0, Classification 519.2-dc22.
 [include overview/structure.qbk] [/getting about, directory and file structure.]
 [include overview/result_type_calc.qbk]
 [include overview/error_handling.qbk]
+[include overview/cuda.qbk]
 
 [section:compilers_overview Compilers]
 [compilers_overview]
@@ -637,6 +648,7 @@ and as a CD ISBN 0-9504833-2-X  978-0-9504833-2-0, Classification 519.2-dc22.
 [include quadrature/gauss_kronrod.qbk]
 [include quadrature/double_exponential.qbk]
 [include quadrature/naive_monte_carlo.qbk]
+[include quadrature/cuda_naive_monte_carlo.qbk]
 [include differentiation/numerical_differentiation.qbk]
 [endmathpart]
 
diff --git a/doc/overview/cuda.qbk b/doc/overview/cuda.qbk
new file mode 100644
index 0000000000..c4012f97fb
--- /dev/null
+++ b/doc/overview/cuda.qbk
@@ -0,0 +1,58 @@
+[section:cuda CUDA support]
+
+This library does not (yet) use CUDA internally, however there are a number of functions which are
+marked up for use on CUDA devices.
+
+Currently nearly all of our internal helper functions, plus the mathematical __constants and the __boost_math_fp
+are marked as safe for use on CUDA devices.
+
+In addition the following special functions are all safe to use in CUDA device code:
+
+__beta, __ibeta_derivative, __binomial_coefficient, __erf, __erfc, __erf_inv, __erfc_inv,
+__ellint_rf, __ellint_rd, __ellint_rc, __ellint_rj, __ellint_rg, __ellint_1, __ellint_2, __ellint_3,
+__ellint_rf, __ellint_rd, __ellint_rc, __ellint_rj, __ellint_rg, __ellint_1, __ellint_2, __ellint_3,
+__ellint_d, __jacobi_zeta, __heuman_lambda, __factorial, __unchecked_factorial, __double_factorial,
+__falling_factorial, __rising_factorial, __tgamma, __tgamma1pm1, __lgamma, __tgamma_lower,
+__gamma_q, __gamma_p, __tgamma_delta_ratio, __tgamma_ratio, __gamma_p_derivative, __cbrt, __log1p,
+__expm1, __powm1, __sinc_pi, __sinhc_pi, __asinh, __acosh, __atanh, __sin_pi, __cos_pi,
+__fpclassify, __isnan, __isinf, __isnormal, __isfinite, __signbit, __sign, __changesign, __pow,
+__nextafter, __float_next, __float_prior, __float_distance, __float_advance.
+
+The following distribution functions are also supported:
+
+[table
+[[Distribution][pdf][cdf][qualtile]]
+[[__arcsine_distrib][Yes][Yes][Yes]]
+[[__cauchy_distrib][Yes][Yes][Yes]]
+[[__chi_squared_distrib][Yes][Yes][No]]
+[[__extreme_distrib][Yes][Yes][Yes]]
+[[__exp_distrib][Yes][Yes][Yes]]
+[[__gamma_distrib][Yes][Yes][No]]
+[[__geometric_distrib][Yes][Yes][No]]
+[[__inverse_chi_squared_distrib][Yes][Yes][No]]
+[[__inverse_gamma_distrib][Yes][Yes][No]]
+[[__inverse_gaussian_distrib][Yes][Yes][No]]
+[[__laplace_distrib][Yes][Yes][Yes]]
+[[__logistic_distrib][Yes][Yes][Yes]]
+[[__lognormal_distrib][Yes][Yes][Yes]]
+[[__normal_distrib][Yes][Yes][Yes]]
+[[__pareto_distrib][Yes][Yes][Yes]]
+[[__poisson_distrib][Yes][Yes][No]]
+[[__rayleigh_distrib][Yes][Yes][Yes]]
+[[__triangular_distrib][Yes][Yes][Yes]]
+[[__uniform_distrib][Yes][Yes][Yes]]
+[[__weibull_distrib][Yes][Yes][Yes]]
+]
+
+If you see something that's not listed, then either we haven't got round to it yet, or else
+the function is too large for a CUDA device (the incomplete beta for example) or depends on
+non-CUDA usable standard library code such as <tuple> etc.  Pull requests or issues are welcome.
+
+[endsect]
+
+[/ cuda.qbk
+  Copyright 2018 John Maddock.
+  Distributed under the Boost Software License, Version 1.0.
+  (See accompanying file LICENSE_1_0.txt or copy at
+  http://www.boost.org/LICENSE_1_0.txt).
+]
diff --git a/doc/quadrature/cuda_naive_monte_carlo.qbk b/doc/quadrature/cuda_naive_monte_carlo.qbk
new file mode 100644
index 0000000000..074f1c7a7f
--- /dev/null
+++ b/doc/quadrature/cuda_naive_monte_carlo.qbk
@@ -0,0 +1,119 @@
+[/
+Copyright (c) 2018 Nick Thompson
+Use, modification and distribution are subject to the
+Boost Software License, Version 1.0. (See accompanying file
+LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+]
+
+
+[section:cuda_naive_monte_carlo CUDA Accelerated Naive Monte Carlo Integration]
+
+[heading Synopsis]
+
+    #include <boost/math/quadrature/cuda_naive_monte_carlo.hpp>
+    namespace boost { namespace math { namespace quadrature {
+
+    template <class Real, class F, class ThreadGen = thrust::random::taus88, class MasterGen = std::random_device>
+    struct cuda_naive_monte_carlo
+    {
+    public:
+        cuda_naive_monte_carlo(
+		   const F& integrand,
+           std::vector<std::pair<Real, Real>> const & bounds, 
+		   const MasterGen& seed);
+		cuda_naive_monte_carlo(
+		   const F& integrand, 
+		   std::vector<std::pair<Real, Real>> const & bounds)
+
+        Real integrate(
+		   Real error_request, 
+		   boost::uintmax_t calls_per_thread = 1024, 
+		   boost::uintmax_t max_calls_per_thread = 250000, 
+		   bool is_compensated = true);
+
+      Real variance() const;
+      Real current_error_estimate() const;
+      uint64_t calls() const;
+    };
+    }}} // namespaces
+
+[heading Description]
+
+The class `cuda_naive_monte_carlo` performs Monte-Carlo integration on a square integrable function /f/ on a domain [Omega].
+The theoretical background of Monte-Carlo integration is nicely discussed at [@https://en.wikipedia.org/wiki/Monte_Carlo_integration Wikipedia],
+and as such will not be discussed here.
+However, despite being "naive",
+it is a mistake to assume that naive Monte-Carlo integration is not powerful,
+as the simplicity of the method affords a robustness not easily provided by more sophisticated tools.
+The multithreaded nature of the routine allows us to compute a large number of sample points with great speed,
+and hence the slow convergence is mitigated by exploiting the full power of modern hardware.
+
+This class provides proof-of-principle CUDA-accelerated Monte-Carlo integration which will utilize all available
+CUDA threads to provide a significant performance advantage over non-CUDA code.  Since CUDA is used, only
+types `float` and `double` are supported.  Note that there are two random number generators specified in the
+template type-list: type `ThreadGen` is the per-thread random number generator, and must be a type which is
+usable on the CUDA device.  These per-thread generators are initialized via random seeds generated by an object
+of type `MasterGen`: this defaults to `std::random_device` but could equally be a psuedo-random number generator.
+It is important though to ensure that the per-thread random generators do not become correlated, or generate
+overlapping sequences.  For this reason type `ThreadGen` should have a long repeat period, and at the very least 
+be a different type from `MasterGen`.
+
+A call to member function `integrate` performs the actual integration, and also controls how the CUDA device is used:
+
+        Real integrate(
+		   Real error_request, 
+		   boost::uintmax_t calls_per_thread = 1024, 
+		   boost::uintmax_t max_calls_per_thread = 250000, 
+		   bool is_compensated = true);
+
+The parameters are as follows:
+
+* `error_request`: the desired absolute error in the result.
+* `calls_per_thread`: the number of calls to the integrand made by each thread in the first pass - 
+this first pass is used to get an estimate of the variance and calculate how many calls will
+be required by the second pass to reach the desired error bound.
+* `max_calls_per_thread`: the maximum number of calls of the integrand by each thread in any single CUDA
+invocation.  This parameter is used to prevent operating system timeouts from terminating the application.
+For example on Windows if the CUDA accelerated display driver is unresponsive for more than 2 seconds, then
+the application using it will simply be terminated.
+* `is_compensated`: when `true` each thread will use Kahan-style compensated addition to ensure accuracy.  When
+`false` then regular addition will be used for improved performance.
+
+For example:
+
+      // Define a function to integrate:
+      auto g = [] __device__ (const double* x)
+      {
+         constexpr const double pi = boost::math::constants::pi<double>();
+         constexpr const double A = 1.0 / (pi * pi * pi);
+         return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
+      };
+      std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
+      double error_goal = 0.001;
+      cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
+
+      double result = mc.integrate(error_goal);
+
+      std::cout << "Integration result is: " << result << std::endl;
+
+First off, we define the function we wish to integrate.
+This function must accept a `const Real*`, and return a `Real`.
+In comparison to the regular non-CUDA code we loose a good deal of type safety
+as CUDA deals in raw pointers, not vectors - so it's up to the client to
+ensure that the number of elements accessed matches the number of dimensions passed
+to the integrators constructor.
+Also note that we've used a lambda expression for the integrand: this currently requires
+compilation with `-expt-extended-lambda`.
+
+Next, we define the domain of integration as a vector of pairs - in this case
+the domain is [0, PI] over 3 dimensions.
+
+   std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
+
+The call
+
+    naive_monte_carlo<double, decltype(g)> mc(g, bounds);
+
+creates an instance of the Monte-Carlo integrator, and the call to `integrate` then returns the result.
+
+[endsect]
diff --git a/doc/roots/elliptic_table_100_msvc_X86_SSE2.qbk b/doc/roots/elliptic_table_100_msvc_X86_SSE2.qbk
index a5210e7fdf..915d4e393b 100644
--- a/doc/roots/elliptic_table_100_msvc_X86_SSE2.qbk
+++ b/doc/roots/elliptic_table_100_msvc_X86_SSE2.qbk
@@ -13,8 +13,8 @@ Compiled in optimise mode., _X86_SSE2]
 [table:elliptic root with radius 28 and arc length 300) for float, double, long double and cpp_bin_float_50 types, using _X86_SSE2
 [[][float][][][] [][double][][][] [][long d][][][] [][cpp50][][]]
 [[Algo     ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ]]
-[[TOMS748  ][  6][  906][2.07][  0][ ][  9][ 1312][1.79][  1][ ][  9][ 1281][1.75][  1][ ][ 11][1690625][1.52][ -3][ ]]
-[[Newton   ][  3][  640][1.46][ -1][ ][  4][  875][1.19][  1][ ][  4][  843][1.15][  1][ ][  5][1368750][1.23][  0][ ]]
-[[Halley   ][  2][  437][[role blue 1.00]][  0][ ][  3][  734][[role blue 1.00]][  3][ ][  3][  734][[role blue 1.00]][  3][ ][  4][1109375][[role blue 1.00]][  0][ ]]
-[[Schr'''&#xf6;'''der][  3][  671][1.54][ -1][ ][  6][ 1296][1.77][  1][ ][  6][ 1406][1.92][  1][ ][  5][1462500][1.32][ -2][ ]]
+[[TOMS748  ][  5][  656][1.36][ -1][ ][  9][ 1125][2.12][  1][ ][  9][ 1000][1.83][  1][ ][ 11][881250][1.52][ -3][ ]]
+[[Newton   ][  3][  656][1.36][ -1][ ][  4][  625][1.18][  1][ ][  4][  640][1.17][  1][ ][  5][717187][1.24][  0][ ]]
+[[Halley   ][  2][  484][[role blue 1.00]][  0][ ][  3][  531][[role blue 1.00]][  3][ ][  3][  546][[role blue 1.00]][  3][ ][  4][579687][[role blue 1.00]][  0][ ]]
+[[Schr'''&#xf6;'''der][  3][  703][1.45][ -1][ ][  6][ 1031][1.94][  1][ ][  6][ 1015][1.86][  1][ ][  5][740625][1.28][ -2][ ]]
 ] [/end of table root]
diff --git a/doc/roots/root_comparison_tables_msvc.qbk b/doc/roots/root_comparison_tables_msvc.qbk
index 9a4846b0fc..7f411f3eae 100644
--- a/doc/roots/root_comparison_tables_msvc.qbk
+++ b/doc/roots/root_comparison_tables_msvc.qbk
@@ -11,9 +11,9 @@ http://www.boost.org/LICENSE_1_0.txt).
 [table:cbrt_4 Cube root(28) for float, double, long double and cpp_bin_float_50
 [[][float][][][] [][double][][][] [][long d][][][] [][cpp50][][]]
 [[Algorithm][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ]]
-[[cbrt     ][  0][78125][[role blue 1.0]][  0][ ][  0][62500][[role blue 1.0]][  1][ ][  0][93750][[role blue 1.0]][  1][ ][  0][11890625][1.1][  0][ ]]
-[[TOMS748  ][  8][468750][[role red 6.0]][ -1][ ][ 11][906250][[role red 15.]][  2][ ][ 11][906250][[role red 9.7]][  2][ ][  6][80859375][[role red 7.6]][ -2][ ]]
-[[Newton   ][  5][203125][2.6][  0][ ][  6][234375][3.8][  0][ ][  6][187500][2.0][  0][ ][  2][10640625][[role blue 1.0]][  0][ ]]
-[[Halley   ][  3][234375][3.0][  0][ ][  4][265625][[role red 4.3]][  0][ ][  4][234375][2.5][  0][ ][  2][26250000][2.5][  0][ ]]
-[[Schr'''&#xf6;'''der][  4][296875][3.8][  0][ ][  5][281250][[role red 4.5]][  0][ ][  5][234375][2.5][  0][ ][  2][32437500][3.0][  0][ ]]
+[[cbrt     ][  0][62500][[role blue 1.0]][  0][ ][  0][78125][[role blue 1.0]][  1][ ][  0][62500][[role blue 1.0]][  1][ ][  0][6375000][1.3][  0][ ]]
+[[TOMS748  ][  8][421875][[role red 6.8]][ -1][ ][ 11][734375][[role red 9.4]][  2][ ][ 11][687500][[role red 11.]][  2][ ][  6][39953125][[role red 8.2]][ -2][ ]]
+[[Newton   ][  5][171875][2.8][  0][ ][  6][171875][2.2][  0][ ][  6][140625][2.3][  0][ ][  2][4859375][[role blue 1.0]][  0][ ]]
+[[Halley   ][  3][156250][2.5][  0][ ][  4][203125][2.6][  0][ ][  4][171875][2.8][  0][ ][  2][12296875][2.5][  0][ ]]
+[[Schr'''&#xf6;'''der][  4][187500][3.0][  0][ ][  5][250000][3.2][  0][ ][  5][218750][3.5][  0][ ][  2][16406250][3.4][  0][ ]]
 ] [/end of table cbrt_4]
diff --git a/doc/roots/roots_table_100_msvc_X86_SSE2.qbk b/doc/roots/roots_table_100_msvc_X86_SSE2.qbk
index dee6062ac5..deeba0f5b2 100644
--- a/doc/roots/roots_table_100_msvc_X86_SSE2.qbk
+++ b/doc/roots/roots_table_100_msvc_X86_SSE2.qbk
@@ -14,24 +14,24 @@ Fraction of full accuracy 1
 [table:root_5 5th root(28) for float, double, long double and cpp_bin_float_50 types, using _X86_SSE2
 [[][float][][][] [][double][][][] [][long d][][][] [][cpp50][][]]
 [[Algo     ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ]]
-[[TOMS748  ][  7][  457][2.00][  0][ ][ 11][  860][3.54][  1][ ][ 11][  806][3.02][  1][ ][ 12][226875][[role red 8.11]][  0][ ]]
-[[Newton   ][  3][  228][[role blue 1.00]][  0][ ][  4][  243][[role blue 1.00]][ -1][ ][  4][  298][1.12][ -1][ ][  6][27968][[role blue 1.00]][  0][ ]]
-[[Halley   ][  2][  250][1.10][  0][ ][  3][  268][1.10][  0][ ][  3][  267][[role blue 1.00]][  0][ ][  4][52812][1.89][  0][ ]]
-[[Schr'''&#xf6;'''der][  2][  256][1.12][  0][ ][  3][  271][1.12][ -1][ ][  3][  270][[role blue 1.01]][ -1][ ][  4][61406][2.20][  0][ ]]
+[[TOMS748  ][  7][  296][1.76][  0][ ][ 11][  576][3.16][  1][ ][ 11][  568][3.17][  1][ ][ 12][104531][[role red 7.78]][  0][ ]]
+[[Newton   ][  3][  168][[role blue 1.00]][  0][ ][  4][  182][[role blue 1.00]][ -1][ ][  4][  179][[role blue 1.00]][ -1][ ][  6][13437][[role blue 1.00]][  0][ ]]
+[[Halley   ][  2][  179][1.07][  0][ ][  3][  204][1.12][  0][ ][  3][  212][1.18][  0][ ][  4][23125][1.72][  0][ ]]
+[[Schr'''&#xf6;'''der][  2][  178][1.06][  0][ ][  3][  218][1.20][ -1][ ][  3][  214][1.20][ -1][ ][  4][28437][2.12][  0][ ]]
 ] [/end of table root]
 [table:root_7 7th root(28) for float, double, long double and cpp_bin_float_50 types, using _X86_SSE2
 [[][float][][][] [][double][][][] [][long d][][][] [][cpp50][][]]
 [[Algo     ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ]]
-[[TOMS748  ][ 12][  825][3.06][  1][ ][ 15][ 1145][[role red 4.06]][  2][ ][ 15][ 1159][[role red 4.17]][  2][ ][ 14][295781][[role red 8.12]][  0][ ]]
-[[Newton   ][  5][  270][[role blue 1.00]][  0][ ][  6][  282][[role blue 1.00]][  0][ ][  6][  278][[role blue 1.00]][  0][ ][  8][36406][[role blue 1.00]][  0][ ]]
-[[Halley   ][  4][  303][1.12][  0][ ][  5][  329][1.17][  0][ ][  5][  335][1.21][  0][ ][  6][78281][2.15][  0][ ]]
-[[Schr'''&#xf6;'''der][  5][  340][1.26][  0][ ][  6][  432][1.53][  0][ ][  6][  367][1.32][  0][ ][  7][85156][2.34][  0][ ]]
+[[TOMS748  ][ 12][  439][2.45][  1][ ][ 15][  715][3.65][  2][ ][ 15][  729][3.84][  2][ ][ 14][126250][[role red 7.28]][  0][ ]]
+[[Newton   ][  5][  179][[role blue 1.00]][  0][ ][  6][  196][[role blue 1.00]][  0][ ][  6][  190][[role blue 1.00]][  0][ ][  8][17343][[role blue 1.00]][  0][ ]]
+[[Halley   ][  4][  200][1.12][  0][ ][  5][  273][1.39][  0][ ][  5][  229][1.21][  0][ ][  6][35468][2.05][  0][ ]]
+[[Schr'''&#xf6;'''der][  5][  229][1.28][  0][ ][  6][  314][1.60][  0][ ][  6][  275][1.45][  0][ ][  7][49375][2.85][  0][ ]]
 ] [/end of table root]
 [table:root_11 11th root(28) for float, double, long double and cpp_bin_float_50 types, using _X86_SSE2
 [[][float][][][] [][double][][][] [][long d][][][] [][cpp50][][]]
 [[Algo     ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ][Its][Times][Norm][Dis][ ]]
-[[TOMS748  ][ 12][  714][3.16][ -2][ ][ 14][  909][[role red 4.19]][  2][ ][ 14][  793][3.69][  2][ ][ 17][211718][[role red 9.28]][  2][ ]]
-[[Newton   ][  6][  226][[role blue 1.00]][  0][ ][  7][  217][[role blue 1.00]][  0][ ][  7][  215][[role blue 1.00]][  0][ ][  9][22812][[role blue 1.00]][  0][ ]]
-[[Halley   ][  4][  262][1.16][ -1][ ][  5][  260][1.20][  0][ ][  5][  260][1.21][  0][ ][  6][40781][1.79][  0][ ]]
-[[Schr'''&#xf6;'''der][  6][  332][1.47][  0][ ][  7][  314][1.45][  0][ ][  7][  310][1.44][  0][ ][  8][67187][2.95][  0][ ]]
+[[TOMS748  ][ 12][  529][2.80][ -2][ ][ 14][  734][3.50][  2][ ][ 14][  729][3.47][  2][ ][ 17][190937][[role red 9.33]][  2][ ]]
+[[Newton   ][  6][  189][[role blue 1.00]][  0][ ][  7][  210][[role blue 1.00]][  0][ ][  7][  210][[role blue 1.00]][  0][ ][  9][20468][[role blue 1.00]][  0][ ]]
+[[Halley   ][  4][  204][1.08][ -1][ ][  5][  246][1.17][  0][ ][  5][  246][1.17][  0][ ][  6][36562][1.79][  0][ ]]
+[[Schr'''&#xf6;'''der][  6][  245][1.30][  0][ ][  7][  292][1.39][  0][ ][  7][  292][1.39][  0][ ][  8][60625][2.96][  0][ ]]
 ] [/end of table root]
diff --git a/include/boost/math/bindings/rr.hpp b/include/boost/math/bindings/rr.hpp
index 6ec79f953d..dbd5cb3673 100644
--- a/include/boost/math/bindings/rr.hpp
+++ b/include/boost/math/bindings/rr.hpp
@@ -395,8 +395,8 @@ inline RR tanh(RR a)
          *exp -= 1;
          r.value().e += 1;
       }
-      BOOST_ASSERT(r < 1);
-      BOOST_ASSERT(r >= 0.5);
+      BOOST_MATH_ASSERT(r < 1);
+      BOOST_MATH_ASSERT(r >= 0.5);
       return r;
    }
    inline RR ldexp(RR r, int exp)
diff --git a/include/boost/math/complex/acos.hpp b/include/boost/math/complex/acos.hpp
index a911756949..92c096c621 100644
--- a/include/boost/math/complex/acos.hpp
+++ b/include/boost/math/complex/acos.hpp
@@ -202,7 +202,7 @@ std::complex<T> acos(const std::complex<T>& z)
             // but we have no way to test that here, so for now just assert
             // on the assumption:
             //
-            BOOST_ASSERT(x == 1);
+            BOOST_MATH_ASSERT(x == 1);
             real = std::sqrt(y);
             imag = std::sqrt(y);
          }
diff --git a/include/boost/math/complex/asin.hpp b/include/boost/math/complex/asin.hpp
index 087c3b51ef..f1b07e869b 100644
--- a/include/boost/math/complex/asin.hpp
+++ b/include/boost/math/complex/asin.hpp
@@ -209,7 +209,7 @@ inline std::complex<T> asin(const std::complex<T>& z)
             // but we have no way to test that here, so for now just assert
             // on the assumption:
             //
-            BOOST_ASSERT(x == 1);
+            BOOST_MATH_ASSERT(x == 1);
             real = half_pi - std::sqrt(y);
             imag = std::sqrt(y);
          }
diff --git a/include/boost/math/constants/constants.hpp b/include/boost/math/constants/constants.hpp
index 8c5c4105d4..5daef01503 100644
--- a/include/boost/math/constants/constants.hpp
+++ b/include/boost/math/constants/constants.hpp
@@ -228,11 +228,11 @@ namespace boost{ namespace math
       constant_initializer<T, & BOOST_JOIN(constant_, name)<T>::get_from_string >::force_instantiate();\
       return get_from_string();\
    }\
-   static inline BOOST_CONSTEXPR T get(const mpl::int_<construct_from_float>) BOOST_NOEXCEPT\
+   static inline BOOST_GPU_ENABLED BOOST_CONSTEXPR T get(const mpl::int_<construct_from_float>) BOOST_NOEXCEPT\
    { return BOOST_JOIN(x, F); }\
-   static inline BOOST_CONSTEXPR T get(const mpl::int_<construct_from_double>&) BOOST_NOEXCEPT\
+   static inline BOOST_GPU_ENABLED BOOST_CONSTEXPR T get(const mpl::int_<construct_from_double>&) BOOST_NOEXCEPT\
    { return x; }\
-   static inline BOOST_CONSTEXPR T get(const mpl::int_<construct_from_long_double>&) BOOST_NOEXCEPT\
+   static inline BOOST_GPU_ENABLED BOOST_CONSTEXPR T get(const mpl::int_<construct_from_long_double>&) BOOST_NOEXCEPT\
    { return BOOST_JOIN(x, L); }\
    BOOST_MATH_FLOAT128_CONSTANT_OVERLOAD(x) \
    template <int N> static inline const T& get(const mpl::int_<N>&)\
@@ -250,9 +250,9 @@ namespace boost{ namespace math
    \
    \
    /* The actual forwarding function: */ \
-   template <class T, class Policy> inline BOOST_CONSTEXPR typename detail::constant_return<T, Policy>::type name(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T) BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(Policy)) BOOST_MATH_NOEXCEPT(T)\
+   template <class T, class Policy> inline BOOST_GPU_ENABLED BOOST_CONSTEXPR typename detail::constant_return<T, Policy>::type name(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T) BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(Policy)) BOOST_MATH_NOEXCEPT(T)\
    { return detail:: BOOST_JOIN(constant_, name)<T>::get(typename construction_traits<T, Policy>::type()); }\
-   template <class T> inline BOOST_CONSTEXPR typename detail::constant_return<T>::type name(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) BOOST_MATH_NOEXCEPT(T)\
+   template <class T> inline BOOST_GPU_ENABLED BOOST_CONSTEXPR typename detail::constant_return<T>::type name(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) BOOST_MATH_NOEXCEPT(T)\
    { return name<T, boost::math::policies::policy<> >(); }\
    \
    \
diff --git a/include/boost/math/distributions/arcsine.hpp b/include/boost/math/distributions/arcsine.hpp
index 8aad5b2d0b..ac5d6e5658 100644
--- a/include/boost/math/distributions/arcsine.hpp
+++ b/include/boost/math/distributions/arcsine.hpp
@@ -54,7 +54,7 @@ namespace boost
       // Common error checking routines for arcsine distribution functions:
       // Duplicating for x_min and x_max provides specific error messages.
       template <class RealType, class Policy>
-      inline bool check_x_min(const char* function, const RealType& x, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_x_min(const char* function, const RealType& x, RealType* result, const Policy& pol)
       {
         if (!(boost::math::isfinite)(x))
         {
@@ -67,7 +67,7 @@ namespace boost
       } // bool check_x_min
 
       template <class RealType, class Policy>
-      inline bool check_x_max(const char* function, const RealType& x, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_x_max(const char* function, const RealType& x, RealType* result, const Policy& pol)
       {
         if (!(boost::math::isfinite)(x))
         {
@@ -81,11 +81,16 @@ namespace boost
 
 
       template <class RealType, class Policy>
-      inline bool check_x_minmax(const char* function, const RealType& x_min, const RealType& x_max, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_x_minmax(const char* function, const RealType& x_min, const RealType& x_max, RealType* result, const Policy& pol)
       { // Check x_min < x_max
         if (x_min >= x_max)
         {
-          std::string msg = "x_max argument is %1%, but must be > x_min = " + lexical_cast<std::string>(x_min) + "!";
+#ifdef __CUDA_ARCH__
+           *result = policies::raise_domain_error<RealType>(
+              function,
+              "x_max value %1% is out of range", x_max, pol);
+#else
+           std::string msg = "x_max argument is %1%, but must be > x_min = " + lexical_cast<std::string>(x_min) + "!";
           *result = policies::raise_domain_error<RealType>(
             function,
             msg.c_str(), x_max, pol);
@@ -94,13 +99,14 @@ namespace boost
           // But would require replication of all helpers functions in /policies/error_handling.hpp for two values,
           // as well as two value versions of raise_error, raise_domain_error and do_format ...
           // so use slightly hacky lexical_cast to string instead.
+#endif
           return false;
         }
         return true;
       } // bool check_x_minmax
 
       template <class RealType, class Policy>
-      inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         if ((p < 0) || (p > 1) || !(boost::math::isfinite)(p))
         {
@@ -113,7 +119,7 @@ namespace boost
       } // bool check_prob
 
       template <class RealType, class Policy>
-      inline bool check_x(const char* function, const RealType& x_min, const RealType& x_max, const RealType& x, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_x(const char* function, const RealType& x_min, const RealType& x_max, const RealType& x, RealType* result, const Policy& pol)
       { // Check x finite and x_min < x < x_max.
         if (!(boost::math::isfinite)(x))
         {
@@ -137,7 +143,7 @@ namespace boost
       } // bool check_x
 
       template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& x_min, const RealType& x_max, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist(const char* function, const RealType& x_min, const RealType& x_max, RealType* result, const Policy& pol)
       { // Check both x_min and x_max finite, and x_min  < x_max.
         return check_x_min(function, x_min, result, pol)
             && check_x_max(function, x_max, result, pol)
@@ -145,14 +151,14 @@ namespace boost
       } // bool check_dist
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_x(const char* function, const RealType& x_min, const RealType& x_max, RealType x, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist_and_x(const char* function, const RealType& x_min, const RealType& x_max, RealType x, RealType* result, const Policy& pol)
       {
         return check_dist(function, x_min, x_max, result, pol)
           && arcsine_detail::check_x(function, x_min, x_max, x, result, pol);
       } // bool check_dist_and_x
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, const RealType& x_min, const RealType& x_max, RealType p, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist_and_prob(const char* function, const RealType& x_min, const RealType& x_max, RealType p, RealType* result, const Policy& pol)
       {
         return check_dist(function, x_min, x_max, result, pol)
           && check_prob(function, p, result, pol);
@@ -167,7 +173,7 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      arcsine_distribution(RealType x_min = 0, RealType x_max = 1) : m_x_min(x_min), m_x_max(x_max)
+      BOOST_GPU_ENABLED arcsine_distribution(RealType x_min = 0, RealType x_max = 1) : m_x_min(x_min), m_x_max(x_max)
       { // Default beta (alpha = beta = 0.5) is standard arcsine with x_min = 0, x_max = 1.
         // Generalized to allow x_min and x_max to be specified.
         RealType result;
@@ -178,11 +184,11 @@ namespace boost
           &result, Policy());
       } // arcsine_distribution constructor.
       // Accessor functions:
-      RealType x_min() const
+      BOOST_GPU_ENABLED RealType x_min() const
       {
         return m_x_min;
       }
-      RealType x_max() const
+      BOOST_GPU_ENABLED RealType x_max() const
       {
         return m_x_max;
       }
@@ -197,21 +203,21 @@ namespace boost
 
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> range(const arcsine_distribution<RealType, Policy>&  dist)
+    inline BOOST_GPU_ENABLED const std::pair<RealType, RealType> range(const arcsine_distribution<RealType, Policy>&  dist)
     { // Range of permissible values for random variable x.
       using boost::math::tools::max_value;
       return std::pair<RealType, RealType>(static_cast<RealType>(dist.x_min()), static_cast<RealType>(dist.x_max()));
     }
 
     template <class RealType, class Policy>
-    inline const std::pair<RealType, RealType> support(const arcsine_distribution<RealType, Policy>&  dist)
+    inline BOOST_GPU_ENABLED const std::pair<RealType, RealType> support(const arcsine_distribution<RealType, Policy>&  dist)
     { // Range of supported values for random variable x.
       // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
       return std::pair<RealType, RealType>(static_cast<RealType>(dist.x_min()), static_cast<RealType>(dist.x_max()));
     }
 
     template <class RealType, class Policy>
-    inline RealType mean(const arcsine_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mean(const arcsine_distribution<RealType, Policy>& dist)
     { // Mean of arcsine distribution .
       RealType result;
       RealType x_min = dist.x_min();
@@ -230,7 +236,7 @@ namespace boost
     } // mean
 
     template <class RealType, class Policy>
-    inline RealType variance(const arcsine_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType variance(const arcsine_distribution<RealType, Policy>& dist)
     { // Variance of standard arcsine distribution = (1-0)/8 = 0.125.
       RealType result;
       RealType x_min = dist.x_min();
@@ -248,7 +254,7 @@ namespace boost
     } // variance
 
     template <class RealType, class Policy>
-    inline RealType mode(const arcsine_distribution<RealType, Policy>& /* dist */)
+    inline BOOST_GPU_ENABLED RealType mode(const arcsine_distribution<RealType, Policy>& /* dist */)
     { //There are always [*two] values for the mode, at ['x_min] and at ['x_max], default 0 and 1,
       // so instead we raise the exception domain_error.
       return policies::raise_domain_error<RealType>(
@@ -259,7 +265,7 @@ namespace boost
     } // mode
 
     template <class RealType, class Policy>
-    inline RealType median(const arcsine_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType median(const arcsine_distribution<RealType, Policy>& dist)
     { // Median of arcsine distribution (a + b) / 2 == mean.
       RealType x_min = dist.x_min();
       RealType x_max = dist.x_max();
@@ -277,7 +283,7 @@ namespace boost
     }
 
     template <class RealType, class Policy>
-    inline RealType skewness(const arcsine_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType skewness(const arcsine_distribution<RealType, Policy>& dist)
     {
       RealType result;
       RealType x_min = dist.x_min();
@@ -296,7 +302,7 @@ namespace boost
     } // skewness
 
     template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const arcsine_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis_excess(const arcsine_distribution<RealType, Policy>& dist)
     {
       RealType result;
       RealType x_min = dist.x_min();
@@ -316,7 +322,7 @@ namespace boost
     } // kurtosis_excess
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const arcsine_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis(const arcsine_distribution<RealType, Policy>& dist)
     {
       RealType result;
       RealType x_min = dist.x_min();
@@ -336,12 +342,12 @@ namespace boost
     } // kurtosis
 
     template <class RealType, class Policy>
-    inline RealType pdf(const arcsine_distribution<RealType, Policy>& dist, const RealType& xx)
+    inline BOOST_GPU_ENABLED RealType pdf(const arcsine_distribution<RealType, Policy>& dist, const RealType& xx)
     { // Probability Density/Mass Function arcsine.
       BOOST_FPU_EXCEPTION_GUARD
       BOOST_MATH_STD_USING // For ADL of std functions.
 
-      static const char* function = "boost::math::pdf(arcsine_distribution<%1%> const&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(arcsine_distribution<%1%> const&, %1%)";
 
       RealType lo = dist.x_min();
       RealType hi = dist.x_max();
@@ -362,11 +368,11 @@ namespace boost
     } // pdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const arcsine_distribution<RealType, Policy>& dist, const RealType& x)
+    inline BOOST_GPU_ENABLED RealType cdf(const arcsine_distribution<RealType, Policy>& dist, const RealType& x)
     { // Cumulative Distribution Function arcsine.
       BOOST_MATH_STD_USING // For ADL of std functions.
 
-      static const char* function = "boost::math::cdf(arcsine_distribution<%1%> const&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(arcsine_distribution<%1%> const&, %1%)";
 
       RealType x_min = dist.x_min();
       RealType x_max = dist.x_max();
@@ -395,10 +401,10 @@ namespace boost
     } // arcsine cdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<arcsine_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<arcsine_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function arcsine.
       BOOST_MATH_STD_USING // For ADL of std functions.
-      static const char* function = "boost::math::cdf(arcsine_distribution<%1%> const&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(arcsine_distribution<%1%> const&, %1%)";
 
       RealType x = c.param;
       arcsine_distribution<RealType, Policy> const& dist = c.dist;
@@ -431,7 +437,7 @@ namespace boost
     } // arcine ccdf
 
     template <class RealType, class Policy>
-    inline RealType quantile(const arcsine_distribution<RealType, Policy>& dist, const RealType& p)
+    inline BOOST_GPU_ENABLED RealType quantile(const arcsine_distribution<RealType, Policy>& dist, const RealType& p)
     { 
       // Quantile or Percent Point arcsine function or
       // Inverse Cumulative probability distribution function CDF.
@@ -445,7 +451,7 @@ namespace boost
 
       using boost::math::constants::half_pi;
 
-      static const char* function = "boost::math::quantile(arcsine_distribution<%1%> const&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(arcsine_distribution<%1%> const&, %1%)";
 
       RealType result = 0; // of argument checks:
       RealType x_min = dist.x_min();
@@ -475,7 +481,7 @@ namespace boost
     } // quantile
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<arcsine_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<arcsine_distribution<RealType, Policy>, RealType>& c)
     { 
       // Complement Quantile or Percent Point arcsine function.
       // Return the number of expected x for a given
@@ -483,7 +489,7 @@ namespace boost
       BOOST_MATH_STD_USING // For ADL of std functions.
 
       using boost::math::constants::half_pi;
-      static const char* function = "boost::math::quantile(arcsine_distribution<%1%> const&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(arcsine_distribution<%1%> const&, %1%)";
 
       // Error checks:
       RealType q = c.param;
diff --git a/include/boost/math/distributions/cauchy.hpp b/include/boost/math/distributions/cauchy.hpp
index 5a3a64f0f2..74a052bf95 100644
--- a/include/boost/math/distributions/cauchy.hpp
+++ b/include/boost/math/distributions/cauchy.hpp
@@ -31,7 +31,7 @@ namespace detail
 {
 
 template <class RealType, class Policy>
-RealType cdf_imp(const cauchy_distribution<RealType, Policy>& dist, const RealType& x, bool complement)
+BOOST_GPU_ENABLED RealType cdf_imp(const cauchy_distribution<RealType, Policy>& dist, const RealType& x, bool complement)
 {
    //
    // This calculates the cdf of the Cauchy distribution and/or its complement.
@@ -55,7 +55,7 @@ RealType cdf_imp(const cauchy_distribution<RealType, Policy>& dist, const RealTy
    // to get the result.
    //
    BOOST_MATH_STD_USING // for ADL of std functions
-   static const char* function = "boost::math::cdf(cauchy<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(cauchy<%1%>&, %1%)";
    RealType result = 0;
    RealType location = dist.location();
    RealType scale = dist.scale();
@@ -67,6 +67,12 @@ RealType cdf_imp(const cauchy_distribution<RealType, Policy>& dist, const RealTy
    {
       return result;
    }
+#ifdef __CUDA_ARCH__
+   if(x > tools::max_value<RealType>())
+     return static_cast<RealType>((complement) ? 0 : 1);
+   if(x < -tools::max_value<RealType>())
+     return static_cast<RealType>((complement) ? 1 : 0);
+#else
    if(std::numeric_limits<RealType>::has_infinity && x == std::numeric_limits<RealType>::infinity())
    { // cdf +infinity is unity.
      return static_cast<RealType>((complement) ? 0 : 1);
@@ -75,6 +81,7 @@ RealType cdf_imp(const cauchy_distribution<RealType, Policy>& dist, const RealTy
    { // cdf -infinity is zero.
      return static_cast<RealType>((complement) ? 1 : 0);
    }
+#endif
    if(false == detail::check_x(function, x, &result, Policy()))
    { // Catches x == NaN
       return result;
@@ -89,7 +96,7 @@ RealType cdf_imp(const cauchy_distribution<RealType, Policy>& dist, const RealTy
 } // cdf
 
 template <class RealType, class Policy>
-RealType quantile_imp(
+BOOST_GPU_ENABLED RealType quantile_imp(
       const cauchy_distribution<RealType, Policy>& dist,
       const RealType& p,
       bool complement)
@@ -102,7 +109,7 @@ RealType quantile_imp(
    // mid-point of the distribution.  This is either added or subtracted
    // from the location parameter depending on whether `complement` is true.
    //
-   static const char* function = "boost::math::quantile(cauchy<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(cauchy<%1%>&, %1%)";
    BOOST_MATH_STD_USING // for ADL of std functions
 
    RealType result = 0;
@@ -152,20 +159,20 @@ class cauchy_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   cauchy_distribution(RealType l_location = 0, RealType l_scale = 1)
+   BOOST_GPU_ENABLED cauchy_distribution(RealType l_location = 0, RealType l_scale = 1)
       : m_a(l_location), m_hg(l_scale)
    {
-    static const char* function = "boost::math::cauchy_distribution<%1%>::cauchy_distribution";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::cauchy_distribution<%1%>::cauchy_distribution";
      RealType result;
      detail::check_location(function, l_location, &result, Policy());
      detail::check_scale(function, l_scale, &result, Policy());
    } // cauchy_distribution
 
-   RealType location()const
+   BOOST_GPU_ENABLED RealType location()const
    {
       return m_a;
    }
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    {
       return m_hg;
    }
@@ -178,7 +185,7 @@ class cauchy_distribution
 typedef cauchy_distribution<double> cauchy;
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> range(const cauchy_distribution<RealType, Policy>&)
+inline BOOST_GPU_ENABLED const std::pair<RealType, RealType> range(const cauchy_distribution<RealType, Policy>&)
 { // Range of permissible values for random variable x.
   if (std::numeric_limits<RealType>::has_infinity)
   { 
@@ -192,7 +199,7 @@ inline const std::pair<RealType, RealType> range(const cauchy_distribution<RealT
 }
 
 template <class RealType, class Policy>
-inline const std::pair<RealType, RealType> support(const cauchy_distribution<RealType, Policy>& )
+inline BOOST_GPU_ENABLED const std::pair<RealType, RealType> support(const cauchy_distribution<RealType, Policy>& )
 { // Range of supported values for random variable x.
    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
   if (std::numeric_limits<RealType>::has_infinity)
@@ -207,11 +214,11 @@ inline const std::pair<RealType, RealType> support(const cauchy_distribution<Rea
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const cauchy_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const cauchy_distribution<RealType, Policy>& dist, const RealType& x)
 {  
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(cauchy<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(cauchy<%1%>&, %1%)";
    RealType result = 0;
    RealType location = dist.location();
    RealType scale = dist.scale();
@@ -244,31 +251,31 @@ inline RealType pdf(const cauchy_distribution<RealType, Policy>& dist, const Rea
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const cauchy_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const cauchy_distribution<RealType, Policy>& dist, const RealType& x)
 {
    return detail::cdf_imp(dist, x, false);
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const cauchy_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const cauchy_distribution<RealType, Policy>& dist, const RealType& p)
 {
    return detail::quantile_imp(dist, p, false);
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<cauchy_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<cauchy_distribution<RealType, Policy>, RealType>& c)
 {
    return detail::cdf_imp(c.dist, c.param, true);
 } //  cdf complement
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<cauchy_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<cauchy_distribution<RealType, Policy>, RealType>& c)
 {
    return detail::quantile_imp(c.dist, c.param, true);
 } // quantile complement
 
 template <class RealType, class Policy>
-inline RealType mean(const cauchy_distribution<RealType, Policy>&)
+inline BOOST_GPU_ENABLED RealType mean(const cauchy_distribution<RealType, Policy>&)
 {  // There is no mean:
    typedef typename Policy::assert_undefined_type assert_type;
    BOOST_STATIC_ASSERT(assert_type::value == 0);
@@ -281,7 +288,7 @@ inline RealType mean(const cauchy_distribution<RealType, Policy>&)
 }
 
 template <class RealType, class Policy>
-inline RealType variance(const cauchy_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType variance(const cauchy_distribution<RealType, Policy>& /*dist*/)
 {
    // There is no variance:
    typedef typename Policy::assert_undefined_type assert_type;
@@ -295,18 +302,18 @@ inline RealType variance(const cauchy_distribution<RealType, Policy>& /*dist*/)
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const cauchy_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const cauchy_distribution<RealType, Policy>& dist)
 {
    return dist.location();
 }
 
 template <class RealType, class Policy>
-inline RealType median(const cauchy_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const cauchy_distribution<RealType, Policy>& dist)
 {
    return dist.location();
 }
 template <class RealType, class Policy>
-inline RealType skewness(const cauchy_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType skewness(const cauchy_distribution<RealType, Policy>& /*dist*/)
 {
    // There is no skewness:
    typedef typename Policy::assert_undefined_type assert_type;
@@ -320,7 +327,7 @@ inline RealType skewness(const cauchy_distribution<RealType, Policy>& /*dist*/)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const cauchy_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis(const cauchy_distribution<RealType, Policy>& /*dist*/)
 {
    // There is no kurtosis:
    typedef typename Policy::assert_undefined_type assert_type;
@@ -334,7 +341,7 @@ inline RealType kurtosis(const cauchy_distribution<RealType, Policy>& /*dist*/)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const cauchy_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const cauchy_distribution<RealType, Policy>& /*dist*/)
 {
    // There is no kurtosis excess:
    typedef typename Policy::assert_undefined_type assert_type;
diff --git a/include/boost/math/distributions/chi_squared.hpp b/include/boost/math/distributions/chi_squared.hpp
index 071c7756f4..34be912265 100644
--- a/include/boost/math/distributions/chi_squared.hpp
+++ b/include/boost/math/distributions/chi_squared.hpp
@@ -26,14 +26,14 @@ class chi_squared_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   chi_squared_distribution(RealType i) : m_df(i)
+   BOOST_GPU_ENABLED chi_squared_distribution(RealType i) : m_df(i)
    {
       RealType result;
       detail::check_df(
          "boost::math::chi_squared_distribution<%1%>::chi_squared_distribution", m_df, &result, Policy());
    } // chi_squared_distribution
 
-   RealType degrees_of_freedom()const
+   BOOST_GPU_ENABLED RealType degrees_of_freedom()const
    {
       return m_df;
    }
@@ -86,14 +86,14 @@ inline const std::pair<RealType, RealType> support(const chi_squared_distributio
 }
 
 template <class RealType, class Policy>
-RealType pdf(const chi_squared_distribution<RealType, Policy>& dist, const RealType& chi_square)
+BOOST_GPU_ENABLED RealType pdf(const chi_squared_distribution<RealType, Policy>& dist, const RealType& chi_square)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    RealType degrees_of_freedom = dist.degrees_of_freedom();
    // Error check:
    RealType error_result;
 
-   static const char* function = "boost::math::pdf(const chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const chi_squared_distribution<%1%>&, %1%)";
 
    if(false == detail::check_df(
          function, degrees_of_freedom, &error_result, Policy()))
@@ -127,12 +127,12 @@ RealType pdf(const chi_squared_distribution<RealType, Policy>& dist, const RealT
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const chi_squared_distribution<RealType, Policy>& dist, const RealType& chi_square)
+inline BOOST_GPU_ENABLED RealType cdf(const chi_squared_distribution<RealType, Policy>& dist, const RealType& chi_square)
 {
    RealType degrees_of_freedom = dist.degrees_of_freedom();
    // Error check:
    RealType error_result;
-   static const char* function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)";
 
    if(false == detail::check_df(
          function, degrees_of_freedom, &error_result, Policy()))
@@ -165,11 +165,11 @@ inline RealType quantile(const chi_squared_distribution<RealType, Policy>& dist,
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<chi_squared_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<chi_squared_distribution<RealType, Policy>, RealType>& c)
 {
    RealType const& degrees_of_freedom = c.dist.degrees_of_freedom();
    RealType const& chi_square = c.param;
-   static const char* function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -203,22 +203,22 @@ inline RealType quantile(const complemented2_type<chi_squared_distribution<RealT
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const chi_squared_distribution<RealType, Policy>& dist)
 { // Mean of Chi-Squared distribution = v.
   return dist.degrees_of_freedom();
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const chi_squared_distribution<RealType, Policy>& dist)
 { // Variance of Chi-Squared distribution = 2v.
   return 2 * dist.degrees_of_freedom();
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::mode(const chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mode(const chi_squared_distribution<%1%>&)";
    // Most sources only define mode for df >= 2,
    // but for 0 <= df <= 2, the pdf maximum actually occurs at random variate = 0;
    // So one could extend the definition of mode thus:
@@ -253,7 +253,7 @@ inline RealType mode(const chi_squared_distribution<RealType, Policy>& dist)
 // Now implemented via quantile(half) in derived accessors.
 
 template <class RealType, class Policy>
-inline RealType skewness(const chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const chi_squared_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING // For ADL
    RealType df = dist.degrees_of_freedom();
@@ -261,14 +261,14 @@ inline RealType skewness(const chi_squared_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
    return 3 + 12 / df;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
    return 12 / df;
diff --git a/include/boost/math/distributions/complement.hpp b/include/boost/math/distributions/complement.hpp
index 26d0d49e6d..371a44f297 100644
--- a/include/boost/math/distributions/complement.hpp
+++ b/include/boost/math/distributions/complement.hpp
@@ -7,6 +7,8 @@
 #ifndef BOOST_STATS_COMPLEMENT_HPP
 #define BOOST_STATS_COMPLEMENT_HPP
 
+#include <boost/config.hpp>
+
 //
 // This code really defines our own tuple type.
 // It would be nice to reuse boost::math::tuple
@@ -19,7 +21,7 @@ namespace boost{ namespace math{
 template <class Dist, class RealType>
 struct complemented2_type
 {
-   complemented2_type(
+   BOOST_GPU_ENABLED complemented2_type(
       const Dist& d, 
       const RealType& p1)
       : dist(d), 
@@ -35,7 +37,7 @@ struct complemented2_type
 template <class Dist, class RealType1, class RealType2>
 struct complemented3_type
 {
-   complemented3_type(
+   BOOST_GPU_ENABLED complemented3_type(
       const Dist& d, 
       const RealType1& p1,
       const RealType2& p2)
@@ -53,7 +55,7 @@ struct complemented3_type
 template <class Dist, class RealType1, class RealType2, class RealType3>
 struct complemented4_type
 {
-   complemented4_type(
+   BOOST_GPU_ENABLED complemented4_type(
       const Dist& d, 
       const RealType1& p1,
       const RealType2& p2,
@@ -74,7 +76,7 @@ struct complemented4_type
 template <class Dist, class RealType1, class RealType2, class RealType3, class RealType4>
 struct complemented5_type
 {
-   complemented5_type(
+   BOOST_GPU_ENABLED complemented5_type(
       const Dist& d, 
       const RealType1& p1,
       const RealType2& p2,
@@ -98,7 +100,7 @@ struct complemented5_type
 template <class Dist, class RealType1, class RealType2, class RealType3, class RealType4, class RealType5>
 struct complemented6_type
 {
-   complemented6_type(
+   BOOST_GPU_ENABLED complemented6_type(
       const Dist& d, 
       const RealType1& p1,
       const RealType2& p2,
@@ -125,7 +127,7 @@ struct complemented6_type
 template <class Dist, class RealType1, class RealType2, class RealType3, class RealType4, class RealType5, class RealType6>
 struct complemented7_type
 {
-   complemented7_type(
+   BOOST_GPU_ENABLED complemented7_type(
       const Dist& d, 
       const RealType1& p1,
       const RealType2& p2,
@@ -153,37 +155,37 @@ struct complemented7_type
 };
 
 template <class Dist, class RealType>
-inline complemented2_type<Dist, RealType> complement(const Dist& d, const RealType& r)
+inline BOOST_GPU_ENABLED complemented2_type<Dist, RealType> complement(const Dist& d, const RealType& r)
 {
    return complemented2_type<Dist, RealType>(d, r);
 }
 
 template <class Dist, class RealType1, class RealType2>
-inline complemented3_type<Dist, RealType1, RealType2> complement(const Dist& d, const RealType1& r1, const RealType2& r2)
+inline BOOST_GPU_ENABLED complemented3_type<Dist, RealType1, RealType2> complement(const Dist& d, const RealType1& r1, const RealType2& r2)
 {
    return complemented3_type<Dist, RealType1, RealType2>(d, r1, r2);
 }
 
 template <class Dist, class RealType1, class RealType2, class RealType3>
-inline complemented4_type<Dist, RealType1, RealType2, RealType3> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3)
+inline BOOST_GPU_ENABLED complemented4_type<Dist, RealType1, RealType2, RealType3> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3)
 {
    return complemented4_type<Dist, RealType1, RealType2, RealType3>(d, r1, r2, r3);
 }
 
 template <class Dist, class RealType1, class RealType2, class RealType3, class RealType4>
-inline complemented5_type<Dist, RealType1, RealType2, RealType3, RealType4> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3, const RealType4& r4)
+inline BOOST_GPU_ENABLED complemented5_type<Dist, RealType1, RealType2, RealType3, RealType4> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3, const RealType4& r4)
 {
    return complemented5_type<Dist, RealType1, RealType2, RealType3, RealType4>(d, r1, r2, r3, r4);
 }
 
 template <class Dist, class RealType1, class RealType2, class RealType3, class RealType4, class RealType5>
-inline complemented6_type<Dist, RealType1, RealType2, RealType3, RealType4, RealType5> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3, const RealType4& r4, const RealType5& r5)
+inline BOOST_GPU_ENABLED complemented6_type<Dist, RealType1, RealType2, RealType3, RealType4, RealType5> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3, const RealType4& r4, const RealType5& r5)
 {
    return complemented6_type<Dist, RealType1, RealType2, RealType3, RealType4, RealType5>(d, r1, r2, r3, r4, r5);
 }
 
 template <class Dist, class RealType1, class RealType2, class RealType3, class RealType4, class RealType5, class RealType6>
-inline complemented7_type<Dist, RealType1, RealType2, RealType3, RealType4, RealType5, RealType6> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3, const RealType4& r4, const RealType5& r5, const RealType6& r6)
+inline BOOST_GPU_ENABLED complemented7_type<Dist, RealType1, RealType2, RealType3, RealType4, RealType5, RealType6> complement(const Dist& d, const RealType1& r1, const RealType2& r2, const RealType3& r3, const RealType4& r4, const RealType5& r5, const RealType6& r6)
 {
    return complemented7_type<Dist, RealType1, RealType2, RealType3, RealType4, RealType5, RealType6>(d, r1, r2, r3, r4, r5, r6);
 }
diff --git a/include/boost/math/distributions/detail/common_error_handling.hpp b/include/boost/math/distributions/detail/common_error_handling.hpp
index 486fb0b5c8..328ad388db 100644
--- a/include/boost/math/distributions/detail/common_error_handling.hpp
+++ b/include/boost/math/distributions/detail/common_error_handling.hpp
@@ -23,7 +23,7 @@ namespace boost{ namespace math{ namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_probability(const char* function, RealType const& prob, RealType* result, const Policy& pol)
+inline BOOST_GPU_ENABLED bool check_probability(const char* function, RealType const& prob, RealType* result, const Policy& pol)
 {
    if((prob < 0) || (prob > 1) || !(boost::math::isfinite)(prob))
    {
@@ -36,7 +36,7 @@ inline bool check_probability(const char* function, RealType const& prob, RealTy
 }
 
 template <class RealType, class Policy>
-inline bool check_df(const char* function, RealType const& df, RealType* result, const Policy& pol)
+inline BOOST_GPU_ENABLED bool check_df(const char* function, RealType const& df, RealType* result, const Policy& pol)
 { //  df > 0 but NOT +infinity allowed.
    if((df <= 0) || !(boost::math::isfinite)(df))
    {
@@ -49,7 +49,7 @@ inline bool check_df(const char* function, RealType const& df, RealType* result,
 }
 
 template <class RealType, class Policy>
-inline bool check_df_gt0_to_inf(const char* function, RealType const& df, RealType* result, const Policy& pol)
+inline BOOST_GPU_ENABLED bool check_df_gt0_to_inf(const char* function, RealType const& df, RealType* result, const Policy& pol)
 {  // df > 0 or +infinity are allowed.
    if( (df <= 0) || (boost::math::isnan)(df) )
    { // is bad df <= 0 or NaN or -infinity.
@@ -63,7 +63,7 @@ inline bool check_df_gt0_to_inf(const char* function, RealType const& df, RealTy
 
 
 template <class RealType, class Policy>
-inline bool check_scale(
+inline BOOST_GPU_ENABLED bool check_scale(
       const char* function,
       RealType scale,
       RealType* result,
@@ -80,7 +80,7 @@ inline bool check_scale(
 }
 
 template <class RealType, class Policy>
-inline bool check_location(
+inline BOOST_GPU_ENABLED bool check_location(
       const char* function,
       RealType location,
       RealType* result,
@@ -97,7 +97,7 @@ inline bool check_location(
 }
 
 template <class RealType, class Policy>
-inline bool check_x(
+inline BOOST_GPU_ENABLED bool check_x(
       const char* function,
       RealType x,
       RealType* result,
@@ -118,7 +118,7 @@ inline bool check_x(
 } // bool check_x
 
 template <class RealType, class Policy>
-inline bool check_x_not_NaN(
+inline BOOST_GPU_ENABLED bool check_x_not_NaN(
   const char* function,
   RealType x,
   RealType* result,
@@ -138,7 +138,7 @@ inline bool check_x_not_NaN(
 } // bool check_x_not_NaN
 
 template <class RealType, class Policy>
-inline bool check_x_gt0(
+inline BOOST_GPU_ENABLED bool check_x_gt0(
       const char* function,
       RealType x,
       RealType* result,
@@ -159,7 +159,7 @@ inline bool check_x_gt0(
 } // bool check_x_gt0
 
 template <class RealType, class Policy>
-inline bool check_positive_x(
+inline BOOST_GPU_ENABLED bool check_positive_x(
       const char* function,
       RealType x,
       RealType* result,
@@ -179,7 +179,7 @@ inline bool check_positive_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_non_centrality(
+inline BOOST_GPU_ENABLED bool check_non_centrality(
       const char* function,
       RealType ncp,
       RealType* result,
@@ -196,7 +196,7 @@ inline bool check_non_centrality(
 }
 
 template <class RealType, class Policy>
-inline bool check_finite(
+inline BOOST_GPU_ENABLED bool check_finite(
       const char* function,
       RealType x,
       RealType* result,
diff --git a/include/boost/math/distributions/detail/derived_accessors.hpp b/include/boost/math/distributions/detail/derived_accessors.hpp
index 00f5a93258..f9620677e8 100644
--- a/include/boost/math/distributions/detail/derived_accessors.hpp
+++ b/include/boost/math/distributions/detail/derived_accessors.hpp
@@ -39,24 +39,24 @@
 namespace boost{ namespace math{
 
 template <class Distribution>
-typename Distribution::value_type variance(const Distribution& dist);
+BOOST_GPU_ENABLED typename Distribution::value_type variance(const Distribution& dist);
 
 template <class Distribution>
-inline typename Distribution::value_type standard_deviation(const Distribution& dist)
+inline BOOST_GPU_ENABLED typename Distribution::value_type standard_deviation(const Distribution& dist)
 {
    BOOST_MATH_STD_USING  // ADL of sqrt.
    return sqrt(variance(dist));
 }
 
 template <class Distribution>
-inline typename Distribution::value_type variance(const Distribution& dist)
+inline BOOST_GPU_ENABLED typename Distribution::value_type variance(const Distribution& dist)
 {
    typename Distribution::value_type result = standard_deviation(dist);
    return result * result;
 }
 
 template <class Distribution, class RealType>
-inline typename Distribution::value_type hazard(const Distribution& dist, const RealType& x)
+inline BOOST_GPU_ENABLED typename Distribution::value_type hazard(const Distribution& dist, const RealType& x)
 { // hazard function
   // http://www.itl.nist.gov/div898/handbook/eda/section3/eda362.htm#HAZ
    typedef typename Distribution::value_type value_type;
@@ -75,7 +75,7 @@ inline typename Distribution::value_type hazard(const Distribution& dist, const
 }
 
 template <class Distribution, class RealType>
-inline typename Distribution::value_type chf(const Distribution& dist, const RealType& x)
+inline BOOST_GPU_ENABLED typename Distribution::value_type chf(const Distribution& dist, const RealType& x)
 { // cumulative hazard function.
   // http://www.itl.nist.gov/div898/handbook/eda/section3/eda362.htm#HAZ
    BOOST_MATH_STD_USING
@@ -83,7 +83,7 @@ inline typename Distribution::value_type chf(const Distribution& dist, const Rea
 }
 
 template <class Distribution>
-inline typename Distribution::value_type coefficient_of_variation(const Distribution& dist)
+inline BOOST_GPU_ENABLED typename Distribution::value_type coefficient_of_variation(const Distribution& dist)
 {
    typedef typename Distribution::value_type value_type;
    typedef typename Distribution::policy_type policy_type;
@@ -104,19 +104,19 @@ inline typename Distribution::value_type coefficient_of_variation(const Distribu
 // implementation with all arguments of the same type:
 //
 template <class Distribution, class RealType>
-inline typename Distribution::value_type pdf(const Distribution& dist, const RealType& x)
+inline BOOST_GPU_ENABLED typename Distribution::value_type pdf(const Distribution& dist, const RealType& x)
 {
    typedef typename Distribution::value_type value_type;
    return pdf(dist, static_cast<value_type>(x));
 }
 template <class Distribution, class RealType>
-inline typename Distribution::value_type cdf(const Distribution& dist, const RealType& x)
+inline BOOST_GPU_ENABLED typename Distribution::value_type cdf(const Distribution& dist, const RealType& x)
 {
    typedef typename Distribution::value_type value_type;
    return cdf(dist, static_cast<value_type>(x));
 }
 template <class Distribution, class RealType>
-inline typename Distribution::value_type quantile(const Distribution& dist, const RealType& x)
+inline BOOST_GPU_ENABLED typename Distribution::value_type quantile(const Distribution& dist, const RealType& x)
 {
    typedef typename Distribution::value_type value_type;
    return quantile(dist, static_cast<value_type>(x));
@@ -130,21 +130,21 @@ inline typename Distribution::value_type chf(const Distribution& dist, const Rea
 }
 */
 template <class Distribution, class RealType>
-inline typename Distribution::value_type cdf(const complemented2_type<Distribution, RealType>& c)
+inline BOOST_GPU_ENABLED typename Distribution::value_type cdf(const complemented2_type<Distribution, RealType>& c)
 {
    typedef typename Distribution::value_type value_type;
    return cdf(complement(c.dist, static_cast<value_type>(c.param)));
 }
 
 template <class Distribution, class RealType>
-inline typename Distribution::value_type quantile(const complemented2_type<Distribution, RealType>& c)
+inline BOOST_GPU_ENABLED typename Distribution::value_type quantile(const complemented2_type<Distribution, RealType>& c)
 {
    typedef typename Distribution::value_type value_type;
    return quantile(complement(c.dist, static_cast<value_type>(c.param)));
 }
 
 template <class Dist>
-inline typename Dist::value_type median(const Dist& d)
+inline BOOST_GPU_ENABLED typename Dist::value_type median(const Dist& d)
 { // median - default definition for those distributions for which a
   // simple closed form is not known,
   // and for which a domain_error and/or NaN generating function is NOT defined.
diff --git a/include/boost/math/distributions/detail/hypergeometric_pdf.hpp b/include/boost/math/distributions/detail/hypergeometric_pdf.hpp
index 4364266514..2e4e6c7042 100644
--- a/include/boost/math/distributions/detail/hypergeometric_pdf.hpp
+++ b/include/boost/math/distributions/detail/hypergeometric_pdf.hpp
@@ -392,7 +392,7 @@ template <class T, class Policy>
 T hypergeometric_pdf_factorial_imp(unsigned x, unsigned r, unsigned n, unsigned N, const Policy&)
 {
    BOOST_MATH_STD_USING
-   BOOST_ASSERT(N <= boost::math::max_factorial<T>::value);
+   BOOST_MATH_ASSERT(N <= boost::math::max_factorial<T>::value);
    T result = boost::math::unchecked_factorial<T>(n);
    T num[3] = {
       boost::math::unchecked_factorial<T>(r),
diff --git a/include/boost/math/distributions/exponential.hpp b/include/boost/math/distributions/exponential.hpp
index 05c49374ed..a50d5e1a50 100644
--- a/include/boost/math/distributions/exponential.hpp
+++ b/include/boost/math/distributions/exponential.hpp
@@ -29,7 +29,7 @@ namespace detail{
 // Error check:
 //
 template <class RealType, class Policy>
-inline bool verify_lambda(const char* function, RealType l, RealType* presult, const Policy& pol)
+inline BOOST_GPU_ENABLED bool verify_lambda(const char* function, RealType l, RealType* presult, const Policy& pol)
 {
    if((l <= 0) || !(boost::math::isfinite)(l))
    {
@@ -42,7 +42,7 @@ inline bool verify_lambda(const char* function, RealType l, RealType* presult, c
 }
 
 template <class RealType, class Policy>
-inline bool verify_exp_x(const char* function, RealType x, RealType* presult, const Policy& pol)
+inline BOOST_GPU_ENABLED bool verify_exp_x(const char* function, RealType x, RealType* presult, const Policy& pol)
 {
    if((x < 0) || (boost::math::isnan)(x))
    {
@@ -63,14 +63,14 @@ class exponential_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   exponential_distribution(RealType l_lambda = 1)
+   BOOST_GPU_ENABLED exponential_distribution(RealType l_lambda = 1)
       : m_lambda(l_lambda)
    {
       RealType err;
       detail::verify_lambda("boost::math::exponential_distribution<%1%>::exponential_distribution", l_lambda, &err, Policy());
    } // exponential_distribution
 
-   RealType lambda()const { return m_lambda; }
+   BOOST_GPU_ENABLED RealType lambda()const { return m_lambda; }
 
 private:
    RealType m_lambda;
@@ -103,11 +103,11 @@ inline const std::pair<RealType, RealType> support(const exponential_distributio
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const exponential_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const exponential_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const exponential_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const exponential_distribution<%1%>&, %1%)";
 
    RealType lambda = dist.lambda();
    RealType result = 0;
@@ -123,11 +123,11 @@ inline RealType pdf(const exponential_distribution<RealType, Policy>& dist, cons
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const exponential_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const exponential_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const exponential_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const exponential_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    RealType lambda = dist.lambda();
@@ -141,11 +141,11 @@ inline RealType cdf(const exponential_distribution<RealType, Policy>& dist, cons
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const exponential_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const exponential_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const exponential_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const exponential_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    RealType lambda = dist.lambda();
@@ -164,11 +164,11 @@ inline RealType quantile(const exponential_distribution<RealType, Policy>& dist,
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<exponential_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<exponential_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const exponential_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const exponential_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    RealType lambda = c.dist.lambda();
@@ -185,11 +185,11 @@ inline RealType cdf(const complemented2_type<exponential_distribution<RealType,
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<exponential_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<exponential_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const exponential_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const exponential_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    RealType lambda = c.dist.lambda();
@@ -210,7 +210,7 @@ inline RealType quantile(const complemented2_type<exponential_distribution<RealT
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const exponential_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const exponential_distribution<RealType, Policy>& dist)
 {
    RealType result = 0;
    RealType lambda = dist.lambda();
@@ -220,7 +220,7 @@ inline RealType mean(const exponential_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType standard_deviation(const exponential_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType standard_deviation(const exponential_distribution<RealType, Policy>& dist)
 {
    RealType result = 0;
    RealType lambda = dist.lambda();
@@ -230,32 +230,32 @@ inline RealType standard_deviation(const exponential_distribution<RealType, Poli
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const exponential_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType mode(const exponential_distribution<RealType, Policy>& /*dist*/)
 {
    return 0;
 }
 
 template <class RealType, class Policy>
-inline RealType median(const exponential_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const exponential_distribution<RealType, Policy>& dist)
 {
    using boost::math::constants::ln_two;
    return ln_two<RealType>() / dist.lambda(); // ln(2) / lambda
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const exponential_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType skewness(const exponential_distribution<RealType, Policy>& /*dist*/)
 {
    return 2;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const exponential_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis(const exponential_distribution<RealType, Policy>& /*dist*/)
 {
    return 9;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const exponential_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const exponential_distribution<RealType, Policy>& /*dist*/)
 {
    return 6;
 }
diff --git a/include/boost/math/distributions/extreme_value.hpp b/include/boost/math/distributions/extreme_value.hpp
index cb86de6612..cfc3928970 100644
--- a/include/boost/math/distributions/extreme_value.hpp
+++ b/include/boost/math/distributions/extreme_value.hpp
@@ -35,7 +35,7 @@ namespace detail{
 // Error check:
 //
 template <class RealType, class Policy>
-inline bool verify_scale_b(const char* function, RealType b, RealType* presult, const Policy& pol)
+inline BOOST_GPU_ENABLED bool verify_scale_b(const char* function, RealType b, RealType* presult, const Policy& pol)
 {
    if((b <= 0) || !(boost::math::isfinite)(b))
    {
@@ -56,7 +56,7 @@ class extreme_value_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   extreme_value_distribution(RealType a = 0, RealType b = 1)
+   BOOST_GPU_ENABLED extreme_value_distribution(RealType a = 0, RealType b = 1)
       : m_a(a), m_b(b)
    {
       RealType err;
@@ -64,8 +64,8 @@ class extreme_value_distribution
       detail::check_finite("boost::math::extreme_value_distribution<%1%>::extreme_value_distribution", a, &err, Policy());
    } // extreme_value_distribution
 
-   RealType location()const { return m_a; }
-   RealType scale()const { return m_b; }
+   BOOST_GPU_ENABLED RealType location()const { return m_a; }
+   BOOST_GPU_ENABLED RealType scale()const { return m_b; }
 
 private:
    RealType m_a, m_b;
@@ -91,11 +91,11 @@ inline const std::pair<RealType, RealType> support(const extreme_value_distribut
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const extreme_value_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const extreme_value_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const extreme_value_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const extreme_value_distribution<%1%>&, %1%)";
 
    RealType a = dist.location();
    RealType b = dist.scale();
@@ -116,11 +116,11 @@ inline RealType pdf(const extreme_value_distribution<RealType, Policy>& dist, co
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const extreme_value_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const extreme_value_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const extreme_value_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const extreme_value_distribution<%1%>&, %1%)";
 
    if((boost::math::isinf)(x))
       return x < 0 ? 0.0f : 1.0f;
@@ -142,11 +142,11 @@ inline RealType cdf(const extreme_value_distribution<RealType, Policy>& dist, co
 } // cdf
 
 template <class RealType, class Policy>
-RealType quantile(const extreme_value_distribution<RealType, Policy>& dist, const RealType& p)
+BOOST_GPU_ENABLED RealType quantile(const extreme_value_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const extreme_value_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const extreme_value_distribution<%1%>&, %1%)";
 
    RealType a = dist.location();
    RealType b = dist.scale();
@@ -169,11 +169,11 @@ RealType quantile(const extreme_value_distribution<RealType, Policy>& dist, cons
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<extreme_value_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<extreme_value_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const extreme_value_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const extreme_value_distribution<%1%>&, %1%)";
 
    if((boost::math::isinf)(c.param))
       return c.param < 0 ? 1.0f : 0.0f;
@@ -193,11 +193,11 @@ inline RealType cdf(const complemented2_type<extreme_value_distribution<RealType
 }
 
 template <class RealType, class Policy>
-RealType quantile(const complemented2_type<extreme_value_distribution<RealType, Policy>, RealType>& c)
+BOOST_GPU_ENABLED RealType quantile(const complemented2_type<extreme_value_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const extreme_value_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const extreme_value_distribution<%1%>&, %1%)";
 
    RealType a = c.dist.location();
    RealType b = c.dist.scale();
@@ -221,7 +221,7 @@ RealType quantile(const complemented2_type<extreme_value_distribution<RealType,
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const extreme_value_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const extreme_value_distribution<RealType, Policy>& dist)
 {
    RealType a = dist.location();
    RealType b = dist.scale();
@@ -234,7 +234,7 @@ inline RealType mean(const extreme_value_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType standard_deviation(const extreme_value_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType standard_deviation(const extreme_value_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING // for ADL of std functions.
 
@@ -248,20 +248,20 @@ inline RealType standard_deviation(const extreme_value_distribution<RealType, Po
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const extreme_value_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const extreme_value_distribution<RealType, Policy>& dist)
 {
    return dist.location();
 }
 
 template <class RealType, class Policy>
-inline RealType median(const extreme_value_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const extreme_value_distribution<RealType, Policy>& dist)
 {
   using constants::ln_ln_two;
    return dist.location() - dist.scale() * ln_ln_two<RealType>();
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const extreme_value_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType skewness(const extreme_value_distribution<RealType, Policy>& /*dist*/)
 {
    //
    // This is 12 * sqrt(6) * zeta(3) / pi^3:
@@ -271,14 +271,14 @@ inline RealType skewness(const extreme_value_distribution<RealType, Policy>& /*d
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const extreme_value_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis(const extreme_value_distribution<RealType, Policy>& /*dist*/)
 {
    // See http://mathworld.wolfram.com/ExtremeValueDistribution.html
    return RealType(27) / 5;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const extreme_value_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const extreme_value_distribution<RealType, Policy>& /*dist*/)
 {
    // See http://mathworld.wolfram.com/ExtremeValueDistribution.html
    return RealType(12) / 5;
diff --git a/include/boost/math/distributions/gamma.hpp b/include/boost/math/distributions/gamma.hpp
index 9a9e2a4f52..28c87b2795 100644
--- a/include/boost/math/distributions/gamma.hpp
+++ b/include/boost/math/distributions/gamma.hpp
@@ -23,7 +23,7 @@ namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_gamma_shape(
+inline BOOST_GPU_ENABLED bool check_gamma_shape(
       const char* function,
       RealType shape,
       RealType* result, const Policy& pol)
@@ -39,7 +39,7 @@ inline bool check_gamma_shape(
 }
 
 template <class RealType, class Policy>
-inline bool check_gamma_x(
+inline BOOST_GPU_ENABLED bool check_gamma_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -55,7 +55,7 @@ inline bool check_gamma_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_gamma(
+inline BOOST_GPU_ENABLED bool check_gamma(
       const char* function,
       RealType scale,
       RealType shape,
@@ -73,19 +73,19 @@ class gamma_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   gamma_distribution(RealType l_shape, RealType l_scale = 1)
+   BOOST_GPU_ENABLED gamma_distribution(RealType l_shape, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
       detail::check_gamma("boost::math::gamma_distribution<%1%>::gamma_distribution", l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -116,11 +116,11 @@ inline const std::pair<RealType, RealType> support(const gamma_distribution<Real
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -140,11 +140,11 @@ inline RealType pdf(const gamma_distribution<RealType, Policy>& dist, const Real
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -164,7 +164,7 @@ inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -184,11 +184,11 @@ inline RealType quantile(const gamma_distribution<RealType, Policy>& dist, const
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -209,7 +209,7 @@ inline RealType quantile(const complemented2_type<gamma_distribution<RealType, P
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -230,11 +230,11 @@ inline RealType quantile(const complemented2_type<gamma_distribution<RealType, P
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -248,11 +248,11 @@ inline RealType mean(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::variance(const gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::variance(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -266,11 +266,11 @@ inline RealType variance(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mode(const gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mode(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -295,11 +295,11 @@ inline RealType mode(const gamma_distribution<RealType, Policy>& dist)
 //}
 
 template <class RealType, class Policy>
-inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::skewness(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -313,11 +313,11 @@ inline RealType skewness(const gamma_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis_excess(const gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -331,7 +331,7 @@ inline RealType kurtosis_excess(const gamma_distribution<RealType, Policy>& dist
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const gamma_distribution<RealType, Policy>& dist)
 {
    return kurtosis_excess(dist) + 3;
 }
diff --git a/include/boost/math/distributions/geometric.hpp b/include/boost/math/distributions/geometric.hpp
index 6c9713eadd..eae2575246 100644
--- a/include/boost/math/distributions/geometric.hpp
+++ b/include/boost/math/distributions/geometric.hpp
@@ -67,7 +67,7 @@ namespace boost
     {
       // Common error checking routines for geometric distribution function:
       template <class RealType, class Policy>
-      inline bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_success_fraction(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         if( !(boost::math::isfinite)(p) || (p < 0) || (p > 1) )
         {
@@ -80,13 +80,13 @@ namespace boost
       }
 
       template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist(const char* function, const RealType& p, RealType* result, const Policy& pol)
       {
         return check_success_fraction(function, p, result, pol);
       }
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_k(const char* function,  const RealType& p, RealType k, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist_and_k(const char* function,  const RealType& p, RealType k, RealType* result, const Policy& pol)
       {
         if(check_dist(function, p, result, pol) == false)
         {
@@ -103,7 +103,7 @@ namespace boost
       } // Check_dist_and_k
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist_and_prob(const char* function, RealType p, RealType prob, RealType* result, const Policy& pol)
       {
         if((check_dist(function, p, result, pol) && detail::check_probability(function, prob, result, pol)) == false)
         {
@@ -120,7 +120,7 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      geometric_distribution(RealType p) : m_p(p)
+      BOOST_GPU_ENABLED geometric_distribution(RealType p) : m_p(p)
       { // Constructor stores success_fraction p.
         RealType result;
         geometric_detail::check_dist(
@@ -130,11 +130,11 @@ namespace boost
       } // geometric_distribution constructor.
 
       // Private data getter class member functions.
-      RealType success_fraction() const
+      BOOST_GPU_ENABLED RealType success_fraction() const
       { // Probability of success as fraction in range 0 to 1.
         return m_p;
       }
-      RealType successes() const
+      BOOST_GPU_ENABLED RealType successes() const
       { // Total number of successes r = 1 (for compatibility with negative binomial?).
         return 1;
       }
@@ -145,7 +145,7 @@ namespace boost
         RealType trials,
         RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_lower_bound_on_p";
+        BOOST_MATH_GPU_STATIC const char* function = "boost::math::geometric<%1%>::find_lower_bound_on_p";
         RealType result = 0;  // of error checks.
         RealType successes = 1;
         RealType failures = trials - successes;
@@ -170,7 +170,7 @@ namespace boost
         RealType trials,
         RealType alpha) // alpha 0.05 equivalent to 95% for one-sided test.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_upper_bound_on_p";
+        BOOST_MATH_GPU_STATIC const char* function = "boost::math::geometric<%1%>::find_upper_bound_on_p";
         RealType result = 0;  // of error checks.
         RealType successes = 1;
         RealType failures = trials - successes;
@@ -203,7 +203,7 @@ namespace boost
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_minimum_number_of_trials";
+        BOOST_MATH_GPU_STATIC const char* function = "boost::math::geometric<%1%>::find_minimum_number_of_trials";
         // Error checks:
         RealType result = 0;
         if(false == geometric_detail::check_dist_and_k(
@@ -221,7 +221,7 @@ namespace boost
         RealType p,     // success fraction 0 <= p <= 1.
         RealType alpha) // risk level threshold 0 <= alpha <= 1.
       {
-        static const char* function = "boost::math::geometric<%1%>::find_maximum_number_of_trials";
+        BOOST_MATH_GPU_STATIC const char* function = "boost::math::geometric<%1%>::find_maximum_number_of_trials";
         // Error checks:
         RealType result = 0;
         if(false == geometric_detail::check_dist_and_k(
@@ -257,7 +257,7 @@ namespace boost
     }
 
     template <class RealType, class Policy>
-    inline RealType mean(const geometric_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mean(const geometric_distribution<RealType, Policy>& dist)
     { // Mean of geometric distribution = (1-p)/p.
       return (1 - dist.success_fraction() ) / dist.success_fraction();
     } // mean
@@ -265,21 +265,21 @@ namespace boost
     // median implemented via quantile(half) in derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType mode(const geometric_distribution<RealType, Policy>&)
+    inline BOOST_GPU_ENABLED RealType mode(const geometric_distribution<RealType, Policy>&)
     { // Mode of geometric distribution = zero.
       BOOST_MATH_STD_USING // ADL of std functions.
       return 0;
     } // mode
     
     template <class RealType, class Policy>
-    inline RealType variance(const geometric_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType variance(const geometric_distribution<RealType, Policy>& dist)
     { // Variance of Binomial distribution = (1-p) / p^2.
       return  (1 - dist.success_fraction())
         / (dist.success_fraction() * dist.success_fraction());
     } // variance
 
     template <class RealType, class Policy>
-    inline RealType skewness(const geometric_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType skewness(const geometric_distribution<RealType, Policy>& dist)
     { // skewness of geometric distribution = 2-p / (sqrt(r(1-p))
       BOOST_MATH_STD_USING // ADL of std functions.
       RealType p = dist.success_fraction();
@@ -287,7 +287,7 @@ namespace boost
     } // skewness
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const geometric_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis(const geometric_distribution<RealType, Policy>& dist)
     { // kurtosis of geometric distribution
       // http://en.wikipedia.org/wiki/geometric is kurtosis_excess so add 3
       RealType p = dist.success_fraction();
@@ -295,7 +295,7 @@ namespace boost
     } // kurtosis
 
      template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const geometric_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis_excess(const geometric_distribution<RealType, Policy>& dist)
     { // kurtosis excess of geometric distribution
       // http://mathworld.wolfram.com/Kurtosis.html table of kurtosis_excess
       RealType p = dist.success_fraction();
@@ -310,11 +310,11 @@ namespace boost
     // chf of geometric distribution provided by derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType pdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    inline BOOST_GPU_ENABLED RealType pdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Probability Density/Mass Function.
       BOOST_FPU_EXCEPTION_GUARD
       BOOST_MATH_STD_USING  // For ADL of math functions.
-      static const char* function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const geometric_distribution<%1%>&, %1%)";
 
       RealType p = dist.success_fraction();
       RealType result = 0;
@@ -348,9 +348,9 @@ namespace boost
     } // geometric_pdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
+    inline BOOST_GPU_ENABLED RealType cdf(const geometric_distribution<RealType, Policy>& dist, const RealType& k)
     { // Cumulative Distribution Function of geometric.
-      static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
 
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
@@ -379,10 +379,10 @@ namespace boost
     } // cdf Cumulative Distribution Function geometric.
 
       template <class RealType, class Policy>
-      inline RealType cdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+      inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
       { // Complemented Cumulative Distribution Function geometric.
       BOOST_MATH_STD_USING
-      static const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const geometric_distribution<%1%>&, %1%)";
       // k argument may be integral, signed, or unsigned, or floating point.
       // If necessary, it has already been promoted from an integral type.
       RealType const& k = c.param;
@@ -404,14 +404,14 @@ namespace boost
     } // cdf Complemented Cumulative Distribution Function geometric.
 
     template <class RealType, class Policy>
-    inline RealType quantile(const geometric_distribution<RealType, Policy>& dist, const RealType& x)
+    inline BOOST_GPU_ENABLED RealType quantile(const geometric_distribution<RealType, Policy>& dist, const RealType& x)
     { // Quantile, percentile/100 or Percent Point geometric function.
       // Return the number of expected failures k for a given probability p.
 
       // Inverse cumulative Distribution Function or Quantile (percentile / 100) of geometric Probability.
       // k argument may be integral, signed, or unsigned, or floating point.
 
-      static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
       BOOST_MATH_STD_USING // ADL of std functions.
 
       RealType success_fraction = dist.success_fraction();
@@ -455,11 +455,11 @@ namespace boost
     } // RealType quantile(const geometric_distribution dist, p)
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<geometric_distribution<RealType, Policy>, RealType>& c)
     {  // Quantile or Percent Point Binomial function.
        // Return the number of expected failures k for a given
        // complement of the probability Q = 1 - P.
-       static const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const geometric_distribution<%1%>&, %1%)";
        BOOST_MATH_STD_USING
        // Error checks:
        RealType x = c.param;
diff --git a/include/boost/math/distributions/inverse_chi_squared.hpp b/include/boost/math/distributions/inverse_chi_squared.hpp
index c1e54905da..a2b58b2801 100644
--- a/include/boost/math/distributions/inverse_chi_squared.hpp
+++ b/include/boost/math/distributions/inverse_chi_squared.hpp
@@ -31,7 +31,7 @@ namespace boost{ namespace math{
 namespace detail
 {
   template <class RealType, class Policy>
-  inline bool check_inverse_chi_squared( // Check both distribution parameters.
+  inline BOOST_GPU_ENABLED bool check_inverse_chi_squared( // Check both distribution parameters.
         const char* function,
         RealType degrees_of_freedom, // degrees_of_freedom (aka nu).
         RealType scale,  // scale (aka sigma^2)
@@ -51,7 +51,7 @@ class inverse_chi_squared_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale)
+   BOOST_GPU_ENABLED inverse_chi_squared_distribution(RealType df, RealType l_scale) : m_df(df), m_scale (l_scale)
    {
       RealType result;
       detail::check_df(
@@ -62,7 +62,7 @@ class inverse_chi_squared_distribution
          m_scale, &result,  Policy());
    } // inverse_chi_squared_distribution constructor 
 
-   inverse_chi_squared_distribution(RealType df = 1) : m_df(df)
+   BOOST_GPU_ENABLED inverse_chi_squared_distribution(RealType df = 1) : m_df(df)
    {
       RealType result;
       m_scale = 1 / m_df ; // Default scale = 1 / degrees of freedom (Wikipedia definition 1).
@@ -71,11 +71,11 @@ class inverse_chi_squared_distribution
          m_df, &result, Policy());
    } // inverse_chi_squared_distribution
 
-   RealType degrees_of_freedom()const
+   BOOST_GPU_ENABLED RealType degrees_of_freedom()const
    {
       return m_df; // aka nu
    }
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    {
       return m_scale;  // aka xi
    }
@@ -112,14 +112,14 @@ inline const std::pair<RealType, RealType> support(const inverse_chi_squared_dis
 }
 
 template <class RealType, class Policy>
-RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
    RealType error_result;
 
-   static const char* function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
 
    if(false == detail::check_inverse_chi_squared
      (function, df, scale, &error_result, Policy())
@@ -152,9 +152,9 @@ RealType pdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, con
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const inverse_chi_squared_distribution<RealType, Policy>& dist, const RealType& x)
 {
-   static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
    RealType error_result;
@@ -187,7 +187,7 @@ inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -213,13 +213,13 @@ inline RealType quantile(const inverse_chi_squared_distribution<RealType, Policy
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<inverse_chi_squared_distribution<RealType, Policy>, RealType>& c)
 {
    using boost::math::gamma_q_inv;
    RealType const& df = c.dist.degrees_of_freedom();
    RealType const& scale = c.dist.scale();
    RealType const& x = c.param;
-   static const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(
@@ -251,7 +251,7 @@ inline RealType quantile(const complemented2_type<inverse_chi_squared_distributi
    RealType const& df = c.dist.degrees_of_freedom();
    RealType const& scale = c.dist.scale();
    RealType const& q = c.param;
-   static const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const inverse_chi_squared_distribution<%1%>&, %1%)";
    // Error check:
    RealType error_result;
    if(false == detail::check_df(function, df, &error_result, Policy()))
@@ -273,12 +273,12 @@ inline RealType quantile(const complemented2_type<inverse_chi_squared_distributi
 } // quantile(const complement
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // Mean of inverse Chi-Squared distribution.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 2)
       return policies::raise_domain_error<RealType>(
          function,
@@ -288,11 +288,11 @@ inline RealType mean(const inverse_chi_squared_distribution<RealType, Policy>& d
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // Variance of inverse Chi-Squared distribution.
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
-   static const char* function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::variance(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 4)
    {
       return policies::raise_domain_error<RealType>(
@@ -304,14 +304,14 @@ inline RealType variance(const inverse_chi_squared_distribution<RealType, Policy
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 { // mode is not defined in Mathematica.
   // See Discussion section http://en.wikipedia.org/wiki/Talk:Scaled-inverse-chi-square_distribution
   // for origin of the formula used below.
 
    RealType df = dist.degrees_of_freedom();
    RealType scale = dist.scale();
-   static const char* function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mode(const inverse_chi_squared_distribution<%1%>&)";
    if(df < 0)
       return policies::raise_domain_error<RealType>(
          function,
@@ -334,11 +334,11 @@ inline RealType mode(const inverse_chi_squared_distribution<RealType, Policy>& d
 // Now implemented via quantile(half) in derived accessors.
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING // For ADL
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::skewness(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 6)
       return policies::raise_domain_error<RealType>(
          function,
@@ -349,10 +349,10 @@ inline RealType skewness(const inverse_chi_squared_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 8)
       return policies::raise_domain_error<RealType>(
          function,
@@ -363,10 +363,10 @@ inline RealType kurtosis(const inverse_chi_squared_distribution<RealType, Policy
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_chi_squared_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const inverse_chi_squared_distribution<RealType, Policy>& dist)
 {
    RealType df = dist.degrees_of_freedom();
-   static const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis(const inverse_chi_squared_distribution<%1%>&)";
    if(df <= 8)
       return policies::raise_domain_error<RealType>(
          function,
diff --git a/include/boost/math/distributions/inverse_gamma.hpp b/include/boost/math/distributions/inverse_gamma.hpp
index fa5d357ac7..0cf14b7812 100644
--- a/include/boost/math/distributions/inverse_gamma.hpp
+++ b/include/boost/math/distributions/inverse_gamma.hpp
@@ -35,7 +35,7 @@ namespace detail
 {
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma_shape(
+inline BOOST_GPU_ENABLED bool check_inverse_gamma_shape(
       const char* function, // inverse_gamma
       RealType shape, // shape aka alpha
       RealType* result, // to update, perhaps with NaN
@@ -56,7 +56,7 @@ inline bool check_inverse_gamma_shape(
 } //bool check_inverse_gamma_shape
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma_x(
+inline BOOST_GPU_ENABLED bool check_inverse_gamma_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -72,7 +72,7 @@ inline bool check_inverse_gamma_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_inverse_gamma(
+inline BOOST_GPU_ENABLED bool check_inverse_gamma(
       const char* function, // TODO swap these over, so shape is first.
       RealType scale,  // scale aka beta
       RealType shape, // shape aka alpha
@@ -91,7 +91,7 @@ class inverse_gamma_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1)
+   BOOST_GPU_ENABLED inverse_gamma_distribution(RealType l_shape = 1, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
@@ -100,12 +100,12 @@ class inverse_gamma_distribution
         l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -141,11 +141,11 @@ inline const std::pair<RealType, RealType> support(const inverse_gamma_distribut
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -189,11 +189,11 @@ inline RealType pdf(const inverse_gamma_distribution<RealType, Policy>& dist, co
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const inverse_gamma_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -222,7 +222,7 @@ inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dis
    BOOST_MATH_STD_USING  // for ADL of std functions
    using boost::math::gamma_q_inv;
 
-   static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -244,11 +244,11 @@ inline RealType quantile(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<inverse_gamma_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -272,7 +272,7 @@ inline RealType quantile(const complemented2_type<inverse_gamma_distribution<Rea
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const inverse_gamma_distribution<%1%>&, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -296,11 +296,11 @@ inline RealType quantile(const complemented2_type<inverse_gamma_distribution<Rea
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -323,11 +323,11 @@ inline RealType mean(const inverse_gamma_distribution<RealType, Policy>& dist)
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::variance(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -349,11 +349,11 @@ inline RealType variance(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mode(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -376,11 +376,11 @@ inline RealType mode(const inverse_gamma_distribution<RealType, Policy>& dist)
 //}
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::skewness(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -402,11 +402,11 @@ inline RealType skewness(const inverse_gamma_distribution<RealType, Policy>& dis
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis_excess(const inverse_gamma_distribution<%1%>&)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -428,9 +428,9 @@ inline RealType kurtosis_excess(const inverse_gamma_distribution<RealType, Polic
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_gamma_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const inverse_gamma_distribution<RealType, Policy>& dist)
 {
-  static const char* function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)";
+  BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis(const inverse_gamma_distribution<%1%>&)";
    RealType shape = dist.shape();
    RealType scale = dist.scale();
 
diff --git a/include/boost/math/distributions/inverse_gaussian.hpp b/include/boost/math/distributions/inverse_gaussian.hpp
index e3aa4e0650..7f8e487873 100644
--- a/include/boost/math/distributions/inverse_gaussian.hpp
+++ b/include/boost/math/distributions/inverse_gaussian.hpp
@@ -74,10 +74,10 @@ class inverse_gaussian_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   inverse_gaussian_distribution(RealType l_mean = 1, RealType l_scale = 1)
+   BOOST_GPU_ENABLED inverse_gaussian_distribution(RealType l_mean = 1, RealType l_scale = 1)
       : m_mean(l_mean), m_scale(l_scale)
    { // Default is a 1,1 inverse_gaussian distribution.
-     static const char* function = "boost::math::inverse_gaussian_distribution<%1%>::inverse_gaussian_distribution";
+     BOOST_MATH_GPU_STATIC const char* function = "boost::math::inverse_gaussian_distribution<%1%>::inverse_gaussian_distribution";
 
      RealType result;
      detail::check_scale(function, l_scale, &result, Policy());
@@ -85,22 +85,22 @@ class inverse_gaussian_distribution
      detail::check_x_gt0(function, l_mean, &result, Policy());
    }
 
-   RealType mean()const
+   BOOST_GPU_ENABLED RealType mean()const
    { // alias for location.
       return m_mean; // aka mu
    }
 
    // Synonyms, provided to allow generic use of find_location and find_scale.
-   RealType location()const
+   BOOST_GPU_ENABLED RealType location()const
    { // location, aka mu.
       return m_mean;
    }
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    { // scale, aka lambda.
       return m_scale;
    }
 
-   RealType shape()const
+   BOOST_GPU_ENABLED RealType shape()const
    { // shape, aka phi = lambda/mu.
       return m_scale / m_mean;
    }
@@ -131,14 +131,14 @@ inline const std::pair<RealType, RealType> support(const inverse_gaussian_distri
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const inverse_gaussian_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const inverse_gaussian_distribution<RealType, Policy>& dist, const RealType& x)
 { // Probability Density Function
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType scale = dist.scale();
    RealType mean = dist.mean();
    RealType result = 0;
-   static const char* function = "boost::math::pdf(const inverse_gaussian_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const inverse_gaussian_distribution<%1%>&, %1%)";
    if(false == detail::check_scale(function, scale, &result, Policy()))
    {
       return result;
@@ -168,13 +168,13 @@ inline RealType pdf(const inverse_gaussian_distribution<RealType, Policy>& dist,
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const inverse_gaussian_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const inverse_gaussian_distribution<RealType, Policy>& dist, const RealType& x)
 { // Cumulative Density Function.
    BOOST_MATH_STD_USING  // for ADL of std functions.
 
    RealType scale = dist.scale();
    RealType mean = dist.mean();
-   static const char* function = "boost::math::cdf(const inverse_gaussian_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const inverse_gaussian_distribution<%1%>&, %1%)";
    RealType result = 0;
    if(false == detail::check_scale(function, scale, &result, Policy()))
    {
@@ -363,14 +363,14 @@ inline RealType quantile(const inverse_gaussian_distribution<RealType, Policy>&
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<inverse_gaussian_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<inverse_gaussian_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions.
 
    RealType scale = c.dist.scale();
    RealType mean = c.dist.mean();
    RealType x = c.param;
-   static const char* function = "boost::math::cdf(const complement(inverse_gaussian_distribution<%1%>&), %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const complement(inverse_gaussian_distribution<%1%>&), %1%)";
    // infinite arguments not supported.
    //if((boost::math::isinf)(x))
    //{
@@ -448,25 +448,25 @@ inline RealType quantile(const complemented2_type<inverse_gaussian_distribution<
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType mean(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const inverse_gaussian_distribution<RealType, Policy>& dist)
 { // aka mu
    return dist.mean();
 }
 
 template <class RealType, class Policy>
-inline RealType scale(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType scale(const inverse_gaussian_distribution<RealType, Policy>& dist)
 { // aka lambda
    return dist.scale();
 }
 
 template <class RealType, class Policy>
-inline RealType shape(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType shape(const inverse_gaussian_distribution<RealType, Policy>& dist)
 { // aka phi
    return dist.shape();
 }
 
 template <class RealType, class Policy>
-inline RealType standard_deviation(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType standard_deviation(const inverse_gaussian_distribution<RealType, Policy>& dist)
 {
   BOOST_MATH_STD_USING
   RealType scale = dist.scale();
@@ -476,7 +476,7 @@ inline RealType standard_deviation(const inverse_gaussian_distribution<RealType,
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const inverse_gaussian_distribution<RealType, Policy>& dist)
 {
   BOOST_MATH_STD_USING
   RealType scale = dist.scale();
@@ -487,7 +487,7 @@ inline RealType mode(const inverse_gaussian_distribution<RealType, Policy>& dist
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const inverse_gaussian_distribution<RealType, Policy>& dist)
 {
   BOOST_MATH_STD_USING
   RealType scale = dist.scale();
@@ -497,7 +497,7 @@ inline RealType skewness(const inverse_gaussian_distribution<RealType, Policy>&
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const inverse_gaussian_distribution<RealType, Policy>& dist)
 {
   RealType scale = dist.scale();
   RealType  mean = dist.mean();
@@ -506,7 +506,7 @@ inline RealType kurtosis(const inverse_gaussian_distribution<RealType, Policy>&
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const inverse_gaussian_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const inverse_gaussian_distribution<RealType, Policy>& dist)
 {
   RealType scale = dist.scale();
   RealType  mean = dist.mean();
diff --git a/include/boost/math/distributions/laplace.hpp b/include/boost/math/distributions/laplace.hpp
index 09b24c868b..e7b723775d 100644
--- a/include/boost/math/distributions/laplace.hpp
+++ b/include/boost/math/distributions/laplace.hpp
@@ -42,7 +42,7 @@ class laplace_distribution
    // ----------------------------------
    // Constructor(s)
    // ----------------------------------
-   laplace_distribution(RealType l_location = 0, RealType l_scale = 1)
+   BOOST_GPU_ENABLED laplace_distribution(RealType l_location = 0, RealType l_scale = 1)
       : m_location(l_location), m_scale(l_scale)
    {
       RealType result;
@@ -54,17 +54,17 @@ class laplace_distribution
    // Public functions
    // ----------------------------------
 
-   RealType location() const
+   BOOST_GPU_ENABLED RealType location() const
    {
       return m_location;
    }
 
-   RealType scale() const
+   BOOST_GPU_ENABLED RealType scale() const
    {
       return m_scale;
    }
 
-   bool check_parameters(const char* function, RealType* result) const
+   BOOST_GPU_ENABLED bool check_parameters(const char* function, RealType* result) const
    {
          if(false == detail::check_scale(function, m_scale, result, Policy())) return false;
          if(false == detail::check_location(function, m_location, result, Policy())) return false;
@@ -112,7 +112,7 @@ inline const std::pair<RealType, RealType> support(const laplace_distribution<Re
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const laplace_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const laplace_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
@@ -144,7 +144,7 @@ inline RealType pdf(const laplace_distribution<RealType, Policy>& dist, const Re
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const laplace_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const laplace_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // For ADL of std functions.
 
@@ -179,7 +179,7 @@ inline RealType cdf(const laplace_distribution<RealType, Policy>& dist, const Re
 
 
 template <class RealType, class Policy>
-inline RealType quantile(const laplace_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const laplace_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING // for ADL of std functions.
 
@@ -217,7 +217,7 @@ inline RealType quantile(const laplace_distribution<RealType, Policy>& dist, con
 
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<laplace_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<laplace_distribution<RealType, Policy>, RealType>& c)
 {
    // Calculate complement of cdf.
    BOOST_MATH_STD_USING // for ADL of std functions
@@ -257,7 +257,7 @@ inline RealType cdf(const complemented2_type<laplace_distribution<RealType, Poli
 
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<laplace_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<laplace_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions.
 
@@ -292,43 +292,43 @@ inline RealType quantile(const complemented2_type<laplace_distribution<RealType,
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType mean(const laplace_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const laplace_distribution<RealType, Policy>& dist)
 {
    return dist.location();
 }
 
 template <class RealType, class Policy>
-inline RealType standard_deviation(const laplace_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType standard_deviation(const laplace_distribution<RealType, Policy>& dist)
 {
    return constants::root_two<RealType>() * dist.scale();
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const laplace_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const laplace_distribution<RealType, Policy>& dist)
 {
    return dist.location();
 }
 
 template <class RealType, class Policy>
-inline RealType median(const laplace_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const laplace_distribution<RealType, Policy>& dist)
 {
    return dist.location();
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const laplace_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType skewness(const laplace_distribution<RealType, Policy>& /*dist*/)
 {
    return 0;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const laplace_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis(const laplace_distribution<RealType, Policy>& /*dist*/)
 {
    return 6;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const laplace_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const laplace_distribution<RealType, Policy>& /*dist*/)
 {
    return 3;
 }
diff --git a/include/boost/math/distributions/logistic.hpp b/include/boost/math/distributions/logistic.hpp
index b3d16b7197..fbe02d8961 100644
--- a/include/boost/math/distributions/logistic.hpp
+++ b/include/boost/math/distributions/logistic.hpp
@@ -24,22 +24,22 @@ namespace boost { namespace math {
       typedef RealType value_type;
       typedef Policy policy_type;
       
-      logistic_distribution(RealType l_location=0, RealType l_scale=1) // Constructor.
+      BOOST_GPU_ENABLED logistic_distribution(RealType l_location=0, RealType l_scale=1) // Constructor.
         : m_location(l_location), m_scale(l_scale) 
       {
-        static const char* function = "boost::math::logistic_distribution<%1%>::logistic_distribution";
+        BOOST_MATH_GPU_STATIC const char* function = "boost::math::logistic_distribution<%1%>::logistic_distribution";
         
         RealType result;
         detail::check_scale(function, l_scale, &result, Policy());
         detail::check_location(function, l_location, &result, Policy());
       }
       // Accessor functions.
-      RealType scale()const
+      BOOST_GPU_ENABLED RealType scale()const
       {
         return m_scale;
       }
       
-      RealType location()const
+      BOOST_GPU_ENABLED RealType location()const
       {
         return m_location;
       }
@@ -70,9 +70,9 @@ namespace boost { namespace math {
     }
      
     template <class RealType, class Policy>
-    inline RealType pdf(const logistic_distribution<RealType, Policy>& dist, const RealType& x)
+    inline BOOST_GPU_ENABLED RealType pdf(const logistic_distribution<RealType, Policy>& dist, const RealType& x)
     {
-       static const char* function = "boost::math::pdf(const logistic_distribution<%1%>&, %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const logistic_distribution<%1%>&, %1%)";
        RealType scale = dist.scale();
        RealType location = dist.location();
        RealType result = 0;
@@ -107,12 +107,12 @@ namespace boost { namespace math {
     } 
     
     template <class RealType, class Policy>
-    inline RealType cdf(const logistic_distribution<RealType, Policy>& dist, const RealType& x)
+    inline BOOST_GPU_ENABLED RealType cdf(const logistic_distribution<RealType, Policy>& dist, const RealType& x)
     {
        RealType scale = dist.scale();
        RealType location = dist.location();
        RealType result = 0; // of checks.
-       static const char* function = "boost::math::cdf(const logistic_distribution<%1%>&, %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const logistic_distribution<%1%>&, %1%)";
        if(false == detail::check_scale(function, scale, &result, Policy()))
        {
           return result;
@@ -142,13 +142,13 @@ namespace boost { namespace math {
     } 
     
     template <class RealType, class Policy>
-    inline RealType quantile(const logistic_distribution<RealType, Policy>& dist, const RealType& p)
+    inline BOOST_GPU_ENABLED RealType quantile(const logistic_distribution<RealType, Policy>& dist, const RealType& p)
     {
        BOOST_MATH_STD_USING
        RealType location = dist.location();
        RealType scale = dist.scale();
 
-       static const char* function = "boost::math::quantile(const logistic_distribution<%1%>&, %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const logistic_distribution<%1%>&, %1%)";
 
        RealType result = 0;
        if(false == detail::check_scale(function, scale, &result, Policy()))
@@ -178,13 +178,13 @@ namespace boost { namespace math {
      } // RealType quantile(const logistic_distribution<RealType, Policy>& dist, const RealType& p)
     
     template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<logistic_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<logistic_distribution<RealType, Policy>, RealType>& c)
     {
        BOOST_MATH_STD_USING
        RealType location = c.dist.location();
        RealType scale = c.dist.scale();
        RealType x = c.param;
-       static const char* function = "boost::math::cdf(const complement(logistic_distribution<%1%>&), %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const complement(logistic_distribution<%1%>&), %1%)";
 
        RealType result = 0;
        if(false == detail::check_scale(function, scale, &result, Policy()))
@@ -213,12 +213,12 @@ namespace boost { namespace math {
     } 
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<logistic_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<logistic_distribution<RealType, Policy>, RealType>& c)
     {
        BOOST_MATH_STD_USING
        RealType scale = c.dist.scale();
        RealType location = c.dist.location();
-       static const char* function = "boost::math::quantile(const complement(logistic_distribution<%1%>&), %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const complement(logistic_distribution<%1%>&), %1%)";
        RealType result = 0;
        if(false == detail::check_scale(function, scale, &result, Policy()))
           return result;
@@ -249,13 +249,13 @@ namespace boost { namespace math {
     } 
     
     template <class RealType, class Policy>
-    inline RealType mean(const logistic_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mean(const logistic_distribution<RealType, Policy>& dist)
     {
       return dist.location();
     } // RealType mean(const logistic_distribution<RealType, Policy>& dist)
     
     template <class RealType, class Policy>
-    inline RealType variance(const logistic_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType variance(const logistic_distribution<RealType, Policy>& dist)
     {
       BOOST_MATH_STD_USING
       RealType scale = dist.scale();
@@ -263,30 +263,30 @@ namespace boost { namespace math {
     } // RealType variance(const logistic_distribution<RealType, Policy>& dist)
     
     template <class RealType, class Policy>
-    inline RealType mode(const logistic_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mode(const logistic_distribution<RealType, Policy>& dist)
     {
       return dist.location();
     }
     
     template <class RealType, class Policy>
-    inline RealType median(const logistic_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType median(const logistic_distribution<RealType, Policy>& dist)
     {
       return dist.location();
     }
     template <class RealType, class Policy>
-    inline RealType skewness(const logistic_distribution<RealType, Policy>& /*dist*/)
+    inline BOOST_GPU_ENABLED RealType skewness(const logistic_distribution<RealType, Policy>& /*dist*/)
     {
       return 0;
     } // RealType skewness(const logistic_distribution<RealType, Policy>& dist)
     
     template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const logistic_distribution<RealType, Policy>& /*dist*/)
+    inline BOOST_GPU_ENABLED RealType kurtosis_excess(const logistic_distribution<RealType, Policy>& /*dist*/)
     {
       return static_cast<RealType>(6)/5; 
     } // RealType kurtosis_excess(const logistic_distribution<RealType, Policy>& dist)
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const logistic_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis(const logistic_distribution<RealType, Policy>& dist)
     {
       return kurtosis_excess(dist) + 3;
     } // RealType kurtosis_excess(const logistic_distribution<RealType, Policy>& dist)
diff --git a/include/boost/math/distributions/lognormal.hpp b/include/boost/math/distributions/lognormal.hpp
index 4e6c0610d4..b8a3ce525b 100644
--- a/include/boost/math/distributions/lognormal.hpp
+++ b/include/boost/math/distributions/lognormal.hpp
@@ -23,7 +23,7 @@ namespace detail
 {
 
   template <class RealType, class Policy>
-  inline bool check_lognormal_x(
+  inline BOOST_GPU_ENABLED bool check_lognormal_x(
         const char* function,
         RealType const& x,
         RealType* result, const Policy& pol)
@@ -48,7 +48,7 @@ class lognormal_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   lognormal_distribution(RealType l_location = 0, RealType l_scale = 1)
+   BOOST_GPU_ENABLED lognormal_distribution(RealType l_location = 0, RealType l_scale = 1)
       : m_location(l_location), m_scale(l_scale)
    {
       RealType result;
@@ -56,12 +56,12 @@ class lognormal_distribution
       detail::check_location("boost::math::lognormal_distribution<%1%>::lognormal_distribution", l_location, &result, Policy());
    }
 
-   RealType location()const
+   BOOST_GPU_ENABLED RealType location()const
    {
       return m_location;
    }
 
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -91,14 +91,14 @@ inline const std::pair<RealType, RealType> support(const lognormal_distribution<
 }
 
 template <class RealType, class Policy>
-RealType pdf(const lognormal_distribution<RealType, Policy>& dist, const RealType& x)
+BOOST_GPU_ENABLED RealType pdf(const lognormal_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType mu = dist.location();
    RealType sigma = dist.scale();
 
-   static const char* function = "boost::math::pdf(const lognormal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const lognormal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(0 == detail::check_scale(function, sigma, &result, Policy()))
@@ -122,11 +122,11 @@ RealType pdf(const lognormal_distribution<RealType, Policy>& dist, const RealTyp
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const lognormal_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const lognormal_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const lognormal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const lognormal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(0 == detail::check_scale(function, dist.scale(), &result, Policy()))
@@ -144,11 +144,11 @@ inline RealType cdf(const lognormal_distribution<RealType, Policy>& dist, const
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const lognormal_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const lognormal_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const lognormal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const lognormal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(0 == detail::check_scale(function, dist.scale(), &result, Policy()))
@@ -168,11 +168,11 @@ inline RealType quantile(const lognormal_distribution<RealType, Policy>& dist, c
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<lognormal_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<lognormal_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const lognormal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const lognormal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(0 == detail::check_scale(function, c.dist.scale(), &result, Policy()))
@@ -190,11 +190,11 @@ inline RealType cdf(const complemented2_type<lognormal_distribution<RealType, Po
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<lognormal_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<lognormal_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const lognormal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const lognormal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(0 == detail::check_scale(function, c.dist.scale(), &result, Policy()))
@@ -214,7 +214,7 @@ inline RealType quantile(const complemented2_type<lognormal_distribution<RealTyp
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
@@ -231,7 +231,7 @@ inline RealType mean(const lognormal_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType variance(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
@@ -248,7 +248,7 @@ inline RealType variance(const lognormal_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
@@ -265,7 +265,7 @@ inline RealType mode(const lognormal_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType median(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
    RealType mu = dist.location();
@@ -273,7 +273,7 @@ inline RealType median(const lognormal_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
@@ -293,7 +293,7 @@ inline RealType skewness(const lognormal_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
@@ -311,7 +311,7 @@ inline RealType kurtosis(const lognormal_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const lognormal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const lognormal_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
diff --git a/include/boost/math/distributions/normal.hpp b/include/boost/math/distributions/normal.hpp
index 32cf66e3ef..5a7a2e0673 100644
--- a/include/boost/math/distributions/normal.hpp
+++ b/include/boost/math/distributions/normal.hpp
@@ -31,32 +31,32 @@ class normal_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   normal_distribution(RealType l_mean = 0, RealType sd = 1)
+   BOOST_GPU_ENABLED normal_distribution(RealType l_mean = 0, RealType sd = 1)
       : m_mean(l_mean), m_sd(sd)
    { // Default is a 'standard' normal distribution N01.
-     static const char* function = "boost::math::normal_distribution<%1%>::normal_distribution";
+     BOOST_MATH_GPU_STATIC const char* function = "boost::math::normal_distribution<%1%>::normal_distribution";
 
      RealType result;
      detail::check_scale(function, sd, &result, Policy());
      detail::check_location(function, l_mean, &result, Policy());
    }
 
-   RealType mean()const
+   BOOST_GPU_ENABLED RealType mean()const
    { // alias for location.
       return m_mean;
    }
 
-   RealType standard_deviation()const
+   BOOST_GPU_ENABLED RealType standard_deviation()const
    { // alias for scale.
       return m_sd;
    }
 
    // Synonyms, provided to allow generic use of find_location and find_scale.
-   RealType location()const
+   BOOST_GPU_ENABLED RealType location()const
    { // location.
       return m_mean;
    }
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    { // scale.
       return m_sd;
    }
@@ -109,14 +109,14 @@ inline const std::pair<RealType, RealType> support(const normal_distribution<Rea
 #endif
 
 template <class RealType, class Policy>
-inline RealType pdf(const normal_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const normal_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType sd = dist.standard_deviation();
    RealType mean = dist.mean();
 
-   static const char* function = "boost::math::pdf(const normal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const normal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(false == detail::check_scale(function, sd, &result, Policy()))
@@ -152,13 +152,13 @@ inline RealType pdf(const normal_distribution<RealType, Policy>& dist, const Rea
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const normal_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const normal_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType sd = dist.standard_deviation();
    RealType mean = dist.mean();
-   static const char* function = "boost::math::cdf(const normal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const normal_distribution<%1%>&, %1%)";
    RealType result = 0;
    if(false == detail::check_scale(function, sd, &result, Policy()))
    {
@@ -192,13 +192,13 @@ inline RealType cdf(const normal_distribution<RealType, Policy>& dist, const Rea
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const normal_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const normal_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType sd = dist.standard_deviation();
    RealType mean = dist.mean();
-   static const char* function = "boost::math::quantile(const normal_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const normal_distribution<%1%>&, %1%)";
 
    RealType result = 0;
    if(false == detail::check_scale(function, sd, &result, Policy()))
@@ -216,14 +216,14 @@ inline RealType quantile(const normal_distribution<RealType, Policy>& dist, cons
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<normal_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<normal_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType sd = c.dist.standard_deviation();
    RealType mean = c.dist.mean();
    RealType x = c.param;
-   static const char* function = "boost::math::cdf(const complement(normal_distribution<%1%>&), %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const complement(normal_distribution<%1%>&), %1%)";
 
    RealType result = 0;
    if(false == detail::check_scale(function, sd, &result, Policy()))
@@ -253,13 +253,13 @@ inline RealType cdf(const complemented2_type<normal_distribution<RealType, Polic
 } // cdf complement
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<normal_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<normal_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
    RealType sd = c.dist.standard_deviation();
    RealType mean = c.dist.mean();
-   static const char* function = "boost::math::quantile(const complement(normal_distribution<%1%>&), %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const complement(normal_distribution<%1%>&), %1%)";
    RealType result = 0;
    if(false == detail::check_scale(function, sd, &result, Policy()))
       return result;
@@ -275,43 +275,43 @@ inline RealType quantile(const complemented2_type<normal_distribution<RealType,
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType mean(const normal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const normal_distribution<RealType, Policy>& dist)
 {
    return dist.mean();
 }
 
 template <class RealType, class Policy>
-inline RealType standard_deviation(const normal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType standard_deviation(const normal_distribution<RealType, Policy>& dist)
 {
    return dist.standard_deviation();
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const normal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const normal_distribution<RealType, Policy>& dist)
 {
    return dist.mean();
 }
 
 template <class RealType, class Policy>
-inline RealType median(const normal_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const normal_distribution<RealType, Policy>& dist)
 {
    return dist.mean();
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const normal_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType skewness(const normal_distribution<RealType, Policy>& /*dist*/)
 {
    return 0;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const normal_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis(const normal_distribution<RealType, Policy>& /*dist*/)
 {
    return 3;
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const normal_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const normal_distribution<RealType, Policy>& /*dist*/)
 {
    return 0;
 }
diff --git a/include/boost/math/distributions/pareto.hpp b/include/boost/math/distributions/pareto.hpp
index 1c6cf350f8..0e2b1fe507 100644
--- a/include/boost/math/distributions/pareto.hpp
+++ b/include/boost/math/distributions/pareto.hpp
@@ -30,7 +30,7 @@ namespace boost
     namespace detail
     { // Parameter checking.
       template <class RealType, class Policy>
-      inline bool check_pareto_scale(
+      inline BOOST_GPU_ENABLED bool check_pareto_scale(
         const char* function,
         RealType scale,
         RealType* result, const Policy& pol)
@@ -59,7 +59,7 @@ namespace boost
       } // bool check_pareto_scale
 
       template <class RealType, class Policy>
-      inline bool check_pareto_shape(
+      inline BOOST_GPU_ENABLED bool check_pareto_shape(
         const char* function,
         RealType shape,
         RealType* result, const Policy& pol)
@@ -88,7 +88,7 @@ namespace boost
       } // bool check_pareto_shape(
 
       template <class RealType, class Policy>
-      inline bool check_pareto_x(
+      inline BOOST_GPU_ENABLED bool check_pareto_x(
         const char* function,
         RealType const& x,
         RealType* result, const Policy& pol)
@@ -117,7 +117,7 @@ namespace boost
       } // bool check_pareto_x
 
       template <class RealType, class Policy>
-      inline bool check_pareto( // distribution parameters.
+      inline BOOST_GPU_ENABLED bool check_pareto( // distribution parameters.
         const char* function,
         RealType scale,
         RealType shape,
@@ -136,19 +136,19 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      pareto_distribution(RealType l_scale = 1, RealType l_shape = 1)
+      BOOST_GPU_ENABLED pareto_distribution(RealType l_scale = 1, RealType l_shape = 1)
         : m_scale(l_scale), m_shape(l_shape)
       { // Constructor.
         RealType result = 0;
         detail::check_pareto("boost::math::pareto_distribution<%1%>::pareto_distribution", l_scale, l_shape, &result, Policy());
       }
 
-      RealType scale()const
+      BOOST_GPU_ENABLED RealType scale()const
       { // AKA Xm and Wolfram b and beta
         return m_scale;
       }
 
-      RealType shape()const
+      BOOST_GPU_ENABLED RealType shape()const
       { // AKA k and Wolfram a and alpha
         return m_shape;
       }
@@ -176,10 +176,10 @@ namespace boost
     } // support
 
     template <class RealType, class Policy>
-    inline RealType pdf(const pareto_distribution<RealType, Policy>& dist, const RealType& x)
+    inline BOOST_GPU_ENABLED RealType pdf(const pareto_distribution<RealType, Policy>& dist, const RealType& x)
     {
       BOOST_MATH_STD_USING  // for ADL of std function pow.
-      static const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
       RealType scale = dist.scale();
       RealType shape = dist.shape();
       RealType result = 0;
@@ -195,10 +195,10 @@ namespace boost
     } // pdf
 
     template <class RealType, class Policy>
-    inline RealType cdf(const pareto_distribution<RealType, Policy>& dist, const RealType& x)
+    inline BOOST_GPU_ENABLED RealType cdf(const pareto_distribution<RealType, Policy>& dist, const RealType& x)
     {
       BOOST_MATH_STD_USING  // for ADL of std function pow.
-      static const char* function = "boost::math::cdf(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const pareto_distribution<%1%>&, %1%)";
       RealType scale = dist.scale();
       RealType shape = dist.shape();
       RealType result = 0;
@@ -218,10 +218,10 @@ namespace boost
     } // cdf
 
     template <class RealType, class Policy>
-    inline RealType quantile(const pareto_distribution<RealType, Policy>& dist, const RealType& p)
+    inline BOOST_GPU_ENABLED RealType quantile(const pareto_distribution<RealType, Policy>& dist, const RealType& p)
     {
       BOOST_MATH_STD_USING  // for ADL of std function pow.
-      static const char* function = "boost::math::quantile(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const pareto_distribution<%1%>&, %1%)";
       RealType result = 0;
       RealType scale = dist.scale();
       RealType shape = dist.shape();
@@ -245,10 +245,10 @@ namespace boost
     } // quantile
 
     template <class RealType, class Policy>
-    inline RealType cdf(const complemented2_type<pareto_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<pareto_distribution<RealType, Policy>, RealType>& c)
     {
        BOOST_MATH_STD_USING  // for ADL of std function pow.
-       static const char* function = "boost::math::cdf(const pareto_distribution<%1%>&, %1%)";
+       BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const pareto_distribution<%1%>&, %1%)";
        RealType result = 0;
        RealType x = c.param;
        RealType scale = c.dist.scale();
@@ -267,10 +267,10 @@ namespace boost
     } // cdf complement
 
     template <class RealType, class Policy>
-    inline RealType quantile(const complemented2_type<pareto_distribution<RealType, Policy>, RealType>& c)
+    inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<pareto_distribution<RealType, Policy>, RealType>& c)
     {
       BOOST_MATH_STD_USING  // for ADL of std function pow.
-      static const char* function = "boost::math::quantile(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const pareto_distribution<%1%>&, %1%)";
       RealType result = 0;
       RealType q = c.param;
       RealType scale = c.dist.scale();
@@ -294,10 +294,10 @@ namespace boost
     } // quantile complement
 
     template <class RealType, class Policy>
-    inline RealType mean(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mean(const pareto_distribution<RealType, Policy>& dist)
     {
       RealType result = 0;
-      static const char* function = "boost::math::mean(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const pareto_distribution<%1%>&, %1%)";
       if(false == detail::check_pareto(function, dist.scale(), dist.shape(), &result, Policy()))
       {
         return result;
@@ -314,16 +314,16 @@ namespace boost
     } // mean
 
     template <class RealType, class Policy>
-    inline RealType mode(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mode(const pareto_distribution<RealType, Policy>& dist)
     {
       return dist.scale();
     } // mode
 
     template <class RealType, class Policy>
-    inline RealType median(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType median(const pareto_distribution<RealType, Policy>& dist)
     {
       RealType result = 0;
-      static const char* function = "boost::math::median(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::median(const pareto_distribution<%1%>&, %1%)";
       if(false == detail::check_pareto(function, dist.scale(), dist.shape(), &result, Policy()))
       {
         return result;
@@ -333,12 +333,12 @@ namespace boost
     } // median
 
     template <class RealType, class Policy>
-    inline RealType variance(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType variance(const pareto_distribution<RealType, Policy>& dist)
     {
       RealType result = 0;
       RealType scale = dist.scale();
       RealType shape = dist.shape();
-      static const char* function = "boost::math::variance(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::variance(const pareto_distribution<%1%>&, %1%)";
       if(false == detail::check_pareto(function, scale, shape, &result, Policy()))
       {
         return result;
@@ -358,12 +358,12 @@ namespace boost
     } // variance
 
     template <class RealType, class Policy>
-    inline RealType skewness(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType skewness(const pareto_distribution<RealType, Policy>& dist)
     {
       BOOST_MATH_STD_USING
       RealType result = 0;
       RealType shape = dist.shape();
-      static const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
       if(false == detail::check_pareto(function, dist.scale(), shape, &result, Policy()))
       {
         return result;
@@ -384,11 +384,11 @@ namespace boost
     } // skewness
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis(const pareto_distribution<RealType, Policy>& dist)
     {
       RealType result = 0;
       RealType shape = dist.shape();
-      static const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
       if(false == detail::check_pareto(function, dist.scale(), shape, &result, Policy()))
       {
         return result;
@@ -408,11 +408,11 @@ namespace boost
     } // kurtosis
 
     template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const pareto_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis_excess(const pareto_distribution<RealType, Policy>& dist)
     {
       RealType result = 0;
       RealType shape = dist.shape();
-      static const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const pareto_distribution<%1%>&, %1%)";
       if(false == detail::check_pareto(function, dist.scale(), shape, &result, Policy()))
       {
         return result;
diff --git a/include/boost/math/distributions/poisson.hpp b/include/boost/math/distributions/poisson.hpp
index e4665bff69..827d455638 100644
--- a/include/boost/math/distributions/poisson.hpp
+++ b/include/boost/math/distributions/poisson.hpp
@@ -59,7 +59,7 @@ namespace boost
       // checks are always performed, even if exceptions are not enabled.
 
       template <class RealType, class Policy>
-      inline bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_mean(const char* function, const RealType& mean, RealType* result, const Policy& pol)
       {
         if(!(boost::math::isfinite)(mean) || (mean < 0))
         {
@@ -72,7 +72,7 @@ namespace boost
       } // bool check_mean
 
       template <class RealType, class Policy>
-      inline bool check_mean_NZ(const char* function, const RealType& mean, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_mean_NZ(const char* function, const RealType& mean, RealType* result, const Policy& pol)
       { // mean == 0 is considered an error.
         if( !(boost::math::isfinite)(mean) || (mean <= 0))
         {
@@ -85,13 +85,13 @@ namespace boost
       } // bool check_mean_NZ
 
       template <class RealType, class Policy>
-      inline bool check_dist(const char* function, const RealType& mean, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist(const char* function, const RealType& mean, RealType* result, const Policy& pol)
       { // Only one check, so this is redundant really but should be optimized away.
         return check_mean_NZ(function, mean, result, pol);
       } // bool check_dist
 
       template <class RealType, class Policy>
-      inline bool check_k(const char* function, const RealType& k, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_k(const char* function, const RealType& k, RealType* result, const Policy& pol)
       {
         if((k < 0) || !(boost::math::isfinite)(k))
         {
@@ -104,7 +104,7 @@ namespace boost
       } // bool check_k
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_k(const char* function, RealType mean, RealType k, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist_and_k(const char* function, RealType mean, RealType k, RealType* result, const Policy& pol)
       {
         if((check_dist(function, mean, result, pol) == false) ||
           (check_k(function, k, result, pol) == false))
@@ -115,7 +115,7 @@ namespace boost
       } // bool check_dist_and_k
 
       template <class RealType, class Policy>
-      inline bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_prob(const char* function, const RealType& p, RealType* result, const Policy& pol)
       { // Check 0 <= p <= 1
         if(!(boost::math::isfinite)(p) || (p < 0) || (p > 1))
         {
@@ -128,7 +128,7 @@ namespace boost
       } // bool check_prob
 
       template <class RealType, class Policy>
-      inline bool check_dist_and_prob(const char* function, RealType mean,  RealType p, RealType* result, const Policy& pol)
+      inline BOOST_GPU_ENABLED bool check_dist_and_prob(const char* function, RealType mean,  RealType p, RealType* result, const Policy& pol)
       {
         if((check_dist(function, mean, result, pol) == false) ||
           (check_prob(function, p, result, pol) == false))
@@ -147,7 +147,7 @@ namespace boost
       typedef RealType value_type;
       typedef Policy policy_type;
 
-      poisson_distribution(RealType l_mean = 1) : m_l(l_mean) // mean (lambda).
+      BOOST_GPU_ENABLED poisson_distribution(RealType l_mean = 1) : m_l(l_mean) // mean (lambda).
       { // Expected mean number of events that occur during the given interval.
         RealType r;
         poisson_detail::check_dist(
@@ -156,7 +156,7 @@ namespace boost
           &r, Policy());
       } // poisson_distribution constructor.
 
-      RealType mean() const
+      BOOST_GPU_ENABLED RealType mean() const
       { // Private data getter function.
         return m_l;
       }
@@ -185,13 +185,13 @@ namespace boost
     }
 
     template <class RealType, class Policy>
-    inline RealType mean(const poisson_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mean(const poisson_distribution<RealType, Policy>& dist)
     { // Mean of poisson distribution = lambda.
       return dist.mean();
     } // mean
 
     template <class RealType, class Policy>
-    inline RealType mode(const poisson_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType mode(const poisson_distribution<RealType, Policy>& dist)
     { // mode.
       BOOST_MATH_STD_USING // ADL of std functions.
       return floor(dist.mean());
@@ -208,7 +208,7 @@ namespace boost
     // Now implemented via quantile(half) in derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType variance(const poisson_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType variance(const poisson_distribution<RealType, Policy>& dist)
     { // variance.
       return dist.mean();
     }
@@ -217,14 +217,14 @@ namespace boost
     // standard_deviation provided by derived accessors.
 
     template <class RealType, class Policy>
-    inline RealType skewness(const poisson_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType skewness(const poisson_distribution<RealType, Policy>& dist)
     { // skewness = sqrt(l).
       BOOST_MATH_STD_USING // ADL of std functions.
       return 1 / sqrt(dist.mean());
     }
 
     template <class RealType, class Policy>
-    inline RealType kurtosis_excess(const poisson_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis_excess(const poisson_distribution<RealType, Policy>& dist)
     { // skewness = sqrt(l).
       return 1 / dist.mean(); // kurtosis_excess 1/mean from Wiki & MathWorld eq 31.
       // http://mathworld.wolfram.com/Kurtosis.html explains that the kurtosis excess
@@ -233,7 +233,7 @@ namespace boost
     } // RealType kurtosis_excess
 
     template <class RealType, class Policy>
-    inline RealType kurtosis(const poisson_distribution<RealType, Policy>& dist)
+    inline BOOST_GPU_ENABLED RealType kurtosis(const poisson_distribution<RealType, Policy>& dist)
     { // kurtosis is 4th moment about the mean = u4 / sd ^ 4
       // http://en.wikipedia.org/wiki/Curtosis
       // kurtosis can range from -2 (flat top) to +infinity (sharp peak & heavy tails).
@@ -245,7 +245,7 @@ namespace boost
     } // RealType kurtosis
 
     template <class RealType, class Policy>
-    RealType pdf(const poisson_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_GPU_ENABLED RealType pdf(const poisson_distribution<RealType, Policy>& dist, const RealType& k)
     { // Probability Density/Mass Function.
       // Probability that there are EXACTLY k occurrences (or arrivals).
       BOOST_FPU_EXCEPTION_GUARD
@@ -277,7 +277,7 @@ namespace boost
     } // pdf
 
     template <class RealType, class Policy>
-    RealType cdf(const poisson_distribution<RealType, Policy>& dist, const RealType& k)
+    BOOST_GPU_ENABLED RealType cdf(const poisson_distribution<RealType, Policy>& dist, const RealType& k)
     { // Cumulative Distribution Function Poisson.
       // The random variate k is the number of occurrences(or arrivals)
       // k argument may be integral, signed, or unsigned, or floating point.
@@ -328,7 +328,7 @@ namespace boost
     } // binomial cdf
 
     template <class RealType, class Policy>
-    RealType cdf(const complemented2_type<poisson_distribution<RealType, Policy>, RealType>& c)
+    BOOST_GPU_ENABLED RealType cdf(const complemented2_type<poisson_distribution<RealType, Policy>, RealType>& c)
     { // Complemented Cumulative Distribution Function Poisson
       // The random variate k is the number of events, occurrences or arrivals.
       // k argument may be integral, signed, or unsigned, or floating point.
diff --git a/include/boost/math/distributions/rayleigh.hpp b/include/boost/math/distributions/rayleigh.hpp
index 744733a9fa..4de2016431 100644
--- a/include/boost/math/distributions/rayleigh.hpp
+++ b/include/boost/math/distributions/rayleigh.hpp
@@ -26,7 +26,7 @@ namespace boost{ namespace math{
 namespace detail
 { // Error checks:
   template <class RealType, class Policy>
-  inline bool verify_sigma(const char* function, RealType sigma, RealType* presult, const Policy& pol)
+  inline BOOST_GPU_ENABLED bool verify_sigma(const char* function, RealType sigma, RealType* presult, const Policy& pol)
   {
      if((sigma <= 0) || (!(boost::math::isfinite)(sigma)))
      {
@@ -39,7 +39,7 @@ namespace detail
   } // bool verify_sigma
 
   template <class RealType, class Policy>
-  inline bool verify_rayleigh_x(const char* function, RealType x, RealType* presult, const Policy& pol)
+  inline BOOST_GPU_ENABLED bool verify_rayleigh_x(const char* function, RealType x, RealType* presult, const Policy& pol)
   {
      if((x < 0) || (boost::math::isnan)(x))
      {
@@ -59,14 +59,14 @@ class rayleigh_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   rayleigh_distribution(RealType l_sigma = 1)
+   BOOST_GPU_ENABLED rayleigh_distribution(RealType l_sigma = 1)
       : m_sigma(l_sigma)
    {
       RealType err;
       detail::verify_sigma("boost::math::rayleigh_distribution<%1%>::rayleigh_distribution", l_sigma, &err, Policy());
    } // rayleigh_distribution
 
-   RealType sigma()const
+   BOOST_GPU_ENABLED RealType sigma()const
    { // Accessor.
      return m_sigma;
    }
@@ -93,13 +93,13 @@ inline const std::pair<RealType, RealType> support(const rayleigh_distribution<R
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const rayleigh_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const rayleigh_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std function exp.
 
    RealType sigma = dist.sigma();
    RealType result = 0;
-   static const char* function = "boost::math::pdf(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
    {
       return result;
@@ -118,13 +118,13 @@ inline RealType pdf(const rayleigh_distribution<RealType, Policy>& dist, const R
 } // pdf
 
 template <class RealType, class Policy>
-inline RealType cdf(const rayleigh_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const rayleigh_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
    RealType result = 0;
    RealType sigma = dist.sigma();
-   static const char* function = "boost::math::cdf(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
    {
       return result;
@@ -138,13 +138,13 @@ inline RealType cdf(const rayleigh_distribution<RealType, Policy>& dist, const R
 } // cdf
 
 template <class RealType, class Policy>
-inline RealType quantile(const rayleigh_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const rayleigh_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
    RealType result = 0;
    RealType sigma = dist.sigma();
-   static const char* function = "boost::math::quantile(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
       return result;
    if(false == detail::check_probability(function, p, &result, Policy()))
@@ -163,13 +163,13 @@ inline RealType quantile(const rayleigh_distribution<RealType, Policy>& dist, co
 } // quantile
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<rayleigh_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<rayleigh_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions
 
    RealType result = 0;
    RealType sigma = c.dist.sigma();
-   static const char* function = "boost::math::cdf(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
    {
       return result;
@@ -188,13 +188,13 @@ inline RealType cdf(const complemented2_type<rayleigh_distribution<RealType, Pol
 } // cdf complement
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<rayleigh_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<rayleigh_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING // for ADL of std functions, log & sqrt.
 
    RealType result = 0;
    RealType sigma = c.dist.sigma();
-   static const char* function = "boost::math::quantile(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
    {
       return result;
@@ -217,11 +217,11 @@ inline RealType quantile(const complemented2_type<rayleigh_distribution<RealType
 } // quantile complement
 
 template <class RealType, class Policy>
-inline RealType mean(const rayleigh_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const rayleigh_distribution<RealType, Policy>& dist)
 {
    RealType result = 0;
    RealType sigma = dist.sigma();
-   static const char* function = "boost::math::mean(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
    {
       return result;
@@ -231,11 +231,11 @@ inline RealType mean(const rayleigh_distribution<RealType, Policy>& dist)
 } // mean
 
 template <class RealType, class Policy>
-inline RealType variance(const rayleigh_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const rayleigh_distribution<RealType, Policy>& dist)
 {
    RealType result = 0;
    RealType sigma = dist.sigma();
-   static const char* function = "boost::math::variance(const rayleigh_distribution<%1%>&, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::variance(const rayleigh_distribution<%1%>&, %1%)";
    if(false == detail::verify_sigma(function, sigma, &result, Policy()))
    {
       return result;
@@ -245,20 +245,20 @@ inline RealType variance(const rayleigh_distribution<RealType, Policy>& dist)
 } // variance
 
 template <class RealType, class Policy>
-inline RealType mode(const rayleigh_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const rayleigh_distribution<RealType, Policy>& dist)
 {
    return dist.sigma();
 }
 
 template <class RealType, class Policy>
-inline RealType median(const rayleigh_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const rayleigh_distribution<RealType, Policy>& dist)
 {
    using boost::math::constants::root_ln_four;
    return root_ln_four<RealType>() * dist.sigma();
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const rayleigh_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType skewness(const rayleigh_distribution<RealType, Policy>& /*dist*/)
 {
   // using namespace boost::math::constants;
   return static_cast<RealType>(0.63111065781893713819189935154422777984404221106391L);
@@ -267,7 +267,7 @@ inline RealType skewness(const rayleigh_distribution<RealType, Policy>& /*dist*/
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const rayleigh_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis(const rayleigh_distribution<RealType, Policy>& /*dist*/)
 {
   // using namespace boost::math::constants;
   return static_cast<RealType>(3.2450893006876380628486604106197544154170667057995L);
@@ -277,7 +277,7 @@ inline RealType kurtosis(const rayleigh_distribution<RealType, Policy>& /*dist*/
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const rayleigh_distribution<RealType, Policy>& /*dist*/)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const rayleigh_distribution<RealType, Policy>& /*dist*/)
 {
   //using namespace boost::math::constants;
   // Computed using NTL at 150 bit, about 50 decimal digits.
diff --git a/include/boost/math/distributions/skew_normal.hpp b/include/boost/math/distributions/skew_normal.hpp
index f348347ede..1a2dcf37a0 100644
--- a/include/boost/math/distributions/skew_normal.hpp
+++ b/include/boost/math/distributions/skew_normal.hpp
@@ -392,7 +392,7 @@ namespace boost{ namespace math{
         
         const diff_type d = std::distance(shapes, result_ptr);
         
-        BOOST_ASSERT(d > static_cast<diff_type>(0));
+        BOOST_MATH_ASSERT(d > static_cast<diff_type>(0));
 
         // refine
         if(d < static_cast<diff_type>(21)) // shape smaller 100
@@ -531,7 +531,7 @@ namespace boost{ namespace math{
     
     const diff_type d = std::distance(shapes, result_ptr);
     
-    BOOST_ASSERT(d > static_cast<diff_type>(0));
+    BOOST_MATH_ASSERT(d > static_cast<diff_type>(0));
 
     // TODO: make the search bounds smarter, depending on the shape parameter
     RealType search_min = 0; // below zero was caught above
diff --git a/include/boost/math/distributions/triangular.hpp b/include/boost/math/distributions/triangular.hpp
index 1e49a38faf..92d8abef19 100644
--- a/include/boost/math/distributions/triangular.hpp
+++ b/include/boost/math/distributions/triangular.hpp
@@ -29,7 +29,7 @@ namespace boost{ namespace math
   namespace detail
   {
     template <class RealType, class Policy>
-    inline bool check_triangular_lower(
+    inline BOOST_GPU_ENABLED bool check_triangular_lower(
       const char* function,
       RealType lower,
       RealType* result, const Policy& pol)
@@ -48,7 +48,7 @@ namespace boost{ namespace math
     } // bool check_triangular_lower(
 
     template <class RealType, class Policy>
-    inline bool check_triangular_mode(
+    inline BOOST_GPU_ENABLED bool check_triangular_mode(
       const char* function,
       RealType mode,
       RealType* result, const Policy& pol)
@@ -67,7 +67,7 @@ namespace boost{ namespace math
     } // bool check_triangular_mode(
 
     template <class RealType, class Policy>
-    inline bool check_triangular_upper(
+    inline BOOST_GPU_ENABLED bool check_triangular_upper(
       const char* function,
       RealType upper,
       RealType* result, const Policy& pol)
@@ -86,7 +86,7 @@ namespace boost{ namespace math
     } // bool check_triangular_upper(
 
     template <class RealType, class Policy>
-    inline bool check_triangular_x(
+    inline BOOST_GPU_ENABLED bool check_triangular_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -105,7 +105,7 @@ namespace boost{ namespace math
     } // bool check_triangular_x
 
     template <class RealType, class Policy>
-    inline bool check_triangular(
+    inline BOOST_GPU_ENABLED bool check_triangular(
       const char* function,
       RealType lower,
       RealType mode,
@@ -153,7 +153,7 @@ namespace boost{ namespace math
     typedef RealType value_type;
     typedef Policy policy_type;
 
-    triangular_distribution(RealType l_lower = -1, RealType l_mode = 0, RealType l_upper = 1)
+    BOOST_GPU_ENABLED triangular_distribution(RealType l_lower = -1, RealType l_mode = 0, RealType l_upper = 1)
       : m_lower(l_lower), m_mode(l_mode), m_upper(l_upper) // Constructor.
     { // Evans says 'standard triangular' is lower 0, mode 1/2, upper 1,
       // has median sqrt(c/2) for c <=1/2 and 1 - sqrt(1-c)/2 for c >= 1/2
@@ -163,15 +163,15 @@ namespace boost{ namespace math
       detail::check_triangular("boost::math::triangular_distribution<%1%>::triangular_distribution",l_lower, l_mode, l_upper, &result, Policy());
     }
     // Accessor functions.
-    RealType lower()const
+    BOOST_GPU_ENABLED RealType lower()const
     {
       return m_lower;
     }
-    RealType mode()const
+    BOOST_GPU_ENABLED RealType mode()const
     {
       return m_mode;
     }
-    RealType upper()const
+    BOOST_GPU_ENABLED RealType upper()const
     {
       return m_upper;
     }
@@ -199,9 +199,9 @@ namespace boost{ namespace math
   }
 
   template <class RealType, class Policy>
-  RealType pdf(const triangular_distribution<RealType, Policy>& dist, const RealType& x)
+  BOOST_GPU_ENABLED RealType pdf(const triangular_distribution<RealType, Policy>& dist, const RealType& x)
   {
-    static const char* function = "boost::math::pdf(const triangular_distribution<%1%>&, %1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const triangular_distribution<%1%>&, %1%)";
     RealType lower = dist.lower();
     RealType mode = dist.mode();
     RealType upper = dist.upper();
@@ -237,9 +237,9 @@ namespace boost{ namespace math
   } // RealType pdf(const triangular_distribution<RealType, Policy>& dist, const RealType& x)
 
   template <class RealType, class Policy>
-  inline RealType cdf(const triangular_distribution<RealType, Policy>& dist, const RealType& x)
+  inline BOOST_GPU_ENABLED RealType cdf(const triangular_distribution<RealType, Policy>& dist, const RealType& x)
   {
-    static const char* function = "boost::math::cdf(const triangular_distribution<%1%>&, %1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const triangular_distribution<%1%>&, %1%)";
     RealType lower = dist.lower();
     RealType mode = dist.mode();
     RealType upper = dist.upper();
@@ -272,10 +272,10 @@ namespace boost{ namespace math
   } // RealType cdf(const triangular_distribution<RealType, Policy>& dist, const RealType& x)
 
   template <class RealType, class Policy>
-  RealType quantile(const triangular_distribution<RealType, Policy>& dist, const RealType& p)
+  BOOST_GPU_ENABLED RealType quantile(const triangular_distribution<RealType, Policy>& dist, const RealType& p)
   {
     BOOST_MATH_STD_USING  // for ADL of std functions (sqrt).
-    static const char* function = "boost::math::quantile(const triangular_distribution<%1%>&, %1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const triangular_distribution<%1%>&, %1%)";
     RealType lower = dist.lower();
     RealType mode = dist.mode();
     RealType upper = dist.upper();
@@ -315,9 +315,9 @@ namespace boost{ namespace math
   } // RealType quantile(const triangular_distribution<RealType, Policy>& dist, const RealType& q)
 
   template <class RealType, class Policy>
-  RealType cdf(const complemented2_type<triangular_distribution<RealType, Policy>, RealType>& c)
+  BOOST_GPU_ENABLED RealType cdf(const complemented2_type<triangular_distribution<RealType, Policy>, RealType>& c)
   {
-    static const char* function = "boost::math::cdf(const triangular_distribution<%1%>&, %1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const triangular_distribution<%1%>&, %1%)";
     RealType lower = c.dist.lower();
     RealType mode = c.dist.mode();
     RealType upper = c.dist.upper();
@@ -350,10 +350,10 @@ namespace boost{ namespace math
   } // RealType cdf(const complemented2_type<triangular_distribution<RealType, Policy>, RealType>& c)
 
   template <class RealType, class Policy>
-  RealType quantile(const complemented2_type<triangular_distribution<RealType, Policy>, RealType>& c)
+  BOOST_GPU_ENABLED RealType quantile(const complemented2_type<triangular_distribution<RealType, Policy>, RealType>& c)
   {
     BOOST_MATH_STD_USING  // Aid ADL for sqrt.
-    static const char* function = "boost::math::quantile(const triangular_distribution<%1%>&, %1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const triangular_distribution<%1%>&, %1%)";
     RealType l = c.dist.lower();
     RealType m = c.dist.mode();
     RealType u = c.dist.upper();
@@ -399,9 +399,9 @@ namespace boost{ namespace math
   } // RealType quantile(const complemented2_type<triangular_distribution<RealType, Policy>, RealType>& c)
 
   template <class RealType, class Policy>
-  inline RealType mean(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType mean(const triangular_distribution<RealType, Policy>& dist)
   {
-    static const char* function = "boost::math::mean(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const triangular_distribution<%1%>&)";
     RealType lower = dist.lower();
     RealType mode = dist.mode();
     RealType upper = dist.upper();
@@ -415,9 +415,9 @@ namespace boost{ namespace math
 
 
   template <class RealType, class Policy>
-  inline RealType variance(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType variance(const triangular_distribution<RealType, Policy>& dist)
   {
-    static const char* function = "boost::math::mean(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const triangular_distribution<%1%>&)";
     RealType lower = dist.lower();
     RealType mode = dist.mode();
     RealType upper = dist.upper();
@@ -430,9 +430,9 @@ namespace boost{ namespace math
   } // RealType variance(const triangular_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType mode(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType mode(const triangular_distribution<RealType, Policy>& dist)
   {
-    static const char* function = "boost::math::mode(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::mode(const triangular_distribution<%1%>&)";
     RealType mode = dist.mode();
     RealType result = 0; // of checks.
     if(false == detail::check_triangular_mode(function, mode, &result, Policy()))
@@ -443,10 +443,10 @@ namespace boost{ namespace math
   } // RealType mode
 
   template <class RealType, class Policy>
-  inline RealType median(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType median(const triangular_distribution<RealType, Policy>& dist)
   {
     BOOST_MATH_STD_USING // ADL of std functions.
-    static const char* function = "boost::math::median(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::median(const triangular_distribution<%1%>&)";
     RealType mode = dist.mode();
     RealType result = 0; // of checks.
     if(false == detail::check_triangular_mode(function, mode, &result, Policy()))
@@ -466,11 +466,11 @@ namespace boost{ namespace math
   } // RealType mode
 
   template <class RealType, class Policy>
-  inline RealType skewness(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType skewness(const triangular_distribution<RealType, Policy>& dist)
   {
     BOOST_MATH_STD_USING  // for ADL of std functions
     using namespace boost::math::constants; // for root_two
-    static const char* function = "boost::math::skewness(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::skewness(const triangular_distribution<%1%>&)";
 
     RealType lower = dist.lower();
     RealType mode = dist.mode();
@@ -487,9 +487,9 @@ namespace boost{ namespace math
   } // RealType skewness(const triangular_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType kurtosis(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType kurtosis(const triangular_distribution<RealType, Policy>& dist)
   { // These checks may be belt and braces as should have been checked on construction?
-    static const char* function = "boost::math::kurtosis(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis(const triangular_distribution<%1%>&)";
     RealType lower = dist.lower();
     RealType upper = dist.upper();
     RealType mode = dist.mode();
@@ -502,9 +502,9 @@ namespace boost{ namespace math
   } // RealType kurtosis_excess(const triangular_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType kurtosis_excess(const triangular_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType kurtosis_excess(const triangular_distribution<RealType, Policy>& dist)
   { // These checks may be belt and braces as should have been checked on construction?
-    static const char* function = "boost::math::kurtosis_excess(const triangular_distribution<%1%>&)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis_excess(const triangular_distribution<%1%>&)";
     RealType lower = dist.lower();
     RealType upper = dist.upper();
     RealType mode = dist.mode();
diff --git a/include/boost/math/distributions/uniform.hpp b/include/boost/math/distributions/uniform.hpp
index 856c144e36..c4dc77e783 100644
--- a/include/boost/math/distributions/uniform.hpp
+++ b/include/boost/math/distributions/uniform.hpp
@@ -26,7 +26,7 @@ namespace boost{ namespace math
   namespace detail
   {
     template <class RealType, class Policy>
-    inline bool check_uniform_lower(
+    inline BOOST_GPU_ENABLED bool check_uniform_lower(
       const char* function,
       RealType lower,
       RealType* result, const Policy& pol)
@@ -45,7 +45,7 @@ namespace boost{ namespace math
     } // bool check_uniform_lower(
 
     template <class RealType, class Policy>
-    inline bool check_uniform_upper(
+    inline BOOST_GPU_ENABLED bool check_uniform_upper(
       const char* function,
       RealType upper,
       RealType* result, const Policy& pol)
@@ -64,7 +64,7 @@ namespace boost{ namespace math
     } // bool check_uniform_upper(
 
     template <class RealType, class Policy>
-    inline bool check_uniform_x(
+    inline BOOST_GPU_ENABLED bool check_uniform_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -83,7 +83,7 @@ namespace boost{ namespace math
     } // bool check_uniform_x
 
     template <class RealType, class Policy>
-    inline bool check_uniform(
+    inline BOOST_GPU_ENABLED bool check_uniform(
       const char* function,
       RealType lower,
       RealType upper,
@@ -116,19 +116,19 @@ namespace boost{ namespace math
     typedef RealType value_type;
     typedef Policy policy_type;
 
-    uniform_distribution(RealType l_lower = 0, RealType l_upper = 1) // Constructor.
+    BOOST_GPU_ENABLED uniform_distribution(RealType l_lower = 0, RealType l_upper = 1) // Constructor.
       : m_lower(l_lower), m_upper(l_upper) // Default is standard uniform distribution.
     {
       RealType result;
       detail::check_uniform("boost::math::uniform_distribution<%1%>::uniform_distribution", l_lower, l_upper, &result, Policy());
     }
     // Accessor functions.
-    RealType lower()const
+    BOOST_GPU_ENABLED RealType lower()const
     {
       return m_lower;
     }
 
-    RealType upper()const
+    BOOST_GPU_ENABLED RealType upper()const
     {
       return m_upper;
     }
@@ -157,7 +157,7 @@ namespace boost{ namespace math
   }
 
   template <class RealType, class Policy>
-  inline RealType pdf(const uniform_distribution<RealType, Policy>& dist, const RealType& x)
+  inline BOOST_GPU_ENABLED RealType pdf(const uniform_distribution<RealType, Policy>& dist, const RealType& x)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -182,7 +182,7 @@ namespace boost{ namespace math
   } // RealType pdf(const uniform_distribution<RealType, Policy>& dist, const RealType& x)
 
   template <class RealType, class Policy>
-  inline RealType cdf(const uniform_distribution<RealType, Policy>& dist, const RealType& x)
+  inline BOOST_GPU_ENABLED RealType cdf(const uniform_distribution<RealType, Policy>& dist, const RealType& x)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -207,7 +207,7 @@ namespace boost{ namespace math
   } // RealType cdf(const uniform_distribution<RealType, Policy>& dist, const RealType& x)
 
   template <class RealType, class Policy>
-  inline RealType quantile(const uniform_distribution<RealType, Policy>& dist, const RealType& p)
+  inline BOOST_GPU_ENABLED RealType quantile(const uniform_distribution<RealType, Policy>& dist, const RealType& p)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -232,7 +232,7 @@ namespace boost{ namespace math
   } // RealType quantile(const uniform_distribution<RealType, Policy>& dist, const RealType& p)
 
   template <class RealType, class Policy>
-  inline RealType cdf(const complemented2_type<uniform_distribution<RealType, Policy>, RealType>& c)
+  inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<uniform_distribution<RealType, Policy>, RealType>& c)
   {
     RealType lower = c.dist.lower();
     RealType upper = c.dist.upper();
@@ -258,7 +258,7 @@ namespace boost{ namespace math
   } // RealType cdf(const complemented2_type<uniform_distribution<RealType, Policy>, RealType>& c)
 
   template <class RealType, class Policy>
-  inline RealType quantile(const complemented2_type<uniform_distribution<RealType, Policy>, RealType>& c)
+  inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<uniform_distribution<RealType, Policy>, RealType>& c)
   {
     RealType lower = c.dist.lower();
     RealType upper = c.dist.upper();
@@ -284,7 +284,7 @@ namespace boost{ namespace math
   } // RealType quantile(const complemented2_type<uniform_distribution<RealType, Policy>, RealType>& c)
 
   template <class RealType, class Policy>
-  inline RealType mean(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType mean(const uniform_distribution<RealType, Policy>& dist)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -297,7 +297,7 @@ namespace boost{ namespace math
   } // RealType mean(const uniform_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType variance(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType variance(const uniform_distribution<RealType, Policy>& dist)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -311,7 +311,7 @@ namespace boost{ namespace math
   } // RealType variance(const uniform_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType mode(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType mode(const uniform_distribution<RealType, Policy>& dist)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -325,7 +325,7 @@ namespace boost{ namespace math
   }
 
   template <class RealType, class Policy>
-  inline RealType median(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType median(const uniform_distribution<RealType, Policy>& dist)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -337,7 +337,7 @@ namespace boost{ namespace math
     return (lower + upper) / 2; //
   }
   template <class RealType, class Policy>
-  inline RealType skewness(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType skewness(const uniform_distribution<RealType, Policy>& dist)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -350,7 +350,7 @@ namespace boost{ namespace math
   } // RealType skewness(const uniform_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType kurtosis_excess(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType kurtosis_excess(const uniform_distribution<RealType, Policy>& dist)
   {
     RealType lower = dist.lower();
     RealType upper = dist.upper();
@@ -363,7 +363,7 @@ namespace boost{ namespace math
   } // RealType kurtosis_excess(const uniform_distribution<RealType, Policy>& dist)
 
   template <class RealType, class Policy>
-  inline RealType kurtosis(const uniform_distribution<RealType, Policy>& dist)
+  inline BOOST_GPU_ENABLED RealType kurtosis(const uniform_distribution<RealType, Policy>& dist)
   {
     return kurtosis_excess(dist) + 3;
   }
diff --git a/include/boost/math/distributions/weibull.hpp b/include/boost/math/distributions/weibull.hpp
index da1189090c..11ac291f3a 100644
--- a/include/boost/math/distributions/weibull.hpp
+++ b/include/boost/math/distributions/weibull.hpp
@@ -23,7 +23,7 @@ namespace boost{ namespace math
 namespace detail{
 
 template <class RealType, class Policy>
-inline bool check_weibull_shape(
+inline BOOST_GPU_ENABLED bool check_weibull_shape(
       const char* function,
       RealType shape,
       RealType* result, const Policy& pol)
@@ -39,7 +39,7 @@ inline bool check_weibull_shape(
 }
 
 template <class RealType, class Policy>
-inline bool check_weibull_x(
+inline BOOST_GPU_ENABLED bool check_weibull_x(
       const char* function,
       RealType const& x,
       RealType* result, const Policy& pol)
@@ -55,7 +55,7 @@ inline bool check_weibull_x(
 }
 
 template <class RealType, class Policy>
-inline bool check_weibull(
+inline BOOST_GPU_ENABLED bool check_weibull(
       const char* function,
       RealType scale,
       RealType shape,
@@ -73,19 +73,19 @@ class weibull_distribution
    typedef RealType value_type;
    typedef Policy policy_type;
 
-   weibull_distribution(RealType l_shape, RealType l_scale = 1)
+   BOOST_GPU_ENABLED weibull_distribution(RealType l_shape, RealType l_scale = 1)
       : m_shape(l_shape), m_scale(l_scale)
    {
       RealType result;
       detail::check_weibull("boost::math::weibull_distribution<%1%>::weibull_distribution", l_scale, l_shape, &result, Policy());
    }
 
-   RealType shape()const
+   BOOST_GPU_ENABLED RealType shape()const
    {
       return m_shape;
    }
 
-   RealType scale()const
+   BOOST_GPU_ENABLED RealType scale()const
    {
       return m_scale;
    }
@@ -117,11 +117,11 @@ inline const std::pair<RealType, RealType> support(const weibull_distribution<Re
 }
 
 template <class RealType, class Policy>
-inline RealType pdf(const weibull_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType pdf(const weibull_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::pdf(const weibull_distribution<%1%>, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::pdf(const weibull_distribution<%1%>, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -151,11 +151,11 @@ inline RealType pdf(const weibull_distribution<RealType, Policy>& dist, const Re
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const weibull_distribution<RealType, Policy>& dist, const RealType& x)
+inline BOOST_GPU_ENABLED RealType cdf(const weibull_distribution<RealType, Policy>& dist, const RealType& x)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -172,11 +172,11 @@ inline RealType cdf(const weibull_distribution<RealType, Policy>& dist, const Re
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const weibull_distribution<RealType, Policy>& dist, const RealType& p)
+inline BOOST_GPU_ENABLED RealType quantile(const weibull_distribution<RealType, Policy>& dist, const RealType& p)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -196,11 +196,11 @@ inline RealType quantile(const weibull_distribution<RealType, Policy>& dist, con
 }
 
 template <class RealType, class Policy>
-inline RealType cdf(const complemented2_type<weibull_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType cdf(const complemented2_type<weibull_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::cdf(const weibull_distribution<%1%>, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -217,11 +217,11 @@ inline RealType cdf(const complemented2_type<weibull_distribution<RealType, Poli
 }
 
 template <class RealType, class Policy>
-inline RealType quantile(const complemented2_type<weibull_distribution<RealType, Policy>, RealType>& c)
+inline BOOST_GPU_ENABLED RealType quantile(const complemented2_type<weibull_distribution<RealType, Policy>, RealType>& c)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::quantile(const weibull_distribution<%1%>, %1%)";
 
    RealType shape = c.dist.shape();
    RealType scale = c.dist.scale();
@@ -242,11 +242,11 @@ inline RealType quantile(const complemented2_type<weibull_distribution<RealType,
 }
 
 template <class RealType, class Policy>
-inline RealType mean(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mean(const weibull_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::mean(const weibull_distribution<%1%>)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mean(const weibull_distribution<%1%>)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -260,12 +260,12 @@ inline RealType mean(const weibull_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType variance(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType variance(const weibull_distribution<RealType, Policy>& dist)
 {
    RealType shape = dist.shape();
    RealType scale = dist.scale();
 
-   static const char* function = "boost::math::variance(const weibull_distribution<%1%>)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::variance(const weibull_distribution<%1%>)";
 
    RealType result = 0;
    if(false == detail::check_weibull(function, scale, shape, &result, Policy()))
@@ -280,11 +280,11 @@ inline RealType variance(const weibull_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType mode(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType mode(const weibull_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std function pow.
 
-   static const char* function = "boost::math::mode(const weibull_distribution<%1%>)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::mode(const weibull_distribution<%1%>)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -301,11 +301,11 @@ inline RealType mode(const weibull_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType median(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType median(const weibull_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std function pow.
 
-   static const char* function = "boost::math::median(const weibull_distribution<%1%>)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::median(const weibull_distribution<%1%>)";
 
    RealType shape = dist.shape(); // Wikipedia k
    RealType scale = dist.scale(); // Wikipedia lambda
@@ -321,11 +321,11 @@ inline RealType median(const weibull_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType skewness(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType skewness(const weibull_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::skewness(const weibull_distribution<%1%>)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::skewness(const weibull_distribution<%1%>)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -347,11 +347,11 @@ inline RealType skewness(const weibull_distribution<RealType, Policy>& dist)
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis_excess(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis_excess(const weibull_distribution<RealType, Policy>& dist)
 {
    BOOST_MATH_STD_USING  // for ADL of std functions
 
-   static const char* function = "boost::math::kurtosis_excess(const weibull_distribution<%1%>)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::kurtosis_excess(const weibull_distribution<%1%>)";
 
    RealType shape = dist.shape();
    RealType scale = dist.scale();
@@ -377,7 +377,7 @@ inline RealType kurtosis_excess(const weibull_distribution<RealType, Policy>& di
 }
 
 template <class RealType, class Policy>
-inline RealType kurtosis(const weibull_distribution<RealType, Policy>& dist)
+inline BOOST_GPU_ENABLED RealType kurtosis(const weibull_distribution<RealType, Policy>& dist)
 {
    return kurtosis_excess(dist) + 3;
 }
diff --git a/include/boost/math/policies/error_handling.hpp b/include/boost/math/policies/error_handling.hpp
index 124337ee87..d23af3d43f 100644
--- a/include/boost/math/policies/error_handling.hpp
+++ b/include/boost/math/policies/error_handling.hpp
@@ -34,6 +34,10 @@
 #endif
 #include <sstream>
 
+#ifdef __CUDACC__
+#include <math_constants.h>
+#endif
+
 namespace boost{ namespace math{
 
 class evaluation_error : public std::runtime_error
@@ -183,7 +187,7 @@ inline T raise_domain_error(
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_domain_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_domain_error(
            const char* ,
            const char* ,
            const T& ,
@@ -191,11 +195,15 @@ inline BOOST_MATH_CONSTEXPR T raise_domain_error(
 {
    // This may or may not do the right thing, but the user asked for the error
    // to be ignored so here we go anyway:
+#ifdef __CUDA_ARCH__
+   return CUDART_INF_F;
+#else
    return std::numeric_limits<T>::quiet_NaN();
+#endif
 }
 
 template <class T>
-inline T raise_domain_error(
+inline BOOST_GPU_ENABLED T raise_domain_error(
            const char* ,
            const char* ,
            const T& ,
@@ -228,7 +236,7 @@ inline T raise_pole_error(
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_pole_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_pole_error(
            const char* function,
            const char* message,
            const T& val,
@@ -282,18 +290,22 @@ inline T raise_overflow_error(
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_overflow_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_overflow_error(
            const char* ,
            const char* ,
            const  ::boost::math::policies::overflow_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
 {
    // This may or may not do the right thing, but the user asked for the error
    // to be ignored so here we go anyway:
+#ifdef __CUDA_ARCH__
+   return CUDART_INF;
+#else
    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : boost::math::tools::max_value<T>();
+#endif
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_overflow_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_overflow_error(
            const char* ,
            const char* ,
            const T&,
@@ -301,7 +313,11 @@ inline BOOST_MATH_CONSTEXPR T raise_overflow_error(
 {
    // This may or may not do the right thing, but the user asked for the error
    // to be ignored so here we go anyway:
+#ifdef __CUDA_ARCH__
+   return CUDART_INF;
+#else
    return std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : boost::math::tools::max_value<T>();
+#endif
 }
 
 template <class T>
@@ -364,7 +380,7 @@ inline T raise_underflow_error(
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_underflow_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_underflow_error(
            const char* ,
            const char* ,
            const  ::boost::math::policies::underflow_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
@@ -408,7 +424,7 @@ inline T raise_denorm_error(
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_denorm_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_denorm_error(
            const char* ,
            const char* ,
            const T&  val,
@@ -455,7 +471,7 @@ inline T raise_evaluation_error(
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T raise_evaluation_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_evaluation_error(
            const char* ,
            const char* ,
            const T& val,
@@ -503,7 +519,7 @@ inline TargetType raise_rounding_error(
 }
 
 template <class T, class TargetType>
-inline BOOST_MATH_CONSTEXPR TargetType raise_rounding_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR TargetType raise_rounding_error(
            const char* ,
            const char* ,
            const T& val,
@@ -516,6 +532,59 @@ inline BOOST_MATH_CONSTEXPR TargetType raise_rounding_error(
    return  val > 0 ? (std::numeric_limits<TargetType>::max)() : (std::numeric_limits<TargetType>::is_integer ? (std::numeric_limits<TargetType>::min)() : -(std::numeric_limits<TargetType>::max)());
 }
 
+#ifdef __CUDA_ARCH__
+template <class T>
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR float raise_rounding_error(
+   const char*,
+   const char*,
+   const T& val,
+   const float&,
+   const  ::boost::math::policies::rounding_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
+{
+   return  val > 0 ? FLT_MAX : -FLT_MAX;
+}
+template <class T>
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR double raise_rounding_error(
+   const char*,
+   const char*,
+   const T& val,
+   const double&,
+   const  ::boost::math::policies::rounding_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
+{
+   return  val > 0 ? DBL_MAX : -DBL_MAX;
+}
+template <class T>
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR int raise_rounding_error(
+   const char*,
+   const char*,
+   const T& val,
+   const int&,
+   const  ::boost::math::policies::rounding_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
+{
+   return  val > 0 ? INT_MAX : INT_MIN;
+}
+template <class T>
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR long raise_rounding_error(
+   const char*,
+   const char*,
+   const T& val,
+   const long&,
+   const  ::boost::math::policies::rounding_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
+{
+   return  val > 0 ? LONG_MAX : LONG_MIN;
+}
+template <class T>
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR long long raise_rounding_error(
+   const char*,
+   const char*,
+   const T& val,
+   const long long&,
+   const  ::boost::math::policies::rounding_error< ::boost::math::policies::ignore_error>&) BOOST_MATH_NOEXCEPT(T)
+{
+   return  val > 0 ? LLONG_MAX : LLONG_MIN;
+}
+#endif
+
 template <class T, class TargetType>
 inline TargetType raise_rounding_error(
            const char* ,
@@ -570,7 +639,7 @@ inline T raise_indeterminate_result_error(
 }
 
 template <class T, class R>
-inline BOOST_MATH_CONSTEXPR T raise_indeterminate_result_error(
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_indeterminate_result_error(
            const char* ,
            const char* ,
            const T& ,
@@ -610,7 +679,7 @@ inline T raise_indeterminate_result_error(
 }  // namespace detail
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_domain_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_domain_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::domain_error_type policy_type;
    return detail::raise_domain_error(
@@ -619,7 +688,7 @@ inline BOOST_MATH_CONSTEXPR T raise_domain_error(const char* function, const cha
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_pole_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_pole_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::pole_error_type policy_type;
    return detail::raise_pole_error(
@@ -628,7 +697,7 @@ inline BOOST_MATH_CONSTEXPR T raise_pole_error(const char* function, const char*
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_overflow_error(const char* function, const char* message, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_overflow_error(const char* function, const char* message, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::overflow_error_type policy_type;
    return detail::raise_overflow_error<T>(
@@ -637,7 +706,7 @@ inline BOOST_MATH_CONSTEXPR T raise_overflow_error(const char* function, const c
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_overflow_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_overflow_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::overflow_error_type policy_type;
    return detail::raise_overflow_error(
@@ -646,7 +715,7 @@ inline BOOST_MATH_CONSTEXPR T raise_overflow_error(const char* function, const c
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_underflow_error(const char* function, const char* message, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_underflow_error(const char* function, const char* message, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::underflow_error_type policy_type;
    return detail::raise_underflow_error<T>(
@@ -655,7 +724,7 @@ inline BOOST_MATH_CONSTEXPR T raise_underflow_error(const char* function, const
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_denorm_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_denorm_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::denorm_error_type policy_type;
    return detail::raise_denorm_error<T>(
@@ -665,7 +734,7 @@ inline BOOST_MATH_CONSTEXPR T raise_denorm_error(const char* function, const cha
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_evaluation_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_evaluation_error(const char* function, const char* message, const T& val, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::evaluation_error_type policy_type;
    return detail::raise_evaluation_error(
@@ -674,7 +743,7 @@ inline BOOST_MATH_CONSTEXPR T raise_evaluation_error(const char* function, const
 }
 
 template <class T, class TargetType, class Policy>
-inline BOOST_MATH_CONSTEXPR TargetType raise_rounding_error(const char* function, const char* message, const T& val, const TargetType& t, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR TargetType raise_rounding_error(const char* function, const char* message, const T& val, const TargetType& t, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::rounding_error_type policy_type;
    return detail::raise_rounding_error(
@@ -683,7 +752,7 @@ inline BOOST_MATH_CONSTEXPR TargetType raise_rounding_error(const char* function
 }
 
 template <class T, class R, class Policy>
-inline BOOST_MATH_CONSTEXPR T raise_indeterminate_result_error(const char* function, const char* message, const T& val, const R& result, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR T raise_indeterminate_result_error(const char* function, const char* message, const T& val, const R& result, const Policy&) BOOST_NOEXCEPT_IF(is_noexcept_error_policy<Policy>::value && BOOST_MATH_IS_FLOAT(T))
 {
    typedef typename Policy::indeterminate_result_error_type policy_type;
    return detail::raise_indeterminate_result_error(
@@ -763,28 +832,28 @@ inline bool check_denorm(std::complex<T> val, R* result, const char* function, c
 
 // Default instantiations with ignore_error policy.
 template <class R, class T>
-inline BOOST_MATH_CONSTEXPR bool check_overflow(T /* val */, R* /* result */, const char* /* function */, const overflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T)) 
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR bool check_overflow(T /* val */, R* /* result */, const char* /* function */, const overflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T))
 { return false; }
 template <class R, class T>
-inline BOOST_MATH_CONSTEXPR bool check_overflow(std::complex<T> /* val */, R* /* result */, const char* /* function */, const overflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T)) 
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR bool check_overflow(std::complex<T> /* val */, R* /* result */, const char* /* function */, const overflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T))
 { return false; }
 template <class R, class T>
-inline BOOST_MATH_CONSTEXPR bool check_underflow(T /* val */, R* /* result */, const char* /* function */, const underflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T)) 
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR bool check_underflow(T /* val */, R* /* result */, const char* /* function */, const underflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T))
 { return false; }
 template <class R, class T>
-inline BOOST_MATH_CONSTEXPR bool check_underflow(std::complex<T> /* val */, R* /* result */, const char* /* function */, const underflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T)) 
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR bool check_underflow(std::complex<T> /* val */, R* /* result */, const char* /* function */, const underflow_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T))
 { return false; }
 template <class R, class T>
-inline BOOST_MATH_CONSTEXPR bool check_denorm(T /* val */, R* /* result*/, const char* /* function */, const denorm_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T)) 
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR bool check_denorm(T /* val */, R* /* result*/, const char* /* function */, const denorm_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T))
 { return false; }
 template <class R, class T>
-inline BOOST_MATH_CONSTEXPR bool check_denorm(std::complex<T> /* val */, R* /* result*/, const char* /* function */, const denorm_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T)) 
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR bool check_denorm(std::complex<T> /* val */, R* /* result*/, const char* /* function */, const denorm_error<ignore_error>&) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T))
 { return false; }
 
 } // namespace detail
 
 template <class R, class Policy, class T>
-inline R checked_narrowing_cast(T val, const char* function) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T) && is_noexcept_error_policy<Policy>::value)
+inline BOOST_GPU_ENABLED R checked_narrowing_cast(T val, const char* function) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(R) && BOOST_MATH_IS_FLOAT(T) && is_noexcept_error_policy<Policy>::value)
 {
    typedef typename Policy::overflow_error_type overflow_type;
    typedef typename Policy::underflow_error_type underflow_type;
@@ -804,7 +873,7 @@ inline R checked_narrowing_cast(T val, const char* function) BOOST_NOEXCEPT_IF(B
 }
 
 template <class T, class Policy>
-inline void check_series_iterations(const char* function, boost::uintmax_t max_iter, const Policy& pol) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && is_noexcept_error_policy<Policy>::value)
+inline BOOST_GPU_ENABLED void check_series_iterations(const char* function, boost::uintmax_t max_iter, const Policy& pol) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && is_noexcept_error_policy<Policy>::value)
 {
    if(max_iter >= policies::get_max_series_iterations<Policy>())
       raise_evaluation_error<T>(
@@ -813,7 +882,7 @@ inline void check_series_iterations(const char* function, boost::uintmax_t max_i
 }
 
 template <class T, class Policy>
-inline void check_root_iterations(const char* function, boost::uintmax_t max_iter, const Policy& pol) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && is_noexcept_error_policy<Policy>::value)
+inline BOOST_GPU_ENABLED void check_root_iterations(const char* function, boost::uintmax_t max_iter, const Policy& pol) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && is_noexcept_error_policy<Policy>::value)
 {
    if(max_iter >= policies::get_max_root_iterations<Policy>())
       raise_evaluation_error<T>(
diff --git a/include/boost/math/policies/policy.hpp b/include/boost/math/policies/policy.hpp
index c1e1a7be4a..a4b7149f04 100644
--- a/include/boost/math/policies/policy.hpp
+++ b/include/boost/math/policies/policy.hpp
@@ -33,9 +33,9 @@ namespace boost{ namespace math{
 namespace tools{
 
 template <class T>
-BOOST_MATH_CONSTEXPR int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT;
+BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT;
 template <class T>
-BOOST_MATH_CONSTEXPR T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T);
+BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T);
 
 }
 
@@ -46,7 +46,7 @@ namespace policies{
 //
 // Special cases for exceptions disabled first:
 //
-#ifdef BOOST_NO_EXCEPTIONS
+#if defined(BOOST_NO_EXCEPTIONS)
 #  ifndef BOOST_MATH_DOMAIN_ERROR_POLICY
 #    define BOOST_MATH_DOMAIN_ERROR_POLICY errno_on_error
 #  endif
@@ -63,6 +63,35 @@ namespace policies{
 #     define BOOST_MATH_ROUNDING_ERROR_POLICY errno_on_error
 #  endif
 #endif
+
+   //
+   // Special cases for CUDA devices:
+   //
+#ifdef __CUDA_ARCH__
+#  ifndef BOOST_MATH_DOMAIN_ERROR_POLICY
+#    define BOOST_MATH_DOMAIN_ERROR_POLICY ignore_error
+#  endif
+#  ifndef BOOST_MATH_POLE_ERROR_POLICY
+#     define BOOST_MATH_POLE_ERROR_POLICY ignore_error
+#  endif
+#  ifndef BOOST_MATH_OVERFLOW_ERROR_POLICY
+#     define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#  endif
+#  ifndef BOOST_MATH_EVALUATION_ERROR_POLICY
+#     define BOOST_MATH_EVALUATION_ERROR_POLICY ignore_error
+#  endif
+#  ifndef BOOST_MATH_ROUNDING_ERROR_POLICY
+#     define BOOST_MATH_ROUNDING_ERROR_POLICY ignore_error
+#  endif
+#  ifndef BOOST_MATH_PROMOTE_FLOAT_POLICY
+#     define BOOST_MATH_PROMOTE_FLOAT_POLICY false
+#  endif
+#  ifndef BOOST_MATH_PROMOTE_DOUBLE_POLICY
+#     define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+#  endif
+#endif
+
+
 //
 // Then the regular cases:
 //
@@ -378,8 +407,12 @@ struct default_args<false, true>
    typedef default_policy arg2;
 };
 
+template <bool b1, bool b2> struct fake_default_arg { typedef default_policy type; };
+template <> struct fake_default_arg<false, false> { typedef digits10<0> type; };
+
 typedef default_args<BOOST_MATH_PROMOTE_FLOAT_POLICY, BOOST_MATH_PROMOTE_DOUBLE_POLICY>::arg1 forwarding_arg1;
 typedef default_args<BOOST_MATH_PROMOTE_FLOAT_POLICY, BOOST_MATH_PROMOTE_DOUBLE_POLICY>::arg2 forwarding_arg2;
+typedef fake_default_arg<BOOST_MATH_PROMOTE_FLOAT_POLICY, BOOST_MATH_PROMOTE_DOUBLE_POLICY>::type fake_default_arg_placeholder;
 
 } // detail
 //
@@ -490,7 +523,7 @@ struct policy<default_policy, default_policy, default_policy, default_policy, de
 };
 
 template <>
-struct policy<detail::forwarding_arg1, detail::forwarding_arg2, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy>
+struct policy<detail::forwarding_arg1, detail::forwarding_arg2, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy, default_policy, detail::fake_default_arg_placeholder>
 {
 public:
    typedef domain_error<> domain_error_type;
@@ -639,7 +672,7 @@ struct normalise<policy<detail::forwarding_arg1, detail::forwarding_arg2>,
           default_policy,
           default_policy,
           default_policy,
-          default_policy>
+          detail::fake_default_arg_placeholder>
 {
    typedef policy<detail::forwarding_arg1, detail::forwarding_arg2> type;
 };
@@ -847,19 +880,19 @@ struct precision<BOOST_MATH_FLOAT128_TYPE, Policy>
 namespace detail{
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR int digits_imp(mpl::true_ const&) BOOST_NOEXCEPT
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED int digits_imp(mpl::true_ const&) BOOST_NOEXCEPT
 {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
    BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::is_specialized);
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
+   BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
 #endif
    typedef typename boost::math::policies::precision<T, Policy>::type p_t;
    return p_t::value;
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR int digits_imp(mpl::false_ const&) BOOST_NOEXCEPT
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED int digits_imp(mpl::false_ const&) BOOST_NOEXCEPT
 {
    return tools::digits<T>();
 }
@@ -867,26 +900,26 @@ inline BOOST_MATH_CONSTEXPR int digits_imp(mpl::false_ const&) BOOST_NOEXCEPT
 } // namespace detail
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT
 {
    typedef mpl::bool_< std::numeric_limits<T>::is_specialized > tag_type;
    return detail::digits_imp<T, Policy>(tag_type());
 }
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR int digits_base10(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED int digits_base10(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT
 {
    return boost::math::policies::digits<T, Policy>() * 301 / 1000L;
 }
 
 template <class Policy>
-inline BOOST_MATH_CONSTEXPR unsigned long get_max_series_iterations() BOOST_NOEXCEPT
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED unsigned long get_max_series_iterations() BOOST_NOEXCEPT
 {
    typedef typename Policy::max_series_iterations_type iter_type;
    return iter_type::value;
 }
 
 template <class Policy>
-inline BOOST_MATH_CONSTEXPR unsigned long get_max_root_iterations() BOOST_NOEXCEPT
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED unsigned long get_max_root_iterations() BOOST_NOEXCEPT
 {
    typedef typename Policy::max_root_iterations_type iter_type;
    return iter_type::value;
@@ -897,7 +930,7 @@ namespace detail{
 template <class T, class Digits, class Small, class Default>
 struct series_factor_calc
 {
-   static T get() BOOST_MATH_NOEXCEPT(T)
+   static BOOST_GPU_ENABLED T get() BOOST_MATH_NOEXCEPT(T)
    {
       return ldexp(T(1.0), 1 - Digits::value);
    }
@@ -906,7 +939,7 @@ struct series_factor_calc
 template <class T, class Digits>
 struct series_factor_calc<T, Digits, mpl::true_, mpl::true_>
 {
-   static BOOST_MATH_CONSTEXPR T get() BOOST_MATH_NOEXCEPT(T)
+   static BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T get() BOOST_MATH_NOEXCEPT(T)
    {
       return boost::math::tools::epsilon<T>();
    }
@@ -914,7 +947,7 @@ struct series_factor_calc<T, Digits, mpl::true_, mpl::true_>
 template <class T, class Digits>
 struct series_factor_calc<T, Digits, mpl::true_, mpl::false_>
 {
-   static BOOST_MATH_CONSTEXPR T get() BOOST_MATH_NOEXCEPT(T)
+   static BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T get() BOOST_MATH_NOEXCEPT(T)
    {
       return 1 / static_cast<T>(static_cast<boost::uintmax_t>(1u) << (Digits::value - 1));
    }
@@ -922,21 +955,21 @@ struct series_factor_calc<T, Digits, mpl::true_, mpl::false_>
 template <class T, class Digits>
 struct series_factor_calc<T, Digits, mpl::false_, mpl::true_>
 {
-   static BOOST_MATH_CONSTEXPR T get() BOOST_MATH_NOEXCEPT(T)
+   static BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T get() BOOST_MATH_NOEXCEPT(T)
    {
       return boost::math::tools::epsilon<T>();
    }
 };
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T get_epsilon_imp(mpl::true_ const&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T get_epsilon_imp(mpl::true_ const&) BOOST_MATH_NOEXCEPT(T)
 {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
    BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::is_specialized);
    BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::radix == 2);
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
-   BOOST_ASSERT(::std::numeric_limits<T>::radix == 2);
+   BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
+   BOOST_MATH_ASSERT(::std::numeric_limits<T>::radix == 2);
 #endif
    typedef typename boost::math::policies::precision<T, Policy>::type p_t;
    typedef mpl::bool_<p_t::value <= std::numeric_limits<boost::uintmax_t>::digits> is_small_int;
@@ -945,7 +978,7 @@ inline BOOST_MATH_CONSTEXPR T get_epsilon_imp(mpl::true_ const&) BOOST_MATH_NOEX
 }
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T get_epsilon_imp(mpl::false_ const&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T get_epsilon_imp(mpl::false_ const&) BOOST_MATH_NOEXCEPT(T)
 {
    return tools::epsilon<T>();
 }
@@ -953,7 +986,7 @@ inline BOOST_MATH_CONSTEXPR T get_epsilon_imp(mpl::false_ const&) BOOST_MATH_NOE
 } // namespace detail
 
 template <class T, class Policy>
-inline BOOST_MATH_CONSTEXPR T get_epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T get_epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    typedef mpl::bool_< (std::numeric_limits<T>::is_specialized && (std::numeric_limits<T>::radix == 2)) > tag_type;
    return detail::get_epsilon_imp<T, Policy>(tag_type());
diff --git a/include/boost/math/quadrature/cuda_naive_monte_carlo.hpp b/include/boost/math/quadrature/cuda_naive_monte_carlo.hpp
new file mode 100644
index 0000000000..37258ea9de
--- /dev/null
+++ b/include/boost/math/quadrature/cuda_naive_monte_carlo.hpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright Nick Thompson, 2018
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. (See accompanying file
+ * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef BOOST_MATH_QUADRATURE_NAIVE_MONTE_CARLO_HPP
+#define BOOST_MATH_QUADRATURE_NAIVE_MONTE_CARLO_HPP
+#include <algorithm>
+#include <vector>
+#include <utility>
+#include <random>
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/cstdint.hpp>
+ // For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/random.h>
+
+namespace boost { namespace math { namespace quadrature { namespace detail{
+
+   template <class T>
+   struct pair_add
+   {
+      __host__  __device__  thrust::pair<T, T> operator()(const thrust::pair<T, T>& a, const thrust::pair<T, T>& b)const
+      {
+         return thrust::pair<T, T>(a.first + b.first, a.second + b.second);
+      }
+   };
+
+   template <class F, class Gen, class Real>
+   void __global__ cuda_naive_monte_carlo_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
+   {
+      int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+      Gen gen(seeds[id]);
+      Real* storage_base = p_storage + id * n_dimentions;
+      thrust::uniform_real_distribution<Real> dist(0, 1);
+      Real sum(0);
+      Real sigma_squared(0);
+      Real c(0);
+      for (unsigned i = 0; i < n_calls; ++i)
+      {
+         for (unsigned j = 0; j < n_dimentions; ++j)
+            storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
+         Real fv = (f)(storage_base);
+         Real y = fv - c;
+         Real t = sum + y;
+         c = (t - sum) - y;
+         sum = t;
+         sigma_squared += fv * fv;
+      }
+      seeds[id] = gen;
+      if (is_first)
+         sums[id] = thrust::pair<Real, Real>(sum, sigma_squared);
+      else
+      {
+         sums[id].first += sum;
+         sums[id].second += sigma_squared;
+      }
+   }
+
+   template <class F, class Gen, class Real>
+   void __global__ cuda_naive_monte_carlo_fast_device_proc(F f, Gen* seeds, thrust::pair<Real, Real>* sums, const Real* p_start_locations, const Real* p_scales, unsigned n_calls, Real* p_storage, unsigned n_dimentions, bool is_first)
+   {
+      int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+      Gen gen(seeds[id]);
+      Real* storage_base = p_storage + id * n_dimentions;
+      thrust::uniform_real_distribution<Real> dist(0, 1);
+      Real sum(0);
+      Real sigma_squared(0);
+      for (unsigned i = 0; i < n_calls; ++i)
+      {
+         for (unsigned j = 0; j < n_dimentions; ++j)
+            storage_base[j] = p_start_locations[j] + p_scales[j] * dist(gen);
+         Real fv = (f)(storage_base);
+         sum += fv;
+         sigma_squared += fv * fv;
+      }
+      seeds[id] = gen;
+      if (is_first)
+         sums[id] = thrust::pair<Real, Real>(sum, sigma_squared);
+      else
+      {
+         sums[id].first += sum;
+         sums[id].second += sigma_squared;
+      }
+   }
+
+} // namespace detail
+
+   template <class Real, class F, class ThreadGen = thrust::random::taus88, class MasterGen = std::random_device>
+   struct cuda_naive_monte_carlo
+   {
+      cuda_naive_monte_carlo(const F& integrand,
+         std::vector<std::pair<Real, Real>> const & bounds,
+         const MasterGen& seed) : m_f(integrand), m_gen(seed), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
+      {
+         auto it = bounds.begin();
+         while (it != bounds.end())
+         {
+            m_start_locations.push_back(it->first);
+            m_scale_factors.push_back(it->second - it->first);
+            m_volume *= (it->second - it->first);
+            ++it;
+         }
+      }
+      cuda_naive_monte_carlo(const F& integrand,
+         std::vector<std::pair<Real, Real>> const & bounds) 
+         : m_f(integrand), m_volume(1), m_sigma(0), m_sigma_squares(0), m_calls(0)
+      {
+         auto it = bounds.begin();
+         while (it != bounds.end())
+         {
+            m_start_locations.push_back(it->first);
+            m_scale_factors.push_back(it->second - it->first);
+            m_volume *= (it->second - it->first);
+            ++it;
+         }
+      }
+
+      template <class T>
+      static T* to_pointer(T* p) { return p; }
+      template <class T>
+      static T* to_pointer(thrust::device_ptr<T> p) { return p.get(); }
+
+      Real integrate(Real error_request, boost::uintmax_t calls_per_thread = 1024, boost::uintmax_t max_calls_per_thread = 250000, bool is_compensated = true)
+      {
+         boost::uintmax_t threads;
+         cudaDeviceProp deviceProp;
+         cudaGetDeviceProperties(&deviceProp, 0);
+
+         threads = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount;
+
+         thrust::device_vector<Real> starts = m_start_locations;
+         thrust::device_vector<Real> scales = m_scale_factors;
+         thrust::device_vector<Real> storage(threads * m_start_locations.size());
+         thrust::device_vector<thrust::pair<Real, Real> > sums(threads);
+         thrust::device_vector<ThreadGen> seeds(threads);
+         thrust::host_vector<ThreadGen> host_seeds(threads);
+         typedef typename ThreadGen::result_type seed_type;
+         std::uniform_int_distribution<seed_type> ui_dist((std::numeric_limits<seed_type>::min)(), (std::numeric_limits<seed_type>::max)());
+         for (unsigned i = 0; i < host_seeds.size(); ++i)
+            host_seeds[i].seed(ui_dist(m_gen));
+         seeds = host_seeds;
+         bool first_call = true;
+         bool have_variance = false;
+         Real sample_variance = 0;
+         do {
+            if (m_calls)
+            {
+               // If we've been called before, adjust calls_per_thread according
+               // to the sample variance:
+               Real err = sqrt(variance() / m_calls);
+               boost::uintmax_t target_total = variance() > 0 ? (boost::uintmax_t)std::floor(1.05 * m_calls * err * err / (error_request * error_request)) : 2 * m_calls;
+               calls_per_thread = 1 + target_total / threads;
+               have_variance = true;
+               sample_variance = variance();
+            }
+            do
+            {
+               if (!first_call)
+               {
+                  // Update how many calls per thread to try based on our variance estimate:
+                  boost::uintmax_t target_total;
+                  if (sample_variance > 0)
+                     target_total = static_cast<boost::uintmax_t>(1.05 * m_calls * boost::math::pow<2>(sqrt(sample_variance / m_calls)) / (error_request * error_request) - m_calls);
+                  else
+                     target_total = 2 * m_calls;
+                  calls_per_thread = 1 + target_total / threads;
+               }
+               if (calls_per_thread > max_calls_per_thread)
+                  calls_per_thread = max_calls_per_thread;
+               // std::cout << "Executing with calls_per_thread = " << calls_per_thread << std::endl;
+               if(is_compensated)
+                  detail::cuda_naive_monte_carlo_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
+               else
+                  detail::cuda_naive_monte_carlo_fast_device_proc << <threads / 256, 256 >> > (m_f, to_pointer(seeds.data()), to_pointer(sums.data()), to_pointer(starts.data()), to_pointer(scales.data()), calls_per_thread, to_pointer(storage.data()), scales.size(), first_call);
+               first_call = false;
+               m_calls += threads * calls_per_thread;
+               // If we haven't been called before then get an estimate of the sample 
+               // variance based on the first sum, this removes the need to reduce
+               // the whole thing which is relatively expensive:
+               if (!have_variance)
+               {
+                  std::pair<Real, Real> first_result;
+                  cudaMemcpy(&first_result, to_pointer(sums.data()), sizeof(first_result), cudaMemcpyDeviceToHost);
+                  // It's tempting to use calls_per_thread as the sample size for the sums: but we may
+                  // have been through this loop more than once and calls_per_thread is just the "extra"
+                  // calls from the last run through.  So divide the total number of calls by the number
+                  // of threads to get the actual sample size for each sub-sum:
+                  sample_variance = m_volume * m_volume * (first_result.second - first_result.first * first_result.first / (m_calls / threads)) / ((m_calls / threads) - 1);
+               }
+               // std::cout << "Estimated error = " << sqrt(sample_variance / m_calls) << std::endl;
+            } while ((sample_variance < 0) || (sqrt(sample_variance / m_calls) > error_request));
+
+            // Reduce the results:
+            thrust::pair<Real, Real> sum = thrust::reduce(sums.begin(), sums.end(), thrust::pair<Real, Real>(0, 0), detail::pair_add<Real>());
+            m_sigma += sum.first;
+            m_sigma_squares += sum.second;
+            first_call = true;
+            //
+            // Since we used an estimate for the variance in the code above, now that
+            // we have the true variance after a full reduction, we had better double check
+            // the termination condition:
+            //
+            // std::cout << "Estimated error after reduce = " << sqrt(variance() / m_calls) << std::endl;
+         } while ((variance() < 0) || (sqrt(variance() / m_calls) > error_request));
+
+         return m_volume * m_sigma / m_calls;
+      }
+
+      Real variance()const
+      {
+         return m_volume * m_volume * (m_sigma_squares - m_sigma * m_sigma / m_calls) / (m_calls - 1);
+      }
+      Real current_error_estimate() const
+      {
+         using std::sqrt;
+         return sqrt(variance() / m_calls);
+      }
+      uint64_t calls() const
+      {
+         return m_calls;
+      }
+
+   private:
+      MasterGen m_gen;
+      Real m_volume, m_sigma, m_sigma_squares;
+      boost::uintmax_t m_calls;
+      F m_f;
+      thrust::host_vector<Real> m_start_locations;
+      thrust::host_vector<Real> m_scale_factors;
+   };
+
+}}}
+#endif
diff --git a/include/boost/math/special_functions/acosh.hpp b/include/boost/math/special_functions/acosh.hpp
index 3b7edbad3f..34b514cef7 100644
--- a/include/boost/math/special_functions/acosh.hpp
+++ b/include/boost/math/special_functions/acosh.hpp
@@ -32,7 +32,7 @@ namespace boost
        namespace detail
        {
         template<typename T, typename Policy>
-        inline T    acosh_imp(const T x, const Policy& pol)
+        inline BOOST_GPU_ENABLED T    acosh_imp(const T x, const Policy& pol)
         {
             BOOST_MATH_STD_USING
             
@@ -76,7 +76,7 @@ namespace boost
        }
 
         template<typename T, typename Policy>
-        inline typename tools::promote_args<T>::type acosh(T x, const Policy&)
+        inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type acosh(T x, const Policy&)
         {
             typedef typename tools::promote_args<T>::type result_type;
             typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -91,7 +91,7 @@ namespace boost
               "boost::math::acosh<%1%>(%1%)");
         }
         template<typename T>
-        inline typename tools::promote_args<T>::type acosh(T x)
+        inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type acosh(T x)
         {
            return boost::math::acosh(x, policies::policy<>());
         }
diff --git a/include/boost/math/special_functions/asinh.hpp b/include/boost/math/special_functions/asinh.hpp
index e55a356284..abb74f9d44 100644
--- a/include/boost/math/special_functions/asinh.hpp
+++ b/include/boost/math/special_functions/asinh.hpp
@@ -32,7 +32,7 @@ namespace boost
     {
        namespace detail{
         template<typename T, class Policy>
-        inline T    asinh_imp(const T x, const Policy& pol)
+        inline BOOST_GPU_ENABLED T    asinh_imp(const T x, const Policy& pol)
         {
             BOOST_MATH_STD_USING
             
@@ -85,12 +85,12 @@ namespace boost
        }
 
         template<typename T>
-        inline typename tools::promote_args<T>::type asinh(T x)
+        inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type asinh(T x)
         {
            return boost::math::asinh(x, policies::policy<>());
         }
         template<typename T, typename Policy>
-        inline typename tools::promote_args<T>::type asinh(T x, const Policy&)
+        inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type asinh(T x, const Policy&)
         {
             typedef typename tools::promote_args<T>::type result_type;
             typedef typename policies::evaluation<result_type, Policy>::type value_type;
diff --git a/include/boost/math/special_functions/atanh.hpp b/include/boost/math/special_functions/atanh.hpp
index 871a465a8c..977e07ae48 100644
--- a/include/boost/math/special_functions/atanh.hpp
+++ b/include/boost/math/special_functions/atanh.hpp
@@ -34,7 +34,7 @@ namespace boost
         // This is the main fare
         
         template<typename T, typename Policy>
-        inline T    atanh_imp(const T x, const Policy& pol)
+        inline BOOST_GPU_ENABLED T    atanh_imp(const T x, const Policy& pol)
         {
             BOOST_MATH_STD_USING
             static const char* function = "boost::math::atanh<%1%>(%1%)";
@@ -94,7 +94,7 @@ namespace boost
        }
 
         template<typename T, typename Policy>
-        inline typename tools::promote_args<T>::type atanh(T x, const Policy&)
+        inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type atanh(T x, const Policy&)
         {
             typedef typename tools::promote_args<T>::type result_type;
             typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -109,7 +109,7 @@ namespace boost
               "boost::math::atanh<%1%>(%1%)");
         }
         template<typename T>
-        inline typename tools::promote_args<T>::type atanh(T x)
+        inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type atanh(T x)
         {
            return boost::math::atanh(x, policies::policy<>());
         }
diff --git a/include/boost/math/special_functions/beta.hpp b/include/boost/math/special_functions/beta.hpp
index 35b114ef15..5902db2c76 100644
--- a/include/boost/math/special_functions/beta.hpp
+++ b/include/boost/math/special_functions/beta.hpp
@@ -31,7 +31,7 @@ namespace detail{
 // Implementation of Beta(a,b) using the Lanczos approximation:
 //
 template <class T, class Lanczos, class Policy>
-T beta_imp(T a, T b, const Lanczos&, const Policy& pol)
+BOOST_GPU_ENABLED T beta_imp(T a, T b, const Lanczos&, const Policy& pol)
 {
    BOOST_MATH_STD_USING  // for ADL of std names
 
@@ -83,9 +83,10 @@ T beta_imp(T a, T b, const Lanczos&, const Policy& pol)
       b += 1;
    }
    */
+   using std::swap;
 
    if(a < b)
-      std::swap(a, b);
+      BOOST_MATH_CUDA_SAFE_SWAP(a, b);
 
    // Lanczos calculation:
    T agh = static_cast<T>(a + Lanczos::g() - 0.5f);
@@ -199,7 +200,7 @@ T beta_imp(T a, T b, const lanczos::undefined_lanczos& /* l */, const Policy& po
 // horrendous cancellation errors.
 //
 template <class T, class Lanczos, class Policy>
-T ibeta_power_terms(T a,
+BOOST_GPU_ENABLED T ibeta_power_terms(T a,
                         T b,
                         T x,
                         T y,
@@ -234,11 +235,11 @@ T ibeta_power_terms(T a,
    // l1 and l2 are the base of the exponents minus one:
    T l1 = (x * b - y * agh) / agh;
    T l2 = (y * a - x * bgh) / bgh;
-   if(((std::min)(fabs(l1), fabs(l2)) < 0.2))
+   if((BOOST_MATH_CUDA_SAFE_MIN(fabs(l1), fabs(l2)) < 0.2))
    {
       // when the base of the exponent is very near 1 we get really
       // gross errors unless extra care is taken:
-      if((l1 * l2 > 0) || ((std::min)(a, b) < 1))
+      if((l1 * l2 > 0) || (BOOST_MATH_CUDA_SAFE_MIN(a, b) < 1))
       {
          //
          // This first branch handles the simple cases where either: 
@@ -274,7 +275,7 @@ T ibeta_power_terms(T a,
             BOOST_MATH_INSTRUMENT_VARIABLE(result);
          }
       }
-      else if((std::max)(fabs(l1), fabs(l2)) < 0.5)
+      else if(BOOST_MATH_CUDA_SAFE_MAX(fabs(l1), fabs(l2)) < 0.5)
       {
          //
          // Both exponents are near one and both the exponents are 
@@ -508,8 +509,8 @@ template <class T>
 struct ibeta_series_t
 {
    typedef T result_type;
-   ibeta_series_t(T a_, T b_, T x_, T mult) : result(mult), x(x_), apn(a_), poch(1-b_), n(1) {}
-   T operator()()
+   BOOST_GPU_ENABLED ibeta_series_t(T a_, T b_, T x_, T mult) : result(mult), x(x_), apn(a_), poch(1-b_), n(1) {}
+   BOOST_GPU_ENABLED T operator()()
    {
       T r = result / apn;
       apn += 1;
@@ -524,13 +525,13 @@ struct ibeta_series_t
 };
 
 template <class T, class Lanczos, class Policy>
-T ibeta_series(T a, T b, T x, T s0, const Lanczos&, bool normalised, T* p_derivative, T y, const Policy& pol)
+BOOST_GPU_ENABLED T ibeta_series(T a, T b, T x, T s0, const Lanczos&, bool normalised, T* p_derivative, T y, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
    T result;
 
-   BOOST_ASSERT((p_derivative == 0) || normalised);
+   BOOST_MATH_ASSERT((p_derivative == 0) || normalised);
 
    if(normalised)
    {
@@ -562,7 +563,7 @@ T ibeta_series(T a, T b, T x, T s0, const Lanczos&, bool normalised, T* p_deriva
          if(p_derivative)
          {
             *p_derivative = result * pow(y, b);
-            BOOST_ASSERT(*p_derivative >= 0);
+            BOOST_MATH_ASSERT(*p_derivative >= 0);
          }
       }
       else
@@ -598,7 +599,7 @@ T ibeta_series(T a, T b, T x, T s0, const boost::math::lanczos::undefined_lanczo
    BOOST_MATH_STD_USING
 
    T result;
-   BOOST_ASSERT((p_derivative == 0) || normalised);
+   BOOST_MATH_ASSERT((p_derivative == 0) || normalised);
 
    if(normalised)
    {
@@ -652,7 +653,7 @@ T ibeta_series(T a, T b, T x, T s0, const boost::math::lanczos::undefined_lanczo
       if(p_derivative)
       {
          *p_derivative = result * pow(y, b);
-         BOOST_ASSERT(*p_derivative >= 0);
+         BOOST_MATH_ASSERT(*p_derivative >= 0);
       }
    }
    else
@@ -677,9 +678,9 @@ struct ibeta_fraction2_t
 {
    typedef std::pair<T, T> result_type;
 
-   ibeta_fraction2_t(T a_, T b_, T x_, T y_) : a(a_), b(b_), x(x_), y(y_), m(0) {}
+   BOOST_GPU_ENABLED ibeta_fraction2_t(T a_, T b_, T x_, T y_) : a(a_), b(b_), x(x_), y(y_), m(0) {}
 
-   result_type operator()()
+   BOOST_GPU_ENABLED result_type operator()()
    {
       T aN = (a + m - 1) * (a + b + m - 1) * m * (b - m) * x * x;
       T denom = (a + 2 * m - 1);
@@ -702,7 +703,7 @@ struct ibeta_fraction2_t
 // Evaluate the incomplete beta via the continued fraction representation:
 //
 template <class T, class Policy>
-inline T ibeta_fraction2(T a, T b, T x, T y, const Policy& pol, bool normalised, T* p_derivative)
+inline BOOST_GPU_ENABLED T ibeta_fraction2(T a, T b, T x, T y, const Policy& pol, bool normalised, T* p_derivative)
 {
    typedef typename lanczos::lanczos<T, Policy>::type lanczos_type;
    BOOST_MATH_STD_USING
@@ -710,7 +711,7 @@ inline T ibeta_fraction2(T a, T b, T x, T y, const Policy& pol, bool normalised,
    if(p_derivative)
    {
       *p_derivative = result;
-      BOOST_ASSERT(*p_derivative >= 0);
+      BOOST_MATH_ASSERT(*p_derivative >= 0);
    }
    if(result == 0)
       return result;
@@ -725,7 +726,7 @@ inline T ibeta_fraction2(T a, T b, T x, T y, const Policy& pol, bool normalised,
 // Computes the difference between ibeta(a,b,x) and ibeta(a+k,b,x):
 //
 template <class T, class Policy>
-T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& pol, bool normalised, T* p_derivative)
+BOOST_GPU_ENABLED T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& pol, bool normalised, T* p_derivative)
 {
    typedef typename lanczos::lanczos<T, Policy>::type lanczos_type;
 
@@ -735,7 +736,7 @@ T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& pol, bool normalised, T*
    if(p_derivative)
    {
       *p_derivative = prefix;
-      BOOST_ASSERT(*p_derivative >= 0);
+      BOOST_MATH_ASSERT(*p_derivative >= 0);
    }
    prefix /= a;
    if(prefix == 0)
@@ -759,7 +760,7 @@ T ibeta_a_step(T a, T b, T x, T y, int k, const Policy& pol, bool normalised, T*
 // it is currently only called for small k.
 //
 template <class T>
-inline T rising_factorial_ratio(T a, T b, int k)
+inline BOOST_GPU_ENABLED T rising_factorial_ratio(T a, T b, int k)
 {
    // calculate:
    // (a)(a+1)(a+2)...(a+k-1)
@@ -803,17 +804,21 @@ template <>
 struct Pn_size<double>
 {
    BOOST_STATIC_CONSTANT(unsigned, value = 30); // 16-20 digit accuracy
+#ifndef __CUDA_ARCH__
    BOOST_STATIC_ASSERT(::boost::math::max_factorial<double>::value >= 60);
+#endif
 };
 template <>
 struct Pn_size<long double>
 {
    BOOST_STATIC_CONSTANT(unsigned, value = 50); // ~35-50 digit accuracy
+#ifndef __CUDA_ARCH__
    BOOST_STATIC_ASSERT(::boost::math::max_factorial<long double>::value >= 100);
+#endif
 };
 
 template <class T, class Policy>
-T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised)
+BOOST_GPU_ENABLED T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& pol, bool normalised)
 {
    typedef typename lanczos::lanczos<T, Policy>::type lanczos_type;
    BOOST_MATH_STD_USING
@@ -925,7 +930,7 @@ T beta_small_b_large_a_series(T a, T b, T x, T y, T s0, T mult, const Policy& po
 // complement of the binomial distribution cdf and use this finite sum.
 //
 template <class T>
-T binomial_ccdf(T n, T k, T x, T y)
+BOOST_GPU_ENABLED T binomial_ccdf(T n, T k, T x, T y)
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -986,7 +991,7 @@ T binomial_ccdf(T n, T k, T x, T y)
 // each domain:
 //
 template <class T, class Policy>
-T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative)
+BOOST_GPU_ENABLED T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_derivative)
 {
    static const char* function = "boost::math::ibeta<%1%>(%1%, %1%, %1%)";
    typedef typename lanczos::lanczos<T, Policy>::type lanczos_type;
@@ -1002,7 +1007,7 @@ T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_de
    T fract;
    T y = 1 - x;
 
-   BOOST_ASSERT((p_derivative == 0) || normalised);
+   BOOST_MATH_ASSERT((p_derivative == 0) || normalised);
 
    if(p_derivative)
       *p_derivative = -1; // value not set.
@@ -1098,19 +1103,19 @@ T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_de
       return p;
    }
 
-   if((std::min)(a, b) <= 1)
+   if(BOOST_MATH_CUDA_SAFE_MIN(a, b) <= 1)
    {
       if(x > 0.5)
       {
-         std::swap(a, b);
-         std::swap(x, y);
+         BOOST_MATH_CUDA_SAFE_SWAP(a, b);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, y);
          invert = !invert;
          BOOST_MATH_INSTRUMENT_VARIABLE(invert);
       }
-      if((std::max)(a, b) <= 1)
+      if(BOOST_MATH_CUDA_SAFE_MAX(a, b) <= 1)
       {
          // Both a,b < 1:
-         if((a >= (std::min)(T(0.2), b)) || (pow(x, a) <= 0.9))
+         if((a >= BOOST_MATH_CUDA_SAFE_MIN(T(0.2), b)) || (pow(x, a) <= 0.9))
          {
             if(!invert)
             {
@@ -1127,8 +1132,8 @@ T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_de
          }
          else
          {
-            std::swap(a, b);
-            std::swap(x, y);
+            BOOST_MATH_CUDA_SAFE_SWAP(a, b);
+            BOOST_MATH_CUDA_SAFE_SWAP(x, y);
             invert = !invert;
             if(y >= 0.3)
             {
@@ -1193,8 +1198,8 @@ T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_de
          }
          else
          {
-            std::swap(a, b);
-            std::swap(x, y);
+            BOOST_MATH_CUDA_SAFE_SWAP(a, b);
+            BOOST_MATH_CUDA_SAFE_SWAP(x, y);
             invert = !invert;
 
             if(y >= 0.3)
@@ -1271,8 +1276,8 @@ T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_de
       }
       if(lambda < 0)
       {
-         std::swap(a, b);
-         std::swap(x, y);
+         BOOST_MATH_CUDA_SAFE_SWAP(a, b);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, y);
          invert = !invert;
          BOOST_MATH_INSTRUMENT_VARIABLE(invert);
       }
@@ -1386,15 +1391,15 @@ T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised, T* p_de
 } // template <class T, class Lanczos>T ibeta_imp(T a, T b, T x, const Lanczos& l, bool inv, bool normalised)
 
 template <class T, class Policy>
-inline T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised)
+BOOST_GPU_ENABLED inline T ibeta_imp(T a, T b, T x, const Policy& pol, bool inv, bool normalised)
 {
    return ibeta_imp(a, b, x, pol, inv, normalised, static_cast<T*>(0));
 }
 
 template <class T, class Policy>
-T ibeta_derivative_imp(T a, T b, T x, const Policy& pol)
+BOOST_GPU_ENABLED T ibeta_derivative_imp(T a, T b, T x, const Policy& pol)
 {
-   static const char* function = "ibeta_derivative<%1%>(%1%,%1%,%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "ibeta_derivative<%1%>(%1%,%1%,%1%)";
    //
    // start with the usual error checks:
    //
@@ -1429,7 +1434,7 @@ T ibeta_derivative_imp(T a, T b, T x, const Policy& pol)
 // Some forwarding functions that dis-ambiguate the third argument type:
 //
 template <class RT1, class RT2, class Policy>
-inline typename tools::promote_args<RT1, RT2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type
    beta(RT1 a, RT2 b, const Policy&, const mpl::true_*)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1446,7 +1451,7 @@ inline typename tools::promote_args<RT1, RT2>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::beta_imp(static_cast<value_type>(a), static_cast<value_type>(b), evaluation_type(), forwarding_policy()), "boost::math::beta<%1%>(%1%,%1%)");
 }
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    beta(RT1 a, RT2 b, RT3 x, const mpl::false_*)
 {
    return boost::math::beta(a, b, x, policies::policy<>());
@@ -1459,7 +1464,7 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
 // and forward to the implementation functions:
 //
 template <class RT1, class RT2, class A>
-inline typename tools::promote_args<RT1, RT2, A>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, A>::type
    beta(RT1 a, RT2 b, A arg)
 {
    typedef typename policies::is_policy<A>::type tag;
@@ -1467,14 +1472,14 @@ inline typename tools::promote_args<RT1, RT2, A>::type
 }
 
 template <class RT1, class RT2>
-inline typename tools::promote_args<RT1, RT2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type
    beta(RT1 a, RT2 b)
 {
    return boost::math::beta(a, b, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    beta(RT1 a, RT2 b, RT3 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1491,7 +1496,7 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    betac(RT1 a, RT2 b, RT3 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1507,14 +1512,14 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::ibeta_imp(static_cast<value_type>(a), static_cast<value_type>(b), static_cast<value_type>(x), forwarding_policy(), true, false), "boost::math::betac<%1%>(%1%,%1%,%1%)");
 }
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    betac(RT1 a, RT2 b, RT3 x)
 {
    return boost::math::betac(a, b, x, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    ibeta(RT1 a, RT2 b, RT3 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1530,14 +1535,14 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::ibeta_imp(static_cast<value_type>(a), static_cast<value_type>(b), static_cast<value_type>(x), forwarding_policy(), false, true), "boost::math::ibeta<%1%>(%1%,%1%,%1%)");
 }
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    ibeta(RT1 a, RT2 b, RT3 x)
 {
    return boost::math::ibeta(a, b, x, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    ibetac(RT1 a, RT2 b, RT3 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1553,14 +1558,14 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::ibeta_imp(static_cast<value_type>(a), static_cast<value_type>(b), static_cast<value_type>(x), forwarding_policy(), true, true), "boost::math::ibetac<%1%>(%1%,%1%,%1%)");
 }
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    ibetac(RT1 a, RT2 b, RT3 x)
 {
    return boost::math::ibetac(a, b, x, policies::policy<>());
 }
 
 template <class RT1, class RT2, class RT3, class Policy>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    ibeta_derivative(RT1 a, RT2 b, RT3 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1576,7 +1581,7 @@ inline typename tools::promote_args<RT1, RT2, RT3>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::ibeta_derivative_imp(static_cast<value_type>(a), static_cast<value_type>(b), static_cast<value_type>(x), forwarding_policy()), "boost::math::ibeta_derivative<%1%>(%1%,%1%,%1%)");
 }
 template <class RT1, class RT2, class RT3>
-inline typename tools::promote_args<RT1, RT2, RT3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
    ibeta_derivative(RT1 a, RT2 b, RT3 x)
 {
    return boost::math::ibeta_derivative(a, b, x, policies::policy<>());
diff --git a/include/boost/math/special_functions/binomial.hpp b/include/boost/math/special_functions/binomial.hpp
index 9a24fc15bb..558c9fb894 100644
--- a/include/boost/math/special_functions/binomial.hpp
+++ b/include/boost/math/special_functions/binomial.hpp
@@ -18,11 +18,11 @@
 namespace boost{ namespace math{
 
 template <class T, class Policy>
-T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
+BOOST_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(!boost::is_integral<T>::value);
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::binomial_coefficient<%1%>(unsigned, unsigned)";
    if(k > n)
       return policies::raise_domain_error<T>(
          function, 
@@ -61,13 +61,13 @@ T binomial_coefficient(unsigned n, unsigned k, const Policy& pol)
 // we'll promote to double:
 //
 template <>
-inline float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>& pol)
+inline BOOST_GPU_ENABLED float binomial_coefficient<float, policies::policy<> >(unsigned n, unsigned k, const policies::policy<>& pol)
 {
    return policies::checked_narrowing_cast<float, policies::policy<> >(binomial_coefficient<double>(n, k, pol), "boost::math::binomial_coefficient<%1%>(unsigned,unsigned)");
 }
 
 template <class T>
-inline T binomial_coefficient(unsigned n, unsigned k)
+inline BOOST_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k)
 {
    return binomial_coefficient<T>(n, k, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/cbrt.hpp b/include/boost/math/special_functions/cbrt.hpp
index c34ad39949..9c417a9847 100644
--- a/include/boost/math/special_functions/cbrt.hpp
+++ b/include/boost/math/special_functions/cbrt.hpp
@@ -40,7 +40,7 @@ struct largest_cbrt_int_type
 };
 
 template <class T, class Policy>
-T cbrt_imp(T z, const Policy& pol)
+BOOST_GPU_ENABLED T cbrt_imp(T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    //
@@ -53,7 +53,7 @@ T cbrt_imp(T z, const Policy& pol)
    // Expected Error Term:                         -1.231e-006
    // Maximum Relative Change in Control Points:   5.982e-004
    //
-   static const T P[] = { 
+      BOOST_MATH_GPU_STATIC const T P[] = {
       static_cast<T>(0.37568269008611818),
       static_cast<T>(1.3304968705558024),
       static_cast<T>(-1.4897101632445036),
@@ -61,7 +61,7 @@ T cbrt_imp(T z, const Policy& pol)
       static_cast<T>(-0.6398703759826468),
       static_cast<T>(0.13584489959258635),
    };
-   static const T correction[] = {
+   BOOST_MATH_GPU_STATIC const T correction[] = {
       static_cast<T>(0.62996052494743658238360530363911),  // 2^-2/3
       static_cast<T>(0.79370052598409973737585281963615),  // 2^-1/3
       static_cast<T>(1),
@@ -156,7 +156,7 @@ T cbrt_imp(T z, const Policy& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type cbrt(T z, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type cbrt(T z, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -164,7 +164,7 @@ inline typename tools::promote_args<T>::type cbrt(T z, const Policy& pol)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type cbrt(T z)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type cbrt(T z)
 {
    return cbrt(z, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/cos_pi.hpp b/include/boost/math/special_functions/cos_pi.hpp
index 669a2c87ae..89681a9ce3 100644
--- a/include/boost/math/special_functions/cos_pi.hpp
+++ b/include/boost/math/special_functions/cos_pi.hpp
@@ -20,7 +20,7 @@
 namespace boost{ namespace math{ namespace detail{
 
 template <class T, class Policy>
-T cos_pi_imp(T x, const Policy& pol)
+BOOST_GPU_ENABLED T cos_pi_imp(T x, const Policy& pol)
 {
    BOOST_MATH_STD_USING // ADL of std names
    // cos of pi*x:
@@ -57,7 +57,7 @@ T cos_pi_imp(T x, const Policy& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type cos_pi(T x, const Policy&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type cos_pi(T x, const Policy&)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -71,7 +71,7 @@ inline typename tools::promote_args<T>::type cos_pi(T x, const Policy&)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type cos_pi(T x)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type cos_pi(T x)
 {
    return boost::math::cos_pi(x, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/detail/bernoulli_details.hpp b/include/boost/math/special_functions/detail/bernoulli_details.hpp
index 75fadbf34a..bc5edf2051 100644
--- a/include/boost/math/special_functions/detail/bernoulli_details.hpp
+++ b/include/boost/math/special_functions/detail/bernoulli_details.hpp
@@ -205,8 +205,8 @@ struct fixed_vector : private std::allocator<T>
       allocator_traits::deallocate(alloc, m_data, m_capacity);
 #endif
    }
-   T& operator[](unsigned n) { BOOST_ASSERT(n < m_used); return m_data[n]; }
-   const T& operator[](unsigned n)const { BOOST_ASSERT(n < m_used); return m_data[n]; }
+   T& operator[](unsigned n) { BOOST_MATH_ASSERT(n < m_used); return m_data[n]; }
+   const T& operator[](unsigned n)const { BOOST_MATH_ASSERT(n < m_used); return m_data[n]; }
    unsigned size()const { return m_used; }
    unsigned size() { return m_used; }
    void resize(unsigned n, const T& val)
diff --git a/include/boost/math/special_functions/detail/bessel_ik.hpp b/include/boost/math/special_functions/detail/bessel_ik.hpp
index aac1781e10..c19956294c 100644
--- a/include/boost/math/special_functions/detail/bessel_ik.hpp
+++ b/include/boost/math/special_functions/detail/bessel_ik.hpp
@@ -94,8 +94,8 @@ int temme_ik(T v, T x, T* K, T* K1, const Policy& pol)
 
     // |x| <= 2, Temme series converge rapidly
     // |x| > 2, the larger the |x|, the slower the convergence
-    BOOST_ASSERT(abs(x) <= 2);
-    BOOST_ASSERT(abs(v) <= 0.5f);
+    BOOST_MATH_ASSERT(abs(x) <= 2);
+    BOOST_MATH_ASSERT(abs(v) <= 0.5f);
 
     T gp = boost::math::tgamma1pm1(v, pol);
     T gm = boost::math::tgamma1pm1(-v, pol);
@@ -216,7 +216,7 @@ int CF2_ik(T v, T x, T* Kv, T* Kv1, const Policy& pol)
     // |x| >= |v|, CF2_ik converges rapidly
     // |x| -> 0, CF2_ik fails to converge
 
-    BOOST_ASSERT(abs(x) > 1);
+    BOOST_MATH_ASSERT(abs(x) > 1);
 
     // Steed's algorithm, see Thompson and Barnett,
     // Journal of Computational Physics, vol 64, 490 (1986)
diff --git a/include/boost/math/special_functions/detail/bessel_j0.hpp b/include/boost/math/special_functions/detail/bessel_j0.hpp
index ebcab17240..7a30f66ecd 100644
--- a/include/boost/math/special_functions/detail/bessel_j0.hpp
+++ b/include/boost/math/special_functions/detail/bessel_j0.hpp
@@ -148,7 +148,7 @@ T bessel_j0(T x)
     if (x <= 4)                       // x in (0, 4]
     {
         T y = x * x;
-        BOOST_ASSERT(sizeof(P1) == sizeof(Q1));
+        BOOST_MATH_ASSERT(sizeof(P1) == sizeof(Q1));
         r = evaluate_rational(P1, Q1, y);
         factor = (x + x1) * ((x - x11/256) - x12);
         value = factor * r;
@@ -156,7 +156,7 @@ T bessel_j0(T x)
     else if (x <= 8.0)                  // x in (4, 8]
     {
         T y = 1 - (x * x)/64;
-        BOOST_ASSERT(sizeof(P2) == sizeof(Q2));
+        BOOST_MATH_ASSERT(sizeof(P2) == sizeof(Q2));
         r = evaluate_rational(P2, Q2, y);
         factor = (x + x2) * ((x - x21/256) - x22);
         value = factor * r;
@@ -165,8 +165,8 @@ T bessel_j0(T x)
     {
         T y = 8 / x;
         T y2 = y * y;
-        BOOST_ASSERT(sizeof(PC) == sizeof(QC));
-        BOOST_ASSERT(sizeof(PS) == sizeof(QS));
+        BOOST_MATH_ASSERT(sizeof(PC) == sizeof(QC));
+        BOOST_MATH_ASSERT(sizeof(PS) == sizeof(QS));
         rc = evaluate_rational(PC, QC, y2);
         rs = evaluate_rational(PS, QS, y2);
         factor = constants::one_div_root_pi<T>() / sqrt(x);
diff --git a/include/boost/math/special_functions/detail/bessel_j1.hpp b/include/boost/math/special_functions/detail/bessel_j1.hpp
index 91ecd2832d..a9e0baa92d 100644
--- a/include/boost/math/special_functions/detail/bessel_j1.hpp
+++ b/include/boost/math/special_functions/detail/bessel_j1.hpp
@@ -149,7 +149,7 @@ T bessel_j1(T x)
     if (w <= 4)                       // w in (0, 4]
     {
         T y = x * x;
-        BOOST_ASSERT(sizeof(P1) == sizeof(Q1));
+        BOOST_MATH_ASSERT(sizeof(P1) == sizeof(Q1));
         r = evaluate_rational(P1, Q1, y);
         factor = w * (w + x1) * ((w - x11/256) - x12);
         value = factor * r;
@@ -157,7 +157,7 @@ T bessel_j1(T x)
     else if (w <= 8)                  // w in (4, 8]
     {
         T y = x * x;
-        BOOST_ASSERT(sizeof(P2) == sizeof(Q2));
+        BOOST_MATH_ASSERT(sizeof(P2) == sizeof(Q2));
         r = evaluate_rational(P2, Q2, y);
         factor = w * (w + x2) * ((w - x21/256) - x22);
         value = factor * r;
@@ -166,8 +166,8 @@ T bessel_j1(T x)
     {
         T y = 8 / w;
         T y2 = y * y;
-        BOOST_ASSERT(sizeof(PC) == sizeof(QC));
-        BOOST_ASSERT(sizeof(PS) == sizeof(QS));
+        BOOST_MATH_ASSERT(sizeof(PC) == sizeof(QC));
+        BOOST_MATH_ASSERT(sizeof(PS) == sizeof(QS));
         rc = evaluate_rational(PC, QC, y2);
         rs = evaluate_rational(PS, QS, y2);
         factor = 1 / (sqrt(w) * constants::root_pi<T>());
diff --git a/include/boost/math/special_functions/detail/bessel_jn.hpp b/include/boost/math/special_functions/detail/bessel_jn.hpp
index 2413630637..b3e452d6f5 100644
--- a/include/boost/math/special_functions/detail/bessel_jn.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jn.hpp
@@ -66,7 +66,7 @@ T bessel_jn(int n, T x, const Policy& pol)
         return static_cast<T>(0);
     }
 
-    BOOST_ASSERT(n > 1);
+    BOOST_MATH_ASSERT(n > 1);
     T scale = 1;
     if (n < abs(x))                         // forward recurrence
     {
diff --git a/include/boost/math/special_functions/detail/bessel_jy.hpp b/include/boost/math/special_functions/detail/bessel_jy.hpp
index b67d989b68..152d999237 100644
--- a/include/boost/math/special_functions/detail/bessel_jy.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy.hpp
@@ -82,7 +82,7 @@ namespace boost { namespace math {
             using namespace boost::math::tools;
          using namespace boost::math::constants;
 
-         BOOST_ASSERT(fabs(v) <= 0.5f);  // precondition for using this routine
+         BOOST_MATH_ASSERT(fabs(v) <= 0.5f);  // precondition for using this routine
 
          T gp = boost::math::tgamma1pm1(v, pol);
          T gm = boost::math::tgamma1pm1(-v, pol);
@@ -197,7 +197,7 @@ namespace boost { namespace math {
 
          // |x| >= |v|, CF2_jy converges rapidly
          // |x| -> 0, CF2_jy fails to converge
-         BOOST_ASSERT(fabs(x) > 1);
+         BOOST_MATH_ASSERT(fabs(x) > 1);
 
          // modified Lentz's method, complex numbers involved, see
          // Lentz, Applied Optics, vol 15, 668 (1976)
@@ -264,7 +264,7 @@ namespace boost { namespace math {
       template <typename T, typename Policy>
       int bessel_jy(T v, T x, T* J, T* Y, int kind, const Policy& pol)
       {
-         BOOST_ASSERT(x >= 0);
+         BOOST_MATH_ASSERT(x >= 0);
 
          T u, Jv, Ju, Yv, Yv1, Yu, Yu1(0), fv, fu;
          T W, p, q, gamma, current, prev, next;
diff --git a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp
index 4d7ac485ad..63cf10c5d0 100644
--- a/include/boost/math/special_functions/detail/bessel_jy_asym.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy_asym.hpp
@@ -133,7 +133,7 @@ inline bool asymptotic_bessel_large_x_limit(int v, const T& x)
       // error rates either side of the divide for v < 10000.
       // At double precision eps^1/8 ~= 0.01.
       //
-      BOOST_ASSERT(v >= 0);
+      BOOST_MATH_ASSERT(v >= 0);
       return (v ? v : 1) < x * 0.004f;
 }
 
diff --git a/include/boost/math/special_functions/detail/bessel_jy_series.hpp b/include/boost/math/special_functions/detail/bessel_jy_series.hpp
index d50bef84e8..6864d45782 100644
--- a/include/boost/math/special_functions/detail/bessel_jy_series.hpp
+++ b/include/boost/math/special_functions/detail/bessel_jy_series.hpp
@@ -218,8 +218,8 @@ T bessel_yn_small_z(int n, T z, T* scale, const Policy& pol)
    // Note that when called we assume that x < epsilon and n is a positive integer.
    //
    BOOST_MATH_STD_USING
-   BOOST_ASSERT(n >= 0);
-   BOOST_ASSERT((z < policies::get_epsilon<T, Policy>()));
+   BOOST_MATH_ASSERT(n >= 0);
+   BOOST_MATH_ASSERT((z < policies::get_epsilon<T, Policy>()));
 
    if(n == 0)
    {
diff --git a/include/boost/math/special_functions/detail/bessel_kn.hpp b/include/boost/math/special_functions/detail/bessel_kn.hpp
index 54c4a1cfa7..486bcd138a 100644
--- a/include/boost/math/special_functions/detail/bessel_kn.hpp
+++ b/include/boost/math/special_functions/detail/bessel_kn.hpp
@@ -56,7 +56,7 @@ T bessel_kn(int n, T x, const Policy& pol)
        prev = bessel_k0(x);
        current = bessel_k1(x);
        int k = 1;
-       BOOST_ASSERT(k < n);
+       BOOST_MATH_ASSERT(k < n);
        T scale = 1;
        do
        {
diff --git a/include/boost/math/special_functions/detail/bessel_yn.hpp b/include/boost/math/special_functions/detail/bessel_yn.hpp
index 62d7377e4f..e27f8e1201 100644
--- a/include/boost/math/special_functions/detail/bessel_yn.hpp
+++ b/include/boost/math/special_functions/detail/bessel_yn.hpp
@@ -77,7 +77,7 @@ T bessel_yn(int n, T x, const Policy& pol)
        prev = bessel_y0(x, pol);
        current = bessel_y1(x, pol);
        int k = 1;
-       BOOST_ASSERT(k < n);
+       BOOST_MATH_ASSERT(k < n);
        policies::check_series_iterations<T>("boost::math::bessel_y_n<%1%>(%1%,%1%)", n, pol);
        T mult = 2 * k / x;
        value = mult * current - prev;
diff --git a/include/boost/math/special_functions/detail/erf_inv.hpp b/include/boost/math/special_functions/detail/erf_inv.hpp
index 4e48300521..3fe6966d0f 100644
--- a/include/boost/math/special_functions/detail/erf_inv.hpp
+++ b/include/boost/math/special_functions/detail/erf_inv.hpp
@@ -21,7 +21,7 @@ namespace detail{
 // this version is for 80-bit long double's and smaller:
 //
 template <class T, class Policy>
-T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*)
+BOOST_GPU_ENABLED T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*)
 {
    BOOST_MATH_STD_USING // for ADL of std names.
 
@@ -41,8 +41,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       // long double: Max error found: 1.017064e-20
       // Maximum Deviation Found (actual error term at infinite precision) 8.030e-21
       //
-      static const float Y = 0.0891314744949340820313f;
-      static const T P[] = {    
+      BOOST_MATH_GPU_STATIC const float Y = 0.0891314744949340820313f;
+      BOOST_MATH_GPU_STATIC const T P[] = {    
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.000508781949658280665617),
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.00836874819741736770379),
          BOOST_MATH_BIG_CONSTANT(T, 64, 0.0334806625409744615033),
@@ -52,7 +52,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
          BOOST_MATH_BIG_CONSTANT(T, 64, 0.00822687874676915743155),
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.00538772965071242932965)
       };
-      static const T Q[] = {    
+      BOOST_MATH_GPU_STATIC const T Q[] = {    
          BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.970005043303290640362),
          BOOST_MATH_BIG_CONSTANT(T, 64, -1.56574558234175846809),
@@ -82,8 +82,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       // long double : Max error found: 6.084616e-20
       // Maximum Deviation Found (error term) 4.811e-20
       //
-      static const float Y = 2.249481201171875f;
-      static const T P[] = {    
+      BOOST_MATH_GPU_STATIC const float Y = 2.249481201171875f;
+      BOOST_MATH_GPU_STATIC const T P[] = {    
          BOOST_MATH_BIG_CONSTANT(T, 64, -0.202433508355938759655),
          BOOST_MATH_BIG_CONSTANT(T, 64, 0.105264680699391713268),
          BOOST_MATH_BIG_CONSTANT(T, 64, 8.37050328343119927838),
@@ -94,7 +94,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
          BOOST_MATH_BIG_CONSTANT(T, 64, 21.1294655448340526258),
          BOOST_MATH_BIG_CONSTANT(T, 64, -3.67192254707729348546)
       };
-      static const T Q[] = {    
+      BOOST_MATH_GPU_STATIC const T Q[] = {    
          BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 64, 6.24264124854247537712),
          BOOST_MATH_BIG_CONSTANT(T, 64, 3.9713437953343869095),
@@ -135,8 +135,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       if(x < 3)
       {
          // Max error found: 1.089051e-20
-         static const float Y = 0.807220458984375f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const float Y = 0.807220458984375f;
+         BOOST_MATH_GPU_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.131102781679951906451),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.163794047193317060787),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.117030156341995252019),
@@ -149,7 +149,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.285225331782217055858e-7),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.681149956853776992068e-9)
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 3.46625407242567245975),
             BOOST_MATH_BIG_CONSTANT(T, 64, 5.38168345707006855425),
@@ -166,8 +166,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       else if(x < 6)
       {
          // Max error found: 8.389174e-21
-         static const float Y = 0.93995571136474609375f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const float Y = 0.93995571136474609375f;
+         BOOST_MATH_GPU_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.0350353787183177984712),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.00222426529213447927281),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.0185573306514231072324),
@@ -178,7 +178,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.230404776911882601748e-9),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.266339227425782031962e-11)
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.3653349817554063097),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.762059164553623404043),
@@ -194,8 +194,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       else if(x < 18)
       {
          // Max error found: 1.481312e-19
-         static const float Y = 0.98362827301025390625f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const float Y = 0.98362827301025390625f;
+         BOOST_MATH_GPU_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.0167431005076633737133),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.00112951438745580278863),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.00105628862152492910091),
@@ -206,7 +206,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.281128735628831791805e-13),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.99055709973310326855e-16)
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.591429344886417493481),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.138151865749083321638),
@@ -222,8 +222,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       else if(x < 44)
       {
          // Max error found: 5.697761e-20
-         static const float Y = 0.99714565277099609375f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const float Y = 0.99714565277099609375f;
+         BOOST_MATH_GPU_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.0024978212791898131227),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.779190719229053954292e-5),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.254723037413027451751e-4),
@@ -233,7 +233,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.145596286718675035587e-11),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.116765012397184275695e-17)
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.207123112214422517181),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.0169410838120975906478),
@@ -249,8 +249,8 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
       else
       {
          // Max error found: 1.279746e-20
-         static const float Y = 0.99941349029541015625f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const float Y = 0.99941349029541015625f;
+         BOOST_MATH_GPU_STATIC const T P[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.000539042911019078575891),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.28398759004727721098e-6),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.899465114892291446442e-6),
@@ -260,7 +260,7 @@ T erf_inv_imp(const T& p, const T& q, const Policy&, const boost::mpl::int_<64>*
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.135880130108924861008e-14),
             BOOST_MATH_BIG_CONSTANT(T, 64, -0.348890393399948882918e-21)
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {    
             BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.0845746234001899436914),
             BOOST_MATH_BIG_CONSTANT(T, 64, 0.00282092984726264681981),
@@ -372,9 +372,11 @@ struct erf_inv_initializer
       void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
+#ifndef __CUDA_ARCH__
       initializer.force_instantiate();
+#endif
    }
 };
 
@@ -393,14 +395,14 @@ bool erf_inv_initializer<T, Policy>::init::is_value_non_zero(T v)
 } // namespace detail
 
 template <class T, class Policy>
-typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
+BOOST_GPU_ENABLED typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
 
    //
    // Begin by testing for domain errors, and other special cases:
    //
-   static const char* function = "boost::math::erfc_inv<%1%>(%1%, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::erfc_inv<%1%>(%1%, %1%)";
    if((z < 0) || (z > 2))
       return policies::raise_domain_error<result_type>(function, "Argument outside range [0,2] in inverse erfc function (got p=%1%).", z, pol);
    if(z == 0)
@@ -457,14 +459,14 @@ typename tools::promote_args<T>::type erfc_inv(T z, const Policy& pol)
 }
 
 template <class T, class Policy>
-typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
+BOOST_GPU_ENABLED typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
 
    //
    // Begin by testing for domain errors, and other special cases:
    //
-   static const char* function = "boost::math::erf_inv<%1%>(%1%, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::erf_inv<%1%>(%1%, %1%)";
    if((z < -1) || (z > 1))
       return policies::raise_domain_error<result_type>(function, "Argument outside range [-1, 1] in inverse erf function (got p=%1%).", z, pol);
    if(z == 1)
@@ -527,13 +529,13 @@ typename tools::promote_args<T>::type erf_inv(T z, const Policy& pol)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erfc_inv(T z)
+BOOST_GPU_ENABLED inline typename tools::promote_args<T>::type erfc_inv(T z)
 {
    return erfc_inv(z, policies::policy<>());
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erf_inv(T z)
+BOOST_GPU_ENABLED inline typename tools::promote_args<T>::type erf_inv(T z)
 {
    return erf_inv(z, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/detail/fp_traits.hpp b/include/boost/math/special_functions/detail/fp_traits.hpp
index c957022223..79a231858b 100644
--- a/include/boost/math/special_functions/detail/fp_traits.hpp
+++ b/include/boost/math/special_functions/detail/fp_traits.hpp
@@ -184,8 +184,8 @@ template<> struct fp_traits_non_native<float, single_precision>
     BOOST_STATIC_CONSTANT(uint32_t, significand = 0x007fffff);
 
     typedef uint32_t bits;
-    static void get_bits(float x, uint32_t& a) { std::memcpy(&a, &x, 4); }
-    static void set_bits(float& x, uint32_t a) { std::memcpy(&x, &a, 4); }
+    static BOOST_GPU_ENABLED void get_bits(float x, uint32_t& a) { std::memcpy(&a, &x, 4); }
+    static BOOST_GPU_ENABLED void set_bits(float& x, uint32_t a) { std::memcpy(&x, &a, 4); }
 };
 
 // ieee_tag version, double (64 bits) ----------------------------------------------
@@ -204,12 +204,12 @@ template<> struct fp_traits_non_native<double, double_precision>
 
     typedef uint32_t bits;
 
-    static void get_bits(double x, uint32_t& a)
+    static BOOST_GPU_ENABLED void get_bits(double x, uint32_t& a)
     {
         std::memcpy(&a, reinterpret_cast<const unsigned char*>(&x) + offset_, 4);
     }
 
-    static void set_bits(double& x, uint32_t a)
+    static BOOST_GPU_ENABLED void set_bits(double& x, uint32_t a)
     {
         std::memcpy(reinterpret_cast<unsigned char*>(&x) + offset_, &a, 4);
     }
@@ -240,8 +240,8 @@ template<> struct fp_traits_non_native<double, double_precision>
         = (((uint64_t)0x000fffff) << 32) + ((uint64_t)0xffffffffu);
 
     typedef uint64_t bits;
-    static void get_bits(double x, uint64_t& a) { std::memcpy(&a, &x, 8); }
-    static void set_bits(double& x, uint64_t a) { std::memcpy(&x, &a, 8); }
+    static BOOST_GPU_ENABLED void get_bits(double x, uint64_t& a) { std::memcpy(&a, &x, 8); }
+    static BOOST_GPU_ENABLED void set_bits(double& x, uint64_t a) { std::memcpy(&x, &a, 8); }
 };
 
 #endif
@@ -264,12 +264,12 @@ template<> struct fp_traits_non_native<long double, double_precision>
 
     typedef uint32_t bits;
 
-    static void get_bits(long double x, uint32_t& a)
+    static BOOST_GPU_ENABLED void get_bits(long double x, uint32_t& a)
     {
         std::memcpy(&a, reinterpret_cast<const unsigned char*>(&x) + offset_, 4);
     }
 
-    static void set_bits(long double& x, uint32_t a)
+    static BOOST_GPU_ENABLED void set_bits(long double& x, uint32_t a)
     {
         std::memcpy(reinterpret_cast<unsigned char*>(&x) + offset_, &a, 4);
     }
@@ -300,8 +300,8 @@ template<> struct fp_traits_non_native<long double, double_precision>
         = ((uint64_t)0x000fffff << 32) + (uint64_t)0xffffffffu;
 
     typedef uint64_t bits;
-    static void get_bits(long double x, uint64_t& a) { std::memcpy(&a, &x, 8); }
-    static void set_bits(long double& x, uint64_t a) { std::memcpy(&x, &a, 8); }
+    static BOOST_GPU_ENABLED void get_bits(long double x, uint64_t& a) { std::memcpy(&a, &x, 8); }
+    static BOOST_GPU_ENABLED void set_bits(long double& x, uint64_t a) { std::memcpy(&x, &a, 8); }
 };
 
 #endif
@@ -327,12 +327,12 @@ struct fp_traits_non_native<long double, extended_double_precision>
 
     typedef uint32_t bits;
 
-    static void get_bits(long double x, uint32_t& a)
+    static BOOST_GPU_ENABLED void get_bits(long double x, uint32_t& a)
     {
         std::memcpy(&a, reinterpret_cast<const unsigned char*>(&x) + 6, 4);
     }
 
-    static void set_bits(long double& x, uint32_t a)
+    static BOOST_GPU_ENABLED void set_bits(long double& x, uint32_t a)
     {
         std::memcpy(reinterpret_cast<unsigned char*>(&x) + 6, &a, 4);
     }
@@ -378,12 +378,12 @@ struct fp_traits_non_native<long double, extended_double_precision>
 
     typedef uint32_t bits;
 
-    static void get_bits(long double x, uint32_t& a)
+    static BOOST_GPU_ENABLED void get_bits(long double x, uint32_t& a)
     {
         std::memcpy(&a, reinterpret_cast<const unsigned char*>(&x) + offset_, 4);
     }
 
-    static void set_bits(long double& x, uint32_t a)
+    static BOOST_GPU_ENABLED void set_bits(long double& x, uint32_t a)
     {
         std::memcpy(reinterpret_cast<unsigned char*>(&x) + offset_, &a, 4);
     }
@@ -425,14 +425,14 @@ struct fp_traits_non_native<long double, extended_double_precision>
 
     typedef uint32_t bits;
 
-    static void get_bits(long double x, uint32_t& a)
+    static BOOST_GPU_ENABLED void get_bits(long double x, uint32_t& a)
     {
         std::memcpy(&a, &x, 2);
         std::memcpy(reinterpret_cast<unsigned char*>(&a) + 2,
                reinterpret_cast<const unsigned char*>(&x) + 4, 2);
     }
 
-    static void set_bits(long double& x, uint32_t a)
+    static BOOST_GPU_ENABLED void set_bits(long double& x, uint32_t a)
     {
         std::memcpy(&x, &a, 2);
         std::memcpy(reinterpret_cast<unsigned char*>(&x) + 4,
@@ -459,12 +459,12 @@ struct fp_traits_non_native<long double, extended_double_precision>
 
     typedef uint32_t bits;
 
-    static void get_bits(long double x, uint32_t& a)
+    static BOOST_GPU_ENABLED void get_bits(long double x, uint32_t& a)
     {
         std::memcpy(&a, reinterpret_cast<const unsigned char*>(&x) + offset_, 4);
     }
 
-    static void set_bits(long double& x, uint32_t a)
+    static BOOST_GPU_ENABLED void set_bits(long double& x, uint32_t a)
     {
         std::memcpy(reinterpret_cast<unsigned char*>(&x) + offset_, &a, 4);
     }
diff --git a/include/boost/math/special_functions/detail/ibeta_inverse.hpp b/include/boost/math/special_functions/detail/ibeta_inverse.hpp
index a9fe8cd49c..3e136cddef 100644
--- a/include/boost/math/special_functions/detail/ibeta_inverse.hpp
+++ b/include/boost/math/special_functions/detail/ibeta_inverse.hpp
@@ -119,9 +119,9 @@ T temme_method_1_ibeta_inverse(T a, T b, T z, const Policy& pol)
    else
       x = (1 + eta * sqrt((1 + c) / eta_2)) / 2;
 
-   BOOST_ASSERT(x >= 0);
-   BOOST_ASSERT(x <= 1);
-   BOOST_ASSERT(eta * (x - 0.5) >= 0);
+   BOOST_MATH_ASSERT(x >= 0);
+   BOOST_MATH_ASSERT(x <= 1);
+   BOOST_MATH_ASSERT(eta * (x - 0.5) >= 0);
 #ifdef BOOST_INSTRUMENT
    std::cout << "Estimating x with Temme method 1: " << x << std::endl;
 #endif
@@ -853,8 +853,8 @@ T ibeta_inv_imp(T a, T b, T p, T q, const Policy& pol, T* py)
    // We don't really want these asserts here, but they are useful for sanity
    // checking that we have the limits right, uncomment if you suspect bugs *only*.
    //
-   //BOOST_ASSERT(x != upper);
-   //BOOST_ASSERT((x != lower) || (x == boost::math::tools::min_value<T>()) || (x == boost::math::tools::epsilon<T>()));
+   //BOOST_MATH_ASSERT(x != upper);
+   //BOOST_MATH_ASSERT((x != lower) || (x == boost::math::tools::min_value<T>()) || (x == boost::math::tools::epsilon<T>()));
    //
    // Tidy up, if we "lower" was too high then zero is the best answer we have:
    //
diff --git a/include/boost/math/special_functions/detail/igamma_large.hpp b/include/boost/math/special_functions/detail/igamma_large.hpp
index eb3d4ba93e..105c941564 100644
--- a/include/boost/math/special_functions/detail/igamma_large.hpp
+++ b/include/boost/math/special_functions/detail/igamma_large.hpp
@@ -55,10 +55,10 @@ namespace boost{ namespace math{ namespace detail{
 // when T is unsuitable to be passed to these routines:
 //
 template <class T, class Policy>
-inline T igamma_temme_large(T, T, const Policy& /* pol */, mpl::int_<0> const *)
+inline BOOST_GPU_ENABLED T igamma_temme_large(T, T, const Policy& /* pol */, mpl::int_<0> const *)
 {
    // stub function, should never actually be called
-   BOOST_ASSERT(0);
+   BOOST_MATH_ASSERT(0);
    return 0;
 }
 //
@@ -66,7 +66,7 @@ inline T igamma_temme_large(T, T, const Policy& /* pol */, mpl::int_<0> const *)
 // (80-bit long double, or 10^-20).
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
+BOOST_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -78,7 +78,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
 
    T workspace[13];
 
-   static const T C0[] = {
+   BOOST_MATH_GPU_STATIC const T C0[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.333333333333333333333),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.0833333333333333333333),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.0148148148148148148148),
@@ -101,7 +101,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[0] = tools::evaluate_polynomial(C0, z);
 
-   static const T C1[] = {
+   BOOST_MATH_GPU_STATIC const T C1[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.00185185185185185185185),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.00347222222222222222222),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00264550264550264550265),
@@ -122,7 +122,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[1] = tools::evaluate_polynomial(C1, z);
 
-   static const T C2[] = {
+   BOOST_MATH_GPU_STATIC const T C2[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00413359788359788359788),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.00268132716049382716049),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000771604938271604938272),
@@ -141,7 +141,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[2] = tools::evaluate_polynomial(C2, z);
 
-   static const T C3[] = {
+   BOOST_MATH_GPU_STATIC const T C3[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000649434156378600823045),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000229472093621399176955),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000469189494395255712128),
@@ -158,7 +158,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[3] = tools::evaluate_polynomial(C3, z);
 
-   static const T C4[] = {
+   BOOST_MATH_GPU_STATIC const T C4[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000861888290916711698605),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000784039221720066627474),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000299072480303190179733),
@@ -173,7 +173,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[4] = tools::evaluate_polynomial(C4, z);
 
-   static const T C5[] = {
+   BOOST_MATH_GPU_STATIC const T C5[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000336798553366358150309),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.697281375836585777429e-4),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000277275324495939207873),
@@ -186,7 +186,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[5] = tools::evaluate_polynomial(C5, z);
 
-   static const T C6[] = {
+   BOOST_MATH_GPU_STATIC const T C6[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000531307936463992223166),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000592166437353693882865),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000270878209671804482771),
@@ -201,7 +201,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[6] = tools::evaluate_polynomial(C6, z);
 
-   static const T C7[] = {
+   BOOST_MATH_GPU_STATIC const T C7[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000344367606892377671254),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.517179090826059219337e-4),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000334931610811422363117),
@@ -214,7 +214,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[7] = tools::evaluate_polynomial(C7, z);
 
-   static const T C8[] = {
+   BOOST_MATH_GPU_STATIC const T C8[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000652623918595309418922),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000839498720672087279993),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000438297098541721005061),
@@ -225,7 +225,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[8] = tools::evaluate_polynomial(C8, z);
 
-   static const T C9[] = {
+   BOOST_MATH_GPU_STATIC const T C9[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.000596761290192746250124),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.720489541602001055909e-4),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000678230883766732836162),
@@ -234,14 +234,14 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[9] = tools::evaluate_polynomial(C9, z);
 
-   static const T C10[] = {
+   BOOST_MATH_GPU_STATIC const T C10[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00133244544948006563713),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.0019144384985654775265),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00110893691345966373396),
    };
    workspace[10] = tools::evaluate_polynomial(C10, z);
 
-   static const T C11[] = {
+   BOOST_MATH_GPU_STATIC const T C11[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00157972766073083495909),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.000162516262783915816899),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.00206334210355432762645),
@@ -250,7 +250,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
    };
    workspace[11] = tools::evaluate_polynomial(C11, z);
 
-   static const T C12[] = {
+   BOOST_MATH_GPU_STATIC const T C12[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.00407251211951401664727),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00640336283380806979482),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.00404101610816766177474),
@@ -271,7 +271,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<64> const *)
 // (IEEE double precision or 10^-17).
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
+BOOST_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -283,7 +283,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
 
    T workspace[10];
 
-   static const T C0[] = {
+   BOOST_MATH_GPU_STATIC const T C0[] = {
       static_cast<T>(-0.33333333333333333L),
       static_cast<T>(0.083333333333333333L),
       static_cast<T>(-0.014814814814814815L),
@@ -302,7 +302,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[0] = tools::evaluate_polynomial(C0, z);
 
-   static const T C1[] = {
+   BOOST_MATH_GPU_STATIC const T C1[] = {
       static_cast<T>(-0.0018518518518518519L),
       static_cast<T>(-0.0034722222222222222L),
       static_cast<T>(0.0026455026455026455L),
@@ -319,7 +319,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[1] = tools::evaluate_polynomial(C1, z);
 
-   static const T C2[] = {
+   BOOST_MATH_GPU_STATIC const T C2[] = {
       static_cast<T>(0.0041335978835978836L),
       static_cast<T>(-0.0026813271604938272L),
       static_cast<T>(0.00077160493827160494L),
@@ -334,7 +334,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[2] = tools::evaluate_polynomial(C2, z);
 
-   static const T C3[] = {
+   BOOST_MATH_GPU_STATIC const T C3[] = {
       static_cast<T>(0.00064943415637860082L),
       static_cast<T>(0.00022947209362139918L),
       static_cast<T>(-0.00046918949439525571L),
@@ -347,7 +347,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[3] = tools::evaluate_polynomial(C3, z);
 
-   static const T C4[] = {
+   BOOST_MATH_GPU_STATIC const T C4[] = {
       static_cast<T>(-0.0008618882909167117L),
       static_cast<T>(0.00078403922172006663L),
       static_cast<T>(-0.00029907248030319018L),
@@ -358,7 +358,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[4] = tools::evaluate_polynomial(C4, z);
 
-   static const T C5[] = {
+   BOOST_MATH_GPU_STATIC const T C5[] = {
       static_cast<T>(-0.00033679855336635815L),
       static_cast<T>(-0.69728137583658578e-4L),
       static_cast<T>(0.00027727532449593921L),
@@ -371,7 +371,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[5] = tools::evaluate_polynomial(C5, z);
 
-   static const T C6[] = {
+   BOOST_MATH_GPU_STATIC const T C6[] = {
       static_cast<T>(0.00053130793646399222L),
       static_cast<T>(-0.00059216643735369388L),
       static_cast<T>(0.00027087820967180448L),
@@ -382,7 +382,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[6] = tools::evaluate_polynomial(C6, z);
 
-   static const T C7[] = {
+   BOOST_MATH_GPU_STATIC const T C7[] = {
       static_cast<T>(0.00034436760689237767L),
       static_cast<T>(0.51717909082605922e-4L),
       static_cast<T>(-0.00033493161081142236L),
@@ -391,7 +391,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
    };
    workspace[7] = tools::evaluate_polynomial(C7, z);
 
-   static const T C8[] = {
+   BOOST_MATH_GPU_STATIC const T C8[] = {
       static_cast<T>(-0.00065262391859530942L),
       static_cast<T>(0.00083949872067208728L),
       static_cast<T>(-0.00043829709854172101L),
@@ -413,7 +413,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<53> const *)
 // (IEEE float precision, or 10^-8)
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<24> const *)
+BOOST_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<24> const *)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -425,7 +425,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<24> const *)
 
    T workspace[3];
 
-   static const T C0[] = {
+   BOOST_MATH_GPU_STATIC const T C0[] = {
       static_cast<T>(-0.333333333L),
       static_cast<T>(0.0833333333L),
       static_cast<T>(-0.0148148148L),
@@ -436,7 +436,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<24> const *)
    };
    workspace[0] = tools::evaluate_polynomial(C0, z);
 
-   static const T C1[] = {
+   BOOST_MATH_GPU_STATIC const T C1[] = {
       static_cast<T>(-0.00185185185L),
       static_cast<T>(-0.00347222222L),
       static_cast<T>(0.00264550265L),
@@ -445,7 +445,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<24> const *)
    };
    workspace[1] = tools::evaluate_polynomial(C1, z);
 
-   static const T C2[] = {
+   BOOST_MATH_GPU_STATIC const T C2[] = {
       static_cast<T>(0.00413359788L),
       static_cast<T>(-0.00268132716L),
       static_cast<T>(0.000771604938L),
@@ -469,7 +469,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<24> const *)
 // require many more terms in the polynomials.
 //
 template <class T, class Policy>
-T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
+BOOST_GPU_ENABLED T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
 {
    BOOST_MATH_STD_USING // ADL of std functions
    T sigma = (x - a) / a;
@@ -481,7 +481,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
 
    T workspace[14];
 
-   static const T C0[] = {
+   BOOST_MATH_GPU_STATIC const T C0[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.333333333333333333333333333333333333),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.0833333333333333333333333333333333333),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.0148148148148148148148148148148148148),
@@ -516,7 +516,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[0] = tools::evaluate_polynomial(C0, z);
 
-   static const T C1[] = {
+   BOOST_MATH_GPU_STATIC const T C1[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00185185185185185185185185185185185185),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00347222222222222222222222222222222222),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.0026455026455026455026455026455026455),
@@ -549,7 +549,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[1] = tools::evaluate_polynomial(C1, z);
 
-   static const T C2[] = {
+   BOOST_MATH_GPU_STATIC const T C2[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.0041335978835978835978835978835978836),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00268132716049382716049382716049382716),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000771604938271604938271604938271604938),
@@ -580,7 +580,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[2] = tools::evaluate_polynomial(C2, z);
 
-   static const T C3[] = {
+   BOOST_MATH_GPU_STATIC const T C3[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000649434156378600823045267489711934156),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000229472093621399176954732510288065844),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000469189494395255712128140111679206329),
@@ -609,7 +609,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[3] = tools::evaluate_polynomial(C3, z);
 
-   static const T C4[] = {
+   BOOST_MATH_GPU_STATIC const T C4[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000861888290916711698604702719929057378),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.00078403922172006662747403488144228885),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000299072480303190179733389609932819809),
@@ -636,7 +636,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[4] = tools::evaluate_polynomial(C4, z);
 
-   static const T C5[] = {
+   BOOST_MATH_GPU_STATIC const T C5[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000336798553366358150308767592718210002),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.697281375836585777429398828575783308e-4),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.00027727532449593920787336425196507501),
@@ -657,7 +657,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[5] = tools::evaluate_polynomial(C5, z);
 
-   static const T C6[] = {
+   BOOST_MATH_GPU_STATIC const T C6[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.00053130793646399222316574854297762391),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000592166437353693882864836225604401187),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000270878209671804482771279183488328692),
@@ -676,7 +676,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[6] = tools::evaluate_polynomial(C6, z);
 
-   static const T C7[] = {
+   BOOST_MATH_GPU_STATIC const T C7[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000344367606892377671254279625108523655),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.517179090826059219337057843002058823e-4),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000334931610811422363116635090580012327),
@@ -693,7 +693,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[7] = tools::evaluate_polynomial(C7, z);
 
-   static const T C8[] = {
+   BOOST_MATH_GPU_STATIC const T C8[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000652623918595309418922034919726622692),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000839498720672087279993357516764983445),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000438297098541721005061087953050560377),
@@ -708,7 +708,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[8] = tools::evaluate_polynomial(C8, z);
 
-   static const T C9[] = {
+   BOOST_MATH_GPU_STATIC const T C9[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.000596761290192746250124390067179459605),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.720489541602001055908571930225015052e-4),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000678230883766732836161951166000673426),
@@ -721,7 +721,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[9] = tools::evaluate_polynomial(C9, z);
 
-   static const T C10[] = {
+   BOOST_MATH_GPU_STATIC const T C10[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.00133244544948006563712694993432717968),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00191443849856547752650089885832852254),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.0011089369134596637339607446329267522),
@@ -732,7 +732,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[10] = tools::evaluate_polynomial(C10, z);
 
-   static const T C11[] = {
+   BOOST_MATH_GPU_STATIC const T C11[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.00157972766073083495908785631307733022),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.000162516262783915816898635123980270998),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00206334210355432762645284467690276817),
@@ -741,7 +741,7 @@ T igamma_temme_large(T a, T x, const Policy& pol, mpl::int_<113> const *)
    };
    workspace[11] = tools::evaluate_polynomial(C11, z);
 
-   static const T C12[] = {
+   BOOST_MATH_GPU_STATIC const T C12[] = {
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00407251211951401664727281097914544601),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.00640336283380806979482363809026579583),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.00404101610816766177473974858518094879),
diff --git a/include/boost/math/special_functions/detail/lgamma_small.hpp b/include/boost/math/special_functions/detail/lgamma_small.hpp
index e65f8b7e98..ec1eb47176 100644
--- a/include/boost/math/special_functions/detail/lgamma_small.hpp
+++ b/include/boost/math/special_functions/detail/lgamma_small.hpp
@@ -18,15 +18,15 @@ namespace boost{ namespace math{ namespace detail{
 // These need forward declaring to keep GCC happy:
 //
 template <class T, class Policy, class Lanczos>
-T gamma_imp(T z, const Policy& pol, const Lanczos& l);
+BOOST_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l);
 template <class T, class Policy>
-T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l);
+BOOST_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos& l);
 
 //
 // lgamma for small arguments:
 //
 template <class T, class Policy, class Lanczos>
-T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l */, const Lanczos&)
+BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l */, const Lanczos&)
 {
    // This version uses rational approximations for small
    // values of z accurate enough for 64-bit mantissas
@@ -77,7 +77,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
       // At long double: Max error found:               1.987e-21
       // Maximum Deviation Found (approximation error): 5.900e-24
       //
-      static const T P[] = {
+      BOOST_MATH_GPU_STATIC const T P[] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.180355685678449379109e-1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.25126649619989678683e-1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.494103151567532234274e-1)),
@@ -86,7 +86,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.541009869215204396339e-3)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.324588649825948492091e-4))
       };
-      static const T Q[] = {
+      BOOST_MATH_GPU_STATIC const T Q[] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.196202987197795200688e1)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.148019669424231326694e1)),
@@ -97,7 +97,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.223352763208617092964e-6))
       };
 
-      static const float Y = 0.158963680267333984375e0f;
+      BOOST_MATH_GPU_STATIC const float Y = 0.158963680267333984375e0f;
 
       T r = zm2 * (z + 1);
       T R = tools::evaluate_polynomial(P, zm2);
@@ -142,9 +142,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
          // Expected Error Term:                                 3.139e-021
 
          //
-         static const float Y = 0.52815341949462890625f;
+         BOOST_MATH_GPU_STATIC const float Y = 0.52815341949462890625f;
 
-         static const T P[] = {
+         BOOST_MATH_GPU_STATIC const T P[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.490622454069039543534e-1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.969117530159521214579e-1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.414983358359495381969e0)),
@@ -153,7 +153,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.240149820648571559892e-1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.100346687696279557415e-2))
          };
-         static const T Q[] = {
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.302349829846463038743e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.348739585360723852576e1)),
@@ -187,9 +187,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
          // Maximum Deviation Found:                           2.151e-021
          // Expected Error Term:                               2.150e-021
          //
-         static const float Y = 0.452017307281494140625f;
+         BOOST_MATH_GPU_STATIC const float Y = 0.452017307281494140625f;
 
-         static const T P[] = {
+         BOOST_MATH_GPU_STATIC const T P[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.292329721830270012337e-1)), 
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.144216267757192309184e0)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.142440390738631274135e0)),
@@ -197,7 +197,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.850535976868336437746e-2)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.431171342679297331241e-3))
          };
-         static const T Q[] = {
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -0.150169356054485044494e1)),
             static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.846973248876495016101e0)),
@@ -215,7 +215,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<64>&, const Policy& /* l *
    return result;
 }
 template <class T, class Policy, class Lanczos>
-T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l */, const Lanczos&)
+BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l */, const Lanczos&)
 {
    //
    // This version uses rational approximations for small
@@ -264,7 +264,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
       //
       // Maximum Deviation Found (approximation error)      3.73e-37
 
-      static const T P[] = {
+      BOOST_MATH_GPU_STATIC const T P[] = {
          BOOST_MATH_BIG_CONSTANT(T, 113, -0.018035568567844937910504030027467476655),
          BOOST_MATH_BIG_CONSTANT(T, 113, 0.013841458273109517271750705401202404195),
          BOOST_MATH_BIG_CONSTANT(T, 113, 0.062031842739486600078866923383017722399),
@@ -277,7 +277,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
          BOOST_MATH_BIG_CONSTANT(T, 113, -0.49999811718089980992888533630523892389e-6),
          BOOST_MATH_BIG_CONSTANT(T, 113, -0.70529798686542184668416911331718963364e-8)
       };
-      static const T Q[] = {
+      BOOST_MATH_GPU_STATIC const T Q[] = {
          BOOST_MATH_BIG_CONSTANT(T, 113, 1.0),
          BOOST_MATH_BIG_CONSTANT(T, 113, 2.5877485070422317542808137697939233685),
          BOOST_MATH_BIG_CONSTANT(T, 113, 2.8797959228352591788629602533153837126),
@@ -296,7 +296,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
       T R = tools::evaluate_polynomial(P, zm2);
       R /= tools::evaluate_polynomial(Q, zm2);
 
-      static const float Y = 0.158963680267333984375F;
+      BOOST_MATH_GPU_STATIC const float Y = 0.158963680267333984375F;
 
       T r = zm2 * (z + 1);
 
@@ -340,9 +340,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
          // Expected Error Term (theoretical error)                  1.343e-36
          // Max error found at 128-bit long double precision         1.007e-35
          //
-         static const float Y = 0.54076099395751953125f;
+         BOOST_MATH_GPU_STATIC const float Y = 0.54076099395751953125f;
 
-         static const T P[] = {
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 113, 0.036454670944013329356512090082402429697),
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.066235835556476033710068679907798799959),
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.67492399795577182387312206593595565371),
@@ -356,7 +356,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.10164985672213178500790406939467614498e-6),
             BOOST_MATH_BIG_CONSTANT(T, 113, 0.13680157145361387405588201461036338274e-8)
          };
-         static const T Q[] = {
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 113, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 113, 4.9106336261005990534095838574132225599),
             BOOST_MATH_BIG_CONSTANT(T, 113, 10.258804800866438510889341082793078432),
@@ -393,9 +393,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
          // Maximum Deviation Found (approximation error)     1.538e-37
          // Expected Error Term (theoretical error)           2.350e-38
          //
-         static const float Y = 0.483787059783935546875f;
+         BOOST_MATH_GPU_STATIC const float Y = 0.483787059783935546875f;
 
-         static const T P[] = {
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.017977422421608624353488126610933005432),
             BOOST_MATH_BIG_CONSTANT(T, 113, 0.18484528905298309555089509029244135703),
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.40401251514859546989565001431430884082),
@@ -407,7 +407,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.57058739515423112045108068834668269608e-4),
             BOOST_MATH_BIG_CONSTANT(T, 113, 0.8207548771933585614380644961342925976e-6)
          };
-         static const T Q[] = {
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 113, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 113, -2.9629552288944259229543137757200262073),
             BOOST_MATH_BIG_CONSTANT(T, 113, 3.7118380799042118987185957298964772755),
@@ -434,9 +434,9 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
          // Maximum Deviation Found (approximation error)      8.588e-36
          // Expected Error Term (theoretical error)            1.458e-36
          //
-         static const float Y = 0.443811893463134765625f;
+         BOOST_MATH_GPU_STATIC const float Y = 0.443811893463134765625f;
 
-         static const T P[] = {
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.021027558364667626231512090082402429494),
             BOOST_MATH_BIG_CONSTANT(T, 113, 0.15128811104498736604523586803722368377),
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.26249631480066246699388544451126410278),
@@ -448,7 +448,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
             BOOST_MATH_BIG_CONSTANT(T, 113, -0.11088589183158123733132268042570710338e-4),
             BOOST_MATH_BIG_CONSTANT(T, 113, 0.13240510580220763969511741896361984162e-6)
          };
-         static const T Q[] = {
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 113, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 113, -2.4240003754444040525462170802796471996),
             BOOST_MATH_BIG_CONSTANT(T, 113, 2.4868383476933178722203278602342786002),
@@ -472,7 +472,7 @@ T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<113>&, const Policy& /* l
    return result;
 }
 template <class T, class Policy, class Lanczos>
-T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<0>&, const Policy& pol, const Lanczos&)
+BOOST_GPU_ENABLED T lgamma_small_imp(T z, T zm1, T zm2, const mpl::int_<0>&, const Policy& pol, const Lanczos&)
 {
    //
    // No rational approximations are available because either
diff --git a/include/boost/math/special_functions/detail/polygamma.hpp b/include/boost/math/special_functions/detail/polygamma.hpp
index c0e4932907..0847d2decb 100644
--- a/include/boost/math/special_functions/detail/polygamma.hpp
+++ b/include/boost/math/special_functions/detail/polygamma.hpp
@@ -437,8 +437,8 @@ namespace boost { namespace math { namespace detail{
            for(int column = 0; column <= max_columns; ++column)
            {
               int cos_order = 2 * column + offset;  // order of the cosine term in entry "column"
-              BOOST_ASSERT(column < (int)table[i].size());
-              BOOST_ASSERT((cos_order + 1) / 2 < (int)table[i + 1].size());
+              BOOST_MATH_ASSERT(column < (int)table[i].size());
+              BOOST_MATH_ASSERT((cos_order + 1) / 2 < (int)table[i + 1].size());
               table[i + 1][(cos_order + 1) / 2] += ((cos_order - sin_order) * table[i][column]) / (sin_order - 1);
               if(cos_order)
                 table[i + 1][(cos_order - 1) / 2] += (-cos_order * table[i][column]) / (sin_order - 1);
diff --git a/include/boost/math/special_functions/detail/round_fwd.hpp b/include/boost/math/special_functions/detail/round_fwd.hpp
index 8c45a7d75a..cf58793f92 100644
--- a/include/boost/math/special_functions/detail/round_fwd.hpp
+++ b/include/boost/math/special_functions/detail/round_fwd.hpp
@@ -21,58 +21,58 @@ namespace boost
    { 
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol);
    template <class T>
-   typename tools::promote_args<T>::type trunc(const T& v);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type trunc(const T& v);
    template <class T, class Policy>
-   int itrunc(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED int itrunc(const T& v, const Policy& pol);
    template <class T>
-   int itrunc(const T& v);
+   BOOST_GPU_ENABLED int itrunc(const T& v);
    template <class T, class Policy>
-   long ltrunc(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED long ltrunc(const T& v, const Policy& pol);
    template <class T>
-   long ltrunc(const T& v);
+   BOOST_GPU_ENABLED long ltrunc(const T& v);
 #ifdef BOOST_HAS_LONG_LONG
    template <class T, class Policy>
-   boost::long_long_type lltrunc(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED boost::long_long_type lltrunc(const T& v, const Policy& pol);
    template <class T>
-   boost::long_long_type lltrunc(const T& v);
+   BOOST_GPU_ENABLED boost::long_long_type lltrunc(const T& v);
 #endif
    template <class T, class Policy>
-   typename tools::promote_args<T>::type round(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type round(const T& v, const Policy& pol);
    template <class T>
-   typename tools::promote_args<T>::type round(const T& v);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type round(const T& v);
    template <class T, class Policy>
-   int iround(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED int iround(const T& v, const Policy& pol);
    template <class T>
-   int iround(const T& v);
+   BOOST_GPU_ENABLED int iround(const T& v);
    template <class T, class Policy>
-   long lround(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED long lround(const T& v, const Policy& pol);
    template <class T>
-   long lround(const T& v);
+   BOOST_GPU_ENABLED long lround(const T& v);
 #ifdef BOOST_HAS_LONG_LONG
    template <class T, class Policy>
-   boost::long_long_type llround(const T& v, const Policy& pol);
+   BOOST_GPU_ENABLED boost::long_long_type llround(const T& v, const Policy& pol);
    template <class T>
-   boost::long_long_type llround(const T& v);
+   BOOST_GPU_ENABLED boost::long_long_type llround(const T& v);
 #endif
    template <class T, class Policy>
-   T modf(const T& v, T* ipart, const Policy& pol);
+   BOOST_GPU_ENABLED T modf(const T& v, T* ipart, const Policy& pol);
    template <class T>
-   T modf(const T& v, T* ipart);
+   BOOST_GPU_ENABLED T modf(const T& v, T* ipart);
    template <class T, class Policy>
-   T modf(const T& v, int* ipart, const Policy& pol);
+   BOOST_GPU_ENABLED T modf(const T& v, int* ipart, const Policy& pol);
    template <class T>
-   T modf(const T& v, int* ipart);
+   BOOST_GPU_ENABLED T modf(const T& v, int* ipart);
    template <class T, class Policy>
-   T modf(const T& v, long* ipart, const Policy& pol);
+   BOOST_GPU_ENABLED T modf(const T& v, long* ipart, const Policy& pol);
    template <class T>
-   T modf(const T& v, long* ipart);
+   BOOST_GPU_ENABLED T modf(const T& v, long* ipart);
 #ifdef BOOST_HAS_LONG_LONG
    template <class T, class Policy>
-   T modf(const T& v, boost::long_long_type* ipart, const Policy& pol);
+   BOOST_GPU_ENABLED T modf(const T& v, boost::long_long_type* ipart, const Policy& pol);
    template <class T>
-   T modf(const T& v, boost::long_long_type* ipart);
+   BOOST_GPU_ENABLED T modf(const T& v, boost::long_long_type* ipart);
 #endif
 
    }
diff --git a/include/boost/math/special_functions/detail/t_distribution_inv.hpp b/include/boost/math/special_functions/detail/t_distribution_inv.hpp
index ab5a8fbca6..32740609d6 100644
--- a/include/boost/math/special_functions/detail/t_distribution_inv.hpp
+++ b/include/boost/math/special_functions/detail/t_distribution_inv.hpp
@@ -27,7 +27,7 @@ template <class T, class Policy>
 T inverse_students_t_hill(T ndf, T u, const Policy& pol)
 {
    BOOST_MATH_STD_USING
-   BOOST_ASSERT(u <= 0.5);
+   BOOST_MATH_ASSERT(u <= 0.5);
 
    T a, b, c, d, q, x, y;
 
diff --git a/include/boost/math/special_functions/detail/unchecked_factorial.hpp b/include/boost/math/special_functions/detail/unchecked_factorial.hpp
index 17366742c4..1d5c2ab653 100644
--- a/include/boost/math/special_functions/detail/unchecked_factorial.hpp
+++ b/include/boost/math/special_functions/detail/unchecked_factorial.hpp
@@ -35,6 +35,8 @@ namespace boost { namespace math
 template <class T>
 struct max_factorial;
 
+#ifndef __CUDA_ARCH__
+
 // Definitions:
 template <>
 inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial<float>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float))
@@ -42,7 +44,7 @@ inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial<float>(unsi
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
    constexpr std::array<float, 35> factorials = { {
 #else
-   static const boost::array<float, 35> factorials = {{
+   BOOST_MATH_GPU_STATIC const boost::array<float, 35> factorials = {{
 #endif
       1.0F,
       1.0F,
@@ -84,6 +86,52 @@ inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION float unchecked_factorial<float>(unsi
    return factorials[i];
 }
 
+#else
+template <>
+inline BOOST_GPU_ENABLED float unchecked_factorial<float>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(float))
+{
+   BOOST_MATH_GPU_STATIC const float factorials[] = {
+         1.0F,
+         1.0F,
+      2.0F,
+      6.0F,
+      24.0F,
+      120.0F,
+      720.0F,
+      5040.0F,
+      40320.0F,
+      362880.0F,
+      3628800.0F,
+      39916800.0F,
+      479001600.0F,
+      6227020800.0F,
+      87178291200.0F,
+      1307674368000.0F,
+      20922789888000.0F,
+      355687428096000.0F,
+      6402373705728000.0F,
+      121645100408832000.0F,
+      0.243290200817664e19F,
+      0.5109094217170944e20F,
+      0.112400072777760768e22F,
+      0.2585201673888497664e23F,
+      0.62044840173323943936e24F,
+      0.15511210043330985984e26F,
+      0.403291461126605635584e27F,
+      0.10888869450418352160768e29F,
+      0.304888344611713860501504e30F,
+      0.8841761993739701954543616e31F,
+      0.26525285981219105863630848e33F,
+      0.822283865417792281772556288e34F,
+      0.26313083693369353016721801216e36F,
+      0.868331761881188649551819440128e37F,
+      0.29523279903960414084761860964352e39F,
+      };
+
+   return factorials[i];
+}
+#endif
+
 template <>
 struct max_factorial<float>
 {
@@ -91,13 +139,15 @@ struct max_factorial<float>
 };
 
 
+#ifndef __CUDA_ARCH__
+
 template <>
-inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION long double unchecked_factorial<long double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(long double))
+inline BOOST_GPU_ENABLED BOOST_MATH_CONSTEXPR_TABLE_FUNCTION long double unchecked_factorial<long double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(long double))
 {
 #ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
    constexpr std::array<long double, 171> factorials = { {
 #else
-   static const boost::array<long double, 171> factorials = {{
+   BOOST_MATH_GPU_STATIC const boost::array<long double, 171> factorials = {{
 #endif
       1L,
       1L,
@@ -281,6 +331,7 @@ struct max_factorial<long double>
    BOOST_STATIC_CONSTANT(unsigned, value = 170);
 };
 
+#endif
 #ifdef BOOST_MATH_USE_FLOAT128
 
 template <>
@@ -475,6 +526,8 @@ struct max_factorial<BOOST_MATH_FLOAT128_TYPE>
 
 #endif
 
+#ifndef __CUDA_ARCH__
+
 template <>
 inline BOOST_MATH_CONSTEXPR_TABLE_FUNCTION double unchecked_factorial<double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double))
 {
@@ -488,6 +541,65 @@ struct max_factorial<double>
       value = ::boost::math::max_factorial<long double>::value);
 };
 
+#else
+
+template <>
+inline BOOST_GPU_ENABLED double unchecked_factorial<double>(unsigned i BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE_SPEC(double))
+{
+#ifdef BOOST_MATH_HAVE_CONSTEXPR_TABLES
+   constexpr const double factorials[] = {
+#else
+   BOOST_MATH_GPU_STATIC const double factorials[] = {
+#endif
+      1,
+      1,
+      2,
+      6,
+      24,
+      120,
+      720,
+      5040,
+      40320,
+      362880.0,
+      3628800.0,
+      39916800.0,
+      479001600.0,
+      6227020800.0,
+      87178291200.0,
+      1307674368000.0,
+      20922789888000.0,
+      355687428096000.0,
+      6402373705728000.0,
+      121645100408832000.0,
+      0.243290200817664e19,
+      0.5109094217170944e20,
+      0.112400072777760768e22,
+      0.2585201673888497664e23,
+      0.62044840173323943936e24,
+      0.15511210043330985984e26,
+      0.403291461126605635584e27,
+      0.10888869450418352160768e29,
+      0.304888344611713860501504e30,
+      0.8841761993739701954543616e31,
+      0.26525285981219105863630848e33,
+      0.822283865417792281772556288e34,
+      0.26313083693369353016721801216e36,
+      0.868331761881188649551819440128e37,
+      0.29523279903960414084761860964352e39,
+      };
+
+   return factorials[i];
+}
+
+template <>
+struct max_factorial<double>
+{
+   BOOST_STATIC_CONSTANT(unsigned,
+      value = 34);
+};
+
+#endif
+
 #ifndef BOOST_MATH_NO_LEXICAL_CAST
 
 template <class T>
diff --git a/include/boost/math/special_functions/ellint_1.hpp b/include/boost/math/special_functions/ellint_1.hpp
index d1d9d72e30..c6a09a4390 100644
--- a/include/boost/math/special_functions/ellint_1.hpp
+++ b/include/boost/math/special_functions/ellint_1.hpp
@@ -31,22 +31,22 @@
 namespace boost { namespace math {
 
 template <class T1, class T2, class Policy>
-typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const Policy& pol);
+BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const Policy& pol);
 
 namespace detail{
 
 template <typename T, typename Policy>
-T ellint_k_imp(T k, const Policy& pol);
+BOOST_GPU_ENABLED T ellint_k_imp(T k, const Policy& pol);
 
 // Elliptic integral (Legendre form) of the first kind
 template <typename T, typename Policy>
-T ellint_f_imp(T phi, T k, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_f_imp(T phi, T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
     using namespace boost::math::constants;
 
-    static const char* function = "boost::math::ellint_f<%1%>(%1%,%1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_f<%1%>(%1%,%1%)";
     BOOST_MATH_INSTRUMENT_VARIABLE(phi);
     BOOST_MATH_INSTRUMENT_VARIABLE(k);
     BOOST_MATH_INSTRUMENT_VARIABLE(function);
@@ -131,12 +131,12 @@ T ellint_f_imp(T phi, T k, const Policy& pol)
 
 // Complete elliptic integral (Legendre form) of the first kind
 template <typename T, typename Policy>
-T ellint_k_imp(T k, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_k_imp(T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
 
-    static const char* function = "boost::math::ellint_k<%1%>(%1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_k<%1%>(%1%)";
 
     if (abs(k) > 1)
     {
@@ -157,7 +157,7 @@ T ellint_k_imp(T k, const Policy& pol)
 }
 
 template <typename T, typename Policy>
-inline typename tools::promote_args<T>::type ellint_1(T k, const Policy& pol, const mpl::true_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_1(T k, const Policy& pol, const mpl::true_&)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -165,7 +165,7 @@ inline typename tools::promote_args<T>::type ellint_1(T k, const Policy& pol, co
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const mpl::false_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const mpl::false_&)
 {
    return boost::math::ellint_1(k, phi, policies::policy<>());
 }
@@ -174,14 +174,14 @@ inline typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const m
 
 // Complete elliptic integral (Legendre form) of the first kind
 template <typename T>
-inline typename tools::promote_args<T>::type ellint_1(T k)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_1(T k)
 {
    return ellint_1(k, policies::policy<>());
 }
 
 // Elliptic integral (Legendre form) of the first kind
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -189,7 +189,7 @@ inline typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const P
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi)
 {
    typedef typename policies::is_policy<T2>::type tag_type;
    return detail::ellint_1(k, phi, tag_type());
diff --git a/include/boost/math/special_functions/ellint_2.hpp b/include/boost/math/special_functions/ellint_2.hpp
index 9ee6b63821..56a40685e5 100644
--- a/include/boost/math/special_functions/ellint_2.hpp
+++ b/include/boost/math/special_functions/ellint_2.hpp
@@ -33,16 +33,16 @@
 namespace boost { namespace math { 
    
 template <class T1, class T2, class Policy>
-typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const Policy& pol);
+BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const Policy& pol);
    
 namespace detail{
 
 template <typename T, typename Policy>
-T ellint_e_imp(T k, const Policy& pol);
+BOOST_GPU_ENABLED T ellint_e_imp(T k, const Policy& pol);
 
 // Elliptic integral (Legendre form) of the second kind
 template <typename T, typename Policy>
-T ellint_e_imp(T phi, T k, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_e_imp(T phi, T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
@@ -121,7 +121,7 @@ T ellint_e_imp(T phi, T k, const Policy& pol)
 
 // Complete elliptic integral (Legendre form) of the second kind
 template <typename T, typename Policy>
-T ellint_e_imp(T k, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_e_imp(T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
@@ -146,7 +146,7 @@ T ellint_e_imp(T k, const Policy& pol)
 }
 
 template <typename T, typename Policy>
-inline typename tools::promote_args<T>::type ellint_2(T k, const Policy& pol, const mpl::true_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_2(T k, const Policy& pol, const mpl::true_&)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -155,7 +155,7 @@ inline typename tools::promote_args<T>::type ellint_2(T k, const Policy& pol, co
 
 // Elliptic integral (Legendre form) of the second kind
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const mpl::false_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const mpl::false_&)
 {
    return boost::math::ellint_2(k, phi, policies::policy<>());
 }
@@ -164,21 +164,21 @@ inline typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const m
 
 // Complete elliptic integral (Legendre form) of the second kind
 template <typename T>
-inline typename tools::promote_args<T>::type ellint_2(T k)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_2(T k)
 {
    return ellint_2(k, policies::policy<>());
 }
 
 // Elliptic integral (Legendre form) of the second kind
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi)
 {
    typedef typename policies::is_policy<T2>::type tag_type;
    return detail::ellint_2(k, phi, tag_type());
 }
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
diff --git a/include/boost/math/special_functions/ellint_3.hpp b/include/boost/math/special_functions/ellint_3.hpp
index b8b36729cf..0d0f26a4d7 100644
--- a/include/boost/math/special_functions/ellint_3.hpp
+++ b/include/boost/math/special_functions/ellint_3.hpp
@@ -38,16 +38,16 @@ namespace boost { namespace math {
 namespace detail{
 
 template <typename T, typename Policy>
-T ellint_pi_imp(T v, T k, T vc, const Policy& pol);
+BOOST_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol);
 
 // Elliptic integral (Legendre form) of the third kind
 template <typename T, typename Policy>
-T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol)
 {
    // Note vc = 1-v presumably without cancellation error.
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::ellint_3<%1%>(%1%,%1%,%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_3<%1%>(%1%,%1%,%1%)";
 
    if(abs(k) > 1)
    {
@@ -254,8 +254,8 @@ T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol)
    // by the time we get here phi should already have been
    // normalised above.
    //
-   BOOST_ASSERT(fabs(phi) < constants::half_pi<T>());
-   BOOST_ASSERT(phi >= 0);
+   BOOST_MATH_ASSERT(fabs(phi) < constants::half_pi<T>());
+   BOOST_MATH_ASSERT(phi >= 0);
    T x, y, z, p, t;
    T cosp = cos(phi);
    x = cosp * cosp;
@@ -273,13 +273,13 @@ T ellint_pi_imp(T v, T phi, T k, T vc, const Policy& pol)
 
 // Complete elliptic integral (Legendre form) of the third kind
 template <typename T, typename Policy>
-T ellint_pi_imp(T v, T k, T vc, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_pi_imp(T v, T k, T vc, const Policy& pol)
 {
     // Note arg vc = 1-v, possibly without cancellation errors
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
 
-    static const char* function = "boost::math::ellint_pi<%1%>(%1%,%1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_pi<%1%>(%1%,%1%)";
 
     if (abs(k) >= 1)
     {
@@ -323,13 +323,13 @@ T ellint_pi_imp(T v, T k, T vc, const Policy& pol)
 }
 
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi, const mpl::false_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi, const mpl::false_&)
 {
    return boost::math::ellint_3(k, v, phi, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v, const Policy& pol, const mpl::true_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v, const Policy& pol, const mpl::true_&)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -344,7 +344,7 @@ inline typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v, const Pol
 } // namespace detail
 
 template <class T1, class T2, class T3, class Policy>
-inline typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2, T3>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -358,14 +358,14 @@ inline typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 ph
 }
 
 template <class T1, class T2, class T3>
-typename detail::ellint_3_result<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi)
+inline BOOST_GPU_ENABLED typename detail::ellint_3_result<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi)
 {
    typedef typename policies::is_policy<T3>::type tag_type;
    return detail::ellint_3(k, v, phi, tag_type());
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v)
 {
    return ellint_3(k, v, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/ellint_d.hpp b/include/boost/math/special_functions/ellint_d.hpp
index fa5c53db18..05afbb6b0a 100644
--- a/include/boost/math/special_functions/ellint_d.hpp
+++ b/include/boost/math/special_functions/ellint_d.hpp
@@ -33,16 +33,16 @@
 namespace boost { namespace math { 
    
 template <class T1, class T2, class Policy>
-typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const Policy& pol);
+BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const Policy& pol);
    
 namespace detail{
 
 template <typename T, typename Policy>
-T ellint_d_imp(T k, const Policy& pol);
+BOOST_GPU_ENABLED T ellint_d_imp(T k, const Policy& pol);
 
 // Elliptic integral (Legendre form) of the second kind
 template <typename T, typename Policy>
-T ellint_d_imp(T phi, T k, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_d_imp(T phi, T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
@@ -113,7 +113,7 @@ T ellint_d_imp(T phi, T k, const Policy& pol)
 
 // Complete elliptic integral (Legendre form) of the second kind
 template <typename T, typename Policy>
-T ellint_d_imp(T k, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_d_imp(T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
@@ -136,7 +136,7 @@ T ellint_d_imp(T k, const Policy& pol)
 }
 
 template <typename T, typename Policy>
-inline typename tools::promote_args<T>::type ellint_d(T k, const Policy& pol, const mpl::true_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_d(T k, const Policy& pol, const mpl::true_&)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -145,7 +145,7 @@ inline typename tools::promote_args<T>::type ellint_d(T k, const Policy& pol, co
 
 // Elliptic integral (Legendre form) of the second kind
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const mpl::false_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const mpl::false_&)
 {
    return boost::math::ellint_d(k, phi, policies::policy<>());
 }
@@ -154,21 +154,21 @@ inline typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const m
 
 // Complete elliptic integral (Legendre form) of the second kind
 template <typename T>
-inline typename tools::promote_args<T>::type ellint_d(T k)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_d(T k)
 {
    return ellint_d(k, policies::policy<>());
 }
 
 // Elliptic integral (Legendre form) of the second kind
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi)
 {
    typedef typename policies::is_policy<T2>::type tag_type;
    return detail::ellint_d(k, phi, tag_type());
 }
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
diff --git a/include/boost/math/special_functions/ellint_rc.hpp b/include/boost/math/special_functions/ellint_rc.hpp
index 846c752a14..b6a81a4ea2 100644
--- a/include/boost/math/special_functions/ellint_rc.hpp
+++ b/include/boost/math/special_functions/ellint_rc.hpp
@@ -32,11 +32,11 @@
 namespace boost { namespace math { namespace detail{
 
 template <typename T, typename Policy>
-T ellint_rc_imp(T x, T y, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_rc_imp(T x, T y, const Policy& pol)
 {
     BOOST_MATH_STD_USING
 
-    static const char* function = "boost::math::ellint_rc<%1%>(%1%,%1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_rc<%1%>(%1%,%1%)";
 
     if(x < 0)
     {
@@ -90,7 +90,7 @@ T ellint_rc_imp(T x, T y, const Policy& pol)
 } // namespace detail
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    ellint_rc(T1 x, T2 y, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
@@ -102,7 +102,7 @@ inline typename tools::promote_args<T1, T2>::type
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    ellint_rc(T1 x, T2 y)
 {
    return ellint_rc(x, y, policies::policy<>());
diff --git a/include/boost/math/special_functions/ellint_rd.hpp b/include/boost/math/special_functions/ellint_rd.hpp
index c08430d545..4faee57dd2 100644
--- a/include/boost/math/special_functions/ellint_rd.hpp
+++ b/include/boost/math/special_functions/ellint_rd.hpp
@@ -29,12 +29,12 @@
 namespace boost { namespace math { namespace detail{
 
 template <typename T, typename Policy>
-T ellint_rd_imp(T x, T y, T z, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_rd_imp(T x, T y, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    using std::swap;
 
-   static const char* function = "boost::math::ellint_rd<%1%>(%1%,%1%,%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_rd<%1%>(%1%,%1%,%1%)";
 
    if(x < 0)
    {
@@ -61,7 +61,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol)
    //
    using std::swap;
    if(x == z)
-      swap(x, y);
+      BOOST_MATH_CUDA_SAFE_SWAP(x, y);
    if(y == z)
    {
       if(x == y)
@@ -74,19 +74,19 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol)
       }
       else
       {
-         if((std::min)(x, y) / (std::max)(x, y) > 1.3)
+         if(BOOST_MATH_CUDA_SAFE_MIN(x, y) / BOOST_MATH_CUDA_SAFE_MAX(x, y) > 1.3)
             return 3 * (ellint_rc_imp(x, y, pol) - sqrt(x) / y) / (2 * (y - x));
          // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y)
       }
    }
    if(x == y)
    {
-      if((std::min)(x, z) / (std::max)(x, z) > 1.3)
+      if(BOOST_MATH_CUDA_SAFE_MIN(x, z) / BOOST_MATH_CUDA_SAFE_MAX(x, z) > 1.3)
          return 3 * (ellint_rc_imp(z, x, pol) - 1 / sqrt(z)) / (z - x);
       // Otherwise fall through to avoid cancellation in the above (RC(x,y) -> 1/x^0.5 as x -> y)
    }
    if(y == 0)
-      swap(x, y);
+      BOOST_MATH_CUDA_SAFE_SWAP(x, y);
    if(x == 0)
    {
       //
@@ -132,7 +132,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol)
    T An = (x + y + 3 * z) / 5;
    T A0 = An;
    // This has an extra 1.2 fudge factor which is really only needed when x, y and z are close in magnitude:
-   T Q = pow(tools::epsilon<T>() / 4, -T(1) / 8) * (std::max)((std::max)(An - x, An - y), An - z) * 1.2f;
+   T Q = pow(tools::epsilon<T>() / 4, -T(1) / 8) * BOOST_MATH_CUDA_SAFE_MAX(BOOST_MATH_CUDA_SAFE_MAX(An - x, An - y), An - z) * 1.2f;
    BOOST_MATH_INSTRUMENT_VARIABLE(Q);
    T lambda, rx, ry, rz;
    unsigned k = 0;
@@ -181,7 +181,7 @@ T ellint_rd_imp(T x, T y, T z, const Policy& pol)
 } // namespace detail
 
 template <class T1, class T2, class T3, class Policy>
-inline typename tools::promote_args<T1, T2, T3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
    ellint_rd(T1 x, T2 y, T3 z, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -194,7 +194,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }
 
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
    ellint_rd(T1 x, T2 y, T3 z)
 {
    return ellint_rd(x, y, z, policies::policy<>());
diff --git a/include/boost/math/special_functions/ellint_rf.hpp b/include/boost/math/special_functions/ellint_rf.hpp
index a8a7b4b217..26c4824545 100644
--- a/include/boost/math/special_functions/ellint_rf.hpp
+++ b/include/boost/math/special_functions/ellint_rf.hpp
@@ -23,6 +23,10 @@
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/special_functions/ellint_rc.hpp>
 
+#ifdef __CUDA_ARCH__
+#include <math_constants.h>
+#endif
+
 // Carlson's elliptic integral of the first kind
 // R_F(x, y, z) = 0.5 * \int_{0}^{\infty} [(t+x)(t+y)(t+z)]^{-1/2} dt
 // Carlson, Numerische Mathematik, vol 33, 1 (1979)
@@ -30,27 +34,34 @@
 namespace boost { namespace math { namespace detail{
 
    template <typename T, typename Policy>
-   T ellint_rf_imp(T x, T y, T z, const Policy& pol)
+   BOOST_GPU_ENABLED T ellint_rf_imp(T x, T y, T z, const Policy& pol)
    {
       BOOST_MATH_STD_USING
-      using namespace boost::math;
       using std::swap;
 
-      static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)";
 
       if(x < 0 || y < 0 || z < 0)
       {
          return policies::raise_domain_error<T>(function,
             "domain error, all arguments must be non-negative, "
             "only sensible result is %1%.",
+#ifdef __CUDA_ARCH__
+            static_cast<T>(CUDART_NAN), pol);
+#else
             std::numeric_limits<T>::quiet_NaN(), pol);
+#endif
       }
       if(x + y == 0 || y + z == 0 || z + x == 0)
       {
          return policies::raise_domain_error<T>(function,
             "domain error, at most one argument can be zero, "
             "only sensible result is %1%.",
+#ifdef __CUDA_ARCH__
+            static_cast<T>(CUDART_NAN), pol);
+#else
             std::numeric_limits<T>::quiet_NaN(), pol);
+#endif
       }
       //
       // Special cases from http://dlmf.nist.gov/19.20#i
@@ -86,9 +97,9 @@ namespace boost { namespace math { namespace detail{
             return ellint_rc_imp(x, y, pol);
       }
       if(x == 0)
-         swap(x, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, z);
       else if(y == 0)
-         swap(y, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(y, z);
       if(z == 0)
       {
          //
@@ -111,7 +122,7 @@ namespace boost { namespace math { namespace detail{
       T zn = z;
       T An = (x + y + z) / 3;
       T A0 = An;
-      T Q = pow(3 * boost::math::tools::epsilon<T>(), T(-1) / 8) * (std::max)((std::max)(fabs(An - xn), fabs(An - yn)), fabs(An - zn));
+      T Q = pow(3 * boost::math::tools::epsilon<T>(), T(-1) / 8) * BOOST_MATH_CUDA_SAFE_MAX(BOOST_MATH_CUDA_SAFE_MAX(fabs(An - xn), fabs(An - yn)), fabs(An - zn));
       T fn = 1;
 
 
@@ -149,7 +160,7 @@ namespace boost { namespace math { namespace detail{
 } // namespace detail
 
 template <class T1, class T2, class T3, class Policy>
-inline typename tools::promote_args<T1, T2, T3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
    ellint_rf(T1 x, T2 y, T3 z, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -162,7 +173,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }
 
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
    ellint_rf(T1 x, T2 y, T3 z)
 {
    return ellint_rf(x, y, z, policies::policy<>());
diff --git a/include/boost/math/special_functions/ellint_rg.hpp b/include/boost/math/special_functions/ellint_rg.hpp
index bb5b7c376a..67497a2c43 100644
--- a/include/boost/math/special_functions/ellint_rg.hpp
+++ b/include/boost/math/special_functions/ellint_rg.hpp
@@ -18,20 +18,28 @@
 #include <boost/math/special_functions/ellint_rf.hpp>
 #include <boost/math/special_functions/pow.hpp>
 
+#ifdef __CUDA_ARCH__
+#include <math_constants.h>
+#endif
+
 namespace boost { namespace math { namespace detail{
 
    template <typename T, typename Policy>
-   T ellint_rg_imp(T x, T y, T z, const Policy& pol)
+   BOOST_GPU_ENABLED T ellint_rg_imp(T x, T y, T z, const Policy& pol)
    {
       BOOST_MATH_STD_USING
-      static const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)";
+      BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_rf<%1%>(%1%,%1%,%1%)";
 
       if(x < 0 || y < 0 || z < 0)
       {
          return policies::raise_domain_error<T>(function,
             "domain error, all arguments must be non-negative, "
             "only sensible result is %1%.",
+#ifdef __CUDA_ARCH__
+            static_cast<T>(CUDART_NAN), pol);
+#else
             std::numeric_limits<T>::quiet_NaN(), pol);
+#endif
       }
       //
       // Function is symmetric in x, y and z, but we require
@@ -40,14 +48,14 @@ namespace boost { namespace math { namespace detail{
       //
       using std::swap;
       if(x < y)
-         swap(x, y);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, y);
       if(x < z)
-         swap(x, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, z);
       if(y > z)
-         swap(y, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(y, z);
       
-      BOOST_ASSERT(x >= z);
-      BOOST_ASSERT(z >= y);
+      BOOST_MATH_ASSERT(x >= z);
+      BOOST_MATH_ASSERT(z >= y);
       //
       // Special cases from http://dlmf.nist.gov/19.20#ii
       //
@@ -67,7 +75,7 @@ namespace boost { namespace math { namespace detail{
          else
          {
             // x = z, y != 0
-            swap(x, y);
+            BOOST_MATH_CUDA_SAFE_SWAP(x, y);
             return (x == 0) ? T(sqrt(z) / 2) : T((z * ellint_rc_imp(x, z, pol) + sqrt(x)) / 2);
          }
       }
@@ -80,7 +88,7 @@ namespace boost { namespace math { namespace detail{
       }
       else if(y == 0)
       {
-         swap(y, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(y, z);
          //
          // Special handling for common case, from
          // Numerical Computation of Real or Complex Elliptic Integrals, eq.46
@@ -111,7 +119,7 @@ namespace boost { namespace math { namespace detail{
 } // namespace detail
 
 template <class T1, class T2, class T3, class Policy>
-inline typename tools::promote_args<T1, T2, T3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
    ellint_rg(T1 x, T2 y, T3 z, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2, T3>::type result_type;
@@ -124,7 +132,7 @@ inline typename tools::promote_args<T1, T2, T3>::type
 }
 
 template <class T1, class T2, class T3>
-inline typename tools::promote_args<T1, T2, T3>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
    ellint_rg(T1 x, T2 y, T3 z)
 {
    return ellint_rg(x, y, z, policies::policy<>());
diff --git a/include/boost/math/special_functions/ellint_rj.hpp b/include/boost/math/special_functions/ellint_rj.hpp
index ac39bed678..bfded1a004 100644
--- a/include/boost/math/special_functions/ellint_rj.hpp
+++ b/include/boost/math/special_functions/ellint_rj.hpp
@@ -25,6 +25,10 @@
 #include <boost/math/special_functions/ellint_rf.hpp>
 #include <boost/math/special_functions/ellint_rd.hpp>
 
+#ifdef __CUDA_ARCH__
+#include <math_constants.h>
+#endif
+
 // Carlson's elliptic integral of the third kind
 // R_J(x, y, z, p) = 1.5 * \int_{0}^{\infty} (t+p)^{-1} [(t+x)(t+y)(t+z)]^{-1/2} dt
 // Carlson, Numerische Mathematik, vol 33, 1 (1979)
@@ -32,13 +36,13 @@
 namespace boost { namespace math { namespace detail{
 
 template <typename T, typename Policy>
-T ellint_rc1p_imp(T y, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_rc1p_imp(T y, const Policy& pol)
 {
    using namespace boost::math;
    // Calculate RC(1, 1 + x)
    BOOST_MATH_STD_USING
 
-  static const char* function = "boost::math::ellint_rc<%1%>(%1%,%1%)";
+  BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_rc<%1%>(%1%,%1%)";
 
    if(y == -1)
    {
@@ -76,11 +80,11 @@ T ellint_rc1p_imp(T y, const Policy& pol)
 }
 
 template <typename T, typename Policy>
-T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
+BOOST_GPU_ENABLED T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::ellint_rj<%1%>(%1%,%1%,%1%)";
 
    if(x < 0)
    {
@@ -106,7 +110,12 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
    {
       return policies::raise_domain_error<T>(function,
          "At most one argument can be zero, "
-         "only possible result is %1%.", std::numeric_limits<T>::quiet_NaN(), pol);
+         "only possible result is %1%.", 
+#ifdef __CUDA_ARCH__
+         static_cast<T>(CUDART_NAN), pol);
+#else
+         std::numeric_limits<T>::quiet_NaN(), pol);
+#endif
    }
 
    // for p < 0, the integral is singular, return Cauchy principal value
@@ -117,20 +126,21 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
       // Since the integral is symmetrical in x, y and z
       // we can just permute the values:
       //
+      using std::swap;
       if(x > y)
-         std::swap(x, y);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, y);
       if(y > z)
-         std::swap(y, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(y, z);
       if(x > y)
-         std::swap(x, y);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, y);
 
-      BOOST_ASSERT(x <= y);
-      BOOST_ASSERT(y <= z);
+      BOOST_MATH_ASSERT(x <= y);
+      BOOST_MATH_ASSERT(y <= z);
 
       T q = -p;
       p = (z * (x + y + q) - x * y) / (z + q);
 
-      BOOST_ASSERT(p >= 0);
+      BOOST_MATH_ASSERT(p >= 0);
 
       T value = (p - z) * ellint_rj_imp(x, y, z, p, pol);
       value -= 3 * ellint_rf_imp(x, y, z, pol);
@@ -161,12 +171,12 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
       {
          // x = y only, permute so y = z:
          using std::swap;
-         swap(x, z);
+         BOOST_MATH_CUDA_SAFE_SWAP(x, z);
          if(y == p)
          {
             return ellint_rd_imp(x, y, y, pol);
          }
-         else if((std::max)(y, p) / (std::min)(y, p) > 1.2)
+         else if(BOOST_MATH_CUDA_SAFE_MAX(y, p) / BOOST_MATH_CUDA_SAFE_MIN(y, p) > 1.2)
          {
             return 3 * (ellint_rc_imp(x, y, pol) - ellint_rc_imp(x, p, pol)) / (p - y);
          }
@@ -180,7 +190,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
          // y = z = p:
          return ellint_rd_imp(x, y, y, pol);
       }
-      else if((std::max)(y, p) / (std::min)(y, p) > 1.2)
+      else if(BOOST_MATH_CUDA_SAFE_MAX(y, p) / BOOST_MATH_CUDA_SAFE_MIN(y, p) > 1.2)
       {
          // y = z:
          return 3 * (ellint_rc_imp(x, y, pol) - ellint_rc_imp(x, p, pol)) / (p - y);
@@ -199,7 +209,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
    T An = (x + y + z + 2 * p) / 5;
    T A0 = An;
    T delta = (p - x) * (p - y) * (p - z);
-   T Q = pow(tools::epsilon<T>() / 5, -T(1) / 8) * (std::max)((std::max)(fabs(An - x), fabs(An - y)), (std::max)(fabs(An - z), fabs(An - p)));
+   T Q = pow(tools::epsilon<T>() / 5, -T(1) / 8) * BOOST_MATH_CUDA_SAFE_MAX(BOOST_MATH_CUDA_SAFE_MAX(fabs(An - x), fabs(An - y)), BOOST_MATH_CUDA_SAFE_MAX(fabs(An - z), fabs(An - p)));
 
    unsigned n;
    T lambda;
@@ -275,7 +285,7 @@ T ellint_rj_imp(T x, T y, T z, T p, const Policy& pol)
 } // namespace detail
 
 template <class T1, class T2, class T3, class T4, class Policy>
-inline typename tools::promote_args<T1, T2, T3, T4>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3, T4>::type
    ellint_rj(T1 x, T2 y, T3 z, T4 p, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2, T3, T4>::type result_type;
@@ -290,7 +300,7 @@ inline typename tools::promote_args<T1, T2, T3, T4>::type
 }
 
 template <class T1, class T2, class T3, class T4>
-inline typename tools::promote_args<T1, T2, T3, T4>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3, T4>::type
    ellint_rj(T1 x, T2 y, T3 z, T4 p)
 {
    return ellint_rj(x, y, z, p, policies::policy<>());
diff --git a/include/boost/math/special_functions/erf.hpp b/include/boost/math/special_functions/erf.hpp
index eda223b5d2..c61d4b75c2 100644
--- a/include/boost/math/special_functions/erf.hpp
+++ b/include/boost/math/special_functions/erf.hpp
@@ -28,7 +28,7 @@ namespace detail
 template <class T>
 struct erf_asympt_series_t
 {
-   erf_asympt_series_t(T z) : xx(2 * -z * z), tk(1)
+   BOOST_GPU_ENABLED erf_asympt_series_t(T z) : xx(2 * -z * z), tk(1)
    {
       BOOST_MATH_STD_USING
       result = -exp(-z * z) / sqrt(boost::math::constants::pi<T>());
@@ -37,7 +37,7 @@ struct erf_asympt_series_t
 
    typedef T result_type;
 
-   T operator()()
+   BOOST_GPU_ENABLED T operator()()
    {
       BOOST_MATH_STD_USING
       T r = result;
@@ -56,33 +56,33 @@ struct erf_asympt_series_t
 // How large z has to be in order to ensure that the series converges:
 //
 template <class T>
-inline float erf_asymptotic_limit_N(const T&)
+inline BOOST_GPU_ENABLED float erf_asymptotic_limit_N(const T&)
 {
    return (std::numeric_limits<float>::max)();
 }
-inline float erf_asymptotic_limit_N(const mpl::int_<24>&)
+inline BOOST_GPU_ENABLED float erf_asymptotic_limit_N(const mpl::int_<24>&)
 {
    return 2.8F;
 }
-inline float erf_asymptotic_limit_N(const mpl::int_<53>&)
+inline BOOST_GPU_ENABLED float erf_asymptotic_limit_N(const mpl::int_<53>&)
 {
    return 4.3F;
 }
-inline float erf_asymptotic_limit_N(const mpl::int_<64>&)
+inline BOOST_GPU_ENABLED float erf_asymptotic_limit_N(const mpl::int_<64>&)
 {
    return 4.8F;
 }
-inline float erf_asymptotic_limit_N(const mpl::int_<106>&)
+inline BOOST_GPU_ENABLED float erf_asymptotic_limit_N(const mpl::int_<106>&)
 {
    return 6.5F;
 }
-inline float erf_asymptotic_limit_N(const mpl::int_<113>&)
+inline BOOST_GPU_ENABLED float erf_asymptotic_limit_N(const mpl::int_<113>&)
 {
    return 6.8F;
 }
 
 template <class T, class Policy>
-inline T erf_asymptotic_limit()
+inline BOOST_GPU_ENABLED T erf_asymptotic_limit()
 {
    typedef typename policies::precision<T, Policy>::type precision_type;
    typedef typename mpl::if_<
@@ -176,20 +176,36 @@ T erf_imp(T z, bool invert, const Policy& pol, const Tag& t)
 }
 
 template <class T, class Policy>
-T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
+BOOST_GPU_ENABLED T erf_imp(T z, bool invert, const Policy&, const mpl::int_<53>&)
 {
    BOOST_MATH_STD_USING
 
    BOOST_MATH_INSTRUMENT_CODE("53-bit precision erf_imp called");
 
+   int prefix_multiplier = 1;
+   int prefix_adder = 0;
+
    if(z < 0)
    {
+      // Recursion is logically simpler here, but confuses static analyzers that need to be
+      // able to calculate the maximimum program stack size at compile time (ie CUDA).
+      z = -z;
       if(!invert)
-         return -erf_imp(T(-z), invert, pol, t);
+      {
+         prefix_multiplier = -1;
+         // return -erf_imp(T(-z), invert, pol, t);
+      }
       else if(z < -0.5)
-         return 2 - erf_imp(T(-z), invert, pol, t);
+      {
+         prefix_adder = 2;
+         // return 2 - erf_imp(T(-z), invert, pol, t);
+      }
       else
-         return 1 + erf_imp(T(-z), false, pol, t);
+      {
+         invert = false;
+         prefix_adder = 1;
+         // return 1 + erf_imp(T(-z), false, pol, t);
+      }
    }
 
    T result;
@@ -212,7 +228,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
          }
          else
          {
-            static const T c = BOOST_MATH_BIG_CONSTANT(T, 53, 0.003379167095512573896158903121545171688);
+            BOOST_MATH_GPU_STATIC const T c = BOOST_MATH_BIG_CONSTANT(T, 53, 0.003379167095512573896158903121545171688);
             result = static_cast<T>(z * 1.125f + z * c);
          }
       }
@@ -223,15 +239,15 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
          // Maximum Relative Change in Control Points:   1.155e-04
          // Max Error found at double precision =        2.961182e-17
 
-         static const T Y = 1.044948577880859375f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const T Y = 1.044948577880859375f;
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0834305892146531832907),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.338165134459360935041),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.0509990735146777432841),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.00772758345802133288487),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.000322780120964605683831),
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.455004033050794024546),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0875222600142252549554),
@@ -254,8 +270,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
          // Expected Error Term:                         3.702e-17
          // Maximum Relative Change in Control Points:   2.845e-04
          // Max Error found at double precision =        4.841816e-17
-         static const T Y = 0.405935764312744140625f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const T Y = 0.405935764312744140625f;
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.098090592216281240205),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.178114665841120341155),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.191003695796775433986),
@@ -263,7 +279,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0195049001251218801359),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00180424538297014223957),
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.84759070983002217845),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.42628004845511324508),
@@ -287,8 +303,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
          // Maximum Deviation Found:                     3.909e-18
          // Expected Error Term:                         3.909e-18
          // Maximum Relative Change in Control Points:   9.886e-05
-         static const T Y = 0.50672817230224609375f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const T Y = 0.50672817230224609375f;
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.0243500476207698441272),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0386540375035707201728),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.04394818964209516296),
@@ -296,7 +312,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00323962406290842133584),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.000235839115596880717416),
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.53991494948552447182),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.982403709157920235114),
@@ -320,8 +336,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
          // Expected Error Term:                         1.512e-17
          // Maximum Relative Change in Control Points:   2.222e-04
          // Max Error found at double precision =        2.062515e-17
-         static const T Y = 0.5405750274658203125f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const T Y = 0.5405750274658203125f;
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00295276716530971662634),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0137384425896355332126),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00840807615555585383007),
@@ -329,7 +345,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.000250269961544794627958),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.113212406648847561139e-4),
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.04217814166938418171),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.442597659481563127003),
@@ -353,8 +369,8 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
          // Maximum Deviation Found:                     2.860e-17
          // Expected Error Term:                         2.859e-17
          // Maximum Relative Change in Control Points:   1.357e-05
-         static const T Y = 0.5579090118408203125f;
-         static const T P[] = {    
+         BOOST_MATH_GPU_STATIC const T Y = 0.5579090118408203125f;
+         BOOST_MATH_GPU_STATIC const T P[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.00628057170626964891937),
             BOOST_MATH_BIG_CONSTANT(T, 53, 0.0175389834052493308818),
             BOOST_MATH_BIG_CONSTANT(T, 53, -0.212652252872804219852),
@@ -363,7 +379,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
             BOOST_MATH_BIG_CONSTANT(T, 53, -3.22729451764143718517),
             BOOST_MATH_BIG_CONSTANT(T, 53, -2.8175401114513378771),
          };
-         static const T Q[] = {    
+         BOOST_MATH_GPU_STATIC const T Q[] = {
             BOOST_MATH_BIG_CONSTANT(T, 53, 1.0),
             BOOST_MATH_BIG_CONSTANT(T, 53, 2.79257750980575282228),
             BOOST_MATH_BIG_CONSTANT(T, 53, 11.0567237927800161565),
@@ -394,15 +410,16 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<53>& t)
 
    if(invert)
    {
-      result = 1 - result;
+      prefix_adder += prefix_multiplier * 1;
+      prefix_multiplier = -prefix_multiplier;
    }
 
-   return result;
+   return prefix_adder + prefix_multiplier * result;
 } // template <class T, class Lanczos>T erf_imp(T z, bool invert, const Lanczos& l, const mpl::int_<53>& t)
 
 
 template <class T, class Policy>
-T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<64>& t)
+BOOST_GPU_ENABLED T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<64>& t)
 {
    BOOST_MATH_STD_USING
 
@@ -636,7 +653,7 @@ T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<64>& t)
 
 
 template <class T, class Policy>
-T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<113>& t)
+BOOST_GPU_ENABLED T erf_imp(T z, bool invert, const Policy& pol, const mpl::int_<113>& t)
 {
    BOOST_MATH_STD_USING
 
@@ -1146,9 +1163,11 @@ struct erf_initializer
       void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
+#ifndef __CUDA_ARCH__
       initializer.force_instantiate();
+#endif
    }
 };
 
@@ -1158,7 +1177,7 @@ const typename erf_initializer<T, Policy, tag>::init erf_initializer<T, Policy,
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type erf(T z, const Policy& /* pol */)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type erf(T z, const Policy& /* pol */)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1204,7 +1223,7 @@ inline typename tools::promote_args<T>::type erf(T z, const Policy& /* pol */)
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type erfc(T z, const Policy& /* pol */)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type erfc(T z, const Policy& /* pol */)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -1250,13 +1269,13 @@ inline typename tools::promote_args<T>::type erfc(T z, const Policy& /* pol */)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erf(T z)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type erf(T z)
 {
    return boost::math::erf(z, policies::policy<>());
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type erfc(T z)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type erfc(T z)
 {
    return boost::math::erfc(z, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/expint.hpp b/include/boost/math/special_functions/expint.hpp
index fc656f3355..da6c7796fa 100644
--- a/include/boost/math/special_functions/expint.hpp
+++ b/include/boost/math/special_functions/expint.hpp
@@ -34,7 +34,7 @@ template <class T>
 inline T expint_1_rational(const T& z, const mpl::int_<0>&)
 {
    // this function is never actually called
-   BOOST_ASSERT(0);
+   BOOST_MATH_ASSERT(0);
    return z;
 }
 
diff --git a/include/boost/math/special_functions/expm1.hpp b/include/boost/math/special_functions/expm1.hpp
index 9cecc89756..c65d7b9b4b 100644
--- a/include/boost/math/special_functions/expm1.hpp
+++ b/include/boost/math/special_functions/expm1.hpp
@@ -41,10 +41,10 @@ namespace detail
   {
      typedef T result_type;
 
-     expm1_series(T x)
+     BOOST_GPU_ENABLED expm1_series(T x)
         : k(0), m_x(x), m_term(1) {}
 
-     T operator()()
+     BOOST_GPU_ENABLED T operator()()
      {
         ++k;
         m_term *= m_x;
@@ -52,7 +52,7 @@ namespace detail
         return m_term;
      }
 
-     int count()const
+     BOOST_GPU_ENABLED int count()const
      {
         return k;
      }
@@ -87,9 +87,11 @@ struct expm1_initializer
       void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
+#ifndef __CUDA_ARCH__
       initializer.force_instantiate();
+#endif
    }
 };
 
@@ -102,7 +104,7 @@ const typename expm1_initializer<T, Policy, tag>::init expm1_initializer<T, Poli
 // This version uses a Taylor series expansion for 0.5 > |x| > epsilon.
 //
 template <class T, class Policy>
-T expm1_imp(T x, const mpl::int_<0>&, const Policy& pol)
+BOOST_GPU_ENABLED T expm1_imp(T x, const mpl::int_<0>&, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -136,7 +138,7 @@ T expm1_imp(T x, const mpl::int_<0>&, const Policy& pol)
 }
 
 template <class T, class P>
-T expm1_imp(T x, const mpl::int_<53>&, const P& pol)
+BOOST_GPU_ENABLED T expm1_imp(T x, const mpl::int_<53>&, const P& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -154,16 +156,16 @@ T expm1_imp(T x, const mpl::int_<53>&, const P& pol)
    if(a < tools::epsilon<T>())
       return x;
 
-   static const float Y = 0.10281276702880859e1f;
-   static const T n[] = { static_cast<T>(-0.28127670288085937e-1), static_cast<T>(0.51278186299064534e0), static_cast<T>(-0.6310029069350198e-1), static_cast<T>(0.11638457975729296e-1), static_cast<T>(-0.52143390687521003e-3), static_cast<T>(0.21491399776965688e-4) };
-   static const T d[] = { 1, static_cast<T>(-0.45442309511354755e0), static_cast<T>(0.90850389570911714e-1), static_cast<T>(-0.10088963629815502e-1), static_cast<T>(0.63003407478692265e-3), static_cast<T>(-0.17976570003654402e-4) };
+   BOOST_MATH_GPU_STATIC const float Y = 0.10281276702880859e1f;
+   BOOST_MATH_GPU_STATIC const T n[] = { static_cast<T>(-0.28127670288085937e-1), static_cast<T>(0.51278186299064534e0), static_cast<T>(-0.6310029069350198e-1), static_cast<T>(0.11638457975729296e-1), static_cast<T>(-0.52143390687521003e-3), static_cast<T>(0.21491399776965688e-4) };
+   BOOST_MATH_GPU_STATIC const T d[] = { 1, static_cast<T>(-0.45442309511354755e0), static_cast<T>(0.90850389570911714e-1), static_cast<T>(-0.10088963629815502e-1), static_cast<T>(0.63003407478692265e-3), static_cast<T>(-0.17976570003654402e-4) };
 
    T result = x * Y + x * tools::evaluate_polynomial(n, x) / tools::evaluate_polynomial(d, x);
    return result;
 }
 
 template <class T, class P>
-T expm1_imp(T x, const mpl::int_<64>&, const P& pol)
+BOOST_GPU_ENABLED T expm1_imp(T x, const mpl::int_<64>&, const P& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -181,8 +183,8 @@ T expm1_imp(T x, const mpl::int_<64>&, const P& pol)
    if(a < tools::epsilon<T>())
       return x;
 
-   static const float Y = 0.10281276702880859375e1f;
-   static const T n[] = { 
+   BOOST_MATH_GPU_STATIC const float Y = 0.10281276702880859375e1f;
+   BOOST_MATH_GPU_STATIC const T n[] = { 
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.281276702880859375e-1), 
        BOOST_MATH_BIG_CONSTANT(T, 64, 0.512980290285154286358e0), 
        BOOST_MATH_BIG_CONSTANT(T, 64, -0.667758794592881019644e-1),
@@ -191,7 +193,7 @@ T expm1_imp(T x, const mpl::int_<64>&, const P& pol)
        BOOST_MATH_BIG_CONSTANT(T, 64, 0.447441185192951335042e-4),
        BOOST_MATH_BIG_CONSTANT(T, 64, -0.714539134024984593011e-6)
    };
-   static const T d[] = { 
+   BOOST_MATH_GPU_STATIC const T d[] = { 
       BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.461477618025562520389e0),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.961237488025708540713e-1),
@@ -206,7 +208,7 @@ T expm1_imp(T x, const mpl::int_<64>&, const P& pol)
 }
 
 template <class T, class P>
-T expm1_imp(T x, const mpl::int_<113>&, const P& pol)
+BOOST_GPU_ENABLED T expm1_imp(T x, const mpl::int_<113>&, const P& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -224,8 +226,8 @@ T expm1_imp(T x, const mpl::int_<113>&, const P& pol)
    if(a < tools::epsilon<T>())
       return x;
 
-   static const float Y = 0.10281276702880859375e1f;
-   static const T n[] = { 
+   BOOST_MATH_GPU_STATIC const float Y = 0.10281276702880859375e1f;
+   BOOST_MATH_GPU_STATIC const T n[] = { 
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.28127670288085937499999999999999999854e-1),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.51278156911210477556524452177540792214e0),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.63263178520747096729500254678819588223e-1),
@@ -237,7 +239,7 @@ T expm1_imp(T x, const mpl::int_<113>&, const P& pol)
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.15995603306536496772374181066765665596e-8),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.45261820069007790520447958280473183582e-10)
    };
-   static const T d[] = { 
+   BOOST_MATH_GPU_STATIC const T d[] = { 
       BOOST_MATH_BIG_CONSTANT(T, 113, 1.0),
       BOOST_MATH_BIG_CONSTANT(T, 113, -0.45441264709074310514348137469214538853e0),
       BOOST_MATH_BIG_CONSTANT(T, 113, 0.96827131936192217313133611655555298106e-1),
@@ -258,7 +260,7 @@ T expm1_imp(T x, const mpl::int_<113>&, const P& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -304,18 +306,18 @@ inline typename tools::promote_args<T>::type expm1(T x, const Policy& /* pol */)
 
 #if defined(BOOST_HAS_EXPM1) && !(defined(__osf__) && defined(__DECCXX_VER))
 #  ifdef BOOST_MATH_USE_C99
-inline float expm1(float x, const policies::policy<>&){ return ::expm1f(x); }
+inline BOOST_GPU_ENABLED float expm1(float x, const policies::policy<>&){ return ::expm1f(x); }
 #     ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
-inline long double expm1(long double x, const policies::policy<>&){ return ::expm1l(x); }
+inline BOOST_GPU_ENABLED long double expm1(long double x, const policies::policy<>&){ return ::expm1l(x); }
 #     endif
 #  else
-inline float expm1(float x, const policies::policy<>&){ return static_cast<float>(::expm1(x)); }
+inline BOOST_GPU_ENABLED float expm1(float x, const policies::policy<>&){ return static_cast<float>(::expm1(x)); }
 #  endif
-inline double expm1(double x, const policies::policy<>&){ return ::expm1(x); }
+inline BOOST_GPU_ENABLED double expm1(double x, const policies::policy<>&){ return ::expm1(x); }
 #endif
 
 template <class T>
-inline typename tools::promote_args<T>::type expm1(T x)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type expm1(T x)
 {
    return expm1(x, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/factorials.hpp b/include/boost/math/special_functions/factorials.hpp
index e36a098bb6..92cf55ffea 100644
--- a/include/boost/math/special_functions/factorials.hpp
+++ b/include/boost/math/special_functions/factorials.hpp
@@ -27,7 +27,7 @@ namespace boost { namespace math
 {
 
 template <class T, class Policy>
-inline T factorial(unsigned i, const Policy& pol)
+inline BOOST_GPU_ENABLED T factorial(unsigned i, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(!boost::is_integral<T>::value);
    // factorial<unsigned int>(n) is not implemented
@@ -48,7 +48,7 @@ inline T factorial(unsigned i, const Policy& pol)
 }
 
 template <class T>
-inline T factorial(unsigned i)
+inline BOOST_GPU_ENABLED T factorial(unsigned i)
 {
    return factorial<T>(i, policies::policy<>());
 }
@@ -71,7 +71,7 @@ inline double factorial<double>(unsigned i)
 }
 */
 template <class T, class Policy>
-T double_factorial(unsigned i, const Policy& pol)
+BOOST_GPU_ENABLED T double_factorial(unsigned i, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(!boost::is_integral<T>::value);
    BOOST_MATH_STD_USING  // ADL lookup of std names
@@ -106,7 +106,7 @@ T double_factorial(unsigned i, const Policy& pol)
 }
 
 template <class T>
-inline T double_factorial(unsigned i)
+inline BOOST_GPU_ENABLED T double_factorial(unsigned i)
 {
    return double_factorial<T>(i, policies::policy<>());
 }
@@ -114,7 +114,7 @@ inline T double_factorial(unsigned i)
 namespace detail{
 
 template <class T, class Policy>
-T rising_factorial_imp(T x, int n, const Policy& pol)
+BOOST_GPU_ENABLED T rising_factorial_imp(T x, int n, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(!boost::is_integral<T>::value);
    if(x < 0)
@@ -162,11 +162,11 @@ T rising_factorial_imp(T x, int n, const Policy& pol)
 }
 
 template <class T, class Policy>
-inline T falling_factorial_imp(T x, unsigned n, const Policy& pol)
+inline BOOST_GPU_ENABLED T falling_factorial_imp(T x, unsigned n, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(!boost::is_integral<T>::value);
    BOOST_MATH_STD_USING // ADL of std names
-   if((x == 0) && (n >= 0))
+   if(x == 0)
       return 0;
    if(x < 0)
    {
@@ -226,7 +226,7 @@ inline T falling_factorial_imp(T x, unsigned n, const Policy& pol)
 } // namespace detail
 
 template <class RT>
-inline typename tools::promote_args<RT>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT>::type
    falling_factorial(RT x, unsigned n)
 {
    typedef typename tools::promote_args<RT>::type result_type;
@@ -235,7 +235,7 @@ inline typename tools::promote_args<RT>::type
 }
 
 template <class RT, class Policy>
-inline typename tools::promote_args<RT>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT>::type
    falling_factorial(RT x, unsigned n, const Policy& pol)
 {
    typedef typename tools::promote_args<RT>::type result_type;
@@ -244,7 +244,7 @@ inline typename tools::promote_args<RT>::type
 }
 
 template <class RT>
-inline typename tools::promote_args<RT>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT>::type
    rising_factorial(RT x, int n)
 {
    typedef typename tools::promote_args<RT>::type result_type;
@@ -253,7 +253,7 @@ inline typename tools::promote_args<RT>::type
 }
 
 template <class RT, class Policy>
-inline typename tools::promote_args<RT>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<RT>::type
    rising_factorial(RT x, int n, const Policy& pol)
 {
    typedef typename tools::promote_args<RT>::type result_type;
diff --git a/include/boost/math/special_functions/fpclassify.hpp b/include/boost/math/special_functions/fpclassify.hpp
index d83e111c48..b93f76efe6 100644
--- a/include/boost/math/special_functions/fpclassify.hpp
+++ b/include/boost/math/special_functions/fpclassify.hpp
@@ -18,6 +18,11 @@
 #include <boost/type_traits/is_floating_point.hpp>
 #include <boost/math/special_functions/math_fwd.hpp>
 #include <boost/math/special_functions/detail/fp_traits.hpp>
+
+#ifdef __CUDA_ARCH__
+#include <math_functions.h>
+#endif
+
 /*!
   \file fpclassify.hpp
   \brief Classify floating-point value as normal, subnormal, zero, infinite, or NaN.
@@ -633,6 +638,78 @@ inline bool (isnan)(__float128 x)
 }
 #endif
 
+#ifdef __CUDA_ARCH__
+
+template<> inline BOOST_GPU_ENABLED bool (isfinite)(float x) {  return ::isfinite(x);  }
+template<> inline BOOST_GPU_ENABLED bool (isfinite)(double x) {  return ::isfinite(x);  }
+
+template<> inline BOOST_GPU_ENABLED bool (isnan)(float x) { return ::isnan(x); }
+template<> inline BOOST_GPU_ENABLED bool (isnan)(double x) { return ::isnan(x); }
+
+template<> inline BOOST_GPU_ENABLED bool (isinf)(float x) { return ::isinf(x); }
+template<> inline BOOST_GPU_ENABLED bool (isinf)(double x) { return ::isinf(x); }
+
+template<> inline BOOST_GPU_ENABLED bool (isnormal)(float x) 
+{ 
+   if(x < 0) x = -x;
+   return (x >= FLT_MIN) && (x <= FLT_MAX);
+}
+template<> inline BOOST_GPU_ENABLED bool (isnormal)(double x) 
+{ 
+   if(x < 0) x = -x;
+   return (x >= DBL_MIN) && (x <= DBL_MAX);
+}
+
+template<> inline BOOST_GPU_ENABLED int (fpclassify)(float t)
+{
+   if((boost::math::isnan)(t))
+      return FP_NAN;
+   // std::fabs broken on a few systems especially for long long!!!!
+   float at = (t < 0.0f) ? -t : t;
+
+   // Use a process of exclusion to figure out
+   // what kind of type we have, this relies on
+   // IEEE conforming reals that will treat
+   // Nan's as unordered.  Some compilers
+   // don't do this once optimisations are
+   // turned on, hence the check for nan's above.
+   if(at <= FLT_MAX)
+   {
+      if(at >= FLT_MIN)
+         return FP_NORMAL;
+      return (at != 0) ? FP_SUBNORMAL : FP_ZERO;
+   }
+   else if(at > FLT_MAX)
+      return FP_INFINITE;
+   return FP_NAN;
+}
+
+template<> inline BOOST_GPU_ENABLED int (fpclassify)(double t)
+{
+   if((boost::math::isnan)(t))
+      return FP_NAN;
+   // std::fabs broken on a few systems especially for long long!!!!
+   double at = (t < 0.0) ? -t : t;
+
+   // Use a process of exclusion to figure out
+   // what kind of type we have, this relies on
+   // IEEE conforming reals that will treat
+   // Nan's as unordered.  Some compilers
+   // don't do this once optimisations are
+   // turned on, hence the check for nan's above.
+   if(at <= DBL_MAX)
+   {
+      if(at >= DBL_MIN)
+         return FP_NORMAL;
+      return (at != 0) ? FP_SUBNORMAL : FP_ZERO;
+   }
+   else if(at > DBL_MAX)
+      return FP_INFINITE;
+   return FP_NAN;
+}
+
+#endif
+
 } // namespace math
 } // namespace boost
 
diff --git a/include/boost/math/special_functions/gamma.hpp b/include/boost/math/special_functions/gamma.hpp
index 1903b997cb..1203f33d46 100644
--- a/include/boost/math/special_functions/gamma.hpp
+++ b/include/boost/math/special_functions/gamma.hpp
@@ -19,6 +19,7 @@
 #include <boost/math/tools/series.hpp>
 #include <boost/math/tools/fraction.hpp>
 #include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/tuple.hpp>
 #include <boost/math/tools/promotion.hpp>
 #include <boost/math/policies/error_handling.hpp>
 #include <boost/math/constants/constants.hpp>
@@ -59,13 +60,13 @@ namespace boost{ namespace math{
 namespace detail{
 
 template <class T>
-inline bool is_odd(T v, const boost::true_type&)
+inline BOOST_GPU_ENABLED bool is_odd(T v, const boost::true_type&)
 {
    int i = static_cast<int>(v);
    return i&1;
 }
 template <class T>
-inline bool is_odd(T v, const boost::false_type&)
+inline BOOST_GPU_ENABLED bool is_odd(T v, const boost::false_type&)
 {
    // Oh dear can't cast T to int!
    BOOST_MATH_STD_USING
@@ -73,13 +74,13 @@ inline bool is_odd(T v, const boost::false_type&)
    return static_cast<bool>(modulus != 0);
 }
 template <class T>
-inline bool is_odd(T v)
+inline BOOST_GPU_ENABLED bool is_odd(T v)
 {
    return is_odd(v, ::boost::is_convertible<T, int>());
 }
 
 template <class T>
-T sinpx(T z)
+BOOST_GPU_ENABLED T sinpx(T z)
 {
    // Ad hoc function calculates x * sin(pi * x),
    // taking extra care near when x is near a whole number.
@@ -101,7 +102,7 @@ T sinpx(T z)
    {
       dist = z - fl;
    }
-   BOOST_ASSERT(fl >= 0);
+   BOOST_MATH_ASSERT(fl >= 0);
    if(dist > 0.5)
       dist = 1 - dist;
    T result = sin(dist*boost::math::constants::pi<T>());
@@ -111,57 +112,19 @@ T sinpx(T z)
 // tgamma(z), with Lanczos support:
 //
 template <class T, class Policy, class Lanczos>
-T gamma_imp(T z, const Policy& pol, const Lanczos& l)
+BOOST_GPU_ENABLED T gamma_positive_imp(T z, const Policy& pol, const Lanczos& l, const char* function)
 {
    BOOST_MATH_STD_USING
-
    T result = 1;
-
-#ifdef BOOST_MATH_INSTRUMENT
-   static bool b = false;
-   if(!b)
-   {
-      std::cout << "tgamma_imp called with " << typeid(z).name() << " " << typeid(l).name() << std::endl;
-      b = true;
-   }
-#endif
-   static const char* function = "boost::math::tgamma<%1%>(%1%)";
-
-   if(z <= 0)
-   {
-      if(floor(z) == z)
-         return policies::raise_pole_error<T>(function, "Evaluation of tgamma at a negative integer %1%.", z, pol);
-      if(z <= -20)
-      {
-         result = gamma_imp(T(-z), pol, l) * sinpx(z);
-         BOOST_MATH_INSTRUMENT_VARIABLE(result);
-         if((fabs(result) < 1) && (tools::max_value<T>() * fabs(result) < boost::math::constants::pi<T>()))
-            return -boost::math::sign(result) * policies::raise_overflow_error<T>(function, "Result of tgamma is too large to represent.", pol);
-         result = -boost::math::constants::pi<T>() / result;
-         if(result == 0)
-            return policies::raise_underflow_error<T>(function, "Result of tgamma is too small to represent.", pol);
-         if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL)
-            return policies::raise_denorm_error<T>(function, "Result of tgamma is denormalized.", result, pol);
-         BOOST_MATH_INSTRUMENT_VARIABLE(result);
-         return result;
-      }
-
-      // shift z to > 1:
-      while(z < 0)
-      {
-         result /= z;
-         z += 1;
-      }
-   }
    BOOST_MATH_INSTRUMENT_VARIABLE(result);
    if((floor(z) == z) && (z < max_factorial<T>::value))
    {
       result *= unchecked_factorial<T>(itrunc(z, pol) - 1);
       BOOST_MATH_INSTRUMENT_VARIABLE(result);
    }
-   else if (z < tools::root_epsilon<T>())
+   else if(z < tools::root_epsilon<T>())
    {
-      if (z < 1 / tools::max_value<T>())
+      if(z < 1 / tools::max_value<T>())
          result = policies::raise_overflow_error<T>(function, 0, pol);
       result *= 1 / z - constants::euler<T>();
    }
@@ -198,14 +161,60 @@ T gamma_imp(T z, const Policy& pol, const Lanczos& l)
    }
    return result;
 }
+
+template <class T, class Policy, class Lanczos>
+BOOST_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const Lanczos& l)
+{
+   BOOST_MATH_STD_USING
+
+   T result = 1;
+
+#ifdef BOOST_MATH_INSTRUMENT
+   BOOST_MATH_GPU_STATIC bool b = false;
+   if(!b)
+   {
+      std::cout << "tgamma_imp called with " << typeid(z).name() << " " << typeid(l).name() << std::endl;
+      b = true;
+   }
+#endif
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::tgamma<%1%>(%1%)";
+
+   if(z <= 0)
+   {
+      if(floor(z) == z)
+         return policies::raise_pole_error<T>(function, "Evaluation of tgamma at a negative integer %1%.", z, pol);
+      if(z <= -20)
+      {
+         result = gamma_positive_imp(T(-z), pol, l, function);
+         result *= sinpx(z);
+         BOOST_MATH_INSTRUMENT_VARIABLE(result);
+         if((fabs(result) < 1) && (tools::max_value<T>() * fabs(result) < boost::math::constants::pi<T>()))
+            return -boost::math::sign(result) * policies::raise_overflow_error<T>(function, "Result of tgamma is too large to represent.", pol);
+         result = -boost::math::constants::pi<T>() / result;
+         if(result == 0)
+            return policies::raise_underflow_error<T>(function, "Result of tgamma is too small to represent.", pol);
+         if((boost::math::fpclassify)(result) == (int)FP_SUBNORMAL)
+            return policies::raise_denorm_error<T>(function, "Result of tgamma is denormalized.", result, pol);
+         BOOST_MATH_INSTRUMENT_VARIABLE(result);
+         return result;
+      }
+      // shift z to > 1:
+      while(z < 0)
+      {
+         result /= z;
+         z += 1;
+      }
+   }
+   return result * gamma_positive_imp(z, pol, l, function);
+}
 //
 // lgamma(z) with Lanczos support:
 //
 template <class T, class Policy, class Lanczos>
-T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = 0)
+BOOST_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = 0)
 {
 #ifdef BOOST_MATH_INSTRUMENT
-   static bool b = false;
+   BOOST_MATH_GPU_STATIC bool b = false;
    if(!b)
    {
       std::cout << "lgamma_imp called with " << typeid(z).name() << " " << typeid(l).name() << std::endl;
@@ -215,7 +224,7 @@ T lgamma_imp(T z, const Policy& pol, const Lanczos& l, int* sign = 0)
 
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::lgamma<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::lgamma<%1%>(%1%)";
 
    T result = 0;
    int sresult = 1;
@@ -295,14 +304,14 @@ struct upper_incomplete_gamma_fract
    T z, a;
    int k;
 public:
-   typedef std::pair<T,T> result_type;
+   typedef boost::math::pair<T,T> result_type;
 
-   upper_incomplete_gamma_fract(T a1, T z1)
+   BOOST_GPU_ENABLED upper_incomplete_gamma_fract(T a1, T z1)
       : z(z1-a1+1), a(a1), k(0)
    {
    }
 
-   result_type operator()()
+   BOOST_GPU_ENABLED result_type operator()()
    {
       ++k;
       z += 2;
@@ -311,7 +320,7 @@ struct upper_incomplete_gamma_fract
 };
 
 template <class T>
-inline T upper_gamma_fraction(T a, T z, T eps)
+inline BOOST_GPU_ENABLED T upper_gamma_fraction(T a, T z, T eps)
 {
    // Multiply result by z^a * e^-z to get the full
    // upper incomplete integral.  Divide by tgamma(z)
@@ -327,9 +336,9 @@ struct lower_incomplete_gamma_series
    T a, z, result;
 public:
    typedef T result_type;
-   lower_incomplete_gamma_series(T a1, T z1) : a(a1), z(z1), result(1){}
+   BOOST_GPU_ENABLED lower_incomplete_gamma_series(T a1, T z1) : a(a1), z(z1), result(1){}
 
-   T operator()()
+   BOOST_GPU_ENABLED T operator()()
    {
       T r = result;
       a += 1;
@@ -339,7 +348,7 @@ struct lower_incomplete_gamma_series
 };
 
 template <class T, class Policy>
-inline T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0)
+inline BOOST_GPU_ENABLED T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0)
 {
    // Multiply result by ((z^a) * (e^-z) / a) to get the full
    // lower incomplete integral. Then divide by tgamma(a)
@@ -357,7 +366,7 @@ inline T lower_gamma_series(T a, T z, const Policy& pol, T init_value = 0)
 // with Bernoulli numbers.
 //
 template<class T>
-std::size_t highest_bernoulli_index()
+BOOST_GPU_ENABLED std::size_t highest_bernoulli_index()
 {
    const float digits10_of_type = (std::numeric_limits<T>::is_specialized
                                       ? static_cast<float>(std::numeric_limits<T>::digits10)
@@ -368,7 +377,7 @@ std::size_t highest_bernoulli_index()
 }
 
 template<class T>
-T minimum_argument_for_bernoulli_recursion()
+BOOST_GPU_ENABLED T minimum_argument_for_bernoulli_recursion()
 {
    const float digits10_of_type = (std::numeric_limits<T>::is_specialized
                                       ? static_cast<float>(std::numeric_limits<T>::digits10)
@@ -379,14 +388,14 @@ T minimum_argument_for_bernoulli_recursion()
 
 // Forward declaration of the lgamma_imp template specialization.
 template <class T, class Policy>
-T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sign = 0);
+BOOST_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sign = 0);
 
 template <class T, class Policy>
-T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&)
+BOOST_GPU_ENABLED T gamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&)
 {
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::tgamma<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::tgamma<%1%>(%1%)";
 
    // Check if the argument of tgamma is identically zero.
    const bool is_at_zero = (z == 0);
@@ -527,11 +536,11 @@ inline T log_gamma_near_1(const T& z, Policy const& pol)
 }
 
 template <class T, class Policy>
-T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sign)
+BOOST_GPU_ENABLED T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sign)
 {
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::lgamma<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::lgamma<%1%>(%1%)";
 
    // Check if the argument of lgamma is identically zero.
    const bool is_at_zero = (z == 0);
@@ -666,7 +675,7 @@ T lgamma_imp(T z, const Policy& pol, const lanczos::undefined_lanczos&, int* sig
 // used by the upper incomplete gamma with z < 1:
 //
 template <class T, class Policy, class Lanczos>
-T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l)
+BOOST_GPU_ENABLED T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
 
@@ -724,8 +733,8 @@ T tgammap1m1_imp(T dz, Policy const& pol, const Lanczos& l)
 }
 
 template <class T, class Policy>
-inline T tgammap1m1_imp(T z, Policy const& pol,
-                 const ::boost::math::lanczos::undefined_lanczos&)
+inline BOOST_GPU_ENABLED T tgammap1m1_imp(T z, Policy const& pol,
+                 const ::boost::math::lanczos::undefined_lanczos& l)
 {
    BOOST_MATH_STD_USING // ADL of std names
 
@@ -744,9 +753,9 @@ struct small_gamma2_series
 {
    typedef T result_type;
 
-   small_gamma2_series(T a_, T x_) : result(-x_), x(-x_), apn(a_+1), n(1){}
+   BOOST_GPU_ENABLED small_gamma2_series(T a_, T x_) : result(-x_), x(-x_), apn(a_+1), n(1){}
 
-   T operator()()
+   BOOST_GPU_ENABLED T operator()()
    {
       T r = result / (apn);
       result *= x;
@@ -764,7 +773,7 @@ struct small_gamma2_series
 // incomplete gammas:
 //
 template <class T, class Policy>
-T full_igamma_prefix(T a, T z, const Policy& pol)
+BOOST_GPU_ENABLED T full_igamma_prefix(T a, T z, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -815,7 +824,7 @@ T full_igamma_prefix(T a, T z, const Policy& pol)
 // most if the error occurs in this function:
 //
 template <class T, class Policy, class Lanczos>
-T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
+BOOST_GPU_ENABLED T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
    T agh = a + static_cast<T>(Lanczos::g()) - T(0.5);
@@ -836,7 +845,13 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
       if(z <= tools::log_min_value<T>())
       {
          // Oh dear, have to use logs, should be free of cancellation errors though:
+#ifdef __CUDA_ARCH__
+         // Even though boost::math::lgamma is definitely not recursive, using it here
+         // causes CUDA to think that this function is :(
+         return exp(a * log(z) - z - std::lgamma(a));
+#else
          return exp(a * log(z) - z - lgamma_imp(a, pol, l));
+#endif
       }
       else
       {
@@ -860,16 +875,16 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
       //
       T alz = a * log(z / agh);
       T amz = a - z;
-      if(((std::min)(alz, amz) <= tools::log_min_value<T>()) || ((std::max)(alz, amz) >= tools::log_max_value<T>()))
+      if((BOOST_MATH_CUDA_SAFE_MIN(alz, amz) <= tools::log_min_value<T>()) || (BOOST_MATH_CUDA_SAFE_MAX(alz, amz) >= tools::log_max_value<T>()))
       {
          T amza = amz / a;
-         if(((std::min)(alz, amz)/2 > tools::log_min_value<T>()) && ((std::max)(alz, amz)/2 < tools::log_max_value<T>()))
+         if((BOOST_MATH_CUDA_SAFE_MIN(alz, amz)/2 > tools::log_min_value<T>()) && (BOOST_MATH_CUDA_SAFE_MAX(alz, amz)/2 < tools::log_max_value<T>()))
          {
             // compute square root of the result and then square it:
             T sq = pow(z / agh, a / 2) * exp(amz / 2);
             prefix = sq * sq;
          }
-         else if(((std::min)(alz, amz)/4 > tools::log_min_value<T>()) && ((std::max)(alz, amz)/4 < tools::log_max_value<T>()) && (z > a))
+         else if((BOOST_MATH_CUDA_SAFE_MIN(alz, amz)/4 > tools::log_min_value<T>()) && (BOOST_MATH_CUDA_SAFE_MAX(alz, amz)/4 < tools::log_max_value<T>()) && (z > a))
          {
             // compute the 4th root of the result then square it twice:
             T sq = pow(z / agh, a / 4) * exp(amz / 4);
@@ -897,11 +912,11 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const Lanczos& l)
 // And again, without Lanczos support:
 //
 template <class T, class Policy>
-T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined_lanczos&)
+BOOST_GPU_ENABLED T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined_lanczos&)
 {
    BOOST_MATH_STD_USING
 
-   T limit = (std::max)(T(10), a);
+   T limit = BOOST_MATH_CUDA_SAFE_MAX(T(10), a);
    T sum = detail::lower_gamma_series(a, limit, pol) / a;
    sum += detail::upper_gamma_fraction(a, limit, ::boost::math::policies::get_epsilon<T, Policy>());
 
@@ -922,7 +937,7 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined
    T amz = a - z;
    T alzoa = a * log(zoa);
    T prefix;
-   if(((std::min)(alzoa, amz) <= tools::log_min_value<T>()) || ((std::max)(alzoa, amz) >= tools::log_max_value<T>()))
+   if((BOOST_MATH_CUDA_SAFE_MIN(alzoa, amz) <= tools::log_min_value<T>()) || (BOOST_MATH_CUDA_SAFE_MAX(alzoa, amz) >= tools::log_max_value<T>()))
    {
       T amza = amz / a;
       if((amza <= tools::log_min_value<T>()) || (amza >= tools::log_max_value<T>()))
@@ -945,7 +960,7 @@ T regularised_gamma_prefix(T a, T z, const Policy& pol, const lanczos::undefined
 // Upper gamma fraction for very small a:
 //
 template <class T, class Policy>
-inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool invert = false, T* pderivative = 0)
+BOOST_GPU_ENABLED inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool invert = false, T* pderivative = 0)
 {
    BOOST_MATH_STD_USING  // ADL of std functions.
    //
@@ -974,7 +989,7 @@ inline T tgamma_small_upper_part(T a, T x, const Policy& pol, T* pgam = 0, bool
 // Upper gamma fraction for integer a:
 //
 template <class T, class Policy>
-inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0)
+BOOST_GPU_ENABLED inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0)
 {
    //
    // Calculates normalised Q when a is an integer:
@@ -1002,7 +1017,7 @@ inline T finite_gamma_q(T a, T x, Policy const& pol, T* pderivative = 0)
 // Upper gamma fraction for half integer a:
 //
 template <class T, class Policy>
-T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol)
+BOOST_GPU_ENABLED T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol)
 {
    //
    // Calculates normalised Q when a is a half-integer:
@@ -1013,7 +1028,7 @@ T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol)
    {
       T term = exp(-x) / sqrt(constants::pi<T>() * x);
       term *= x;
-      static const T half = T(1) / 2;
+      BOOST_MATH_GPU_STATIC const T half = T(1) / 2;
       term /= half;
       T sum = term;
       for(unsigned n = 2; n < a; ++n)
@@ -1039,10 +1054,10 @@ T finite_half_gamma_q(T a, T x, T* p_derivative, const Policy& pol)
 // Main incomplete gamma entry point, handles all four incomplete gamma's:
 //
 template <class T, class Policy>
-T gamma_incomplete_imp(T a, T x, bool normalised, bool invert, 
+BOOST_GPU_ENABLED T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
                        const Policy& pol, T* p_derivative)
 {
-   static const char* function = "boost::math::gamma_p<%1%>(%1%, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::gamma_p<%1%>(%1%, %1%)";
    if(a <= 0)
       return policies::raise_domain_error<T>(function, "Argument a to the incomplete gamma function must be greater than zero (got a=%1%).", a, pol);
    if(x < 0)
@@ -1054,6 +1069,7 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
 
    T result = 0; // Just to avoid warning C4701: potentially uninitialized local variable 'result' used
 
+#ifndef __CUDA_ARCH__  // this branch causes recursion on CUDA
    if(a >= max_factorial<T>::value && !normalised)
    {
       //
@@ -1117,8 +1133,9 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
          return policies::raise_overflow_error<T>(function, 0, pol);
       return exp(result);
    }
+#endif // CUDA
 
-   BOOST_ASSERT((p_derivative == 0) || (normalised == true));
+   BOOST_MATH_ASSERT((p_derivative == 0) || (normalised == true));
 
    bool is_int, is_half_int;
    bool is_small_a = (a < 30) && (a <= x + 1) && (x < tools::log_max_value<T>());
@@ -1396,7 +1413,7 @@ T gamma_incomplete_imp(T a, T x, bool normalised, bool invert,
 // Ratios of two gamma functions:
 //
 template <class T, class Policy, class Lanczos>
-T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
+BOOST_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos& l)
 {
    BOOST_MATH_STD_USING
    if(z < tools::epsilon<T>())
@@ -1451,7 +1468,7 @@ T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const Lanczos&
 // And again without Lanczos support this time:
 //
 template <class T, class Policy>
-T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos::undefined_lanczos&)
+BOOST_GPU_ENABLED T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos::undefined_lanczos&)
 {
    BOOST_MATH_STD_USING
    //
@@ -1488,7 +1505,7 @@ T tgamma_delta_ratio_imp_lanczos(T z, T delta, const Policy& pol, const lanczos:
 }
 
 template <class T, class Policy>
-T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol)
+BOOST_GPU_ENABLED T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -1546,7 +1563,7 @@ T tgamma_delta_ratio_imp(T z, T delta, const Policy& pol)
 }
 
 template <class T, class Policy>
-T tgamma_ratio_imp(T x, T y, const Policy& pol)
+BOOST_GPU_ENABLED T tgamma_ratio_imp(T x, T y, const Policy& pol)
 {
    BOOST_MATH_STD_USING
 
@@ -1615,7 +1632,7 @@ T tgamma_ratio_imp(T x, T y, const Policy& pol)
 }
 
 template <class T, class Policy>
-T gamma_p_derivative_imp(T a, T x, const Policy& pol)
+BOOST_GPU_ENABLED T gamma_p_derivative_imp(T a, T x, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    //
@@ -1656,7 +1673,7 @@ T gamma_p_derivative_imp(T a, T x, const Policy& pol)
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    tgamma(T z, const Policy& /* pol */, const mpl::true_)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1714,9 +1731,11 @@ struct igamma_initializer
       void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
+#ifndef __CUDA_ARCH__
       initializer.force_instantiate();
+#endif
    }
 };
 
@@ -1765,9 +1784,11 @@ struct lgamma_initializer
       void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
+#ifndef __CUDA_ARCH__
       initializer.force_instantiate();
+#endif
    }
 };
 
@@ -1775,7 +1796,7 @@ template <class T, class Policy>
 const typename lgamma_initializer<T, Policy>::init lgamma_initializer<T, Policy>::initializer;
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma(T1 a, T2 z, const Policy&, const mpl::false_)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1798,7 +1819,7 @@ inline typename tools::promote_args<T1, T2>::type
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma(T1 a, T2 z, const mpl::false_ tag)
 {
    return tgamma(a, z, policies::policy<>(), tag);
@@ -1808,14 +1829,14 @@ inline typename tools::promote_args<T1, T2>::type
 } // namespace detail
 
 template <class T>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    tgamma(T z)
 {
    return tgamma(z, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    lgamma(T z, int* sign, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1835,28 +1856,28 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    lgamma(T z, int* sign)
 {
    return lgamma(z, sign, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    lgamma(T x, const Policy& pol)
 {
    return ::boost::math::lgamma(x, 0, pol);
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    lgamma(T x)
 {
    return ::boost::math::lgamma(x, 0, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    tgamma1pm1(T z, const Policy& /* pol */)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1874,7 +1895,7 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    tgamma1pm1(T z)
 {
    return tgamma1pm1(z, policies::policy<>());
@@ -1884,7 +1905,7 @@ inline typename tools::promote_args<T>::type
 // Full upper incomplete gamma:
 //
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma(T1 a, T2 z)
 {
    //
@@ -1895,7 +1916,7 @@ inline typename tools::promote_args<T1, T2>::type
    return detail::tgamma(a, z, maybe_policy());
 }
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma(T1 a, T2 z, const Policy& pol)
 {
    return detail::tgamma(a, z, pol, mpl::false_());
@@ -1904,7 +1925,7 @@ inline typename tools::promote_args<T1, T2>::type
 // Full lower incomplete gamma:
 //
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma_lower(T1 a, T2 z, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1926,7 +1947,7 @@ inline typename tools::promote_args<T1, T2>::type
       forwarding_policy(), static_cast<value_type*>(0)), "tgamma_lower<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma_lower(T1 a, T2 z)
 {
    return tgamma_lower(a, z, policies::policy<>());
@@ -1935,7 +1956,7 @@ inline typename tools::promote_args<T1, T2>::type
 // Regularised upper incomplete gamma:
 //
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    gamma_q(T1 a, T2 z, const Policy& /* pol */)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1957,7 +1978,7 @@ inline typename tools::promote_args<T1, T2>::type
       forwarding_policy(), static_cast<value_type*>(0)), "gamma_q<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    gamma_q(T1 a, T2 z)
 {
    return gamma_q(a, z, policies::policy<>());
@@ -1966,7 +1987,7 @@ inline typename tools::promote_args<T1, T2>::type
 // Regularised lower incomplete gamma:
 //
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    gamma_p(T1 a, T2 z, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -1988,7 +2009,7 @@ inline typename tools::promote_args<T1, T2>::type
       forwarding_policy(), static_cast<value_type*>(0)), "gamma_p<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    gamma_p(T1 a, T2 z)
 {
    return gamma_p(a, z, policies::policy<>());
@@ -1996,7 +2017,7 @@ inline typename tools::promote_args<T1, T2>::type
 
 // ratios of gamma functions:
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma_delta_ratio(T1 z, T2 delta, const Policy& /* pol */)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2012,13 +2033,13 @@ inline typename tools::promote_args<T1, T2>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::tgamma_delta_ratio_imp(static_cast<value_type>(z), static_cast<value_type>(delta), forwarding_policy()), "boost::math::tgamma_delta_ratio<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma_delta_ratio(T1 z, T2 delta)
 {
    return tgamma_delta_ratio(z, delta, policies::policy<>());
 }
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma_ratio(T1 a, T2 b, const Policy&)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
@@ -2033,14 +2054,14 @@ inline typename tools::promote_args<T1, T2>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::tgamma_ratio_imp(static_cast<value_type>(a), static_cast<value_type>(b), forwarding_policy()), "boost::math::tgamma_delta_ratio<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    tgamma_ratio(T1 a, T2 b)
 {
    return tgamma_ratio(a, b, policies::policy<>());
 }
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    gamma_p_derivative(T1 a, T2 x, const Policy&)
 {
    BOOST_FPU_EXCEPTION_GUARD
@@ -2056,7 +2077,7 @@ inline typename tools::promote_args<T1, T2>::type
    return policies::checked_narrowing_cast<result_type, forwarding_policy>(detail::gamma_p_derivative_imp(static_cast<value_type>(a), static_cast<value_type>(x), forwarding_policy()), "boost::math::gamma_p_derivative<%1%>(%1%, %1%)");
 }
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    gamma_p_derivative(T1 a, T2 x)
 {
    return gamma_p_derivative(a, x, policies::policy<>());
diff --git a/include/boost/math/special_functions/heuman_lambda.hpp b/include/boost/math/special_functions/heuman_lambda.hpp
index 6389443ee4..383d2e5275 100644
--- a/include/boost/math/special_functions/heuman_lambda.hpp
+++ b/include/boost/math/special_functions/heuman_lambda.hpp
@@ -27,13 +27,13 @@ namespace detail{
 
 // Elliptic integral - Jacobi Zeta
 template <typename T, typename Policy>
-T heuman_lambda_imp(T phi, T k, const Policy& pol)
+BOOST_GPU_ENABLED T heuman_lambda_imp(T phi, T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
     using namespace boost::math::constants;
 
-    const char* function = "boost::math::heuman_lambda<%1%>(%1%, %1%)";
+    BOOST_MATH_GPU_STATIC const char* function = "boost::math::heuman_lambda<%1%>(%1%, %1%)";
 
     if(fabs(k) > 1)
        return policies::raise_domain_error<T>(function, "We require |k| <= 1 but got k = %1%", k, pol);
@@ -68,7 +68,7 @@ T heuman_lambda_imp(T phi, T k, const Policy& pol)
 } // detail
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -76,7 +76,7 @@ inline typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi, co
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi)
 {
    return boost::math::heuman_lambda(k, phi, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/jacobi_zeta.hpp b/include/boost/math/special_functions/jacobi_zeta.hpp
index a3fa54746e..50837c3ac7 100644
--- a/include/boost/math/special_functions/jacobi_zeta.hpp
+++ b/include/boost/math/special_functions/jacobi_zeta.hpp
@@ -26,7 +26,7 @@ namespace detail{
 
 // Elliptic integral - Jacobi Zeta
 template <typename T, typename Policy>
-T jacobi_zeta_imp(T phi, T k, const Policy& pol)
+BOOST_GPU_ENABLED T jacobi_zeta_imp(T phi, T k, const Policy& pol)
 {
     BOOST_MATH_STD_USING
     using namespace boost::math::tools;
@@ -55,7 +55,7 @@ T jacobi_zeta_imp(T phi, T k, const Policy& pol)
 } // detail
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -63,7 +63,7 @@ inline typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi, cons
 }
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi)
 {
    return boost::math::jacobi_zeta(k, phi, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/lanczos.hpp b/include/boost/math/special_functions/lanczos.hpp
index c1ff86930b..30a873677b 100644
--- a/include/boost/math/special_functions/lanczos.hpp
+++ b/include/boost/math/special_functions/lanczos.hpp
@@ -74,10 +74,10 @@ struct lanczos6 : public mpl::int_<35>
    // double precision:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
       lanczos_initializer<lanczos6, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[6] = {
+      BOOST_MATH_GPU_STATIC const T num[6] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 8706.349592549009182288174442774377925882)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 8523.650341121874633477483696775067709735)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 3338.029219476423550899999750161289306564)),
@@ -85,7 +85,7 @@ struct lanczos6 : public mpl::int_<35>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 63.99951844938187085666201263218840287667)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 2.506628274631006311133031631822390264407))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
          static_cast<boost::uint16_t>(0u),
          static_cast<boost::uint16_t>(24u),
          static_cast<boost::uint16_t>(50u),
@@ -97,10 +97,10 @@ struct lanczos6 : public mpl::int_<35>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
       lanczos_initializer<lanczos6, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[6] = {
+      BOOST_MATH_GPU_STATIC const T num[6] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 32.81244541029783471623665933780748627823)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 32.12388941444332003446077108933558534361)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 12.58034729455216106950851080138931470954)),
@@ -108,7 +108,7 @@ struct lanczos6 : public mpl::int_<35>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 0.2412010548258800231126240760264822486599)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 0.009446967704539249494420221613134244048319))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
          static_cast<boost::uint16_t>(0u),
          static_cast<boost::uint16_t>(24u),
          static_cast<boost::uint16_t>(50u),
@@ -121,10 +121,10 @@ struct lanczos6 : public mpl::int_<35>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
       lanczos_initializer<lanczos6, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[5] = {
+      BOOST_MATH_GPU_STATIC const T d[5] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 2.044879010930422922760429926121241330235)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, -2.751366405578505366591317846728753993668)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 1.02282965224225004296750609604264824677)),
@@ -140,10 +140,10 @@ struct lanczos6 : public mpl::int_<35>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
       lanczos_initializer<lanczos6, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[5] = {
+      BOOST_MATH_GPU_STATIC const T d[5] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 5.748142489536043490764289256167080091892)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, -7.734074268282457156081021756682138251825)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 35, 2.875167944990511006997713242805893543947)),
@@ -159,7 +159,7 @@ struct lanczos6 : public mpl::int_<35>
       return result;
    }
 
-   static double g(){ return 5.581000000000000405009359383257105946541; }
+   static BOOST_GPU_ENABLED double g(){ return 5.581000000000000405009359383257105946541; }
 };
 
 //
@@ -174,10 +174,10 @@ struct lanczos11 : public mpl::int_<60>
    // extended-double precision:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
       lanczos_initializer<lanczos11, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[11] = {
+      BOOST_MATH_GPU_STATIC const T num[11] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 38474670393.31776828316099004518914832218)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 36857665043.51950660081971227404959150474)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 15889202453.72942008945006665994637853242)),
@@ -190,7 +190,7 @@ struct lanczos11 : public mpl::int_<60>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 261.6140441641668190791708576058805625502)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 2.506628274631000502415573855452633787834))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[11] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[11] = {
          static_cast<boost::uint32_t>(0u),
          static_cast<boost::uint32_t>(362880u),
          static_cast<boost::uint32_t>(1026576u),
@@ -207,10 +207,10 @@ struct lanczos11 : public mpl::int_<60>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
       lanczos_initializer<lanczos11, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[11] = {
+      BOOST_MATH_GPU_STATIC const T num[11] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 709811.662581657956893540610814842699825)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 679979.847415722640161734319823103390728)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 293136.785721159725251629480984140341656)),
@@ -223,7 +223,7 @@ struct lanczos11 : public mpl::int_<60>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 0.004826466289237661857584712046231435101741)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 0.4624429436045378766270459638520555557321e-4))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[11] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[11] = {
          static_cast<boost::uint32_t>(0u),
          static_cast<boost::uint32_t>(362880u),
          static_cast<boost::uint32_t>(1026576u),
@@ -241,10 +241,10 @@ struct lanczos11 : public mpl::int_<60>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
       lanczos_initializer<lanczos11, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[10] = {
+      BOOST_MATH_GPU_STATIC const T d[10] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 4.005853070677940377969080796551266387954)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, -13.17044315127646469834125159673527183164)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 17.19146865350790353683895137079288129318)),
@@ -265,10 +265,10 @@ struct lanczos11 : public mpl::int_<60>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
       lanczos_initializer<lanczos11, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[10] = {
+      BOOST_MATH_GPU_STATIC const T d[10] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 19.05889633808148715159575716844556056056)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, -62.66183664701721716960978577959655644762)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 60, 81.7929198065004751699057192860287512027)),
@@ -289,7 +289,7 @@ struct lanczos11 : public mpl::int_<60>
       return result;
    }
 
-   static double g(){ return 10.90051099999999983936049829935654997826; }
+   static BOOST_GPU_ENABLED double g(){ return 10.90051099999999983936049829935654997826; }
 };
 
 //
@@ -304,10 +304,10 @@ struct lanczos13 : public mpl::int_<72>
    // higher precision:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
       lanczos_initializer<lanczos13, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[13] = {
+      BOOST_MATH_GPU_STATIC const T num[13] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 44012138428004.60895436261759919070125699)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 41590453358593.20051581730723108131357995)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 18013842787117.99677796276038389462742949)),
@@ -322,7 +322,7 @@ struct lanczos13 : public mpl::int_<72>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 381.8801248632926870394389468349331394196)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 2.506628274631000502415763426076722427007))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
          static_cast<boost::uint32_t>(0u),
          static_cast<boost::uint32_t>(39916800u),
          static_cast<boost::uint32_t>(120543840u),
@@ -341,10 +341,10 @@ struct lanczos13 : public mpl::int_<72>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
       lanczos_initializer<lanczos13, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[13] = {
+      BOOST_MATH_GPU_STATIC const T num[13] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 86091529.53418537217994842267760536134841)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 81354505.17858011242874285785316135398567)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 35236626.38815461910817650960734605416521)),
@@ -359,7 +359,7 @@ struct lanczos13 : public mpl::int_<72>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 0.0007469903808915448316510079585999893674101)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 0.4903180573459871862552197089738373164184e-5))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
          static_cast<boost::uint32_t>(0u),
          static_cast<boost::uint32_t>(39916800u),
          static_cast<boost::uint32_t>(120543840u),
@@ -379,10 +379,10 @@ struct lanczos13 : public mpl::int_<72>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
       lanczos_initializer<lanczos13, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[12] = {
+      BOOST_MATH_GPU_STATIC const T d[12] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 4.832115561461656947793029596285626840312)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, -19.86441536140337740383120735104359034688)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 33.9927422807443239927197864963170585331)),
@@ -405,10 +405,10 @@ struct lanczos13 : public mpl::int_<72>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
       lanczos_initializer<lanczos13, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[12] = {
+      BOOST_MATH_GPU_STATIC const T d[12] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 26.96979819614830698367887026728396466395)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, -110.8705424709385114023884328797900204863)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 72, 189.7258846119231466417015694690434770085)),
@@ -431,7 +431,7 @@ struct lanczos13 : public mpl::int_<72>
       return result;
    }
 
-   static double g(){ return 13.1445650000000000545696821063756942749; }
+   static BOOST_GPU_ENABLED double g(){ return 13.1445650000000000545696821063756942749; }
 };
 
 //
@@ -446,10 +446,10 @@ struct lanczos22 : public mpl::int_<120>
    // evaluated at higher precision:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
       lanczos_initializer<lanczos22, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[22] = {
+      BOOST_MATH_GPU_STATIC const T num[22] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 46198410803245094237463011094.12173081986)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 43735859291852324413622037436.321513777)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 19716607234435171720534556386.97481377748)),
@@ -473,7 +473,7 @@ struct lanczos22 : public mpl::int_<120>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 1167.501919472435718934219997431551246996)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 2.50662827463100050241576528481104525333))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[22] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[22] = {
          BOOST_MATH_INT_VALUE_SUFFIX(0, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(2432902008176640000, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(8752948036761600000, uLL),
@@ -501,10 +501,10 @@ struct lanczos22 : public mpl::int_<120>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
       lanczos_initializer<lanczos22, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[22] = {
+      BOOST_MATH_GPU_STATIC const T num[22] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 6939996264376682180.277485395074954356211)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 6570067992110214451.87201438870245659384)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 2961859037444440551.986724631496417064121)),
@@ -528,7 +528,7 @@ struct lanczos22 : public mpl::int_<120>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 0.1753839324538447655939518484052327068859e-6)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 0.3765495513732730583386223384116545391759e-9))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[22] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[22] = {
          BOOST_MATH_INT_VALUE_SUFFIX(0, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(2432902008176640000, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(8752948036761600000, uLL),
@@ -557,10 +557,10 @@ struct lanczos22 : public mpl::int_<120>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
       lanczos_initializer<lanczos22, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[21] = {
+      BOOST_MATH_GPU_STATIC const T d[21] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 8.318998691953337183034781139546384476554)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, -63.15415991415959158214140353299240638675)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 217.3108224383632868591462242669081540163)),
@@ -592,10 +592,10 @@ struct lanczos22 : public mpl::int_<120>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
       lanczos_initializer<lanczos22, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[21] = {
+      BOOST_MATH_GPU_STATIC const T d[21] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 75.39272007105208086018421070699575462226)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, -572.3481967049935412452681346759966390319)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 120, 1969.426202741555335078065370698955484358)),
@@ -627,7 +627,7 @@ struct lanczos22 : public mpl::int_<120>
       return result;
    }
 
-   static double g(){ return 22.61890999999999962710717227309942245483; }
+   static BOOST_GPU_ENABLED double g(){ return 22.61890999999999962710717227309942245483; }
 };
 
 //
@@ -641,9 +641,9 @@ struct lanczos6m24 : public mpl::int_<24>
    // Use for float precision, when evaluated as a float:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
-      static const T num[6] = {
+      BOOST_MATH_GPU_STATIC const T num[6] = {
          static_cast<T>(58.52061591769095910314047740215847630266L),
          static_cast<T>(182.5248962595894264831189414768236280862L),
          static_cast<T>(211.0971093028510041839168287718170827259L),
@@ -651,7 +651,7 @@ struct lanczos6m24 : public mpl::int_<24>
          static_cast<T>(27.5192015197455403062503721613097825345L),
          static_cast<T>(2.50662858515256974113978724717473206342L)
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
          static_cast<boost::uint16_t>(0u),
          static_cast<boost::uint16_t>(24u),
          static_cast<boost::uint16_t>(50u),
@@ -663,9 +663,9 @@ struct lanczos6m24 : public mpl::int_<24>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
-      static const T num[6] = {
+      BOOST_MATH_GPU_STATIC const T num[6] = {
          static_cast<T>(14.0261432874996476619570577285003839357L),
          static_cast<T>(43.74732405540314316089531289293124360129L),
          static_cast<T>(50.59547402616588964511581430025589038612L),
@@ -673,7 +673,7 @@ struct lanczos6m24 : public mpl::int_<24>
          static_cast<T>(6.595765571169314946316366571954421695196L),
          static_cast<T>(0.6007854010515290065101128585795542383721L)
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint16_t) denom[6] = {
          static_cast<boost::uint16_t>(0u),
          static_cast<boost::uint16_t>(24u),
          static_cast<boost::uint16_t>(50u),
@@ -686,9 +686,9 @@ struct lanczos6m24 : public mpl::int_<24>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
-      static const T d[5] = {
+      BOOST_MATH_GPU_STATIC const T d[5] = {
          static_cast<T>(0.4922488055204602807654354732674868442106L),
          static_cast<T>(0.004954497451132152436631238060933905650346L),
          static_cast<T>(-0.003374784572167105840686977985330859371848L),
@@ -704,9 +704,9 @@ struct lanczos6m24 : public mpl::int_<24>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
-      static const T d[5] = {
+      BOOST_MATH_GPU_STATIC const T d[5] = {
          static_cast<T>(0.6534966888520080645505805298901130485464L),
          static_cast<T>(0.006577461728560758362509168026049182707101L),
          static_cast<T>(-0.004480276069269967207178373559014835978161L),
@@ -722,7 +722,7 @@ struct lanczos6m24 : public mpl::int_<24>
       return result;
    }
 
-   static double g(){ return 1.428456135094165802001953125; }
+   static BOOST_GPU_ENABLED double g(){ return 1.428456135094165802001953125; }
 };
 
 //
@@ -736,9 +736,9 @@ struct lanczos13m53 : public mpl::int_<53>
    // Use for double precision, when evaluated as a double:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
-      static const T num[13] = {
+      BOOST_MATH_GPU_STATIC const T num[13] = {
          static_cast<T>(23531376880.41075968857200767445163675473L),
          static_cast<T>(42919803642.64909876895789904700198885093L),
          static_cast<T>(35711959237.35566804944018545154716670596L),
@@ -753,7 +753,7 @@ struct lanczos13m53 : public mpl::int_<53>
          static_cast<T>(210.8242777515793458725097339207133627117L),
          static_cast<T>(2.506628274631000270164908177133837338626L)
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
          static_cast<boost::uint32_t>(0u),
          static_cast<boost::uint32_t>(39916800u),
          static_cast<boost::uint32_t>(120543840u),
@@ -772,9 +772,9 @@ struct lanczos13m53 : public mpl::int_<53>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
-      static const T num[13] = {
+      BOOST_MATH_GPU_STATIC const T num[13] = {
          static_cast<T>(56906521.91347156388090791033559122686859L),
          static_cast<T>(103794043.1163445451906271053616070238554L),
          static_cast<T>(86363131.28813859145546927288977868422342L),
@@ -789,7 +789,7 @@ struct lanczos13m53 : public mpl::int_<53>
          static_cast<T>(0.5098416655656676188125178644804694509993L),
          static_cast<T>(0.006061842346248906525783753964555936883222L)
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint32_t) denom[13] = {
          static_cast<boost::uint32_t>(0u),
          static_cast<boost::uint32_t>(39916800u),
          static_cast<boost::uint32_t>(120543840u),
@@ -809,9 +809,9 @@ struct lanczos13m53 : public mpl::int_<53>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
-      static const T d[12] = {
+      BOOST_MATH_GPU_STATIC const T d[12] = {
          static_cast<T>(2.208709979316623790862569924861841433016L),
          static_cast<T>(-3.327150580651624233553677113928873034916L),
          static_cast<T>(1.483082862367253753040442933770164111678L),
@@ -834,9 +834,9 @@ struct lanczos13m53 : public mpl::int_<53>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
-      static const T d[12] = {
+      BOOST_MATH_GPU_STATIC const T d[12] = {
          static_cast<T>(6.565936202082889535528455955485877361223L),
          static_cast<T>(-9.8907772644920670589288081640128194231L),
          static_cast<T>(4.408830289125943377923077727900630927902L),
@@ -859,7 +859,7 @@ struct lanczos13m53 : public mpl::int_<53>
       return result;
    }
 
-   static double g(){ return 6.024680040776729583740234375; }
+   static BOOST_GPU_ENABLED double g(){ return 6.024680040776729583740234375; }
 };
 
 //
@@ -873,10 +873,10 @@ struct lanczos17m64 : public mpl::int_<64>
    // Use for extended-double precision, when evaluated as an extended-double:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
       lanczos_initializer<lanczos17m64, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[17] = {
+      BOOST_MATH_GPU_STATIC const T num[17] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 553681095419291969.2230556393350368550504)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 731918863887667017.2511276782146694632234)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 453393234285807339.4627124634539085143364)),
@@ -895,7 +895,7 @@ struct lanczos17m64 : public mpl::int_<64>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 488.0063567520005730476791712814838113252)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2.50662827463100050241576877135758834683))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[17] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[17] = {
          BOOST_MATH_INT_VALUE_SUFFIX(0, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(1307674368000, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(4339163001600, uLL),
@@ -918,10 +918,10 @@ struct lanczos17m64 : public mpl::int_<64>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
       lanczos_initializer<lanczos17m64, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[17] = {
+      BOOST_MATH_GPU_STATIC const T num[17] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2715894658327.717377557655133124376674911)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 3590179526097.912105038525528721129550434)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 2223966599737.814969312127353235818710172)),
@@ -940,7 +940,7 @@ struct lanczos17m64 : public mpl::int_<64>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.002393749522058449186690627996063983095463)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 0.1229541408909435212800785616808830746135e-4))
       };
-      static const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[17] = {
+      BOOST_MATH_GPU_STATIC const BOOST_MATH_INT_TABLE_TYPE(T, boost::uint64_t) denom[17] = {
          BOOST_MATH_INT_VALUE_SUFFIX(0, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(1307674368000, uLL),
          BOOST_MATH_INT_VALUE_SUFFIX(4339163001600, uLL),
@@ -964,10 +964,10 @@ struct lanczos17m64 : public mpl::int_<64>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
       lanczos_initializer<lanczos17m64, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[16] = {
+      BOOST_MATH_GPU_STATIC const T d[16] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 4.493645054286536365763334986866616581265)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -16.95716370392468543800733966378143997694)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 26.19196892983737527836811770970479846644)),
@@ -994,10 +994,10 @@ struct lanczos17m64 : public mpl::int_<64>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
       lanczos_initializer<lanczos17m64, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[16] = {
+      BOOST_MATH_GPU_STATIC const T d[16] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 23.56409085052261327114594781581930373708)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, -88.92116338946308797946237246006238652361)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 64, 137.3472822086847596961177383569603988797)),
@@ -1024,7 +1024,7 @@ struct lanczos17m64 : public mpl::int_<64>
       return result;
    }
 
-   static double g(){ return 12.2252227365970611572265625; }
+   static BOOST_GPU_ENABLED double g(){ return 12.2252227365970611572265625; }
 };
 
 //
@@ -1038,10 +1038,10 @@ struct lanczos24m113 : public mpl::int_<113>
    // Use for long-double precision, when evaluated as an long-double:
    //
    template <class T>
-   static T lanczos_sum(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum(const T& z)
    {
       lanczos_initializer<lanczos24m113, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[24] = {
+      BOOST_MATH_GPU_STATIC const T num[24] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 2029889364934367661624137213253.22102954656825019111612712252027267955023987678816620961507)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 2338599599286656537526273232565.2727349714338768161421882478417543004440597874814359063158)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 1288527989493833400335117708406.3953711906175960449186720680201425446299360322830739180195)),
@@ -1067,7 +1067,7 @@ struct lanczos24m113 : public mpl::int_<113>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 1151.61895453463992438325318456328526085882924197763140514450975619271382783957699017875304)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 2.50662827463100050241576528481104515966515623051532908941425544355490413900497467936202516))
       };
-      static const T denom[24] = {
+      BOOST_MATH_GPU_STATIC const T denom[24] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.0)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.112400072777760768e22)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.414847677933545472e22)),
@@ -1097,10 +1097,10 @@ struct lanczos24m113 : public mpl::int_<113>
    }
 
    template <class T>
-   static T lanczos_sum_expG_scaled(const T& z)
+   static BOOST_GPU_ENABLED T lanczos_sum_expG_scaled(const T& z)
    {
       lanczos_initializer<lanczos24m113, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T num[24] = {
+      BOOST_MATH_GPU_STATIC const T num[24] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 3035162425359883494754.02878223286972654682199012688209026810841953293372712802258398358538)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 3496756894406430103600.16057175075063458536101374170860226963245118484234495645518505519827)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 1926652656689320888654.01954015145958293168365236755537645929361841917596501251362171653478)),
@@ -1126,7 +1126,7 @@ struct lanczos24m113 : public mpl::int_<113>
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.172194142179211139195966608011235161516824700287310869949928393345257114743230967204370963e-5)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.374799931707148855771381263542708435935402853962736029347951399323367765509988401336565436e-8))
       };
-      static const T denom[24] = {
+      BOOST_MATH_GPU_STATIC const T denom[24] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.0)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.112400072777760768e22)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 0.414847677933545472e22)),
@@ -1157,10 +1157,10 @@ struct lanczos24m113 : public mpl::int_<113>
 
 
    template<class T>
-   static T lanczos_sum_near_1(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_1(const T& dz)
    {
       lanczos_initializer<lanczos24m113, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[23] = {
+      BOOST_MATH_GPU_STATIC const T d[23] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 7.4734083002469026177867421609938203388868806387315406134072298925733950040583068760685908)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, -50.4225805042247530267317342133388132970816607563062253708655085754357843064134941138154171)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 152.288200621747008570784082624444625293884063492396162110698238568311211546361189979357019)),
@@ -1194,10 +1194,10 @@ struct lanczos24m113 : public mpl::int_<113>
    }
 
    template<class T>
-   static T lanczos_sum_near_2(const T& dz)
+   static BOOST_GPU_ENABLED T lanczos_sum_near_2(const T& dz)
    {
       lanczos_initializer<lanczos24m113, T>::force_instantiate(); // Ensure our constants get initialized before main()
-      static const T d[23] = {
+      BOOST_MATH_GPU_STATIC const T d[23] = {
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 61.4165001061101455341808888883960361969557848005400286332291451422461117307237198559485365)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, -414.372973678657049667308134761613915623353625332248315105320470271523320700386200587519147)),
          static_cast<T>(BOOST_MATH_BIG_CONSTANT(T, 113, 1251.50505818554680171298972755376376836161706773644771875668053742215217922228357204561873)),
@@ -1231,7 +1231,7 @@ struct lanczos24m113 : public mpl::int_<113>
       return result;
    }
 
-   static double g(){ return 20.3209821879863739013671875; }
+   static BOOST_GPU_ENABLED double g(){ return 20.3209821879863739013671875; }
 };
 
 
diff --git a/include/boost/math/special_functions/log1p.hpp b/include/boost/math/special_functions/log1p.hpp
index 7fa1eb8de9..5a24735fbe 100644
--- a/include/boost/math/special_functions/log1p.hpp
+++ b/include/boost/math/special_functions/log1p.hpp
@@ -41,16 +41,16 @@ namespace detail
   {
      typedef T result_type;
 
-     log1p_series(T x)
+     BOOST_GPU_ENABLED log1p_series(T x)
         : k(0), m_mult(-x), m_prod(-1){}
 
-     T operator()()
+     BOOST_GPU_ENABLED T operator()()
      {
         m_prod *= m_mult;
         return m_prod / ++k;
      }
 
-     int count()const
+     BOOST_GPU_ENABLED int count()const
      {
         return k;
      }
@@ -73,12 +73,12 @@ namespace detail
 // it performs no better than log(1+x): which is to say not very well at all.
 //
 template <class T, class Policy>
-T log1p_imp(T const & x, const Policy& pol, const mpl::int_<0>&)
+BOOST_GPU_ENABLED T log1p_imp(T const & x, const Policy& pol, const mpl::int_<0>&)
 { // The function returns the natural logarithm of 1 + x.
    typedef typename tools::promote_args<T>::type result_type;
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::log1p<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::log1p<%1%>(%1%)";
 
    if((x < -1) || (boost::math::isnan)(x))
       return policies::raise_domain_error<T>(
@@ -107,11 +107,11 @@ T log1p_imp(T const & x, const Policy& pol, const mpl::int_<0>&)
 }
 
 template <class T, class Policy>
-T log1p_imp(T const& x, const Policy& pol, const mpl::int_<53>&)
+BOOST_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const mpl::int_<53>&)
 { // The function returns the natural logarithm of 1 + x.
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::log1p<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::log1p<%1%>(%1%)";
 
    if(x < -1)
       return policies::raise_domain_error<T>(
@@ -132,7 +132,7 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<53>&)
    // Expected Error Term:                         1.843e-017
    // Maximum Relative Change in Control Points:   8.138e-004
    // Max Error found at double precision =        3.250766e-016
-   static const T P[] = {    
+   BOOST_MATH_GPU_STATIC const T P[] = {
        0.15141069795941984e-16L,
        0.35495104378055055e-15L,
        0.33333333333332835L,
@@ -142,7 +142,7 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<53>&)
        0.13703234928513215L,
        0.011294864812099712L
      };
-   static const T Q[] = {    
+   BOOST_MATH_GPU_STATIC const T Q[] = {
        1L,
        3.7274719063011499L,
        5.5387948649720334L,
@@ -160,11 +160,11 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<53>&)
 }
 
 template <class T, class Policy>
-T log1p_imp(T const& x, const Policy& pol, const mpl::int_<64>&)
+BOOST_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const mpl::int_<64>&)
 { // The function returns the natural logarithm of 1 + x.
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::log1p<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::log1p<%1%>(%1%)";
 
    if(x < -1)
       return policies::raise_domain_error<T>(
@@ -185,7 +185,7 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<64>&)
    // Expected Error Term:                         8.088e-20
    // Maximum Relative Change in Control Points:   9.648e-05
    // Max Error found at long double precision =   2.242324e-19
-   static const T P[] = {    
+   BOOST_MATH_GPU_STATIC const T P[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.807533446680736736712e-19),
       BOOST_MATH_BIG_CONSTANT(T, 64, -0.490881544804798926426e-18),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.333333333333333373941),
@@ -196,7 +196,7 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<64>&)
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.0706537026422828914622),
       BOOST_MATH_BIG_CONSTANT(T, 64, 0.00441709903782239229447)
    };
-   static const T Q[] = {    
+   BOOST_MATH_GPU_STATIC const T Q[] = {
       BOOST_MATH_BIG_CONSTANT(T, 64, 1.0),
       BOOST_MATH_BIG_CONSTANT(T, 64, 4.26423872346263928361),
       BOOST_MATH_BIG_CONSTANT(T, 64, 7.48189472704477708962),
@@ -215,11 +215,11 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<64>&)
 }
 
 template <class T, class Policy>
-T log1p_imp(T const& x, const Policy& pol, const mpl::int_<24>&)
+BOOST_GPU_ENABLED T log1p_imp(T const& x, const Policy& pol, const mpl::int_<24>&)
 { // The function returns the natural logarithm of 1 + x.
    BOOST_MATH_STD_USING
 
-   static const char* function = "boost::math::log1p<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::log1p<%1%>(%1%)";
 
    if(x < -1)
       return policies::raise_domain_error<T>(
@@ -241,13 +241,13 @@ T log1p_imp(T const& x, const Policy& pol, const mpl::int_<24>&)
    // Maximum Relative Change in Control Points:   2.509e-04
    // Max Error found at double precision =        6.910422e-08
    // Max Error found at float precision =         8.357242e-08
-   static const T P[] = {    
+   BOOST_MATH_GPU_STATIC const T P[] = {
       -0.671192866803148236519e-7L,
       0.119670999140731844725e-6L,
       0.333339469182083148598L,
       0.237827183019664122066L
    };
-   static const T Q[] = {    
+   BOOST_MATH_GPU_STATIC const T Q[] = {
       1L,
       1.46348272586988539733L,
       0.497859871350117338894L,
@@ -278,9 +278,11 @@ struct log1p_initializer
       void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
+#ifndef __CUDA_ARCH__
       initializer.force_instantiate();
+#endif
    }
 };
 
@@ -291,7 +293,7 @@ const typename log1p_initializer<T, Policy, tag>::init log1p_initializer<T, Poli
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type log1p(T x, const Policy&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1p(T x, const Policy&)
 { 
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -351,7 +353,7 @@ inline long double log1p(long double z)
 #if defined(BOOST_HAS_LOG1P) && !(defined(__osf__) && defined(__DECCXX_VER))
 #  ifdef BOOST_MATH_USE_C99
 template <class Policy>
-inline float log1p(float x, const Policy& pol)
+inline BOOST_GPU_ENABLED float log1p(float x, const Policy& pol)
 { 
    if(x < -1)
       return policies::raise_domain_error<float>(
@@ -363,7 +365,7 @@ inline float log1p(float x, const Policy& pol)
 }
 #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
 template <class Policy>
-inline long double log1p(long double x, const Policy& pol)
+inline BOOST_GPU_ENABLED long double log1p(long double x, const Policy& pol)
 { 
    if(x < -1)
       return policies::raise_domain_error<long double>(
@@ -376,7 +378,7 @@ inline long double log1p(long double x, const Policy& pol)
 #endif
 #else
 template <class Policy>
-inline float log1p(float x, const Policy& pol)
+inline BOOST_GPU_ENABLED float log1p(float x, const Policy& pol)
 { 
    if(x < -1)
       return policies::raise_domain_error<float>(
@@ -388,7 +390,7 @@ inline float log1p(float x, const Policy& pol)
 }
 #endif
 template <class Policy>
-inline double log1p(double x, const Policy& pol)
+inline BOOST_GPU_ENABLED double log1p(double x, const Policy& pol)
 { 
    if(x < -1)
       return policies::raise_domain_error<double>(
@@ -405,7 +407,7 @@ inline double log1p(double x, const Policy& pol)
 // Currently tested with VC8 and Intel 9.1.
 //
 template <class Policy>
-inline double log1p(double x, const Policy& pol)
+inline BOOST_GPU_ENABLED double log1p(double x, const Policy& pol)
 {
    if(x < -1)
       return policies::raise_domain_error<double>(
@@ -420,7 +422,7 @@ inline double log1p(double x, const Policy& pol)
       return ::log(u)*(x/(u-1.0));
 }
 template <class Policy>
-inline float log1p(float x, const Policy& pol)
+inline BOOST_GPU_ENABLED float log1p(float x, const Policy& pol)
 {
    return static_cast<float>(boost::math::log1p(static_cast<double>(x), pol));
 }
@@ -430,7 +432,7 @@ inline float log1p(float x, const Policy& pol)
 // Needs more investigation.
 //
 template <class Policy>
-inline long double log1p(long double x, const Policy& pol)
+inline BOOST_GPU_ENABLED long double log1p(long double x, const Policy& pol)
 {
    if(x < -1)
       return policies::raise_domain_error<long double>(
@@ -448,7 +450,7 @@ inline long double log1p(long double x, const Policy& pol)
 #endif
 
 template <class T>
-inline typename tools::promote_args<T>::type log1p(T x)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1p(T x)
 {
    return boost::math::log1p(x, policies::policy<>());
 }
@@ -456,12 +458,12 @@ inline typename tools::promote_args<T>::type log1p(T x)
 // Compute log(1+x)-x:
 //
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type
    log1pmx(T x, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::log1pmx<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::log1pmx<%1%>(%1%)";
 
    if(x < -1)
       return policies::raise_domain_error<T>(
@@ -491,7 +493,7 @@ inline typename tools::promote_args<T>::type
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type log1pmx(T x)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1pmx(T x)
 {
    return log1pmx(x, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/math_fwd.hpp b/include/boost/math/special_functions/math_fwd.hpp
index 4f44f56113..610927ea83 100644
--- a/include/boost/math/special_functions/math_fwd.hpp
+++ b/include/boost/math/special_functions/math_fwd.hpp
@@ -40,39 +40,39 @@ namespace boost
 
    // Beta functions.
    template <class RT1, class RT2>
-   typename tools::promote_args<RT1, RT2>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type
          beta(RT1 a, RT2 b); // Beta function (2 arguments).
 
    template <class RT1, class RT2, class A>
-   typename tools::promote_args<RT1, RT2, A>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, A>::type
          beta(RT1 a, RT2 b, A x); // Beta function (3 arguments).
 
    template <class RT1, class RT2, class RT3, class Policy>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          beta(RT1 a, RT2 b, RT3 x, const Policy& pol); // Beta function (3 arguments).
 
    template <class RT1, class RT2, class RT3>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          betac(RT1 a, RT2 b, RT3 x);
 
    template <class RT1, class RT2, class RT3, class Policy>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          betac(RT1 a, RT2 b, RT3 x, const Policy& pol);
 
    template <class RT1, class RT2, class RT3>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          ibeta(RT1 a, RT2 b, RT3 x); // Incomplete beta function.
 
    template <class RT1, class RT2, class RT3, class Policy>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          ibeta(RT1 a, RT2 b, RT3 x, const Policy& pol); // Incomplete beta function.
 
    template <class RT1, class RT2, class RT3>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          ibetac(RT1 a, RT2 b, RT3 x); // Incomplete beta complement function.
 
    template <class RT1, class RT2, class RT3, class Policy>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          ibetac(RT1 a, RT2 b, RT3 x, const Policy& pol); // Incomplete beta complement function.
 
    template <class T1, class T2, class T3, class T4>
@@ -140,39 +140,39 @@ namespace boost
          ibetac_invb(RT1 a, RT2 b, RT3 q, const Policy&); // Incomplete beta complement inverse function.
 
    template <class RT1, class RT2, class RT3>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          ibeta_derivative(RT1 a, RT2 b, RT3 x);  // derivative of incomplete beta
 
    template <class RT1, class RT2, class RT3, class Policy>
-   typename tools::promote_args<RT1, RT2, RT3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2, RT3>::type
          ibeta_derivative(RT1 a, RT2 b, RT3 x, const Policy& pol);  // derivative of incomplete beta
 
    // Binomial:
    template <class T, class Policy>
-   T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
+   BOOST_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k, const Policy& pol);
    template <class T>
-   T binomial_coefficient(unsigned n, unsigned k);
+   BOOST_GPU_ENABLED T binomial_coefficient(unsigned n, unsigned k);
 
    // erf & erfc error functions.
    template <class RT> // Error function.
-   typename tools::promote_args<RT>::type erf(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erf(RT z);
    template <class RT, class Policy> // Error function.
-   typename tools::promote_args<RT>::type erf(RT z, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erf(RT z, const Policy&);
 
    template <class RT>// Error function complement.
-   typename tools::promote_args<RT>::type erfc(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erfc(RT z);
    template <class RT, class Policy>// Error function complement.
-   typename tools::promote_args<RT>::type erfc(RT z, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erfc(RT z, const Policy&);
 
    template <class RT>// Error function inverse.
-   typename tools::promote_args<RT>::type erf_inv(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erf_inv(RT z);
    template <class RT, class Policy>// Error function inverse.
-   typename tools::promote_args<RT>::type erf_inv(RT z, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erf_inv(RT z, const Policy& pol);
 
    template <class RT>// Error function complement inverse.
-   typename tools::promote_args<RT>::type erfc_inv(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erfc_inv(RT z);
    template <class RT, class Policy>// Error function complement inverse.
-   typename tools::promote_args<RT>::type erfc_inv(RT z, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type erfc_inv(RT z, const Policy& pol);
 
    // Polynomials:
    template <class T1, class T2, class T3>
@@ -313,83 +313,83 @@ namespace boost
 
    // Elliptic integrals:
    template <class T1, class T2, class T3>
-   typename tools::promote_args<T1, T2, T3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
          ellint_rf(T1 x, T2 y, T3 z);
 
    template <class T1, class T2, class T3, class Policy>
-   typename tools::promote_args<T1, T2, T3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
          ellint_rf(T1 x, T2 y, T3 z, const Policy& pol);
 
    template <class T1, class T2, class T3>
-   typename tools::promote_args<T1, T2, T3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
          ellint_rd(T1 x, T2 y, T3 z);
 
    template <class T1, class T2, class T3, class Policy>
-   typename tools::promote_args<T1, T2, T3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
          ellint_rd(T1 x, T2 y, T3 z, const Policy& pol);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
          ellint_rc(T1 x, T2 y);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
          ellint_rc(T1 x, T2 y, const Policy& pol);
 
    template <class T1, class T2, class T3, class T4>
-   typename tools::promote_args<T1, T2, T3, T4>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3, T4>::type
          ellint_rj(T1 x, T2 y, T3 z, T4 p);
 
    template <class T1, class T2, class T3, class T4, class Policy>
-   typename tools::promote_args<T1, T2, T3, T4>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3, T4>::type
          ellint_rj(T1 x, T2 y, T3 z, T4 p, const Policy& pol);
 
    template <class T1, class T2, class T3>
-   typename tools::promote_args<T1, T2, T3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
       ellint_rg(T1 x, T2 y, T3 z);
 
    template <class T1, class T2, class T3, class Policy>
-   typename tools::promote_args<T1, T2, T3>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type
       ellint_rg(T1 x, T2 y, T3 z, const Policy& pol);
 
    template <typename T>
-   typename tools::promote_args<T>::type ellint_2(T k);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_2(T k);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_2(T1 k, T2 phi, const Policy& pol);
 
    template <typename T>
-   typename tools::promote_args<T>::type ellint_1(T k);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_1(T k);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_1(T1 k, T2 phi, const Policy& pol);
 
    template <typename T>
-   typename tools::promote_args<T>::type ellint_d(T k);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type ellint_d(T k);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_d(T1 k, T2 phi, const Policy& pol);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type jacobi_zeta(T1 k, T2 phi, const Policy& pol);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type heuman_lambda(T1 k, T2 phi, const Policy& pol);
 
    namespace detail{
 
@@ -407,104 +407,104 @@ namespace boost
 
 
    template <class T1, class T2, class T3>
-   typename detail::ellint_3_result<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi);
+   BOOST_GPU_ENABLED typename detail::ellint_3_result<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi);
 
    template <class T1, class T2, class T3, class Policy>
-   typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2, T3>::type ellint_3(T1 k, T2 v, T3 phi, const Policy& pol);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type ellint_3(T1 k, T2 v);
 
    // Factorial functions.
    // Note: not for integral types, at present.
    template <class RT>
    struct max_factorial;
    template <class RT>
-   RT factorial(unsigned int);
+   BOOST_GPU_ENABLED RT factorial(unsigned int);
    template <class RT, class Policy>
-   RT factorial(unsigned int, const Policy& pol);
+   BOOST_GPU_ENABLED RT factorial(unsigned int, const Policy& pol);
    template <class RT>
-   RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT));
+   BOOST_GPU_ENABLED RT unchecked_factorial(unsigned int BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(RT));
    template <class RT>
-   RT double_factorial(unsigned i);
+   BOOST_GPU_ENABLED RT double_factorial(unsigned i);
    template <class RT, class Policy>
-   RT double_factorial(unsigned i, const Policy& pol);
+   BOOST_GPU_ENABLED RT double_factorial(unsigned i, const Policy& pol);
 
    template <class RT>
-   typename tools::promote_args<RT>::type falling_factorial(RT x, unsigned n);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type falling_factorial(RT x, unsigned n);
 
    template <class RT, class Policy>
-   typename tools::promote_args<RT>::type falling_factorial(RT x, unsigned n, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type falling_factorial(RT x, unsigned n, const Policy& pol);
 
    template <class RT>
-   typename tools::promote_args<RT>::type rising_factorial(RT x, int n);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type rising_factorial(RT x, int n);
 
    template <class RT, class Policy>
-   typename tools::promote_args<RT>::type rising_factorial(RT x, int n, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type rising_factorial(RT x, int n, const Policy& pol);
 
    // Gamma functions.
    template <class RT>
-   typename tools::promote_args<RT>::type tgamma(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type tgamma(RT z);
 
    template <class RT>
-   typename tools::promote_args<RT>::type tgamma1pm1(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type tgamma1pm1(RT z);
 
    template <class RT, class Policy>
-   typename tools::promote_args<RT>::type tgamma1pm1(RT z, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type tgamma1pm1(RT z, const Policy& pol);
 
    template <class RT1, class RT2>
-   typename tools::promote_args<RT1, RT2>::type tgamma(RT1 a, RT2 z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type tgamma(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   typename tools::promote_args<RT1, RT2>::type tgamma(RT1 a, RT2 z, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type tgamma(RT1 a, RT2 z, const Policy& pol);
 
    template <class RT>
-   typename tools::promote_args<RT>::type lgamma(RT z, int* sign);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type lgamma(RT z, int* sign);
 
    template <class RT, class Policy>
-   typename tools::promote_args<RT>::type lgamma(RT z, int* sign, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type lgamma(RT z, int* sign, const Policy& pol);
 
    template <class RT>
-   typename tools::promote_args<RT>::type lgamma(RT x);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type lgamma(RT x);
 
    template <class RT, class Policy>
-   typename tools::promote_args<RT>::type lgamma(RT x, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type lgamma(RT x, const Policy& pol);
 
    template <class RT1, class RT2>
-   typename tools::promote_args<RT1, RT2>::type tgamma_lower(RT1 a, RT2 z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type tgamma_lower(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   typename tools::promote_args<RT1, RT2>::type tgamma_lower(RT1 a, RT2 z, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type tgamma_lower(RT1 a, RT2 z, const Policy&);
 
    template <class RT1, class RT2>
-   typename tools::promote_args<RT1, RT2>::type gamma_q(RT1 a, RT2 z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type gamma_q(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   typename tools::promote_args<RT1, RT2>::type gamma_q(RT1 a, RT2 z, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type gamma_q(RT1 a, RT2 z, const Policy&);
 
    template <class RT1, class RT2>
-   typename tools::promote_args<RT1, RT2>::type gamma_p(RT1 a, RT2 z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type gamma_p(RT1 a, RT2 z);
 
    template <class RT1, class RT2, class Policy>
-   typename tools::promote_args<RT1, RT2>::type gamma_p(RT1 a, RT2 z, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT1, RT2>::type gamma_p(RT1 a, RT2 z, const Policy&);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type tgamma_delta_ratio(T1 z, T2 delta);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type tgamma_delta_ratio(T1 z, T2 delta);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type tgamma_delta_ratio(T1 z, T2 delta, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type tgamma_delta_ratio(T1 z, T2 delta, const Policy&);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type tgamma_ratio(T1 a, T2 b);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type tgamma_ratio(T1 a, T2 b);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type tgamma_ratio(T1 a, T2 b, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type tgamma_ratio(T1 a, T2 b, const Policy&);
 
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type gamma_p_derivative(T1 a, T2 x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type gamma_p_derivative(T1 a, T2 x);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type gamma_p_derivative(T1 a, T2 x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type gamma_p_derivative(T1 a, T2 x, const Policy&);
 
    // gamma inverse.
    template <class T1, class T2>
@@ -563,39 +563,39 @@ namespace boost
 
    // cbrt - cube root.
    template <class RT>
-   typename tools::promote_args<RT>::type cbrt(RT z);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type cbrt(RT z);
 
    template <class RT, class Policy>
-   typename tools::promote_args<RT>::type cbrt(RT z, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<RT>::type cbrt(RT z, const Policy&);
 
    // log1p is log(x + 1)
    template <class T>
-   typename tools::promote_args<T>::type log1p(T);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1p(T);
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type log1p(T, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1p(T, const Policy&);
 
    // log1pmx is log(x + 1) - x
    template <class T>
-   typename tools::promote_args<T>::type log1pmx(T);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1pmx(T);
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type log1pmx(T, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type log1pmx(T, const Policy&);
 
    // Exp (x) minus 1 functions.
    template <class T>
-   typename tools::promote_args<T>::type expm1(T);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type expm1(T);
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type expm1(T, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type expm1(T, const Policy&);
 
    // Power - 1
    template <class T1, class T2>
-   typename tools::promote_args<T1, T2>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
          powm1(const T1 a, const T2 z);
 
    template <class T1, class T2, class Policy>
-   typename tools::promote_args<T1, T2>::type
+   BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
          powm1(const T1 a, const T2 z, const Policy&);
 
    // sqrt(1+x) - 1
@@ -607,10 +607,10 @@ namespace boost
 
    // sinus cardinals:
    template <class T>
-   typename tools::promote_args<T>::type sinc_pi(T x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type sinc_pi(T x);
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type sinc_pi(T x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type sinc_pi(T x, const Policy&);
 
    template <class T>
    typename tools::promote_args<T>::type sinhc_pi(T x);
@@ -620,22 +620,22 @@ namespace boost
 
    // inverse hyperbolics:
    template<typename T>
-   typename tools::promote_args<T>::type asinh(T x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type asinh(T x);
 
    template<typename T, class Policy>
-   typename tools::promote_args<T>::type asinh(T x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type asinh(T x, const Policy&);
 
    template<typename T>
-   typename tools::promote_args<T>::type acosh(T x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type acosh(T x);
 
    template<typename T, class Policy>
-   typename tools::promote_args<T>::type acosh(T x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type acosh(T x, const Policy&);
 
    template<typename T>
-   typename tools::promote_args<T>::type atanh(T x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type atanh(T x);
 
    template<typename T, class Policy>
-   typename tools::promote_args<T>::type atanh(T x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type atanh(T x, const Policy&);
 
    namespace detail{
 
@@ -861,43 +861,43 @@ namespace boost
                      const Policy&);
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type sin_pi(T x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type sin_pi(T x, const Policy&);
 
    template <class T>
-   typename tools::promote_args<T>::type sin_pi(T x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type sin_pi(T x);
 
    template <class T, class Policy>
-   typename tools::promote_args<T>::type cos_pi(T x, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type cos_pi(T x, const Policy&);
 
    template <class T>
-   typename tools::promote_args<T>::type cos_pi(T x);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type cos_pi(T x);
 
    template <class T>
-   int fpclassify BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_GPU_ENABLED int fpclassify BOOST_NO_MACRO_EXPAND(T t);
 
    template <class T>
-   bool isfinite BOOST_NO_MACRO_EXPAND(T z);
+   BOOST_GPU_ENABLED bool isfinite BOOST_NO_MACRO_EXPAND(T z);
 
    template <class T>
-   bool isinf BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_GPU_ENABLED bool isinf BOOST_NO_MACRO_EXPAND(T t);
 
    template <class T>
-   bool isnan BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_GPU_ENABLED bool isnan BOOST_NO_MACRO_EXPAND(T t);
 
    template <class T>
-   bool isnormal BOOST_NO_MACRO_EXPAND(T t);
+   BOOST_GPU_ENABLED bool isnormal BOOST_NO_MACRO_EXPAND(T t);
 
    template<class T>
-   int signbit BOOST_NO_MACRO_EXPAND(T x);
+   BOOST_GPU_ENABLED int signbit BOOST_NO_MACRO_EXPAND(T x);
 
    template <class T>
-   int sign BOOST_NO_MACRO_EXPAND(const T& z);
+   BOOST_GPU_ENABLED int sign BOOST_NO_MACRO_EXPAND(const T& z);
 
    template <class T, class U>
-   typename tools::promote_args_permissive<T, U>::type copysign BOOST_NO_MACRO_EXPAND(const T& x, const U& y);
+   BOOST_GPU_ENABLED typename tools::promote_args_permissive<T, U>::type copysign BOOST_NO_MACRO_EXPAND(const T& x, const U& y);
 
    template <class T>
-   typename tools::promote_args_permissive<T>::type changesign BOOST_NO_MACRO_EXPAND(const T& z);
+   BOOST_GPU_ENABLED typename tools::promote_args_permissive<T>::type changesign BOOST_NO_MACRO_EXPAND(const T& z);
 
    // Exponential integrals:
    namespace detail{
@@ -1019,32 +1019,32 @@ namespace boost
 
    // pow:
    template <int N, typename T, class Policy>
-   typename tools::promote_args<T>::type pow(T base, const Policy& policy);
+   BOOST_GPU_ENABLED BOOST_CONSTEXPR typename tools::promote_args<T>::type pow(T base, const Policy& policy);
 
    template <int N, typename T>
-   typename tools::promote_args<T>::type pow(T base);
+   BOOST_GPU_ENABLED BOOST_CONSTEXPR typename tools::promote_args<T>::type pow(T base);
 
    // next:
    template <class T, class U, class Policy>
-   typename tools::promote_args<T, U>::type nextafter(const T&, const U&, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type nextafter(const T&, const U&, const Policy&);
    template <class T, class U>
-   typename tools::promote_args<T, U>::type nextafter(const T&, const U&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type nextafter(const T&, const U&);
    template <class T, class Policy>
-   typename tools::promote_args<T>::type float_next(const T&, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_next(const T&, const Policy&);
    template <class T>
-   typename tools::promote_args<T>::type float_next(const T&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_next(const T&);
    template <class T, class Policy>
-   typename tools::promote_args<T>::type float_prior(const T&, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_prior(const T&, const Policy&);
    template <class T>
-   typename tools::promote_args<T>::type float_prior(const T&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_prior(const T&);
    template <class T, class U, class Policy>
-   typename tools::promote_args<T, U>::type float_distance(const T&, const U&, const Policy&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type float_distance(const T&, const U&, const Policy&);
    template <class T, class U>
-   typename tools::promote_args<T, U>::type float_distance(const T&, const U&);
+   BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type float_distance(const T&, const U&);
    template <class T, class Policy>
-   typename tools::promote_args<T>::type float_advance(T val, int distance, const Policy& pol);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_advance(T val, int distance, const Policy& pol);
    template <class T>
-   typename tools::promote_args<T>::type float_advance(const T& val, int distance);
+   BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_advance(const T& val, int distance);
 
    template <class T, class Policy>
    typename tools::promote_args<T>::type ulp(const T& val, const Policy& pol);
diff --git a/include/boost/math/special_functions/modf.hpp b/include/boost/math/special_functions/modf.hpp
index 3ce74e7aa3..20321664d7 100644
--- a/include/boost/math/special_functions/modf.hpp
+++ b/include/boost/math/special_functions/modf.hpp
@@ -17,50 +17,50 @@
 namespace boost{ namespace math{
 
 template <class T, class Policy>
-inline T modf(const T& v, T* ipart, const Policy& pol)
+inline BOOST_GPU_ENABLED T modf(const T& v, T* ipart, const Policy& pol)
 {
    *ipart = trunc(v, pol);
    return v - *ipart;
 }
 template <class T>
-inline T modf(const T& v, T* ipart)
+inline BOOST_GPU_ENABLED T modf(const T& v, T* ipart)
 {
    return modf(v, ipart, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline T modf(const T& v, int* ipart, const Policy& pol)
+inline BOOST_GPU_ENABLED T modf(const T& v, int* ipart, const Policy& pol)
 {
    *ipart = itrunc(v, pol);
    return v - *ipart;
 }
 template <class T>
-inline T modf(const T& v, int* ipart)
+inline BOOST_GPU_ENABLED T modf(const T& v, int* ipart)
 {
    return modf(v, ipart, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline T modf(const T& v, long* ipart, const Policy& pol)
+inline BOOST_GPU_ENABLED T modf(const T& v, long* ipart, const Policy& pol)
 {
    *ipart = ltrunc(v, pol);
    return v - *ipart;
 }
 template <class T>
-inline T modf(const T& v, long* ipart)
+inline BOOST_GPU_ENABLED T modf(const T& v, long* ipart)
 {
    return modf(v, ipart, policies::policy<>());
 }
 
 #ifdef BOOST_HAS_LONG_LONG
 template <class T, class Policy>
-inline T modf(const T& v, boost::long_long_type* ipart, const Policy& pol)
+inline BOOST_GPU_ENABLED T modf(const T& v, boost::long_long_type* ipart, const Policy& pol)
 {
    *ipart = lltrunc(v, pol);
    return v - *ipart;
 }
 template <class T>
-inline T modf(const T& v, boost::long_long_type* ipart)
+inline BOOST_GPU_ENABLED T modf(const T& v, boost::long_long_type* ipart)
 {
    return modf(v, ipart, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/next.hpp b/include/boost/math/special_functions/next.hpp
index a63983e1c3..5b1fd0f682 100644
--- a/include/boost/math/special_functions/next.hpp
+++ b/include/boost/math/special_functions/next.hpp
@@ -80,14 +80,14 @@ inline T normalize_value(const T& val, const mpl::true_&)
 }
 
 template <class T>
-inline T get_smallest_value(mpl::true_ const&)
+inline BOOST_GPU_ENABLED T get_smallest_value(mpl::true_ const&)
 {
    //
    // numeric_limits lies about denorms being present - particularly
    // when this can be turned on or off at runtime, as is the case
    // when using the SSE2 registers in DAZ or FTZ mode.
    //
-   static const T m = std::numeric_limits<T>::denorm_min();
+   BOOST_MATH_GPU_STATIC const T m = std::numeric_limits<T>::denorm_min();
 #ifdef BOOST_MATH_CHECK_SSE2
    return (_mm_getcsr() & (_MM_FLUSH_ZERO_ON | 0x40)) ? tools::min_value<T>() : m;;
 #else
@@ -96,15 +96,17 @@ inline T get_smallest_value(mpl::true_ const&)
 }
 
 template <class T>
-inline T get_smallest_value(mpl::false_ const&)
+inline BOOST_GPU_ENABLED T get_smallest_value(mpl::false_ const&)
 {
    return tools::min_value<T>();
 }
 
 template <class T>
-inline T get_smallest_value()
+inline BOOST_GPU_ENABLED T get_smallest_value()
 {
-#if defined(BOOST_MSVC) && (BOOST_MSVC <= 1310)
+#ifdef __CUDA_ARCH__
+   return get_smallest_value<T>(mpl::bool_<false>());
+#elif defined(BOOST_MSVC) && (BOOST_MSVC <= 1310)
    return get_smallest_value<T>(mpl::bool_<std::numeric_limits<T>::is_specialized && (std::numeric_limits<T>::has_denorm == 1)>());
 #else
    return get_smallest_value<T>(mpl::bool_<std::numeric_limits<T>::is_specialized && (std::numeric_limits<T>::has_denorm == std::denorm_present)>());
@@ -116,25 +118,25 @@ inline T get_smallest_value()
 // we calculate the value of the least-significant-bit:
 //
 template <class T>
-T get_min_shift_value();
+BOOST_GPU_ENABLED T get_min_shift_value();
 
 template <class T>
 struct min_shift_initializer
 {
    struct init
    {
-      init()
+      BOOST_GPU_ENABLED init()
       {
          do_init();
       }
-      static void do_init()
+      static BOOST_GPU_ENABLED void do_init()
       {
          get_min_shift_value<T>();
       }
-      void force_instantiate()const{}
+      BOOST_GPU_ENABLED void force_instantiate()const{}
    };
    static const init initializer;
-   static void force_instantiate()
+   static BOOST_GPU_ENABLED void force_instantiate()
    {
       initializer.force_instantiate();
    }
@@ -144,13 +146,13 @@ template <class T>
 const typename min_shift_initializer<T>::init min_shift_initializer<T>::initializer;
 
 template <class T>
-inline T calc_min_shifted(const mpl::true_&)
+inline BOOST_GPU_ENABLED T calc_min_shifted(const mpl::true_&)
 {
    BOOST_MATH_STD_USING
    return ldexp(tools::min_value<T>(), tools::digits<T>() + 1);
 }
 template <class T>
-inline T calc_min_shifted(const mpl::false_&)
+inline BOOST_GPU_ENABLED T calc_min_shifted(const mpl::false_&)
 {
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_specialized);
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::radix != 2);
@@ -160,7 +162,7 @@ inline T calc_min_shifted(const mpl::false_&)
 
 
 template <class T>
-inline T get_min_shift_value()
+inline BOOST_GPU_ENABLED T get_min_shift_value()
 {
    static const T val = calc_min_shifted<T>(mpl::bool_<!std::numeric_limits<T>::is_specialized || std::numeric_limits<T>::radix == 2>());
    min_shift_initializer<T>::force_instantiate();
@@ -169,11 +171,11 @@ inline T get_min_shift_value()
 }
 
 template <class T, class Policy>
-T float_next_imp(const T& val, const mpl::true_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_next_imp(const T& val, const mpl::true_&, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    int expon;
-   static const char* function = "float_next<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "float_next<%1%>(%1%)";
 
    int fpclass = (boost::math::fpclassify)(val);
 
@@ -213,7 +215,7 @@ T float_next_imp(const T& val, const mpl::true_&, const Policy& pol)
 // Special version for some base other than 2:
 //
 template <class T, class Policy>
-T float_next_imp(const T& val, const mpl::false_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_next_imp(const T& val, const mpl::false_&, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_specialized);
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::radix != 2);
@@ -231,7 +233,7 @@ T float_next_imp(const T& val, const mpl::false_&, const Policy& pol)
       return policies::raise_domain_error<T>(
          function,
          "Argument must be finite, but got %1%", val, pol);
-   }
+}
 
    if(val >= tools::max_value<T>())
       return policies::raise_overflow_error<T>(function, 0, pol);
@@ -247,7 +249,7 @@ T float_next_imp(const T& val, const mpl::false_&, const Policy& pol)
       // This avoids issues with the Intel SSE2 registers when the FTZ or DAZ flags are set.
       //
       return scalbn(float_next(T(scalbn(val, 2 * std::numeric_limits<T>::digits)), pol), -2 * std::numeric_limits<T>::digits);
-   }
+}
 
    expon = 1 + ilogb(val);
    if(-1 == scalbn(val, -expon) * std::numeric_limits<T>::radix)
@@ -261,7 +263,7 @@ T float_next_imp(const T& val, const mpl::false_&, const Policy& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type float_next(const T& val, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_next(const T& val, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
    return detail::float_next_imp(detail::normalize_value(static_cast<result_type>(val), typename detail::has_hidden_guard_digits<result_type>::type()), mpl::bool_<!std::numeric_limits<result_type>::is_specialized || (std::numeric_limits<result_type>::radix == 2)>(), pol);
@@ -276,7 +278,7 @@ inline typename tools::promote_args<T>::type float_next(const T& val, const Poli
 template <class Policy>
 inline double float_next(const double& val, const Policy& pol)
 {
-   static const char* function = "float_next<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "float_next<%1%>(%1%)";
 
    if(!(boost::math::isfinite)(val) && (val > 0))
       return policies::raise_domain_error<double>(
@@ -291,7 +293,7 @@ inline double float_next(const double& val, const Policy& pol)
 #endif
 
 template <class T>
-inline typename tools::promote_args<T>::type float_next(const T& val)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_next(const T& val)
 {
    return float_next(val, policies::policy<>());
 }
@@ -299,11 +301,11 @@ inline typename tools::promote_args<T>::type float_next(const T& val)
 namespace detail{
 
 template <class T, class Policy>
-T float_prior_imp(const T& val, const mpl::true_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_prior_imp(const T& val, const mpl::true_&, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    int expon;
-   static const char* function = "float_prior<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "float_prior<%1%>(%1%)";
 
    int fpclass = (boost::math::fpclassify)(val);
 
@@ -344,7 +346,7 @@ T float_prior_imp(const T& val, const mpl::true_&, const Policy& pol)
 // Special version for bases other than 2:
 //
 template <class T, class Policy>
-T float_prior_imp(const T& val, const mpl::false_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_prior_imp(const T& val, const mpl::false_&, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_specialized);
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::radix != 2);
@@ -362,7 +364,7 @@ T float_prior_imp(const T& val, const mpl::false_&, const Policy& pol)
       return policies::raise_domain_error<T>(
          function,
          "Argument must be finite, but got %1%", val, pol);
-   }
+}
 
    if(val <= -tools::max_value<T>())
       return -policies::raise_overflow_error<T>(function, 0, pol);
@@ -378,7 +380,7 @@ T float_prior_imp(const T& val, const mpl::false_&, const Policy& pol)
       // This avoids issues with the Intel SSE2 registers when the FTZ or DAZ flags are set.
       //
       return scalbn(float_prior(T(scalbn(val, 2 * std::numeric_limits<T>::digits)), pol), -2 * std::numeric_limits<T>::digits);
-   }
+}
 
    expon = 1 + ilogb(val);
    T remain = scalbn(val, -expon);
@@ -393,7 +395,7 @@ T float_prior_imp(const T& val, const mpl::false_&, const Policy& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type float_prior(const T& val, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_prior(const T& val, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
    return detail::float_prior_imp(detail::normalize_value(static_cast<result_type>(val), typename detail::has_hidden_guard_digits<result_type>::type()), mpl::bool_<!std::numeric_limits<result_type>::is_specialized || (std::numeric_limits<result_type>::radix == 2)>(), pol);
@@ -408,7 +410,7 @@ inline typename tools::promote_args<T>::type float_prior(const T& val, const Pol
 template <class Policy>
 inline double float_prior(const double& val, const Policy& pol)
 {
-   static const char* function = "float_prior<%1%>(%1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "float_prior<%1%>(%1%)";
 
    if(!(boost::math::isfinite)(val) && (val < 0))
       return policies::raise_domain_error<double>(
@@ -423,20 +425,20 @@ inline double float_prior(const double& val, const Policy& pol)
 #endif
 
 template <class T>
-inline typename tools::promote_args<T>::type float_prior(const T& val)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_prior(const T& val)
 {
    return float_prior(val, policies::policy<>());
 }
 
 template <class T, class U, class Policy>
-inline typename tools::promote_args<T, U>::type nextafter(const T& val, const U& direction, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type nextafter(const T& val, const U& direction, const Policy& pol)
 {
    typedef typename tools::promote_args<T, U>::type result_type;
    return val < direction ? boost::math::float_next<result_type>(val, pol) : val == direction ? val : boost::math::float_prior<result_type>(val, pol);
 }
 
 template <class T, class U>
-inline typename tools::promote_args<T, U>::type nextafter(const T& val, const U& direction)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type nextafter(const T& val, const U& direction)
 {
    return nextafter(val, direction, policies::policy<>());
 }
@@ -444,13 +446,13 @@ inline typename tools::promote_args<T, U>::type nextafter(const T& val, const U&
 namespace detail{
 
 template <class T, class Policy>
-T float_distance_imp(const T& a, const T& b, const mpl::true_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_distance_imp(const T& a, const T& b, const mpl::true_&, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    //
    // Error handling:
    //
-   static const char* function = "float_distance<%1%>(%1%, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "float_distance<%1%>(%1%, %1%)";
    if(!(boost::math::isfinite)(a))
       return policies::raise_domain_error<T>(
          function,
@@ -480,8 +482,8 @@ T float_distance_imp(const T& a, const T& b, const mpl::true_&, const Policy& po
    if(a < 0)
       return float_distance(static_cast<T>(-b), static_cast<T>(-a), pol);
 
-   BOOST_ASSERT(a >= 0);
-   BOOST_ASSERT(b >= a);
+   BOOST_MATH_ASSERT(a >= 0);
+   BOOST_MATH_ASSERT(b >= a);
 
    int expon;
    //
@@ -519,7 +521,7 @@ T float_distance_imp(const T& a, const T& b, const mpl::true_&, const Policy& po
       //
       T a2 = ldexp(a, tools::digits<T>());
       T b2 = ldexp(b, tools::digits<T>());
-      mb = -(std::min)(T(ldexp(upper, tools::digits<T>())), b2);
+      mb = -BOOST_MATH_CUDA_SAFE_MIN(T(ldexp(upper, tools::digits<T>())), b2);
       x = a2 + mb;
       z = x - a2;
       y = (a2 - (x - z)) + (mb - z);
@@ -528,7 +530,7 @@ T float_distance_imp(const T& a, const T& b, const mpl::true_&, const Policy& po
    }
    else
    {
-      mb = -(std::min)(upper, b);
+      mb = -BOOST_MATH_CUDA_SAFE_MIN(upper, b);
       x = a + mb;
       z = x - a;
       y = (a - (x - z)) + (mb - z);
@@ -542,14 +544,14 @@ T float_distance_imp(const T& a, const T& b, const mpl::true_&, const Policy& po
    //
    // Result must be an integer:
    //
-   BOOST_ASSERT(result == floor(result));
+   BOOST_MATH_ASSERT(result == floor(result));
    return result;
 } // float_distance_imp
 //
 // Special versions for bases other than 2:
 //
 template <class T, class Policy>
-T float_distance_imp(const T& a, const T& b, const mpl::false_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_distance_imp(const T& a, const T& b, const mpl::false_&, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_specialized);
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::radix != 2);
@@ -610,7 +612,7 @@ T float_distance_imp(const T& a, const T& b, const mpl::false_&, const Policy& p
       T upper2 = scalbn(T(1), expon2 - 1);
       result = float_distance(upper2, b);
       result += (expon2 - expon - 1) * scalbn(T(1), std::numeric_limits<T>::digits - 1);
-   }
+}
    //
    // Use compensated double-double addition to avoid rounding
    // errors in the subtraction:
@@ -626,7 +628,7 @@ T float_distance_imp(const T& a, const T& b, const mpl::false_&, const Policy& p
       //
       T a2 = scalbn(a, std::numeric_limits<T>::digits);
       T b2 = scalbn(b, std::numeric_limits<T>::digits);
-      mb = -(std::min)(T(scalbn(upper, std::numeric_limits<T>::digits)), b2);
+      mb = -BOOST_MATH_CUDA_SAFE_MIN(T(scalbn(upper, std::numeric_limits<T>::digits)), b2);
       x = a2 + mb;
       z = x - a2;
       y = (a2 - (x - z)) + (mb - z);
@@ -635,7 +637,7 @@ T float_distance_imp(const T& a, const T& b, const mpl::false_&, const Policy& p
    }
    else
    {
-      mb = -(std::min)(upper, b);
+      mb = -BOOST_MATH_CUDA_SAFE_MIN(upper, b);
       x = a + mb;
       z = x - a;
       y = (a - (x - z)) + (mb - z);
@@ -656,14 +658,14 @@ T float_distance_imp(const T& a, const T& b, const mpl::false_&, const Policy& p
 } // namespace detail
 
 template <class T, class U, class Policy>
-inline typename tools::promote_args<T, U>::type float_distance(const T& a, const U& b, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type float_distance(const T& a, const U& b, const Policy& pol)
 {
    typedef typename tools::promote_args<T, U>::type result_type;
    return detail::float_distance_imp(detail::normalize_value(static_cast<result_type>(a), typename detail::has_hidden_guard_digits<result_type>::type()), detail::normalize_value(static_cast<result_type>(b), typename detail::has_hidden_guard_digits<result_type>::type()), mpl::bool_<!std::numeric_limits<result_type>::is_specialized || (std::numeric_limits<result_type>::radix == 2)>(), pol);
 }
 
 template <class T, class U>
-typename tools::promote_args<T, U>::type float_distance(const T& a, const U& b)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T, U>::type float_distance(const T& a, const U& b)
 {
    return boost::math::float_distance(a, b, policies::policy<>());
 }
@@ -671,13 +673,13 @@ typename tools::promote_args<T, U>::type float_distance(const T& a, const U& b)
 namespace detail{
 
 template <class T, class Policy>
-T float_advance_imp(T val, int distance, const mpl::true_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_advance_imp(T val, int distance, const mpl::true_&, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    //
    // Error handling:
    //
-   static const char* function = "float_advance<%1%>(%1%, int)";
+   BOOST_MATH_GPU_STATIC const char* function = "float_advance<%1%>(%1%, int)";
 
    int fpclass = (boost::math::fpclassify)(val);
 
@@ -754,7 +756,7 @@ T float_advance_imp(T val, int distance, const mpl::true_&, const Policy& pol)
 // Special version for bases other than 2:
 //
 template <class T, class Policy>
-T float_advance_imp(T val, int distance, const mpl::false_&, const Policy& pol)
+BOOST_GPU_ENABLED T float_advance_imp(T val, int distance, const mpl::false_&, const Policy& pol)
 {
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_specialized);
    BOOST_STATIC_ASSERT(std::numeric_limits<T>::radix != 2);
@@ -791,7 +793,7 @@ T float_advance_imp(T val, int distance, const mpl::false_&, const Policy& pol)
       if(distance > 0)
       {
          do{ val = float_next(val, pol); } while(--distance);
-      }
+}
       else
       {
          do{ val = float_prior(val, pol); } while(++distance);
@@ -804,7 +806,7 @@ T float_advance_imp(T val, int distance, const mpl::false_&, const Policy& pol)
    if(val <= tools::min_value<T>())
    {
       limit = sign(T(distance)) * tools::min_value<T>();
-   }
+}
    T limit_distance = float_distance(val, limit);
    while(fabs(limit_distance) < abs(distance))
    {
@@ -840,14 +842,14 @@ T float_advance_imp(T val, int distance, const mpl::false_&, const Policy& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type float_advance(T val, int distance, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_advance(T val, int distance, const Policy& pol)
 {
    typedef typename tools::promote_args<T>::type result_type;
    return detail::float_advance_imp(detail::normalize_value(static_cast<result_type>(val), typename detail::has_hidden_guard_digits<result_type>::type()), distance, mpl::bool_<!std::numeric_limits<result_type>::is_specialized || (std::numeric_limits<result_type>::radix == 2)>(), pol);
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type float_advance(const T& val, int distance)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type float_advance(const T& val, int distance)
 {
    return boost::math::float_advance(val, distance, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/owens_t.hpp b/include/boost/math/special_functions/owens_t.hpp
index 7fbd8918c0..a6c31e68c5 100644
--- a/include/boost/math/special_functions/owens_t.hpp
+++ b/include/boost/math/special_functions/owens_t.hpp
@@ -117,7 +117,7 @@ namespace boost
          {
             static const unsigned short ord[] = {2, 3, 4, 5, 7, 10, 12, 18, 10, 20, 30, 0, 4, 7, 8, 20, 0, 0}; // 18 entries
 
-            BOOST_ASSERT(icode<18);
+            BOOST_MATH_ASSERT(icode<18);
 
             return ord[icode];
          } // unsigned short owens_t_get_order(const unsigned short icode, RealType, mpl::int<53> const&)
@@ -128,7 +128,7 @@ namespace boost
            // method ================>>>       {1, 1, 1, 1, 1,  1,  1,  1,  2,  2,  2,  3, 4,  4,  4,  4,  5, 6}
            static const unsigned short ord[] = {3, 4, 5, 6, 8, 11, 13, 19, 10, 20, 30,  0, 7, 10, 11, 23,  0, 0}; // 18 entries
 
-          BOOST_ASSERT(icode<18);
+          BOOST_MATH_ASSERT(icode<18);
 
           return ord[icode];
         } // unsigned short owens_t_get_order(const unsigned short icode, RealType, mpl::int<64> const&)
@@ -254,7 +254,7 @@ namespace boost
 
             while( true )
             {
-               BOOST_ASSERT(i < 21);
+               BOOST_MATH_ASSERT(i < 21);
                val += zi*c2[i];
                if( m <= i ) // if( m < i+1 )
                {
@@ -326,7 +326,7 @@ namespace boost
 
           while( true )
           {
-              BOOST_ASSERT(i < 31);
+              BOOST_MATH_ASSERT(i < 31);
               val += zi*c2[i];
               if( m <= i ) // if( m < i+1 )
               {
@@ -425,7 +425,7 @@ namespace boost
             RealType val = 0;
             for(unsigned short i = 0; i < m; ++i)
             {
-               BOOST_ASSERT(i < 13);
+               BOOST_MATH_ASSERT(i < 13);
                const RealType r = static_cast<RealType>(1) + as*pts[i];
                val += wts[i] * exp( hs*r ) / r;
             } // for(unsigned short i = 0; i < m; ++i)
@@ -497,7 +497,7 @@ namespace boost
           RealType val = 0;
           for(unsigned short i = 0; i < m; ++i)
             {
-              BOOST_ASSERT(i < 19);
+              BOOST_MATH_ASSERT(i < 19);
               const RealType r = 1 + as*pts[i];
               val += wts[i] * exp( hs*r ) / r;
             } // for(unsigned short i = 0; i < m; ++i)
diff --git a/include/boost/math/special_functions/pow.hpp b/include/boost/math/special_functions/pow.hpp
index 494f721d05..07f4b24241 100644
--- a/include/boost/math/special_functions/pow.hpp
+++ b/include/boost/math/special_functions/pow.hpp
@@ -30,15 +30,17 @@ namespace math {
 
 namespace detail {
 
+   template <class T>
+   inline BOOST_GPU_ENABLED BOOST_CONSTEXPR T square(const T& val) { return val * val; }
+
 
 template <int N, int M = N%2>
 struct positive_power
 {
     template <typename T>
-    static T result(T base)
+    static BOOST_GPU_ENABLED BOOST_CXX14_CONSTEXPR T result(T base)
     {
-        T power = positive_power<N/2>::result(base);
-        return power * power;
+        return square(positive_power<N / 2>::result(base));
     }
 };
 
@@ -46,10 +48,9 @@ template <int N>
 struct positive_power<N, 1>
 {
     template <typename T>
-    static T result(T base)
+    static BOOST_GPU_ENABLED BOOST_CONSTEXPR T result(T base)
     {
-        T power = positive_power<N/2>::result(base);
-        return base * power * power;
+        return base * square(positive_power<N / 2>::result(base));
     }
 };
 
@@ -57,7 +58,7 @@ template <>
 struct positive_power<1, 1>
 {
     template <typename T>
-    static T result(T base){ return base; }
+    static BOOST_GPU_ENABLED BOOST_CONSTEXPR T result(T base){ return base; }
 };
 
 
@@ -65,7 +66,7 @@ template <int N, bool>
 struct power_if_positive
 {
     template <typename T, class Policy>
-    static T result(T base, const Policy&)
+    static BOOST_GPU_ENABLED BOOST_CONSTEXPR T result(T base, const Policy&)
     { return positive_power<N>::result(base); }
 };
 
@@ -73,18 +74,13 @@ template <int N>
 struct power_if_positive<N, false>
 {
     template <typename T, class Policy>
-    static T result(T base, const Policy& policy)
+    static BOOST_GPU_ENABLED BOOST_CONSTEXPR T result(T base, const Policy& policy)
     {
-        if (base == 0)
-        {
-            return policies::raise_overflow_error<T>(
+        return base == 0 ? policies::raise_overflow_error<T>(
                        "boost::math::pow(%1%)",
                        "Attempted to compute a negative power of 0",
                        policy
-                   );
-        }
-
-        return T(1) / positive_power<-N>::result(base);
+                   ) : T(1) / positive_power<-N>::result(base);
     }
 };
 
@@ -92,20 +88,15 @@ template <>
 struct power_if_positive<0, true>
 {
     template <typename T, class Policy>
-    static T result(T base, const Policy& policy)
+    static BOOST_GPU_ENABLED BOOST_CONSTEXPR T result(T base, const Policy& policy)
     {
-        if (base == 0)
-        {
-            return policies::raise_indeterminate_result_error<T>(
+        return base == 0 ? policies::raise_indeterminate_result_error<T>(
                        "boost::math::pow(%1%)",
                        "The result of pow<0>(%1%) is undetermined",
                        base,
                        T(1),
                        policy
-                   );
-        }
-
-        return T(1);
+                   ) : T(1);
     }
 };
 
@@ -126,15 +117,14 @@ struct select_power_if_positive
 
 
 template <int N, typename T, class Policy>
-inline typename tools::promote_args<T>::type pow(T base, const Policy& policy)
+inline BOOST_GPU_ENABLED BOOST_CONSTEXPR typename tools::promote_args<T>::type pow(T base, const Policy& policy)
 { 
-   typedef typename tools::promote_args<T>::type result_type;
-   return detail::select_power_if_positive<N>::type::result(static_cast<result_type>(base), policy); 
+   return detail::select_power_if_positive<N>::type::result(static_cast<typename tools::promote_args<T>::type>(base), policy);
 }
 
 
 template <int N, typename T>
-inline typename tools::promote_args<T>::type pow(T base)
+inline BOOST_GPU_ENABLED BOOST_CONSTEXPR typename tools::promote_args<T>::type pow(T base)
 { return pow<N>(base, policies::policy<>()); }
 
 #ifdef BOOST_MSVC
diff --git a/include/boost/math/special_functions/powm1.hpp b/include/boost/math/special_functions/powm1.hpp
index 37db8746c8..71512af05d 100644
--- a/include/boost/math/special_functions/powm1.hpp
+++ b/include/boost/math/special_functions/powm1.hpp
@@ -21,10 +21,10 @@
 namespace boost{ namespace math{ namespace detail{
 
 template <class T, class Policy>
-inline T powm1_imp(const T x, const T y, const Policy& pol)
+inline BOOST_GPU_ENABLED T powm1_imp(const T x, const T y, const Policy& pol)
 {
    BOOST_MATH_STD_USING
-   static const char* function = "boost::math::powm1<%1%>(%1%, %1%)";
+   BOOST_MATH_GPU_STATIC const char* function = "boost::math::powm1<%1%>(%1%, %1%)";
 
    if (x > 0)
    {
@@ -54,7 +54,7 @@ inline T powm1_imp(const T x, const T y, const Policy& pol)
 } // detail
 
 template <class T1, class T2>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    powm1(const T1 a, const T2 z)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
@@ -62,7 +62,7 @@ inline typename tools::promote_args<T1, T2>::type
 }
 
 template <class T1, class T2, class Policy>
-inline typename tools::promote_args<T1, T2>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args<T1, T2>::type
    powm1(const T1 a, const T2 z, const Policy& pol)
 {
    typedef typename tools::promote_args<T1, T2>::type result_type;
diff --git a/include/boost/math/special_functions/round.hpp b/include/boost/math/special_functions/round.hpp
index e21f7185d1..cb5990a675 100644
--- a/include/boost/math/special_functions/round.hpp
+++ b/include/boost/math/special_functions/round.hpp
@@ -20,7 +20,7 @@ namespace boost{ namespace math{
 namespace detail{
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type round(const T& v, const Policy& pol, const mpl::false_)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type round(const T& v, const Policy& pol, const mpl::false_)
 {
    BOOST_MATH_STD_USING
       typedef typename tools::promote_args<T>::type result_type;
@@ -52,7 +52,7 @@ inline typename tools::promote_args<T>::type round(const T& v, const Policy& pol
    }
 }
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type round(const T& v, const Policy&, const mpl::true_)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type round(const T& v, const Policy&, const mpl::true_)
 {
    return v;
 }
@@ -60,12 +60,12 @@ inline typename tools::promote_args<T>::type round(const T& v, const Policy&, co
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type round(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type round(const T& v, const Policy& pol)
 {
    return detail::round(v, pol, mpl::bool_<detail::is_integer_for_rounding<T>::value>());
 }
 template <class T>
-inline typename tools::promote_args<T>::type round(const T& v)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type round(const T& v)
 {
    return round(v, policies::policy<>());
 }
@@ -79,31 +79,39 @@ inline typename tools::promote_args<T>::type round(const T& v)
 // dependent lookup.  See our concept archetypes for examples.
 //
 template <class T, class Policy>
-inline int iround(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED int iround(const T& v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    T r = boost::math::round(v, pol);
+#ifdef __CUDA_ARCH__
+   if((r > INT_MAX) || (r < INT_MIN))
+#else
    if((r > (std::numeric_limits<int>::max)()) || (r < (std::numeric_limits<int>::min)()))
+#endif
       return static_cast<int>(policies::raise_rounding_error("boost::math::iround<%1%>(%1%)", 0, v, 0, pol));
    return static_cast<int>(r);
 }
 template <class T>
-inline int iround(const T& v)
+inline BOOST_GPU_ENABLED int iround(const T& v)
 {
    return iround(v, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline long lround(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED long lround(const T& v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    T r = boost::math::round(v, pol);
+#ifdef __CUDA_ARCH__
+   if((r > LONG_MAX) || (r < LONG_MIN))
+#else
    if((r > (std::numeric_limits<long>::max)()) || (r < (std::numeric_limits<long>::min)()))
+#endif
       return static_cast<long int>(policies::raise_rounding_error("boost::math::lround<%1%>(%1%)", 0, v, 0L, pol));
    return static_cast<long int>(r);
 }
 template <class T>
-inline long lround(const T& v)
+inline BOOST_GPU_ENABLED long lround(const T& v)
 {
    return lround(v, policies::policy<>());
 }
@@ -111,16 +119,20 @@ inline long lround(const T& v)
 #ifdef BOOST_HAS_LONG_LONG
 
 template <class T, class Policy>
-inline boost::long_long_type llround(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED boost::long_long_type llround(const T& v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    T r = boost::math::round(v, pol);
+#ifdef __CUDA_ARCH__
+   if((r > LLONG_MAX) || (r < LLONG_MIN))
+#else
    if((r > (std::numeric_limits<boost::long_long_type>::max)()) || (r < (std::numeric_limits<boost::long_long_type>::min)()))
+#endif
       return static_cast<boost::long_long_type>(policies::raise_rounding_error("boost::math::llround<%1%>(%1%)", 0, v, static_cast<boost::long_long_type>(0), pol));
    return static_cast<boost::long_long_type>(r);
 }
 template <class T>
-inline boost::long_long_type llround(const T& v)
+inline BOOST_GPU_ENABLED boost::long_long_type llround(const T& v)
 {
    return llround(v, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/sign.hpp b/include/boost/math/special_functions/sign.hpp
index 5cb21bac54..969c977603 100644
--- a/include/boost/math/special_functions/sign.hpp
+++ b/include/boost/math/special_functions/sign.hpp
@@ -25,7 +25,7 @@ namespace detail {
 
 #ifdef BOOST_MATH_USE_STD_FPCLASSIFY
     template<class T> 
-    inline int signbit_impl(T x, native_tag const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(T x, native_tag const&)
     {
         return (std::signbit)(x) ? 1 : 0;
     }
@@ -35,13 +35,13 @@ namespace detail {
     // signed zero or NaN.
 
     template<class T>
-    inline int signbit_impl(T x, generic_tag<true> const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(T x, generic_tag<true> const&)
     {
         return x < 0;
     }
 
     template<class T> 
-    inline int signbit_impl(T x, generic_tag<false> const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(T x, generic_tag<false> const&)
     {
         return x < 0;
     }
@@ -54,18 +54,18 @@ namespace detail {
     // can occur since the exponents are the same magnitude
     // for the two types:
     //
-    inline int signbit_impl(long double x, generic_tag<true> const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(long double x, generic_tag<true> const&)
     {
        return (boost::math::signbit)(static_cast<double>(x));
     }
-    inline int signbit_impl(long double x, generic_tag<false> const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(long double x, generic_tag<false> const&)
     {
        return (boost::math::signbit)(static_cast<double>(x));
     }
 #endif
 
     template<class T>
-    inline int signbit_impl(T x, ieee_copy_all_bits_tag const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(T x, ieee_copy_all_bits_tag const&)
     {
         typedef BOOST_DEDUCED_TYPENAME fp_traits<T>::type traits;
 
@@ -75,7 +75,7 @@ namespace detail {
     }
 
     template<class T> 
-    inline int signbit_impl(T x, ieee_copy_leading_bits_tag const&)
+    inline BOOST_GPU_ENABLED int signbit_impl(T x, ieee_copy_leading_bits_tag const&)
     {
         typedef BOOST_DEDUCED_TYPENAME fp_traits<T>::type traits;
 
@@ -91,13 +91,13 @@ namespace detail {
     // signed zero or NaN.
 
     template<class T>
-    inline T (changesign_impl)(T x, generic_tag<true> const&)
+    inline BOOST_GPU_ENABLED T (changesign_impl)(T x, generic_tag<true> const&)
     {
         return -x;
     }
 
     template<class T>
-    inline T (changesign_impl)(T x, generic_tag<false> const&)
+    inline BOOST_GPU_ENABLED T (changesign_impl)(T x, generic_tag<false> const&)
     {
         return -x;
     }
@@ -107,14 +107,14 @@ namespace detail {
     // in this case we need to change the sign of both
     // components of the "double double":
     //
-    inline long double (changesign_impl)(long double x, generic_tag<true> const&)
+    inline BOOST_GPU_ENABLED long double (changesign_impl)(long double x, generic_tag<true> const&)
     {
        double* pd = reinterpret_cast<double*>(&x);
        pd[0] = boost::math::changesign(pd[0]);
        pd[1] = boost::math::changesign(pd[1]);
        return x;
     }
-    inline long double (changesign_impl)(long double x, generic_tag<false> const&)
+    inline BOOST_GPU_ENABLED long double (changesign_impl)(long double x, generic_tag<false> const&)
     {
        double* pd = reinterpret_cast<double*>(&x);
        pd[0] = boost::math::changesign(pd[0]);
@@ -124,7 +124,7 @@ namespace detail {
 #endif
 
     template<class T>
-    inline T changesign_impl(T x, ieee_copy_all_bits_tag const&)
+    inline BOOST_GPU_ENABLED T changesign_impl(T x, ieee_copy_all_bits_tag const&)
     {
         typedef BOOST_DEDUCED_TYPENAME fp_traits<T>::sign_change_type traits;
 
@@ -136,7 +136,7 @@ namespace detail {
     }
 
     template<class T>
-    inline T (changesign_impl)(T x, ieee_copy_leading_bits_tag const&)
+    inline BOOST_GPU_ENABLED T (changesign_impl)(T x, ieee_copy_leading_bits_tag const&)
     {
         typedef BOOST_DEDUCED_TYPENAME fp_traits<T>::sign_change_type traits;
 
@@ -150,7 +150,7 @@ namespace detail {
 
 }   // namespace detail
 
-template<class T> int (signbit)(T x)
+template<class T> BOOST_GPU_ENABLED int (signbit)(T x)
 { 
    typedef typename detail::fp_traits<T>::type traits;
    typedef typename traits::method method;
@@ -160,12 +160,12 @@ template<class T> int (signbit)(T x)
 }
 
 template <class T>
-inline int sign BOOST_NO_MACRO_EXPAND(const T& z)
+inline BOOST_GPU_ENABLED int sign BOOST_NO_MACRO_EXPAND(const T& z)
 {
    return (z == 0) ? 0 : (boost::math::signbit)(z) ? -1 : 1;
 }
 
-template <class T> typename tools::promote_args_permissive<T>::type (changesign)(const T& x)
+template <class T> BOOST_GPU_ENABLED typename tools::promote_args_permissive<T>::type (changesign)(const T& x)
 { //!< \brief return unchanged binary pattern of x, except for change of sign bit. 
    typedef typename detail::fp_traits<T>::sign_change_type traits;
    typedef typename traits::method method;
@@ -176,7 +176,7 @@ template <class T> typename tools::promote_args_permissive<T>::type (changesign)
 }
 
 template <class T, class U>
-inline typename tools::promote_args_permissive<T, U>::type 
+inline BOOST_GPU_ENABLED typename tools::promote_args_permissive<T, U>::type
    copysign BOOST_NO_MACRO_EXPAND(const T& x, const U& y)
 {
    BOOST_MATH_STD_USING
diff --git a/include/boost/math/special_functions/sin_pi.hpp b/include/boost/math/special_functions/sin_pi.hpp
index ae6b3e7442..e77efdf919 100644
--- a/include/boost/math/special_functions/sin_pi.hpp
+++ b/include/boost/math/special_functions/sin_pi.hpp
@@ -20,7 +20,7 @@
 namespace boost{ namespace math{ namespace detail{
 
 template <class T, class Policy>
-T sin_pi_imp(T x, const Policy& pol)
+BOOST_GPU_ENABLED T sin_pi_imp(T x, const Policy& pol)
 {
    BOOST_MATH_STD_USING // ADL of std names
    if(x < 0)
@@ -53,7 +53,7 @@ T sin_pi_imp(T x, const Policy& pol)
 } // namespace detail
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type sin_pi(T x, const Policy&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type sin_pi(T x, const Policy&)
 {
    typedef typename tools::promote_args<T>::type result_type;
    typedef typename policies::evaluation<result_type, Policy>::type value_type;
@@ -67,7 +67,7 @@ inline typename tools::promote_args<T>::type sin_pi(T x, const Policy&)
 }
 
 template <class T>
-inline typename tools::promote_args<T>::type sin_pi(T x)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type sin_pi(T x)
 {
    return boost::math::sin_pi(x, policies::policy<>());
 }
diff --git a/include/boost/math/special_functions/sinc.hpp b/include/boost/math/special_functions/sinc.hpp
index 84fbf0e324..90792cc059 100644
--- a/include/boost/math/special_functions/sinc.hpp
+++ b/include/boost/math/special_functions/sinc.hpp
@@ -39,7 +39,7 @@ namespace boost
         // This is the "Sinus Cardinal" of index Pi.
 
         template<typename T>
-        inline T    sinc_pi_imp(const T x)
+        inline BOOST_GPU_ENABLED T    sinc_pi_imp(const T x)
         {
             BOOST_MATH_STD_USING
 
@@ -77,14 +77,14 @@ namespace boost
        } // namespace detail
 
        template <class T>
-       inline typename tools::promote_args<T>::type sinc_pi(T x)
+       inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type sinc_pi(T x)
        {
           typedef typename tools::promote_args<T>::type result_type;
           return detail::sinc_pi_imp(static_cast<result_type>(x));
        }
 
        template <class T, class Policy>
-       inline typename tools::promote_args<T>::type sinc_pi(T x, const Policy&)
+       inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type sinc_pi(T x, const Policy&)
        {
           typedef typename tools::promote_args<T>::type result_type;
           return detail::sinc_pi_imp(static_cast<result_type>(x));
@@ -92,7 +92,7 @@ namespace boost
 
 #ifndef    BOOST_NO_TEMPLATE_TEMPLATES
         template<typename T, template<typename> class U>
-        inline U<T>    sinc_pi(const U<T> x)
+        inline BOOST_GPU_ENABLED U<T>    sinc_pi(const U<T> x)
         {
             BOOST_MATH_STD_USING
             using    ::std::numeric_limits;
@@ -133,7 +133,7 @@ namespace boost
         }
 
         template<typename T, template<typename> class U, class Policy>
-        inline U<T>    sinc_pi(const U<T> x, const Policy&)
+        inline BOOST_GPU_ENABLED U<T>    sinc_pi(const U<T> x, const Policy&)
         {
            return sinc_pi(x);
         }
diff --git a/include/boost/math/special_functions/trunc.hpp b/include/boost/math/special_functions/trunc.hpp
index 3f80c96fee..4adefaab35 100644
--- a/include/boost/math/special_functions/trunc.hpp
+++ b/include/boost/math/special_functions/trunc.hpp
@@ -18,7 +18,7 @@
 namespace boost{ namespace math{ namespace detail{
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol, const mpl::false_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol, const mpl::false_&)
 {
    BOOST_MATH_STD_USING
    typedef typename tools::promote_args<T>::type result_type;
@@ -28,7 +28,7 @@ inline typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type trunc(const T& v, const Policy&, const mpl::true_&)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type trunc(const T& v, const Policy&, const mpl::true_&)
 {
    return v;
 }
@@ -36,12 +36,12 @@ inline typename tools::promote_args<T>::type trunc(const T& v, const Policy&, co
 }
 
 template <class T, class Policy>
-inline typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type trunc(const T& v, const Policy& pol)
 {
    return detail::trunc(v, pol, mpl::bool_<detail::is_integer_for_rounding<T>::value>());
 }
 template <class T>
-inline typename tools::promote_args<T>::type trunc(const T& v)
+inline BOOST_GPU_ENABLED typename tools::promote_args<T>::type trunc(const T& v)
 {
    return trunc(v, policies::policy<>());
 }
@@ -55,33 +55,41 @@ inline typename tools::promote_args<T>::type trunc(const T& v)
 // dependent lookup.  See our concept archetypes for examples.
 //
 template <class T, class Policy>
-inline int itrunc(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED int itrunc(const T& v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    typedef typename tools::promote_args<T>::type result_type;
    result_type r = boost::math::trunc(v, pol);
+#ifdef __CUDA_ARCH__
+   if((r > INT_MAX) || (r < INT_MIN))
+#else
    if((r > (std::numeric_limits<int>::max)()) || (r < (std::numeric_limits<int>::min)()))
+#endif
       return static_cast<int>(policies::raise_rounding_error("boost::math::itrunc<%1%>(%1%)", 0, static_cast<result_type>(v), 0, pol));
    return static_cast<int>(r);
 }
 template <class T>
-inline int itrunc(const T& v)
+inline BOOST_GPU_ENABLED int itrunc(const T& v)
 {
    return itrunc(v, policies::policy<>());
 }
 
 template <class T, class Policy>
-inline long ltrunc(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED long ltrunc(const T& v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    typedef typename tools::promote_args<T>::type result_type;
    result_type r = boost::math::trunc(v, pol);
+#ifdef __CUDA_ARCH__
+   if((r > LONG_MAX) || (r < LONG_MIN))
+#else
    if((r > (std::numeric_limits<long>::max)()) || (r < (std::numeric_limits<long>::min)()))
+#endif
       return static_cast<long>(policies::raise_rounding_error("boost::math::ltrunc<%1%>(%1%)", 0, static_cast<result_type>(v), 0L, pol));
    return static_cast<long>(r);
 }
 template <class T>
-inline long ltrunc(const T& v)
+inline BOOST_GPU_ENABLED long ltrunc(const T& v)
 {
    return ltrunc(v, policies::policy<>());
 }
@@ -89,17 +97,21 @@ inline long ltrunc(const T& v)
 #ifdef BOOST_HAS_LONG_LONG
 
 template <class T, class Policy>
-inline boost::long_long_type lltrunc(const T& v, const Policy& pol)
+inline BOOST_GPU_ENABLED boost::long_long_type lltrunc(const T& v, const Policy& pol)
 {
    BOOST_MATH_STD_USING
    typedef typename tools::promote_args<T>::type result_type;
    result_type r = boost::math::trunc(v, pol);
+#ifdef __CUDA_ARCH__
+   if((r > LLONG_MAX) || (r < LLONG_MIN))
+#else
    if((r > (std::numeric_limits<boost::long_long_type>::max)()) || (r < (std::numeric_limits<boost::long_long_type>::min)()))
+#endif
       return static_cast<boost::long_long_type>(policies::raise_rounding_error("boost::math::lltrunc<%1%>(%1%)", 0, v, static_cast<boost::long_long_type>(0), pol));
    return static_cast<boost::long_long_type>(r);
 }
 template <class T>
-inline boost::long_long_type lltrunc(const T& v)
+inline BOOST_GPU_ENABLED boost::long_long_type lltrunc(const T& v)
 {
    return lltrunc(v, policies::policy<>());
 }
diff --git a/include/boost/math/tools/big_constant.hpp b/include/boost/math/tools/big_constant.hpp
index a6f7bc0f1a..4f39bf07c3 100644
--- a/include/boost/math/tools/big_constant.hpp
+++ b/include/boost/math/tools/big_constant.hpp
@@ -62,6 +62,10 @@ inline BOOST_MATH_CONSTEXPR T make_big_value(largest_float, const char* s, mpl::
 //
 // For constants which might fit in a long double (if it's big enough):
 //
+#ifdef __CUDA_ARCH__
+// We don't support anything except float and double:
+#define BOOST_MATH_BIG_CONSTANT(T, D, x) static_cast<T>(x)
+#else
 #define BOOST_MATH_BIG_CONSTANT(T, D, x)\
    boost::math::tools::make_big_value<T>(\
       BOOST_MATH_LARGEST_FLOAT_C(x), \
@@ -72,6 +76,7 @@ inline BOOST_MATH_CONSTEXPR T make_big_value(largest_float, const char* s, mpl::
           || (boost::math::tools::numeric_traits<T>::is_specialized && \
           (boost::math::tools::numeric_traits<T>::digits10 <= boost::math::tools::numeric_traits<boost::math::tools::largest_float>::digits10))) >(), \
       boost::is_convertible<const char*, T>())
+#endif
 //
 // For constants too huge for any conceivable long double (and which generate compiler errors if we try and declare them as such):
 //
diff --git a/include/boost/math/tools/config.hpp b/include/boost/math/tools/config.hpp
index 17bfec16fe..3ce6b70655 100644
--- a/include/boost/math/tools/config.hpp
+++ b/include/boost/math/tools/config.hpp
@@ -265,6 +265,43 @@
 #  define BOOST_MATH_INT_VALUE_SUFFIX(RV, SUF) RV##SUF
 #endif
 //
+// function scope static variables aren't allowed on CUDA device code:
+//
+#if defined(__CUDA_ARCH__) && defined(BOOST_NO_CXX11_CONSTEXPR)
+#define BOOST_MATH_GPU_STATIC
+#elif defined(__CUDA_ARCH__)
+#define BOOST_MATH_GPU_STATIC constexpr
+#else
+#define BOOST_MATH_GPU_STATIC static
+#endif
+//
+// Asserts don't work on CUDA either:
+//
+#ifdef __CUDA_ARCH__
+#define BOOST_MATH_ASSERT(x)
+#else
+#define BOOST_MATH_ASSERT(x) BOOST_ASSERT(x)
+#endif
+//
+// And std lib functions swap/min/max aren't marked up as device safe:
+//
+#ifdef __CUDA_ARCH__
+template <class T>
+inline BOOST_GPU_ENABLED void cuda_safe_swap(T& a, T& b) { T t(a); a = b; b = t; }
+template <class T>
+inline BOOST_GPU_ENABLED T cuda_safe_min(const T& a, const T& b) { return a < b ? a : b; }
+template <class T>
+inline BOOST_GPU_ENABLED T cuda_safe_max(const T& a, const T& b) { return a > b ? a : b; }
+
+# define BOOST_MATH_CUDA_SAFE_SWAP(a, b) cuda_safe_swap(a, b)
+# define BOOST_MATH_CUDA_SAFE_MIN(a, b) cuda_safe_min(a, b)
+# define BOOST_MATH_CUDA_SAFE_MAX(a, b) cuda_safe_max(a, b)
+#else
+# define BOOST_MATH_CUDA_SAFE_SWAP(a, b) swap(a, b)
+# define BOOST_MATH_CUDA_SAFE_MIN(a, b) (std::min)(a, b)
+# define BOOST_MATH_CUDA_SAFE_MAX(a, b) (std::max)(a, b)
+#endif
+//
 // And then the actual configuration:
 //
 #if defined(_GLIBCXX_USE_FLOAT128) && defined(BOOST_GCC) && !defined(__STRICT_ANSI__) \
diff --git a/include/boost/math/tools/detail/polynomial_horner1_10.hpp b/include/boost/math/tools/detail/polynomial_horner1_10.hpp
index 48cf9e39b3..5cc32bfc23 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_10.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_10.hpp
@@ -12,67 +12,67 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_11.hpp b/include/boost/math/tools/detail/polynomial_horner1_11.hpp
index e52595429e..a54773aad4 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_11.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_11.hpp
@@ -12,73 +12,73 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_12.hpp b/include/boost/math/tools/detail/polynomial_horner1_12.hpp
index 10cd178975..54bd8f755d 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_12.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_12.hpp
@@ -12,79 +12,79 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_13.hpp b/include/boost/math/tools/detail/polynomial_horner1_13.hpp
index 90fa9ec43d..60d85776cc 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_13.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_13.hpp
@@ -12,85 +12,85 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_14.hpp b/include/boost/math/tools/detail/polynomial_horner1_14.hpp
index 389c8063b0..8786979bcb 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_14.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_14.hpp
@@ -12,91 +12,91 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_15.hpp b/include/boost/math/tools/detail/polynomial_horner1_15.hpp
index d196a41603..572562d051 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_15.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_15.hpp
@@ -12,97 +12,97 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_16.hpp b/include/boost/math/tools/detail/polynomial_horner1_16.hpp
index fa48c6614c..5ddfe42eb7 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_16.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_16.hpp
@@ -12,103 +12,103 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_17.hpp b/include/boost/math/tools/detail/polynomial_horner1_17.hpp
index 4575aeac7b..aef785ab7d 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_17.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_17.hpp
@@ -12,109 +12,109 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_18.hpp b/include/boost/math/tools/detail/polynomial_horner1_18.hpp
index 95dd400416..70ad76b306 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_18.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_18.hpp
@@ -12,115 +12,115 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((((a[17] * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_19.hpp b/include/boost/math/tools/detail/polynomial_horner1_19.hpp
index 6d73eb8ffc..eff44d8e1b 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_19.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_19.hpp
@@ -12,121 +12,121 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((((a[17] * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((((((a[18] * x + a[17]) * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_2.hpp b/include/boost/math/tools/detail/polynomial_horner1_2.hpp
index 85640edf67..9502347742 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_2.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_2.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_20.hpp b/include/boost/math/tools/detail/polynomial_horner1_20.hpp
index f8b7f34384..d5fe038f91 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_20.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_20.hpp
@@ -12,127 +12,127 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((((a[17] * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((((((((((((a[18] * x + a[17]) * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((((((((((((((a[19] * x + a[18]) * x + a[17]) * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_3.hpp b/include/boost/math/tools/detail/polynomial_horner1_3.hpp
index c0ad4660e9..4ae37d858c 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_3.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_3.hpp
@@ -12,25 +12,25 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_4.hpp b/include/boost/math/tools/detail/polynomial_horner1_4.hpp
index 27ad74eb95..8bb5cdf43b 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_4.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_4.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_5.hpp b/include/boost/math/tools/detail/polynomial_horner1_5.hpp
index 5cfafb4052..8f8ec45a74 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_5.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_5.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_6.hpp b/include/boost/math/tools/detail/polynomial_horner1_6.hpp
index f7f4bee480..faf61aa84c 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_6.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_6.hpp
@@ -12,43 +12,43 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_7.hpp b/include/boost/math/tools/detail/polynomial_horner1_7.hpp
index c612822fc0..3102726e84 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_7.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_7.hpp
@@ -12,49 +12,49 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_8.hpp b/include/boost/math/tools/detail/polynomial_horner1_8.hpp
index 5d9b453b9d..9a91bc4ce3 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_8.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_8.hpp
@@ -12,55 +12,55 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner1_9.hpp b/include/boost/math/tools/detail/polynomial_horner1_9.hpp
index 84977fd13b..d9f280f31c 100644
--- a/include/boost/math/tools/detail/polynomial_horner1_9.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner1_9.hpp
@@ -12,61 +12,61 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner2_10.hpp b/include/boost/math/tools/detail/polynomial_horner2_10.hpp
index f10c5d0492..2647f9d42e 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_10.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_10.hpp
@@ -12,72 +12,72 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_11.hpp b/include/boost/math/tools/detail/polynomial_horner2_11.hpp
index 757357addf..0db7c32f48 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_11.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_11.hpp
@@ -12,79 +12,79 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_12.hpp b/include/boost/math/tools/detail/polynomial_horner2_12.hpp
index e0fc8a9df7..429967a66f 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_12.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_12.hpp
@@ -12,86 +12,86 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_13.hpp b/include/boost/math/tools/detail/polynomial_horner2_13.hpp
index c13a0a7cc2..5b5a88efc5 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_13.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_13.hpp
@@ -12,93 +12,93 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_14.hpp b/include/boost/math/tools/detail/polynomial_horner2_14.hpp
index ccc6e7d8da..fca4cc8f4e 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_14.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_14.hpp
@@ -12,100 +12,100 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_15.hpp b/include/boost/math/tools/detail/polynomial_horner2_15.hpp
index a409ba7fdb..0de1a1f37b 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_15.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_15.hpp
@@ -12,107 +12,107 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_16.hpp b/include/boost/math/tools/detail/polynomial_horner2_16.hpp
index a69c1a3619..8ff354a73a 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_16.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_16.hpp
@@ -12,114 +12,114 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_17.hpp b/include/boost/math/tools/detail/polynomial_horner2_17.hpp
index 9e7c05d007..965601ef3b 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_17.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_17.hpp
@@ -12,121 +12,121 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_18.hpp b/include/boost/math/tools/detail/polynomial_horner2_18.hpp
index 1ea0e052c0..4d16725d63 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_18.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_18.hpp
@@ -12,128 +12,128 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((((a[17] * x2 + a[15]) * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_19.hpp b/include/boost/math/tools/detail/polynomial_horner2_19.hpp
index 45c125c871..21519449d8 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_19.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_19.hpp
@@ -12,135 +12,135 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((((a[17] * x2 + a[15]) * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((((a[18] * x2 + a[16]) * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((((a[17] * x2 + a[15]) * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_2.hpp b/include/boost/math/tools/detail/polynomial_horner2_2.hpp
index 20da483c33..188e80a397 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_2.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_2.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner2_20.hpp b/include/boost/math/tools/detail/polynomial_horner2_20.hpp
index 956c6dba4f..0f098bc527 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_20.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_20.hpp
@@ -12,142 +12,142 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((a[9] * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((a[10] * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((a[11] * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((a[12] * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((a[13] * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((((a[14] * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((((((a[15] * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((((a[17] * x2 + a[15]) * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + (((((((a[16] * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((((((((a[18] * x2 + a[16]) * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((((((((a[17] * x2 + a[15]) * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((((((((a[19] * x2 + a[17]) * x2 + a[15]) * x2 + a[13]) * x2 + a[11]) * x2 + a[9]) * x2 + a[7]) * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((((((((a[18] * x2 + a[16]) * x2 + a[14]) * x2 + a[12]) * x2 + a[10]) * x2 + a[8]) * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_3.hpp b/include/boost/math/tools/detail/polynomial_horner2_3.hpp
index 58b290c7c9..fb76d418d7 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_3.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_3.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner2_4.hpp b/include/boost/math/tools/detail/polynomial_horner2_4.hpp
index 74f969b749..ea4f9a6439 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_4.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_4.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner2_5.hpp b/include/boost/math/tools/detail/polynomial_horner2_5.hpp
index 134cbc8ef9..0fd24c1311 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_5.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_5.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_6.hpp b/include/boost/math/tools/detail/polynomial_horner2_6.hpp
index 7cb75d75c8..47fb294db5 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_6.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_6.hpp
@@ -12,44 +12,44 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_7.hpp b/include/boost/math/tools/detail/polynomial_horner2_7.hpp
index 327639deb6..52b55d78c8 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_7.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_7.hpp
@@ -12,51 +12,51 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_8.hpp b/include/boost/math/tools/detail/polynomial_horner2_8.hpp
index 2145ad6dcd..4ef422b083 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_8.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_8.hpp
@@ -12,58 +12,58 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
diff --git a/include/boost/math/tools/detail/polynomial_horner2_9.hpp b/include/boost/math/tools/detail/polynomial_horner2_9.hpp
index 08f6336e49..c7e4ddc66b 100644
--- a/include/boost/math/tools/detail/polynomial_horner2_9.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner2_9.hpp
@@ -12,65 +12,65 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((a[4] * x2 + a[2]) * x2 + a[0] + (a[3] * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[5] * x2 + a[3]) * x2 + a[1]) * x + (a[4] * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>(((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + ((a[5] * x2 + a[3]) * x2 + a[1]) * x);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x + ((a[6] * x2 + a[4]) * x2 + a[2]) * x2 + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    return static_cast<V>((((a[8] * x2 + a[6]) * x2 + a[4]) * x2 + a[2]) * x2 + a[0] + (((a[7] * x2 + a[5]) * x2 + a[3]) * x2 + a[1]) * x);
diff --git a/include/boost/math/tools/detail/polynomial_horner3_10.hpp b/include/boost/math/tools/detail/polynomial_horner3_10.hpp
index 05ad20cc30..ec744e7cfa 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_10.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_10.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_11.hpp b/include/boost/math/tools/detail/polynomial_horner3_11.hpp
index 9f8628e750..0d81fddd2f 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_11.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_11.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_12.hpp b/include/boost/math/tools/detail/polynomial_horner3_12.hpp
index ccb3ec4d10..61fae85f05 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_12.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_12.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_13.hpp b/include/boost/math/tools/detail/polynomial_horner3_13.hpp
index a8cdf2c508..464b874f5c 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_13.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_13.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_14.hpp b/include/boost/math/tools/detail/polynomial_horner3_14.hpp
index 5eb4ef1f28..d1f51b9f01 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_14.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_14.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_15.hpp b/include/boost/math/tools/detail/polynomial_horner3_15.hpp
index 7e8edcd8aa..95692b2499 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_15.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_15.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -262,7 +262,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_16.hpp b/include/boost/math/tools/detail/polynomial_horner3_16.hpp
index 58bea1581f..ce1dc45d0e 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_16.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_16.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -262,7 +262,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -295,7 +295,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_17.hpp b/include/boost/math/tools/detail/polynomial_horner3_17.hpp
index 007b8a4918..81bc012f86 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_17.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_17.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -262,7 +262,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -295,7 +295,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -330,7 +330,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_18.hpp b/include/boost/math/tools/detail/polynomial_horner3_18.hpp
index 68935adba1..4bcfe2066f 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_18.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_18.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -262,7 +262,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -295,7 +295,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -330,7 +330,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -367,7 +367,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_19.hpp b/include/boost/math/tools/detail/polynomial_horner3_19.hpp
index acffba6b47..67f4a3dedc 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_19.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_19.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -262,7 +262,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -295,7 +295,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -330,7 +330,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -367,7 +367,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -406,7 +406,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_2.hpp b/include/boost/math/tools/detail/polynomial_horner3_2.hpp
index 6b8d9a71f8..abf0326c81 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_2.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_2.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner3_20.hpp b/include/boost/math/tools/detail/polynomial_horner3_20.hpp
index b3a170c095..c49bab24c1 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_20.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_20.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -127,7 +127,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -150,7 +150,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<10>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -175,7 +175,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<11>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -202,7 +202,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<12>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -231,7 +231,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<13>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -262,7 +262,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<14>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -295,7 +295,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<15>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -330,7 +330,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<16>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -367,7 +367,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<17>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -406,7 +406,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<18>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -447,7 +447,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<19>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_3.hpp b/include/boost/math/tools/detail/polynomial_horner3_3.hpp
index 05fe88fccc..41a5588d0a 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_3.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_3.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner3_4.hpp b/include/boost/math/tools/detail/polynomial_horner3_4.hpp
index b98d6f678d..a437914475 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_4.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_4.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
diff --git a/include/boost/math/tools/detail/polynomial_horner3_5.hpp b/include/boost/math/tools/detail/polynomial_horner3_5.hpp
index 12e639cef3..84a6ea87a1 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_5.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_5.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_6.hpp b/include/boost/math/tools/detail/polynomial_horner3_6.hpp
index b645cb5bbc..d925d3e53f 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_6.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_6.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_7.hpp b/include/boost/math/tools/detail/polynomial_horner3_7.hpp
index 3df4b5b4ef..c4500e7e8e 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_7.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_7.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_8.hpp b/include/boost/math/tools/detail/polynomial_horner3_8.hpp
index 9a49d2555e..885d3955e2 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_8.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_8.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/polynomial_horner3_9.hpp b/include/boost/math/tools/detail/polynomial_horner3_9.hpp
index 3507d37604..efdf7af30b 100644
--- a/include/boost/math/tools/detail/polynomial_horner3_9.hpp
+++ b/include/boost/math/tools/detail/polynomial_horner3_9.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[1] * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[2] * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[3] * x + a[2]) * x + a[1]) * x + a[0]);
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -55,7 +55,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<5>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -70,7 +70,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<6>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -87,7 +87,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<7>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
@@ -106,7 +106,7 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<8>*)
 }
 
 template <class T, class V>
-inline V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    V x2 = x * x;
    V t[2];
diff --git a/include/boost/math/tools/detail/rational_horner1_10.hpp b/include/boost/math/tools/detail/rational_horner1_10.hpp
index e670853869..4ee20eb817 100644
--- a/include/boost/math/tools/detail/rational_horner1_10.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_10.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_11.hpp b/include/boost/math/tools/detail/rational_horner1_11.hpp
index 65e17598ff..05fddffff6 100644
--- a/include/boost/math/tools/detail/rational_horner1_11.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_11.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_12.hpp b/include/boost/math/tools/detail/rational_horner1_12.hpp
index de33af0e07..632b0b110c 100644
--- a/include/boost/math/tools/detail/rational_horner1_12.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_12.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_13.hpp b/include/boost/math/tools/detail/rational_horner1_13.hpp
index ed4ac1af8b..9004ff5bbe 100644
--- a/include/boost/math/tools/detail/rational_horner1_13.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_13.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_14.hpp b/include/boost/math/tools/detail/rational_horner1_14.hpp
index a3222f8212..1a712f8b4b 100644
--- a/include/boost/math/tools/detail/rational_horner1_14.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_14.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_15.hpp b/include/boost/math/tools/detail/rational_horner1_15.hpp
index c8cd691573..7efdb95720 100644
--- a/include/boost/math/tools/detail/rational_horner1_15.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_15.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((b[14] * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_16.hpp b/include/boost/math/tools/detail/rational_horner1_16.hpp
index 8003c82db7..33b6360c00 100644
--- a/include/boost/math/tools/detail/rational_horner1_16.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_16.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((b[14] * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -192,7 +192,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((b[15] * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_17.hpp b/include/boost/math/tools/detail/rational_horner1_17.hpp
index 294c3f175d..811c9b0ee7 100644
--- a/include/boost/math/tools/detail/rational_horner1_17.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_17.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((b[14] * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -192,7 +192,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((b[15] * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -204,7 +204,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((((b[16] * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_18.hpp b/include/boost/math/tools/detail/rational_horner1_18.hpp
index 1a48bba097..20954a66d7 100644
--- a/include/boost/math/tools/detail/rational_horner1_18.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_18.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((b[14] * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -192,7 +192,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((b[15] * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -204,7 +204,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((((b[16] * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -216,7 +216,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((((a[17] * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((((b[17] * x + b[16]) * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_19.hpp b/include/boost/math/tools/detail/rational_horner1_19.hpp
index 12fd75cf1b..46439b6084 100644
--- a/include/boost/math/tools/detail/rational_horner1_19.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_19.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((b[14] * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -192,7 +192,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((b[15] * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -204,7 +204,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((((b[16] * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -216,7 +216,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((((a[17] * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((((b[17] * x + b[16]) * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -228,7 +228,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((((((a[18] * x + a[17]) * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((((((b[18] * x + b[17]) * x + b[16]) * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_2.hpp b/include/boost/math/tools/detail/rational_horner1_2.hpp
index c838f2a2cd..eb2da6caf5 100644
--- a/include/boost/math/tools/detail/rational_horner1_2.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_2.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_20.hpp b/include/boost/math/tools/detail/rational_horner1_20.hpp
index 3ee3e966e6..caece95794 100644
--- a/include/boost/math/tools/detail/rational_horner1_20.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_20.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -120,7 +120,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((a[9] * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((b[9] * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -132,7 +132,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((a[10] * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((b[10] * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -144,7 +144,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((a[11] * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((b[11] * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -156,7 +156,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((a[12] * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((b[12] * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -168,7 +168,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((a[13] * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((b[13] * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((a[14] * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((b[14] * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -192,7 +192,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((a[15] * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((b[15] * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -204,7 +204,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((((a[16] * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((((b[16] * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -216,7 +216,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((((a[17] * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((((b[17] * x + b[16]) * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -228,7 +228,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((((((((((((a[18] * x + a[17]) * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((((((((((((b[18] * x + b[17]) * x + b[16]) * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -240,7 +240,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((((((((((((((a[19] * x + a[18]) * x + a[17]) * x + a[16]) * x + a[15]) * x + a[14]) * x + a[13]) * x + a[12]) * x + a[11]) * x + a[10]) * x + a[9]) * x + a[8]) * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((((((((((((((b[19] * x + b[18]) * x + b[17]) * x + b[16]) * x + b[15]) * x + b[14]) * x + b[13]) * x + b[12]) * x + b[11]) * x + b[10]) * x + b[9]) * x + b[8]) * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_3.hpp b/include/boost/math/tools/detail/rational_horner1_3.hpp
index 034ead3f66..1290078e37 100644
--- a/include/boost/math/tools/detail/rational_horner1_3.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_3.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_4.hpp b/include/boost/math/tools/detail/rational_horner1_4.hpp
index de2972f4c4..6e99d93d23 100644
--- a/include/boost/math/tools/detail/rational_horner1_4.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_4.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_5.hpp b/include/boost/math/tools/detail/rational_horner1_5.hpp
index a59ff114d1..ada4dc55a1 100644
--- a/include/boost/math/tools/detail/rational_horner1_5.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_5.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_6.hpp b/include/boost/math/tools/detail/rational_horner1_6.hpp
index c5000c5db4..eca7f5e0f5 100644
--- a/include/boost/math/tools/detail/rational_horner1_6.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_6.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_7.hpp b/include/boost/math/tools/detail/rational_horner1_7.hpp
index bc860f3bf7..21dd256964 100644
--- a/include/boost/math/tools/detail/rational_horner1_7.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_7.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_8.hpp b/include/boost/math/tools/detail/rational_horner1_8.hpp
index 69368978b6..012b3ab71c 100644
--- a/include/boost/math/tools/detail/rational_horner1_8.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_8.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner1_9.hpp b/include/boost/math/tools/detail/rational_horner1_9.hpp
index 0aafea15fd..06991281f1 100644
--- a/include/boost/math/tools/detail/rational_horner1_9.hpp
+++ b/include/boost/math/tools/detail/rational_horner1_9.hpp
@@ -12,19 +12,19 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
@@ -36,7 +36,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
@@ -48,7 +48,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
@@ -60,7 +60,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((a[4] * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((b[4] * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -72,7 +72,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((a[5] * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((b[5] * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -84,7 +84,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((a[6] * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((b[6] * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -96,7 +96,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>((((((((a[7] * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / (((((((b[7] * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
@@ -108,7 +108,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
      return static_cast<V>(((((((((a[8] * x + a[7]) * x + a[6]) * x + a[5]) * x + a[4]) * x + a[3]) * x + a[2]) * x + a[1]) * x + a[0]) / ((((((((b[8] * x + b[7]) * x + b[6]) * x + b[5]) * x + b[4]) * x + b[3]) * x + b[2]) * x + b[1]) * x + b[0]));
diff --git a/include/boost/math/tools/detail/rational_horner2_10.hpp b/include/boost/math/tools/detail/rational_horner2_10.hpp
index 127777bc2a..28fa196157 100644
--- a/include/boost/math/tools/detail/rational_horner2_10.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_10.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_11.hpp b/include/boost/math/tools/detail/rational_horner2_11.hpp
index 53983ed4ad..7f562d062d 100644
--- a/include/boost/math/tools/detail/rational_horner2_11.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_11.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_12.hpp b/include/boost/math/tools/detail/rational_horner2_12.hpp
index 4ad0856b06..dbcec7003b 100644
--- a/include/boost/math/tools/detail/rational_horner2_12.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_12.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_13.hpp b/include/boost/math/tools/detail/rational_horner2_13.hpp
index 5f9303d0c3..a73935f666 100644
--- a/include/boost/math/tools/detail/rational_horner2_13.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_13.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_14.hpp b/include/boost/math/tools/detail/rational_horner2_14.hpp
index dc512f393f..3b99df73d8 100644
--- a/include/boost/math/tools/detail/rational_horner2_14.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_14.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_15.hpp b/include/boost/math/tools/detail/rational_horner2_15.hpp
index cf084e8be0..b9a0e0a5e6 100644
--- a/include/boost/math/tools/detail/rational_horner2_15.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_15.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -202,7 +202,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_16.hpp b/include/boost/math/tools/detail/rational_horner2_16.hpp
index 3d13db2553..748fb127ac 100644
--- a/include/boost/math/tools/detail/rational_horner2_16.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_16.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -202,7 +202,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -218,7 +218,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_17.hpp b/include/boost/math/tools/detail/rational_horner2_17.hpp
index 3adf4053bd..f06f178975 100644
--- a/include/boost/math/tools/detail/rational_horner2_17.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_17.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -202,7 +202,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -218,7 +218,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -234,7 +234,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_18.hpp b/include/boost/math/tools/detail/rational_horner2_18.hpp
index 607609fd8e..8f7c8f1f5d 100644
--- a/include/boost/math/tools/detail/rational_horner2_18.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_18.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -202,7 +202,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -218,7 +218,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -234,7 +234,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -250,7 +250,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_19.hpp b/include/boost/math/tools/detail/rational_horner2_19.hpp
index bc324c3be3..ab515ab96b 100644
--- a/include/boost/math/tools/detail/rational_horner2_19.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_19.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -202,7 +202,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -218,7 +218,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -234,7 +234,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -250,7 +250,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -266,7 +266,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_2.hpp b/include/boost/math/tools/detail/rational_horner2_2.hpp
index 95ec0251d5..71c4ebae20 100644
--- a/include/boost/math/tools/detail/rational_horner2_2.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_2.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
diff --git a/include/boost/math/tools/detail/rational_horner2_20.hpp b/include/boost/math/tools/detail/rational_horner2_20.hpp
index cf1211b61f..9914426a24 100644
--- a/include/boost/math/tools/detail/rational_horner2_20.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_20.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -122,7 +122,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -138,7 +138,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -154,7 +154,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -170,7 +170,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -186,7 +186,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -202,7 +202,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -218,7 +218,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -234,7 +234,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -250,7 +250,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -266,7 +266,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -282,7 +282,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_3.hpp b/include/boost/math/tools/detail/rational_horner2_3.hpp
index 9ce437b41f..7b50dc4c40 100644
--- a/include/boost/math/tools/detail/rational_horner2_3.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_3.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
diff --git a/include/boost/math/tools/detail/rational_horner2_4.hpp b/include/boost/math/tools/detail/rational_horner2_4.hpp
index 00543ede85..5e3105d680 100644
--- a/include/boost/math/tools/detail/rational_horner2_4.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_4.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
diff --git a/include/boost/math/tools/detail/rational_horner2_5.hpp b/include/boost/math/tools/detail/rational_horner2_5.hpp
index d117b66633..d073e51168 100644
--- a/include/boost/math/tools/detail/rational_horner2_5.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_5.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_6.hpp b/include/boost/math/tools/detail/rational_horner2_6.hpp
index c431d16344..be7fb6ca7a 100644
--- a/include/boost/math/tools/detail/rational_horner2_6.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_6.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_7.hpp b/include/boost/math/tools/detail/rational_horner2_7.hpp
index 2104302472..c1ce46889d 100644
--- a/include/boost/math/tools/detail/rational_horner2_7.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_7.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_8.hpp b/include/boost/math/tools/detail/rational_horner2_8.hpp
index fd98289b99..9133066891 100644
--- a/include/boost/math/tools/detail/rational_horner2_8.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_8.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner2_9.hpp b/include/boost/math/tools/detail/rational_horner2_9.hpp
index 1081ab2f8b..286514ff4a 100644
--- a/include/boost/math/tools/detail/rational_horner2_9.hpp
+++ b/include/boost/math/tools/detail/rational_horner2_9.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -58,7 +58,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -74,7 +74,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -90,7 +90,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -106,7 +106,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_10.hpp b/include/boost/math/tools/detail/rational_horner3_10.hpp
index 7da05875f3..de2381d079 100644
--- a/include/boost/math/tools/detail/rational_horner3_10.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_10.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_11.hpp b/include/boost/math/tools/detail/rational_horner3_11.hpp
index df971197a3..e292111f04 100644
--- a/include/boost/math/tools/detail/rational_horner3_11.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_11.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_12.hpp b/include/boost/math/tools/detail/rational_horner3_12.hpp
index 668f76684f..d2203a10b1 100644
--- a/include/boost/math/tools/detail/rational_horner3_12.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_12.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_13.hpp b/include/boost/math/tools/detail/rational_horner3_13.hpp
index b0b4c2ac58..0bfbb3a4db 100644
--- a/include/boost/math/tools/detail/rational_horner3_13.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_13.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_14.hpp b/include/boost/math/tools/detail/rational_horner3_14.hpp
index 92035ef806..4af1fa7dde 100644
--- a/include/boost/math/tools/detail/rational_horner3_14.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_14.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_15.hpp b/include/boost/math/tools/detail/rational_horner3_15.hpp
index 9536ecd844..3c88bbd3e4 100644
--- a/include/boost/math/tools/detail/rational_horner3_15.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_15.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -782,7 +782,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_16.hpp b/include/boost/math/tools/detail/rational_horner3_16.hpp
index 7ccf8f6e7a..bb02b5cb97 100644
--- a/include/boost/math/tools/detail/rational_horner3_16.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_16.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -782,7 +782,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -900,7 +900,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_17.hpp b/include/boost/math/tools/detail/rational_horner3_17.hpp
index 1a35c47397..167bfd5904 100644
--- a/include/boost/math/tools/detail/rational_horner3_17.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_17.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -782,7 +782,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -900,7 +900,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1026,7 +1026,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_18.hpp b/include/boost/math/tools/detail/rational_horner3_18.hpp
index 8a1c16eb2f..79b92a87e6 100644
--- a/include/boost/math/tools/detail/rational_horner3_18.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_18.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -782,7 +782,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -900,7 +900,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1026,7 +1026,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1160,7 +1160,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_19.hpp b/include/boost/math/tools/detail/rational_horner3_19.hpp
index 15d16bafc3..2403b77c2c 100644
--- a/include/boost/math/tools/detail/rational_horner3_19.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_19.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -782,7 +782,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -900,7 +900,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1026,7 +1026,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1160,7 +1160,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1302,7 +1302,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_2.hpp b/include/boost/math/tools/detail/rational_horner3_2.hpp
index 95ec0251d5..71c4ebae20 100644
--- a/include/boost/math/tools/detail/rational_horner3_2.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_2.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
diff --git a/include/boost/math/tools/detail/rational_horner3_20.hpp b/include/boost/math/tools/detail/rational_horner3_20.hpp
index 78233214d8..a4fab5745d 100644
--- a/include/boost/math/tools/detail/rational_horner3_20.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_20.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -312,7 +312,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<10>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -390,7 +390,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<11>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -476,7 +476,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<12>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -570,7 +570,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<13>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -672,7 +672,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<14>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -782,7 +782,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<15>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -900,7 +900,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<16>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1026,7 +1026,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<17>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1160,7 +1160,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<18>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1302,7 +1302,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<19>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -1452,7 +1452,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<20>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_3.hpp b/include/boost/math/tools/detail/rational_horner3_3.hpp
index 9ce437b41f..7b50dc4c40 100644
--- a/include/boost/math/tools/detail/rational_horner3_3.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_3.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
diff --git a/include/boost/math/tools/detail/rational_horner3_4.hpp b/include/boost/math/tools/detail/rational_horner3_4.hpp
index 00543ede85..5e3105d680 100644
--- a/include/boost/math/tools/detail/rational_horner3_4.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_4.hpp
@@ -12,31 +12,31 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
diff --git a/include/boost/math/tools/detail/rational_horner3_5.hpp b/include/boost/math/tools/detail/rational_horner3_5.hpp
index 35dce45a80..f5138c06aa 100644
--- a/include/boost/math/tools/detail/rational_horner3_5.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_5.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_6.hpp b/include/boost/math/tools/detail/rational_horner3_6.hpp
index b9361ba07a..a1f7199828 100644
--- a/include/boost/math/tools/detail/rational_horner3_6.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_6.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_7.hpp b/include/boost/math/tools/detail/rational_horner3_7.hpp
index 92b00b3a15..4c5487ecba 100644
--- a/include/boost/math/tools/detail/rational_horner3_7.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_7.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_8.hpp b/include/boost/math/tools/detail/rational_horner3_8.hpp
index 197b6c0550..f1f0710ac0 100644
--- a/include/boost/math/tools/detail/rational_horner3_8.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_8.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/detail/rational_horner3_9.hpp b/include/boost/math/tools/detail/rational_horner3_9.hpp
index 5aad957c37..695fc2d1d1 100644
--- a/include/boost/math/tools/detail/rational_horner3_9.hpp
+++ b/include/boost/math/tools/detail/rational_horner3_9.hpp
@@ -12,37 +12,37 @@
 namespace boost{ namespace math{ namespace tools{ namespace detail{
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T*, const U*, const V&, const mpl::int_<0>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(0);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V&, const mpl::int_<1>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(a[0]) / static_cast<V>(b[0]);
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<2>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((a[1] * x + a[0]) / (b[1] * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<3>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>(((a[2] * x + a[1]) * x + a[0]) / ((b[2] * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<4>*) BOOST_MATH_NOEXCEPT(V)
 {
    return static_cast<V>((((a[3] * x + a[2]) * x + a[1]) * x + a[0]) / (((b[3] * x + b[2]) * x + b[1]) * x + b[0]));
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<5>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -80,7 +80,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<6>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -126,7 +126,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<7>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -180,7 +180,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<8>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
@@ -242,7 +242,7 @@ inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::
 }
 
 template <class T, class U, class V>
-inline V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* a, const U* b, const V& x, const mpl::int_<9>*) BOOST_MATH_NOEXCEPT(V)
 {
    if(x <= 1)
    {
diff --git a/include/boost/math/tools/fraction.hpp b/include/boost/math/tools/fraction.hpp
index a787c603f3..a91f8d3009 100644
--- a/include/boost/math/tools/fraction.hpp
+++ b/include/boost/math/tools/fraction.hpp
@@ -15,6 +15,7 @@
 #include <boost/type_traits/integral_constant.hpp>
 #include <boost/mpl/if.hpp>
 #include <boost/math/tools/precision.hpp>
+#include <boost/math/tools/tuple.hpp>
 
 namespace boost{ namespace math{ namespace tools{
 
@@ -27,17 +28,22 @@ namespace detail
    template <class T, class U>
    struct is_pair<std::pair<T,U> > : public boost::true_type{};
 
+#ifdef __CUDA_ARCH__
+   template <class T, class U>
+   struct is_pair<thrust::pair<T, U> > : public boost::true_type {};
+#endif
+
    template <class Gen>
    struct fraction_traits_simple
    {
        typedef typename Gen::result_type result_type;
        typedef typename Gen::result_type value_type;
 
-       static result_type a(const value_type&) BOOST_MATH_NOEXCEPT(value_type)
+       static BOOST_GPU_ENABLED result_type a(const value_type&) BOOST_MATH_NOEXCEPT(value_type)
        {
           return 1;
        }
-       static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
+       static BOOST_GPU_ENABLED result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
        {
           return v;
        }
@@ -49,11 +55,11 @@ namespace detail
        typedef typename Gen::result_type value_type;
        typedef typename value_type::first_type result_type;
 
-       static result_type a(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
+       static BOOST_GPU_ENABLED result_type a(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
        {
           return v.first;
        }
-       static result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
+       static BOOST_GPU_ENABLED result_type b(const value_type& v) BOOST_MATH_NOEXCEPT(value_type)
        {
           return v.second;
        }
@@ -85,7 +91,7 @@ namespace detail
 // Note that the first a0 returned by generator Gen is disarded.
 //
 template <class Gen, class U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor, boost::uintmax_t& max_terms) 
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor, boost::uintmax_t& max_terms)
       BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    BOOST_MATH_STD_USING // ADL of std names
@@ -126,7 +132,7 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
 }
 
 template <class Gen, class U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, const U& factor)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    boost::uintmax_t max_terms = (std::numeric_limits<boost::uintmax_t>::max)();
@@ -134,7 +140,7 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
 }
 
 template <class Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    BOOST_MATH_STD_USING // ADL of std names
@@ -148,7 +154,7 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
 }
 
 template <class Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits, boost::uintmax_t& max_terms)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_b(Gen& g, int bits, boost::uintmax_t& max_terms)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    BOOST_MATH_STD_USING // ADL of std names
@@ -175,7 +181,7 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_b(G
 // Note that the first a1 and b1 returned by generator Gen are both used.
 //
 template <class Gen, class U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor, boost::uintmax_t& max_terms)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor, boost::uintmax_t& max_terms)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    BOOST_MATH_STD_USING // ADL of std names
@@ -217,15 +223,19 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(G
 }
 
 template <class Gen, class U>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, const U& factor)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
+#ifdef __CUDA_ARCH__
+   boost::uintmax_t max_iter = ULONG_MAX;
+#else
    boost::uintmax_t max_iter = (std::numeric_limits<boost::uintmax_t>::max)();
+#endif
    return continued_fraction_a(g, factor, max_iter);
 }
 
 template <class Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    BOOST_MATH_STD_USING // ADL of std names
@@ -240,7 +250,7 @@ inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(G
 }
 
 template <class Gen>
-inline typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits, boost::uintmax_t& max_terms)
+inline BOOST_GPU_ENABLED typename detail::fraction_traits<Gen>::result_type continued_fraction_a(Gen& g, int bits, boost::uintmax_t& max_terms)
    BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename detail::fraction_traits<Gen>::result_type) && noexcept(std::declval<Gen>()()))
 {
    BOOST_MATH_STD_USING // ADL of std names
diff --git a/include/boost/math/tools/polynomial.hpp b/include/boost/math/tools/polynomial.hpp
index 1f4b1f6fe6..95365a0bac 100644
--- a/include/boost/math/tools/polynomial.hpp
+++ b/include/boost/math/tools/polynomial.hpp
@@ -47,7 +47,7 @@ T chebyshev_coefficient(unsigned n, unsigned m)
    unsigned r = n - m;
    r /= 2;
 
-   BOOST_ASSERT(n - 2 * r == m);
+   BOOST_MATH_ASSERT(n - 2 * r == m);
 
    if(r & 1)
       result = -result;
@@ -192,9 +192,9 @@ template <typename T>
 std::pair< polynomial<T>, polynomial<T> >
 division(polynomial<T> u, const polynomial<T>& v)
 {
-    BOOST_ASSERT(v.size() <= u.size());
-    BOOST_ASSERT(v);
-    BOOST_ASSERT(u);
+    BOOST_MATH_ASSERT(v.size() <= u.size());
+    BOOST_MATH_ASSERT(v);
+    BOOST_MATH_ASSERT(u);
 
     typedef typename polynomial<T>::size_type N;
     
@@ -277,7 +277,7 @@ template <typename T>
 std::pair< polynomial<T>, polynomial<T> >
 quotient_remainder(const polynomial<T>& dividend, const polynomial<T>& divisor)
 {
-    BOOST_ASSERT(divisor);
+   BOOST_MATH_ASSERT(divisor);
     if (dividend.size() < divisor.size())
         return std::make_pair(polynomial<T>(), dividend);
     return detail::division(dividend, divisor);
diff --git a/include/boost/math/tools/precision.hpp b/include/boost/math/tools/precision.hpp
index 6538083b99..bda9932b79 100644
--- a/include/boost/math/tools/precision.hpp
+++ b/include/boost/math/tools/precision.hpp
@@ -24,58 +24,80 @@
 
 namespace boost{ namespace math
 {
-namespace tools
-{
-// If T is not specialized, the functions digits, max_value and min_value,
-// all get synthesised automatically from std::numeric_limits.
-// However, if numeric_limits is not specialised for type RealType,
-// for example with NTL::RR type, then you will get a compiler error
-// when code tries to use these functions, unless you explicitly specialise them.
-
-// For example if the precision of RealType varies at runtime,
-// then numeric_limits support may not be appropriate,
-// see boost/math/tools/ntl.hpp  for examples like
-// template <> NTL::RR max_value<NTL::RR> ...
-// See  Conceptual Requirements for Real Number Types.
-
-template <class T>
-inline BOOST_MATH_CONSTEXPR int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) BOOST_NOEXCEPT
-{
+   namespace tools
+   {
+      // If T is not specialized, the functions digits, max_value and min_value,
+      // all get synthesised automatically from std::numeric_limits.
+      // However, if numeric_limits is not specialised for type RealType,
+      // for example with NTL::RR type, then you will get a compiler error
+      // when code tries to use these functions, unless you explicitly specialise them.
+
+      // For example if the precision of RealType varies at runtime,
+      // then numeric_limits support may not be appropriate,
+      // see boost/math/tools/ntl.hpp  for examples like
+      // template <> NTL::RR max_value<NTL::RR> ...
+      // See  Conceptual Requirements for Real Number Types.
+
+      template <class T>
+      inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED int digits(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) BOOST_NOEXCEPT
+      {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
-   BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::is_specialized);
-   BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::radix == 2 || ::std::numeric_limits<T>::radix == 10);
+         BOOST_STATIC_ASSERT(::std::numeric_limits<T>::is_specialized);
+         BOOST_STATIC_ASSERT(::std::numeric_limits<T>::radix == 2 || ::std::numeric_limits<T>::radix == 10);
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
-   BOOST_ASSERT(::std::numeric_limits<T>::radix == 2 || ::std::numeric_limits<T>::radix == 10);
+         BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
+         BOOST_MATH_ASSERT(::std::numeric_limits<T>::radix == 2 || ::std::numeric_limits<T>::radix == 10);
 #endif
-   return std::numeric_limits<T>::radix == 2 
-      ? std::numeric_limits<T>::digits
-      : ((std::numeric_limits<T>::digits + 1) * 1000L) / 301L;
-}
-
-template <class T>
-inline BOOST_MATH_CONSTEXPR T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T))  BOOST_MATH_NOEXCEPT(T)
-{
+         return std::numeric_limits<T>::radix == 2
+            ? std::numeric_limits<T>::digits
+            : ((std::numeric_limits<T>::digits + 1) * 1000L) / 301L;
+      }
+
+      template <class T>
+      inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T))  BOOST_MATH_NOEXCEPT(T)
+      {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
-   BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::is_specialized);
+         BOOST_STATIC_ASSERT(::std::numeric_limits<T>::is_specialized);
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
+         BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
+#endif
+         return (std::numeric_limits<T>::max)();
+      } // Also used as a finite 'infinite' value for - and +infinity, for example:
+      // -max_value<double> = -1.79769e+308, max_value<double> = 1.79769e+308.
+
+#ifdef __CUDA_ARCH__
+      template <> inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED float max_value<float>(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(float))  BOOST_MATH_NOEXCEPT(float)
+      {
+         return FLT_MAX;
+      }
+      template <> inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED double max_value<double>(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(double))  BOOST_MATH_NOEXCEPT(double)
+      {
+         return DBL_MAX;
+      }
 #endif
-   return (std::numeric_limits<T>::max)();
-} // Also used as a finite 'infinite' value for - and +infinity, for example:
-// -max_value<double> = -1.79769e+308, max_value<double> = 1.79769e+308.
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
    BOOST_STATIC_ASSERT( ::std::numeric_limits<T>::is_specialized);
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
+   BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
 #endif
    return (std::numeric_limits<T>::min)();
 }
 
+#ifdef __CUDA_ARCH__
+template <> inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED float min_value<float>(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(float))  BOOST_MATH_NOEXCEPT(float)
+{
+   return FLT_MIN;
+}
+template <> inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED double min_value<double>(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(double))  BOOST_MATH_NOEXCEPT(double)
+{
+   return DBL_MIN;
+}
+#endif
+
 namespace detail{
 //
 // Logarithmic limits come next, note that although
@@ -86,13 +108,13 @@ namespace detail{
 // For type float first:
 //
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_max_value(const mpl::int_<128>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_max_value(const mpl::int_<128>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return 88.0f;
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_min_value(const mpl::int_<128>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_min_value(const mpl::int_<128>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return -87.0f;
 }
@@ -100,13 +122,13 @@ inline BOOST_MATH_CONSTEXPR T log_min_value(const mpl::int_<128>& BOOST_MATH_APP
 // Now double:
 //
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_max_value(const mpl::int_<1024>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_max_value(const mpl::int_<1024>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return 709.0;
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_min_value(const mpl::int_<1024>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_min_value(const mpl::int_<1024>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return -708.0;
 }
@@ -114,19 +136,19 @@ inline BOOST_MATH_CONSTEXPR T log_min_value(const mpl::int_<1024>& BOOST_MATH_AP
 // 80 and 128-bit long doubles:
 //
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_max_value(const mpl::int_<16384>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_max_value(const mpl::int_<16384>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return 11356.0L;
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_min_value(const mpl::int_<16384>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_min_value(const mpl::int_<16384>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return -11355.0L;
 }
 
 template <class T>
-inline T log_max_value(const mpl::int_<0>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T))
+inline BOOST_GPU_ENABLED T log_max_value(const mpl::int_<0>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T))
 {
    BOOST_MATH_STD_USING
 #ifdef __SUNPRO_CC
@@ -139,7 +161,7 @@ inline T log_max_value(const mpl::int_<0>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_T
 }
 
 template <class T>
-inline T log_min_value(const mpl::int_<0>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T))
+inline BOOST_GPU_ENABLED T log_min_value(const mpl::int_<0>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T))
 {
    BOOST_MATH_STD_USING
 #ifdef __SUNPRO_CC
@@ -152,14 +174,25 @@ inline T log_min_value(const mpl::int_<0>& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_T
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T epsilon(const mpl::true_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T epsilon(const mpl::true_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_MATH_NOEXCEPT(T)
 {
    return std::numeric_limits<T>::epsilon();
 }
 
+#ifdef __CUDA_ARCH__
+template <> inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED float epsilon<float>(const mpl::true_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(float)) BOOST_MATH_NOEXCEPT(float)
+{
+   return FLT_EPSILON;
+}
+template <> inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED double epsilon(const mpl::true_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(double)) BOOST_MATH_NOEXCEPT(double)
+{
+   return DBL_EPSILON;
+}
+#endif
+
 #if defined(__GNUC__) && ((LDBL_MANT_DIG == 106) || (__LDBL_MANT_DIG__ == 106))
 template <>
-inline BOOST_MATH_CONSTEXPR long double epsilon<long double>(const mpl::true_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(long double)) BOOST_MATH_NOEXCEPT(long double)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED long double epsilon<long double>(const mpl::true_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(long double)) BOOST_MATH_NOEXCEPT(long double)
 {
    // numeric_limits on Darwin (and elsewhere) tells lies here:
    // the issue is that long double on a few platforms is
@@ -178,7 +211,7 @@ inline BOOST_MATH_CONSTEXPR long double epsilon<long double>(const mpl::true_& B
 #endif
 
 template <class T>
-inline T epsilon(const mpl::false_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T))
+inline BOOST_GPU_ENABLED T epsilon(const mpl::false_& BOOST_MATH_APPEND_EXPLICIT_TEMPLATE_TYPE(T))
 {
    // Note: don't cache result as precision may vary at runtime:
    BOOST_MATH_STD_USING  // for ADL of std names
@@ -214,12 +247,12 @@ struct log_limit_noexcept_traits : public log_limit_noexcept_traits_imp<T, BOOST
 #endif
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT_IF(detail::log_limit_noexcept_traits<T>::value)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T log_max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT_IF(detail::log_limit_noexcept_traits<T>::value)
 {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
    return detail::log_max_value<T>(typename detail::log_limit_traits<T>::tag_type());
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
+   BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
    BOOST_MATH_STD_USING
    static const T val = log((std::numeric_limits<T>::max)());
    return val;
@@ -227,12 +260,12 @@ inline BOOST_MATH_CONSTEXPR T log_max_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T log_min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT_IF(detail::log_limit_noexcept_traits<T>::value)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED  T log_min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)) BOOST_NOEXCEPT_IF(detail::log_limit_noexcept_traits<T>::value)
 {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
    return detail::log_min_value<T>(typename detail::log_limit_traits<T>::tag_type());
 #else
-   BOOST_ASSERT(::std::numeric_limits<T>::is_specialized);
+   BOOST_MATH_ASSERT(::std::numeric_limits<T>::is_specialized);
    BOOST_MATH_STD_USING
    static const T val = log((std::numeric_limits<T>::min)());
    return val;
@@ -244,7 +277,7 @@ inline BOOST_MATH_CONSTEXPR T log_min_value(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE(T)
 #endif
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T)) BOOST_MATH_NOEXCEPT(T)
 {
 #ifndef BOOST_NO_LIMITS_COMPILE_TIME_CONSTANTS
    return detail::epsilon<T>(mpl::bool_< ::std::numeric_limits<T>::is_specialized>());
@@ -258,31 +291,31 @@ inline BOOST_MATH_CONSTEXPR T epsilon(BOOST_MATH_EXPLICIT_TEMPLATE_TYPE_SPEC(T))
 namespace detail{
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T root_epsilon_imp(const mpl::int_<24>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T root_epsilon_imp(const T*, const mpl::int_<24>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.00034526698300124390839884978618400831996329879769945L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T root_epsilon_imp(const T*, const mpl::int_<53>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T root_epsilon_imp(const T*, const mpl::int_<53>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.1490116119384765625e-7L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T root_epsilon_imp(const T*, const mpl::int_<64>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T root_epsilon_imp(const T*, const mpl::int_<64>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.32927225399135962333569506281281311031656150598474e-9L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T root_epsilon_imp(const T*, const mpl::int_<113>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T root_epsilon_imp(const T*, const mpl::int_<113>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.1387778780781445675529539585113525390625e-16L);
 }
 
 template <class T, class Tag>
-inline T root_epsilon_imp(const T*, const Tag&)
+inline BOOST_GPU_ENABLED T root_epsilon_imp(const T*, const Tag&)
 {
    BOOST_MATH_STD_USING
    static const T r_eps = sqrt(tools::epsilon<T>());
@@ -297,31 +330,31 @@ inline T root_epsilon_imp(const T*, const mpl::int_<0>&)
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T cbrt_epsilon_imp(const mpl::int_<24>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T cbrt_epsilon_imp(const mpl::int_<24>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.0049215666011518482998719164346805794944150447839903L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T cbrt_epsilon_imp(const T*, const mpl::int_<53>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T cbrt_epsilon_imp(const T*, const mpl::int_<53>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(6.05545445239333906078989272793696693569753008995e-6L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T cbrt_epsilon_imp(const T*, const mpl::int_<64>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T cbrt_epsilon_imp(const T*, const mpl::int_<64>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(4.76837158203125e-7L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T cbrt_epsilon_imp(const T*, const mpl::int_<113>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T cbrt_epsilon_imp(const T*, const mpl::int_<113>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(5.7749313854154005630396773604745549542403508090496e-12L);
 }
 
 template <class T, class Tag>
-inline T cbrt_epsilon_imp(const T*, const Tag&)
+inline BOOST_GPU_ENABLED T cbrt_epsilon_imp(const T*, const Tag&)
 {
    BOOST_MATH_STD_USING;
    static const T cbrt_eps = pow(tools::epsilon<T>(), T(1) / 3);
@@ -336,31 +369,31 @@ inline T cbrt_epsilon_imp(const T*, const mpl::int_<0>&)
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T forth_root_epsilon_imp(const T*, const mpl::int_<24>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T forth_root_epsilon_imp(const T*, const mpl::int_<24>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.018581361171917516667460937040007436176452688944747L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T forth_root_epsilon_imp(const T*, const mpl::int_<53>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T forth_root_epsilon_imp(const T*, const mpl::int_<53>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.0001220703125L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T forth_root_epsilon_imp(const T*, const mpl::int_<64>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T forth_root_epsilon_imp(const T*, const mpl::int_<64>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.18145860519450699870567321328132261891067079047605e-4L);
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T forth_root_epsilon_imp(const T*, const mpl::int_<113>&) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T forth_root_epsilon_imp(const T*, const mpl::int_<113>&) BOOST_MATH_NOEXCEPT(T)
 {
    return static_cast<T>(0.37252902984619140625e-8L);
 }
 
 template <class T, class Tag>
-inline T forth_root_epsilon_imp(const T*, const Tag&)
+inline BOOST_GPU_ENABLED T forth_root_epsilon_imp(const T*, const Tag&)
 {
    BOOST_MATH_STD_USING
    static const T r_eps = sqrt(sqrt(tools::epsilon<T>()));
@@ -384,19 +417,19 @@ struct root_epsilon_traits
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T root_epsilon() BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && detail::root_epsilon_traits<T>::has_noexcept)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T root_epsilon() BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && detail::root_epsilon_traits<T>::has_noexcept)
 {
    return detail::root_epsilon_imp(static_cast<T const*>(0), typename detail::root_epsilon_traits<T>::tag_type());
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T cbrt_epsilon() BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && detail::root_epsilon_traits<T>::has_noexcept)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T cbrt_epsilon() BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && detail::root_epsilon_traits<T>::has_noexcept)
 {
    return detail::cbrt_epsilon_imp(static_cast<T const*>(0), typename detail::root_epsilon_traits<T>::tag_type());
 }
 
 template <class T>
-inline BOOST_MATH_CONSTEXPR T forth_root_epsilon() BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && detail::root_epsilon_traits<T>::has_noexcept)
+inline BOOST_MATH_CONSTEXPR BOOST_GPU_ENABLED T forth_root_epsilon() BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(T) && detail::root_epsilon_traits<T>::has_noexcept)
 {
    return detail::forth_root_epsilon_imp(static_cast<T const*>(0), typename detail::root_epsilon_traits<T>::tag_type());
 }
diff --git a/include/boost/math/tools/rational.hpp b/include/boost/math/tools/rational.hpp
index d8bd4a73aa..4a31c7959e 100644
--- a/include/boost/math/tools/rational.hpp
+++ b/include/boost/math/tools/rational.hpp
@@ -168,12 +168,12 @@ namespace boost{ namespace math{ namespace tools{
 // Forward declaration to keep two phase lookup happy:
 //
 template <class T, class U>
-U evaluate_polynomial(const T* poly, U const& z, std::size_t count) BOOST_MATH_NOEXCEPT(U);
+BOOST_GPU_ENABLED U evaluate_polynomial(const T* poly, U const& z, std::size_t count) BOOST_MATH_NOEXCEPT(U);
 
 namespace detail{
 
 template <class T, class V, class Tag>
-inline V evaluate_polynomial_c_imp(const T* a, const V& val, const Tag*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial_c_imp(const T* a, const V& val, const Tag*) BOOST_MATH_NOEXCEPT(V)
 {
    return evaluate_polynomial(a, val, Tag::value);
 }
@@ -186,9 +186,9 @@ inline V evaluate_polynomial_c_imp(const T* a, const V& val, const Tag*) BOOST_M
 // the loop expanded versions above:
 //
 template <class T, class U>
-inline U evaluate_polynomial(const T* poly, U const& z, std::size_t count) BOOST_MATH_NOEXCEPT(U)
+inline BOOST_GPU_ENABLED U evaluate_polynomial(const T* poly, U const& z, std::size_t count) BOOST_MATH_NOEXCEPT(U)
 {
-   BOOST_ASSERT(count > 0);
+   BOOST_MATH_ASSERT(count > 0);
    U sum = static_cast<U>(poly[count - 1]);
    for(int i = static_cast<int>(count) - 2; i >= 0; --i)
    {
@@ -202,14 +202,14 @@ inline U evaluate_polynomial(const T* poly, U const& z, std::size_t count) BOOST
 // implementations above:
 //
 template <std::size_t N, class T, class V>
-inline V evaluate_polynomial(const T(&a)[N], const V& val) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial(const T(&a)[N], const V& val) BOOST_MATH_NOEXCEPT(V)
 {
    typedef mpl::int_<N> tag_type;
    return detail::evaluate_polynomial_c_imp(static_cast<const T*>(a), val, static_cast<tag_type const*>(0));
 }
 
 template <std::size_t N, class T, class V>
-inline V evaluate_polynomial(const boost::array<T,N>& a, const V& val) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_polynomial(const boost::array<T,N>& a, const V& val) BOOST_MATH_NOEXCEPT(V)
 {
    typedef mpl::int_<N> tag_type;
    return detail::evaluate_polynomial_c_imp(static_cast<const T*>(a.data()), val, static_cast<tag_type const*>(0));
@@ -218,19 +218,19 @@ inline V evaluate_polynomial(const boost::array<T,N>& a, const V& val) BOOST_MAT
 // Even polynomials are trivial: just square the argument!
 //
 template <class T, class U>
-inline U evaluate_even_polynomial(const T* poly, U z, std::size_t count) BOOST_MATH_NOEXCEPT(U)
+inline BOOST_GPU_ENABLED U evaluate_even_polynomial(const T* poly, U z, std::size_t count) BOOST_MATH_NOEXCEPT(U)
 {
    return evaluate_polynomial(poly, U(z*z), count);
 }
 
 template <std::size_t N, class T, class V>
-inline V evaluate_even_polynomial(const T(&a)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_even_polynomial(const T(&a)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return evaluate_polynomial(a, V(z*z));
 }
 
 template <std::size_t N, class T, class V>
-inline V evaluate_even_polynomial(const boost::array<T,N>& a, const V& z) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_even_polynomial(const boost::array<T,N>& a, const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return evaluate_polynomial(a, V(z*z));
 }
@@ -238,32 +238,32 @@ inline V evaluate_even_polynomial(const boost::array<T,N>& a, const V& z) BOOST_
 // Odd polynomials come next:
 //
 template <class T, class U>
-inline U evaluate_odd_polynomial(const T* poly, U z, std::size_t count) BOOST_MATH_NOEXCEPT(U)
+inline BOOST_GPU_ENABLED U evaluate_odd_polynomial(const T* poly, U z, std::size_t count) BOOST_MATH_NOEXCEPT(U)
 {
    return poly[0] + z * evaluate_polynomial(poly+1, U(z*z), count-1);
 }
 
 template <std::size_t N, class T, class V>
-inline V evaluate_odd_polynomial(const T(&a)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_odd_polynomial(const T(&a)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    typedef mpl::int_<N-1> tag_type;
    return a[0] + z * detail::evaluate_polynomial_c_imp(static_cast<const T*>(a) + 1, V(z*z), static_cast<tag_type const*>(0));
 }
 
 template <std::size_t N, class T, class V>
-inline V evaluate_odd_polynomial(const boost::array<T,N>& a, const V& z) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_odd_polynomial(const boost::array<T,N>& a, const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    typedef mpl::int_<N-1> tag_type;
    return a[0] + z * detail::evaluate_polynomial_c_imp(static_cast<const T*>(a.data()) + 1, V(z*z), static_cast<tag_type const*>(0));
 }
 
 template <class T, class U, class V>
-V evaluate_rational(const T* num, const U* denom, const V& z_, std::size_t count) BOOST_MATH_NOEXCEPT(V);
+BOOST_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V& z_, std::size_t count) BOOST_MATH_NOEXCEPT(V);
 
 namespace detail{
 
 template <class T, class U, class V, class Tag>
-inline V evaluate_rational_c_imp(const T* num, const U* denom, const V& z, const Tag*) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational_c_imp(const T* num, const U* denom, const V& z, const Tag*) BOOST_MATH_NOEXCEPT(V)
 {
    return boost::math::tools::evaluate_rational(num, denom, z, Tag::value);
 }
@@ -278,7 +278,7 @@ inline V evaluate_rational_c_imp(const T* num, const U* denom, const V& z, const
 // in our Lanczos code for example.
 //
 template <class T, class U, class V>
-V evaluate_rational(const T* num, const U* denom, const V& z_, std::size_t count) BOOST_MATH_NOEXCEPT(V)
+BOOST_GPU_ENABLED V evaluate_rational(const T* num, const U* denom, const V& z_, std::size_t count) BOOST_MATH_NOEXCEPT(V)
 {
    V z(z_);
    V s1, s2;
@@ -311,13 +311,13 @@ V evaluate_rational(const T* num, const U* denom, const V& z_, std::size_t count
 }
 
 template <std::size_t N, class T, class U, class V>
-inline V evaluate_rational(const T(&a)[N], const U(&b)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational(const T(&a)[N], const U(&b)[N], const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return detail::evaluate_rational_c_imp(a, b, z, static_cast<const mpl::int_<N>*>(0));
 }
 
 template <std::size_t N, class T, class U, class V>
-inline V evaluate_rational(const boost::array<T,N>& a, const boost::array<U,N>& b, const V& z) BOOST_MATH_NOEXCEPT(V)
+inline BOOST_GPU_ENABLED V evaluate_rational(const boost::array<T,N>& a, const boost::array<U,N>& b, const V& z) BOOST_MATH_NOEXCEPT(V)
 {
    return detail::evaluate_rational_c_imp(a.data(), b.data(), z, static_cast<mpl::int_<N>*>(0));
 }
diff --git a/include/boost/math/tools/series.hpp b/include/boost/math/tools/series.hpp
index ab01549a2d..f5c8276aa4 100644
--- a/include/boost/math/tools/series.hpp
+++ b/include/boost/math/tools/series.hpp
@@ -21,7 +21,7 @@ namespace boost{ namespace math{ namespace tools{
 // Simple series summation come first:
 //
 template <class Functor, class U, class V>
-inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::uintmax_t& max_terms, const V& init_value) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type sum_series(Functor& func, const U& factor, boost::uintmax_t& max_terms, const V& init_value) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
 
@@ -44,14 +44,14 @@ inline typename Functor::result_type sum_series(Functor& func, const U& factor,
 }
 
 template <class Functor, class U>
-inline typename Functor::result_type sum_series(Functor& func, const U& factor, boost::uintmax_t& max_terms) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type sum_series(Functor& func, const U& factor, boost::uintmax_t& max_terms) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    typename Functor::result_type init_value = 0;
    return sum_series(func, factor, max_terms, init_value);
 }
 
 template <class Functor, class U>
-inline typename Functor::result_type sum_series(Functor& func, int bits, boost::uintmax_t& max_terms, const U& init_value) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type sum_series(Functor& func, int bits, boost::uintmax_t& max_terms, const U& init_value) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
    typedef typename Functor::result_type result_type;
@@ -60,7 +60,7 @@ inline typename Functor::result_type sum_series(Functor& func, int bits, boost::
 }
 
 template <class Functor>
-inline typename Functor::result_type sum_series(Functor& func, int bits) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type sum_series(Functor& func, int bits) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
    typedef typename Functor::result_type result_type;
@@ -70,7 +70,7 @@ inline typename Functor::result_type sum_series(Functor& func, int bits) BOOST_N
 }
 
 template <class Functor>
-inline typename Functor::result_type sum_series(Functor& func, int bits, boost::uintmax_t& max_terms) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type sum_series(Functor& func, int bits, boost::uintmax_t& max_terms) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
    typedef typename Functor::result_type result_type;
@@ -79,7 +79,7 @@ inline typename Functor::result_type sum_series(Functor& func, int bits, boost::
 }
 
 template <class Functor, class U>
-inline typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type sum_series(Functor& func, int bits, const U& init_value) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
    boost::uintmax_t iters = (std::numeric_limits<boost::uintmax_t>::max)();
@@ -99,7 +99,7 @@ inline typename Functor::result_type sum_series(Functor& func, int bits, const U
 // in any case the result is still much better than a naive summation.
 //
 template <class Functor>
-inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type kahan_sum_series(Functor& func, int bits) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
 
@@ -122,7 +122,7 @@ inline typename Functor::result_type kahan_sum_series(Functor& func, int bits) B
 }
 
 template <class Functor>
-inline typename Functor::result_type kahan_sum_series(Functor& func, int bits, boost::uintmax_t& max_terms) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
+inline BOOST_GPU_ENABLED typename Functor::result_type kahan_sum_series(Functor& func, int bits, boost::uintmax_t& max_terms) BOOST_NOEXCEPT_IF(BOOST_MATH_IS_FLOAT(typename Functor::result_type) && noexcept(std::declval<Functor>()()))
 {
    BOOST_MATH_STD_USING
 
diff --git a/include/boost/math/tools/tuple.hpp b/include/boost/math/tools/tuple.hpp
index 81de59fa3a..5daab83972 100644
--- a/include/boost/math/tools/tuple.hpp
+++ b/include/boost/math/tools/tuple.hpp
@@ -8,6 +8,29 @@
 #  include <boost/config.hpp>
 #  include <boost/detail/workaround.hpp>
 
+#ifdef __CUDA_ARCH__
+#include <thrust/pair.h>
+
+namespace boost {
+   namespace math {
+
+      using thrust::pair;
+
+   }
+}
+
+#else
+
+#include <utility>
+
+namespace boost {
+   namespace math {
+      using std::pair;
+} }
+
+#endif
+
+
 #if !defined(BOOST_NO_CXX11_HDR_TUPLE) && !BOOST_WORKAROUND(BOOST_GCC_VERSION, < 40500)
 
 #include <tuple>
diff --git a/include/boost/math/tools/workaround.hpp b/include/boost/math/tools/workaround.hpp
index 29ce8b1c04..e6eb82eba5 100644
--- a/include/boost/math/tools/workaround.hpp
+++ b/include/boost/math/tools/workaround.hpp
@@ -19,14 +19,14 @@ namespace boost{ namespace math{ namespace tools{
 // std::fmod(1185.0L, 1.5L);
 //
 template <class T>
-inline T fmod_workaround(T a, T b) BOOST_MATH_NOEXCEPT(T)
+inline BOOST_GPU_ENABLED T fmod_workaround(T a, T b) BOOST_MATH_NOEXCEPT(T)
 {
    BOOST_MATH_STD_USING
    return fmod(a, b);
 }
 #if (defined(macintosh) || defined(__APPLE__) || defined(__APPLE_CC__)) && ((LDBL_MANT_DIG == 106) || (__LDBL_MANT_DIG__ == 106))
 template <>
-inline long double fmod_workaround(long double a, long double b) BOOST_NOEXCEPT
+inline BOOST_GPU_ENABLED long double fmod_workaround(long double a, long double b) BOOST_NOEXCEPT
 {
    return ::fmodl(a, b);
 }
diff --git a/test/cuda/Jamfile.v2 b/test/cuda/Jamfile.v2
new file mode 100644
index 0000000000..35233df73e
--- /dev/null
+++ b/test/cuda/Jamfile.v2
@@ -0,0 +1,19 @@
+
+import testing ;
+import cu ;
+import modules ;
+
+cuda = [ modules.peek : CUDA_PATH ] ;
+
+searched-lib cuda : : <name>cuda <search>$(cuda)/lib/x64 ;
+searched-lib cudart : : <name>cudart <search>$(cuda)/lib/x64 ;
+
+exe verify_cuda : cuda_check.cu cuda cudart ../../../chrono/build//boost_chrono ../../../system/build//boost_system : <define>BOOST_ALL_NO_LIB release <address-model>64 <toolset>msvc:<runtime-link>static ;
+explicit verify_cuda ;
+
+for local source in [ glob test*.cu ]
+{
+   run $(source) cuda cudart ../../../chrono/build//boost_chrono ../../../system/build//boost_system : : : release <define>BOOST_ALL_NO_LIB <address-model>64 <toolset>msvc:<runtime-link>static [ check-target-builds .//verify_cuda "CUDA compiler support" : : <build>no ] ;
+}
+
+run misc/test_naive_monte_carlo.cu misc/test_naive_monte_carlo_host.cpp cuda cudart ../../../chrono/build//boost_chrono ../../../system/build//boost_system : : : release <define>BOOST_ALL_NO_LIB <address-model>64 <toolset>msvc:<runtime-link>static [ check-target-builds .//verify_cuda "CUDA compiler support" : : <build>no ] ;
diff --git a/test/cuda/cu.jam b/test/cuda/cu.jam
new file mode 100644
index 0000000000..f2efd1f82c
--- /dev/null
+++ b/test/cuda/cu.jam
@@ -0,0 +1,15 @@
+
+
+import os ;
+import type ;
+type.register CU : cu ;
+import generators ;
+generators.register-standard cu.inline-file : CU : OBJ ;
+
+#rule cuda { }
+
+actions inline-file 
+{
+    nvcc -c -O3 -DBOOST_PP_VARIADICS=0 -expt-extended-lambda -I../../../.. -DBOOST_ALL_NO_LIB -o $(<) $(>)
+}
+
diff --git a/test/cuda/cuda_check.cu b/test/cuda/cuda_check.cu
new file mode 100644
index 0000000000..6059c5c686
--- /dev/null
+++ b/test/cuda/cuda_check.cu
@@ -0,0 +1,107 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in1[i] * in1[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(-10000, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(input_vector1[i] * input_vector1[i]);
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/cuda_managed_ptr.hpp b/test/cuda/cuda_managed_ptr.hpp
new file mode 100644
index 0000000000..6ec7f3f7f6
--- /dev/null
+++ b/test/cuda/cuda_managed_ptr.hpp
@@ -0,0 +1,138 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_CUDA_MANAGED_PTR_HPP
+#define BOOST_MATH_CUDA_MANAGED_PTR_HPP
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+class managed_holder_base
+{
+protected:
+   static int count;
+   managed_holder_base() { ++count; }
+   ~managed_holder_base()
+   {
+      if(0 == --count)
+         cudaDeviceSynchronize();
+   }
+};
+
+int managed_holder_base::count = 0;
+
+//
+// Reset the device and exit:
+// cudaDeviceReset causes the driver to clean up all state. While
+// not mandatory in normal operation, it is good practice.  It is also
+// needed to ensure correct operation when the application is being
+// profiled. Calling cudaDeviceReset causes all profile data to be
+// flushed before the application exits.
+//
+// We have a global instance of this class, plus instances for each
+// managed pointer.  Last one out the door switches the lights off.
+//
+class cudaResetter
+{
+   static int count;
+public:
+   cudaResetter() { ++count;  }
+   ~cudaResetter()
+   {
+      if(--count == 0)
+      {
+         cudaError_t err = cudaDeviceReset();
+         if(err != cudaSuccess)
+         {
+            std::cerr << "Failed to deinitialize the device! error=" << cudaGetErrorString(err) << std::endl;
+         }
+      }
+   }
+};
+
+int cudaResetter::count = 0;
+
+cudaResetter global_resetter;
+
+template <class T>
+class cuda_managed_ptr
+{
+   T* data;
+   static const cudaResetter resetter;
+   cuda_managed_ptr(const cuda_managed_ptr&) = delete;
+   cuda_managed_ptr& operator=(cuda_managed_ptr const&) = delete;
+   void free()
+   {
+      if(data)
+      {
+         cudaDeviceSynchronize();
+         cudaError_t err = cudaFree(data);
+         if(err != cudaSuccess)
+         {
+            std::cerr << "Failed to deinitialize the device! error=" << cudaGetErrorString(err) << std::endl;
+         }
+      }
+   }
+public:
+   cuda_managed_ptr() : data(0) {}
+   cuda_managed_ptr(std::size_t n)
+   {
+      cudaError_t err = cudaSuccess;
+      void *ptr;
+      err = cudaMallocManaged(&ptr, n * sizeof(T));
+      if(err != cudaSuccess)
+         throw std::runtime_error(cudaGetErrorString(err));
+      cudaDeviceSynchronize();
+      data = static_cast<T*>(ptr);
+   }
+   cuda_managed_ptr(cuda_managed_ptr&& o)
+   {
+      data = o.data;
+      o.data = 0;
+   }
+   cuda_managed_ptr& operator=(cuda_managed_ptr&& o)
+   {
+      free();
+      data = o.data;
+      o.data = 0;
+      return *this;
+   }
+   ~cuda_managed_ptr()
+   {
+      free();
+   }
+
+   class managed_holder : managed_holder_base
+   {
+      T* pdata;
+   public:
+      managed_holder(T* p) : managed_holder_base(), pdata(p) {}
+      managed_holder(const managed_holder& o) : managed_holder_base(), pdata(o.pdata) {}
+      operator T* () { return pdata; }
+      T& operator[] (std::size_t n) { return pdata[n]; }
+   };
+   class const_managed_holder : managed_holder_base
+   {
+      const T* pdata;
+   public:
+      const_managed_holder(T* p) : managed_holder_base(), pdata(p) {}
+      const_managed_holder(const managed_holder& o) : managed_holder_base(), pdata(o.pdata) {}
+      operator const T* () { return pdata; }
+      const T& operator[] (std::size_t n) { return pdata[n]; }
+   };
+
+   managed_holder get() { return managed_holder(data); }
+   const_managed_holder get()const { return data; }
+   T& operator[](std::size_t n) { return data[n]; }
+   const T& operator[](std::size_t n)const { return data[n]; }
+};
+
+template <class T>
+cudaResetter const cuda_managed_ptr<T>::resetter;
+
+#endif
+
+
diff --git a/test/cuda/misc/test_naive_monte_carlo.cu b/test/cuda/misc/test_naive_monte_carlo.cu
new file mode 100644
index 0000000000..9ebaf93c96
--- /dev/null
+++ b/test/cuda/misc/test_naive_monte_carlo.cu
@@ -0,0 +1,198 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#ifdef __CUDACC__
+#define BOOST_PP_VARIADICS 0
+#endif
+
+#include <iostream>
+#include <iomanip>
+#include <numeric>
+#include "../stopwatch.hpp"
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/constants/constants.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/math/special_functions/pow.hpp>
+#include <boost/math/quadrature/cuda_naive_monte_carlo.hpp>
+
+
+typedef double float_type;
+
+ template <class Real>
+ struct pi_calculator
+ {
+    __host__ __device__ Real operator()(const Real* abscissa)const
+    {
+       return abscissa[0] * abscissa[0] + abscissa[1] * abscissa[1] <= 1 ? 1 : 0;
+    }
+ };
+
+ template <class Real, unsigned N>
+ struct hypersphere
+ {
+    __host__ __device__ Real operator()(const Real* abscissa)const
+    {
+       Real location = 0;
+       for (unsigned i = 0; i < N; ++i)
+          location += abscissa[i] * abscissa[i];
+       return location <= 1 ? 1 : 0;
+    }
+ };
+
+ void test_host_pi(double);
+ void test_host_hypersphere_10();
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+
+   using boost::math::quadrature::cuda_naive_monte_carlo;
+   cudaDeviceProp deviceProp;
+   cudaGetDeviceProperties(&deviceProp, 0);
+
+   boost::uintmax_t max_threads = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount;
+
+   std::cout << "maxThreadsPerBlock = " << deviceProp.maxThreadsPerBlock << std::endl;
+   std::cout << "maxThreadsPerMultiProcessor = " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
+   std::cout << "multiProcessorCount = " << deviceProp.multiProcessorCount << std::endl << std::endl;
+   std::cout << "Total max threads = " << max_threads << std::endl;
+
+   std::cout << "Testing Pi calculation for circle formula.\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+   std::cout << std::right << std::setw(15);
+   std::cout << "Threads" << std::right << std::setw(15) << "Init #/thread" 
+      << std::right << std::setw(15) << "Total Points" 
+      << std::right << std::setw(15) << "Time" 
+      << std::right << std::setw(15) << "Error Goal" 
+      << std::right << std::setw(15) << "Variance"
+      << std::right << std::setw(15) << "Error Est."
+      << std::right << std::setw(15) << "Error Actual" << std::endl;
+
+   //
+   // Do something to initialize the CUDA device:
+   //
+   do {
+      std::vector<std::pair<double, double> > bounds = { { -1, 1 },{ -1, 1 } };
+      cuda_naive_monte_carlo<double, pi_calculator<double>, thrust::random::taus88> init(pi_calculator<double>(), bounds);
+      init.integrate(0.01);
+   } while (0);
+
+
+   for (double error_goal = 1e-3; error_goal > 1e-5; error_goal /= 2)
+   {
+      for (boost::uintmax_t calls_per_thread = 128; calls_per_thread < 2048; calls_per_thread *= 2)
+      {
+         try {
+            std::vector<std::pair<double, double> > bounds = { {-1, 1}, {-1, 1} };
+
+            watch w;
+            cuda_naive_monte_carlo<double, pi_calculator<double>, thrust::random::taus88> integrator(pi_calculator<double>(), bounds);
+            double val = integrator.integrate(error_goal, calls_per_thread);
+            double elapsed = w.elapsed();
+            double err = fabs(val - boost::math::constants::pi<double>());
+            std::cout << std::right << std::setw(15) << std::fixed << max_threads 
+               << std::right << std::setw(15) << calls_per_thread 
+               << std::right << std::setw(15) << integrator.calls() 
+               << std::right << std::setw(15) << std::fixed << elapsed 
+               << std::right << std::setw(15) << std::scientific << error_goal
+               << std::right << std::setw(15) << std::scientific << integrator.variance()
+               << std::right << std::setw(15) << std::scientific << integrator.current_error_estimate()
+               << std::right << std::setw(15) << std::scientific << err << std::endl;
+         }
+         catch (const std::exception& e)
+         {
+            std::cout << "Found exception: " << e.what() << std::endl;
+         }
+      }
+   }
+
+   for (double error_goal = 1e-3; error_goal > 1e-5; error_goal /= 2)
+   {
+      test_host_pi(error_goal);
+   }
+
+   std::cout << "Testing Hypersphere volume.\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n";
+   std::cout << std::right << std::setw(15);
+   std::cout << "Threads" << std::right << std::setw(15) << "Init #/thread"
+      << std::right << std::setw(15) << "Total Points"
+      << std::right << std::setw(15) << "Time"
+      << std::right << std::setw(15) << "Error Goal"
+      << std::right << std::setw(15) << "Variance"
+      << std::right << std::setw(15) << "Error Est."
+      << std::right << std::setw(15) << "Error Actual" << std::endl;
+
+   double hypersphere10 = std::pow(boost::math::constants::pi<double>(), 5) / boost::math::tgamma(6.0);
+
+   // initialized CUDA device code:
+   {
+      std::vector<std::pair<double, double> > bounds;
+      std::pair<double, double> point = { -1.0, 1.0 };
+      for (unsigned i = 0; i < 10; ++i)
+         bounds.push_back(point);
+
+      cuda_naive_monte_carlo<double, hypersphere<double, 10>, thrust::random::taus88> integrator(hypersphere<double, 10>(), bounds);
+      double val = integrator.integrate(1e-2);
+   }
+
+   for (boost::uintmax_t calls_per_thread = 64; calls_per_thread < 4086; calls_per_thread *= 2)
+   {
+      try {
+         double error_goal = 0.02;
+
+         std::vector<std::pair<double, double> > bounds;
+         std::pair<double, double> point = { -1.0, 1.0 };
+         for (unsigned i = 0; i < 10; ++i)
+            bounds.push_back(point);
+
+         watch w;
+         cuda_naive_monte_carlo<double, hypersphere<double, 10>, thrust::random::taus88> integrator(hypersphere<double, 10>(), bounds);
+         double val = integrator.integrate(error_goal, calls_per_thread);
+         double elapsed = w.elapsed();
+         double err = fabs(val - hypersphere10);
+         std::cout << std::right << std::setw(15) << std::fixed << max_threads
+            << std::right << std::setw(15) << calls_per_thread
+            << std::right << std::setw(15) << integrator.calls()
+            << std::right << std::setw(15) << std::fixed << elapsed
+            << std::right << std::setw(15) << std::scientific << error_goal
+            << std::right << std::setw(15) << std::scientific << integrator.variance()
+            << std::right << std::setw(15) << std::scientific << integrator.current_error_estimate()
+            << std::right << std::setw(15) << std::scientific << err << std::endl;
+      }
+      catch (const std::exception& e)
+      {
+         std::cout << "Found exception: " << e.what() << std::endl;
+      }
+   }
+   
+   test_host_hypersphere_10();
+
+   /* Example code from docs */
+
+   {
+
+      // Define a function to integrate:
+      auto g = [] __device__ (const double* x)
+      {
+         constexpr const double pi = boost::math::constants::pi<double>();
+         constexpr const double A = 1.0 / (pi * pi * pi);
+         return A / (1.0 - cos(x[0])*cos(x[1])*cos(x[2]));
+      };
+      std::vector<std::pair<double, double>> bounds{ { 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() },{ 0, boost::math::constants::pi<double>() } };
+      double error_goal = 0.001;
+      cuda_naive_monte_carlo<double, decltype(g)> mc(g, bounds);
+
+      double result = mc.integrate(error_goal);
+
+      std::cout << "Integration result is: " << result << std::endl;
+
+
+   }
+
+   return 0;
+}
+
diff --git a/test/cuda/misc/test_naive_monte_carlo_host.cpp b/test/cuda/misc/test_naive_monte_carlo_host.cpp
new file mode 100644
index 0000000000..5bd819a19e
--- /dev/null
+++ b/test/cuda/misc/test_naive_monte_carlo_host.cpp
@@ -0,0 +1,98 @@
+
+
+#include <iostream>
+#include <iomanip>
+#include <random>
+#include <boost/random.hpp>
+#include "../stopwatch.hpp"
+#include <boost/math/policies/error_handling.hpp>
+#include <boost/math/constants/constants.hpp>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/quadrature/naive_monte_carlo.hpp>
+
+
+
+void test_host_pi(double error_goal)
+{
+   using boost::math::quadrature::naive_monte_carlo;
+
+
+   watch w;
+   auto g = [](std::vector<double> const & x)->double
+   {
+      double r = x[0] * x[0] + x[1] * x[1];
+      if (r <= 1)
+      {
+         return 1;
+      }
+      return 0;
+   };
+
+   std::cout << "Regular host code:\n";
+
+   std::vector<std::pair<double, double>> bounds2{ { -1.0, 1.0 },{ -1., 1. } };
+
+   for (unsigned threads = 1; threads < 9; ++threads)
+   {
+      w.reset();
+      naive_monte_carlo<double, decltype(g), boost::random::taus88> mc(g, bounds2, error_goal,
+         /*singular =*/ false,/* threads = */ threads, /* seed = */ 128402);
+      auto task = mc.integrate();
+      double val = task.get();
+      double elapsed = w.elapsed();
+      boost::uintmax_t points = mc.calls();
+      double err = fabs(val - boost::math::constants::pi<double>());
+      std::cout << std::right << std::setw(15) << threads 
+         << std::right << std::setw(15) << "-" 
+         << std::right << std::setw(15) << points 
+         << std::right << std::setw(15) << std::fixed << elapsed 
+         << std::right << std::setw(15) << std::scientific << error_goal 
+         << std::right << std::setw(15) << std::scientific << mc.variance()
+         << std::right << std::setw(15) << std::scientific << mc.current_error_estimate()
+         << std::right << std::setw(15) << std::scientific << err << std::endl;
+   }
+}
+
+void test_host_hypersphere_10()
+{
+   using boost::math::quadrature::naive_monte_carlo;
+
+
+   watch w;
+   auto g = [](std::vector<double> const & x)->double
+   {
+      double location = 0;
+      for (unsigned i = 0; i < 10; ++i)
+         location += x[i] * x[i];
+      return location <= 1 ? 1 : 0;
+   };
+
+   std::cout << "Regular host code:\n";
+
+   double hypersphere10 = std::pow(boost::math::constants::pi<double>(), 5) / boost::math::tgamma(6.0);
+
+   std::vector<std::pair<double, double>> bounds2;
+   std::pair<double, double> point = { -1.0, 1.0 };
+   for (unsigned i = 0; i < 10; ++i)
+      bounds2.push_back(point);
+   double error_goal = 0.02;
+   for (unsigned threads = 1; threads < 9; ++threads)
+   {
+      w.reset();
+      naive_monte_carlo<double, decltype(g), boost::random::taus88> mc(g, bounds2, error_goal,
+         /*singular =*/ false,/* threads = */ threads, /* seed = */ 128402);
+      auto task = mc.integrate();
+      double val = task.get();
+      double elapsed = w.elapsed();
+      boost::uintmax_t points = mc.calls();
+      double err = fabs(val - hypersphere10);
+      std::cout << std::right << std::setw(15) << threads
+         << std::right << std::setw(15) << "-"
+         << std::right << std::setw(15) << points
+         << std::right << std::setw(15) << std::fixed << elapsed
+         << std::right << std::setw(15) << std::scientific << error_goal
+         << std::right << std::setw(15) << std::scientific << mc.variance()
+         << std::right << std::setw(15) << std::scientific << mc.current_error_estimate()
+         << std::right << std::setw(15) << std::scientific << err << std::endl;
+   }
+}
diff --git a/test/cuda/misc/test_naive_monte_carlo_output.txt b/test/cuda/misc/test_naive_monte_carlo_output.txt
new file mode 100644
index 0000000000..71d7b62737
--- /dev/null
+++ b/test/cuda/misc/test_naive_monte_carlo_output.txt
@@ -0,0 +1,117 @@
+maxThreadsPerBlock = 1024
+maxThreadsPerMultiProcessor = 2048
+multiProcessorCount = 5
+
+Total max threads = 10240
+Testing Pi calculation for circle formula.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        Threads  Init #/thread   Total Points           Time     Error Goal       Variance     Error Est.   Error Actual
+          10240            128        3481600       0.008367   1.000000e-03   2.698975e+00   8.804608e-04   9.676536e-04
+          10240            256        3031040       0.008578   1.000000e-03   2.698542e+00   9.435583e-04   7.776198e-04
+          10240            512        5242880       0.011217   1.000000e-03   2.698333e+00   7.174024e-04   6.861289e-04
+          10240           1024       10485760       0.016828   1.000000e-03   2.696115e+00   5.070715e-04   2.854745e-04
+          10240            128       12154880       0.019123   5.000000e-04   2.697501e+00   4.710917e-04   3.216579e-04
+          10240            256       22097920       0.030329   5.000000e-04   2.696124e+00   3.492965e-04   2.815138e-04
+          10240            512       12226560       0.018789   5.000000e-04   2.695405e+00   4.695262e-04   5.961550e-04
+          10240           1024       21821440       0.030160   5.000000e-04   2.696328e+00   3.515157e-04   1.917566e-04
+          10240            128       47482880       0.056661   2.500000e-04   2.696911e+00   2.383224e-04   6.349613e-05
+          10240            256       45527040       0.052038   2.500000e-04   2.697649e+00   2.434210e-04   3.866362e-04
+          10240            512       46909440       0.051340   2.500000e-04   2.697758e+00   2.398123e-04   4.345413e-04
+          10240           1024       45086720       0.049912   2.500000e-04   2.696520e+00   2.445555e-04   1.079625e-04
+          10240            128      194457600       0.180079   1.250000e-04   2.696297e+00   1.177528e-04   2.055790e-04
+          10240            256      179671040       0.148807   1.250000e-04   2.695943e+00   1.224944e-04   3.606851e-04
+          10240            512      185292800       0.142422   1.250000e-04   2.696969e+00   1.206448e-04   8.891464e-05
+          10240           1024      178534400       0.128817   1.250000e-04   2.696871e+00   1.229049e-04   4.579203e-05
+          10240            128      812759040       0.530420   6.250000e-05   2.696687e+00   5.760158e-05   3.456719e-05
+          10240            256      765757440       0.498838   6.250000e-05   2.696707e+00   5.934324e-05   2.588295e-05
+          10240            512      712366080       0.464361   6.250000e-05   2.696697e+00   6.152682e-05   3.040628e-05
+          10240           1024      723793920       0.471714   6.250000e-05   2.696916e+00   6.104165e-05   6.568415e-05
+          10240            128     2899230720       1.873548   3.125000e-05   2.696716e+00   3.049834e-05   2.181716e-05
+          10240            256     2908538880       1.879682   3.125000e-05   2.696710e+00   3.044947e-05   2.464258e-05
+          10240            512     2887147520       1.867332   3.125000e-05   2.696759e+00   3.056234e-05   3.253501e-06
+          10240           1024     2905436160       1.877813   3.125000e-05   2.696726e+00   3.046581e-05   1.766629e-05
+          10240            128    11596021760       7.482193   1.562500e-05   2.696714e+00   1.524976e-05   2.306342e-05
+          10240            256    11597219840       7.485470   1.562500e-05   2.696743e+00   1.524905e-05   1.027976e-05
+          10240            512    11603496960       7.489412   1.562500e-05   2.696769e+00   1.524500e-05   1.019131e-06
+          10240           1024    11591219200       7.483967   1.562500e-05   2.696742e+00   1.525299e-05   1.074012e-05
+Regular host code:
+              1              -        4550657       0.101999   1.000000e-03   2.697200e+00   7.698737e-04   1.895551e-04
+              2              -        9011202       0.101099   1.000000e-03   2.697681e+00   5.471469e-04   4.006128e-04
+              3              -       13193219       0.101715   1.000000e-03   2.697069e+00   4.521376e-04   1.324838e-04
+              4              -       15540228       0.101911   1.000000e-03   2.696240e+00   4.165341e-04   2.304908e-04
+              5              -       17301509       0.101999   1.000000e-03   2.695919e+00   3.947402e-04   3.705127e-04
+              6              -       17262598       0.101882   1.000000e-03   2.696000e+00   3.951908e-04   3.351142e-04
+              7              -       15417351       0.101948   1.000000e-03   2.696114e+00   4.181810e-04   2.850918e-04
+              8              -       22349832       0.128935   1.000000e-03   2.696685e+00   3.473585e-04   3.525663e-05
+Regular host code:
+              1              -       13709313       0.302718   5.000000e-04   2.695696e+00   4.434326e-04   4.688049e-04
+              2              -       18317314       0.201833   5.000000e-04   2.696751e+00   3.836980e-04   6.600537e-06
+              3              -       13318147       0.101908   5.000000e-04   2.697159e+00   4.500196e-04   1.721542e-04
+              4              -       17143812       0.101902   5.000000e-04   2.696372e+00   3.965849e-04   1.727834e-04
+              5              -       17346565       0.101924   5.000000e-04   2.696311e+00   3.942559e-04   1.989345e-04
+              6              -       16046086       0.101931   5.000000e-04   2.696458e+00   4.099324e-04   1.344941e-04
+              7              -       17283079       0.101943   5.000000e-04   2.696471e+00   3.949911e-04   1.285640e-04
+              8              -       21749768       0.128923   5.000000e-04   2.696677e+00   3.521171e-04   3.865025e-05
+Regular host code:
+              1              -       46278657       1.009327   2.500000e-04   2.696945e+00   2.414047e-04   7.806729e-05
+              2              -       45809666       0.504654   2.500000e-04   2.696625e+00   2.426228e-04   6.203562e-05
+              3              -       52557827       0.403961   2.500000e-04   2.697215e+00   2.265370e-04   1.967964e-04
+              4              -       50624516       0.605888   2.500000e-04   2.696779e+00   2.308034e-04   5.759208e-06
+              5              -       47114245       0.303742   2.500000e-04   2.696477e+00   2.392336e-04   1.267988e-04
+              6              -       46374918       0.303819   2.500000e-04   2.696350e+00   2.411274e-04   1.822170e-04
+              7              -       51236871       0.312092   2.500000e-04   2.696683e+00   2.294160e-04   3.619412e-05
+              8              -       54386696       0.331619   2.500000e-04   2.696812e+00   2.226789e-04   2.016314e-05
+Regular host code:
+              1              -      175192065       3.836674   1.250000e-04   2.696896e+00   1.240723e-04   5.692251e-05
+              2              -      175951874       2.020645   1.250000e-04   2.696651e+00   1.237985e-04   5.033908e-05
+              3              -      181583875       1.414251   1.250000e-04   2.696676e+00   1.218641e-04   3.934430e-05
+              4              -      183513092       1.212304   1.250000e-04   2.696707e+00   1.212225e-04   2.579835e-05
+              5              -      175282181       1.212331   1.250000e-04   2.696732e+00   1.240367e-04   1.478682e-05
+              6              -      173148166       1.313198   1.250000e-04   2.696545e+00   1.247943e-04   9.671319e-05
+              7              -      175978503       1.111325   1.250000e-04   2.696631e+00   1.237887e-04   5.943786e-05
+              8              -      173408264       1.080426   1.250000e-04   2.696845e+00   1.247077e-04   3.453269e-05
+Regular host code:
+              1              -      693923841      16.151669   6.250000e-05   2.696759e+00   6.233976e-05   3.232291e-06
+              2              -      694083586       7.671572   6.250000e-05   2.696786e+00   6.233290e-05   8.717617e-06
+              3              -      699369475       5.249936   6.250000e-05   2.696698e+00   6.209588e-05   3.006593e-05
+              4              -      699097092       4.240581   6.250000e-05   2.696848e+00   6.210971e-05   3.592687e-05
+              5              -      698124293       4.442437   6.250000e-05   2.696688e+00   6.215113e-05   3.411114e-05
+              6              -      691200006       4.341502   6.250000e-05   2.696544e+00   6.245999e-05   9.729599e-05
+              7              -      700665863       4.251577   6.250000e-05   2.696596e+00   6.203724e-05   7.446656e-05
+              8              -      691546120       4.298594   6.250000e-05   2.696597e+00   6.244496e-05   7.427674e-05
+Regular host code:
+              1              -     2762876929      59.859699   3.125000e-05   2.696780e+00   3.124223e-05   6.067197e-06
+              2              -     2763421698      30.383588   3.125000e-05   2.696766e+00   3.123906e-05   2.528722e-07
+              3              -     2772873219      20.996881   3.125000e-05   2.696811e+00   3.118604e-05   1.943188e-05
+              4              -     2762248196      17.363063   3.125000e-05   2.696818e+00   3.124600e-05   2.281328e-05
+              5              -     2764449797      17.665886   3.125000e-05   2.696765e+00   3.123325e-05   3.973764e-07
+              6              -     2768607238      17.766842   3.125000e-05   2.696751e+00   3.120971e-05   6.553629e-06
+              7              -     2766036999      17.587942   3.125000e-05   2.696786e+00   3.122441e-05   8.847176e-06
+              8              -     2766690312      17.630970   3.125000e-05   2.696802e+00   3.122082e-05   1.577008e-05
+Regular host code:
+              1              -    11046782977     255.202993   1.562500e-05   2.696767e+00   1.562442e-05   4.741963e-07
+              2              -    11051421698     122.614502   1.562500e-05   2.696776e+00   1.562116e-05   4.096320e-06
+              3              -    11048861699      86.396803   1.562500e-05   2.696747e+00   1.562289e-05   8.501137e-06
+              4              -    11060627460      77.351707   1.562500e-05   2.696739e+00   1.561455e-05   1.196637e-05
+              5              -    11048994821     175.726520   1.562500e-05   2.696776e+00   1.562288e-05   4.087902e-06
+              6              -    11050817542      76.486873   1.562500e-05   2.696752e+00   1.562152e-05   6.182881e-06
+              7              -    11058628615      74.262706   1.562500e-05   2.696773e+00   1.561606e-05   2.913170e-06
+              8              -    11047886856      83.430636   1.562500e-05   2.696755e+00   1.562360e-05   5.093417e-06
+Testing Hypersphere volume.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        Threads  Init #/thread   Total Points           Time     Error Goal       Variance     Error Est.   Error Actual
+          10240             64        7311360       0.053767   2.000000e-02   2.574587e+03   1.876526e-02   2.971586e-02
+          10240            128        8253440       0.060574   2.000000e-02   2.598377e+03   1.774327e-02   6.367514e-03
+          10240            256        9318400       0.065937   2.000000e-02   2.581856e+03   1.664544e-02   2.258162e-02
+          10240            512       12103680       0.076164   2.000000e-02   2.619352e+03   1.471086e-02   1.421836e-02
+          10240           1024       10485760       0.065995   2.000000e-02   2.608479e+03   1.577225e-02   3.546898e-03
+          10240           2048       20971520       0.115725   2.000000e-02   2.609822e+03   1.115554e-02   4.865257e-03
+Regular host code:
+              1              -        7063553       0.504751   2.000000e-02   2.616529e+03   1.924647e-02   1.144766e-02
+              2              -        8331266       0.301304   2.000000e-02   2.603470e+03   1.767750e-02   1.369175e-03
+              3              -        8808451       0.302716   2.000000e-02   2.596392e+03   1.716862e-02   8.314173e-03
+              4              -        9146372       0.202616   2.000000e-02   2.630406e+03   1.695848e-02   2.506994e-02
+              5              -        7051269       0.202599   2.000000e-02   2.593405e+03   1.917792e-02   1.124629e-02
+              6              -        7208966       0.202971   2.000000e-02   2.610192e+03   1.902829e-02   5.231561e-03
+              7              -        9146375       0.223786   2.000000e-02   2.604055e+03   1.687332e-02   7.929502e-04
+              8              -       10758152       0.258991   2.000000e-02   2.602338e+03   1.555296e-02   2.478898e-03
diff --git a/test/cuda/stopwatch.hpp b/test/cuda/stopwatch.hpp
new file mode 100644
index 0000000000..028fe25fa4
--- /dev/null
+++ b/test/cuda/stopwatch.hpp
@@ -0,0 +1,41 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_CUDA_STOPWATCH_HPP
+#define BOOST_MATH_CUDA_STOPWATCH_HPP
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#include <boost/chrono.hpp>
+
+template <class Clock>
+struct stopwatch
+{
+   typedef typename Clock::duration duration;
+   stopwatch()
+   {
+      m_start = Clock::now();
+   }
+   double elapsed()
+   {
+      duration t = Clock::now() - m_start;
+      return boost::chrono::duration_cast<boost::chrono::duration<double>>(t).count();
+   }
+   void reset()
+   {
+      m_start = Clock::now();
+   }
+
+private:
+   typename Clock::time_point m_start;
+};
+
+typedef stopwatch<boost::chrono::high_resolution_clock> watch;
+
+#endif
+
+
diff --git a/test/cuda/test_arcsine_cdf_double.cu b/test/cuda/test_arcsine_cdf_double.cu
new file mode 100644
index 0000000000..7b93a830cc
--- /dev/null
+++ b/test/cuda/test_arcsine_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/arcsine.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::arcsine_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::arcsine_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_arcsine_cdf_float.cu b/test/cuda/test_arcsine_cdf_float.cu
new file mode 100644
index 0000000000..d42084cd0a
--- /dev/null
+++ b/test/cuda/test_arcsine_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/arcsine.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::arcsine_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::arcsine_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_arcsine_pdf_double.cu b/test/cuda/test_arcsine_pdf_double.cu
new file mode 100644
index 0000000000..3d61ce6f07
--- /dev/null
+++ b/test/cuda/test_arcsine_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/arcsine.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::arcsine_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::arcsine_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_arcsine_pdf_float.cu b/test/cuda/test_arcsine_pdf_float.cu
new file mode 100644
index 0000000000..2dd5d6dbdf
--- /dev/null
+++ b/test/cuda/test_arcsine_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/arcsine.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::arcsine_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::arcsine_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_arcsine_quan_double.cu b/test/cuda/test_arcsine_quan_double.cu
new file mode 100644
index 0000000000..46b3ab4c9a
--- /dev/null
+++ b/test/cuda/test_arcsine_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/arcsine.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::arcsine_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::arcsine_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_arcsine_quan_float.cu b/test/cuda/test_arcsine_quan_float.cu
new file mode 100644
index 0000000000..39ff89041c
--- /dev/null
+++ b/test/cuda/test_arcsine_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/arcsine.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::arcsine_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::arcsine_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_beta_double.cu b/test/cuda/test_beta_double.cu
new file mode 100644
index 0000000000..2114a75698
--- /dev/null
+++ b/test/cuda/test_beta_double.cu
@@ -0,0 +1,130 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::beta(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../beta_med_data.ipp"
+#include "../beta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < beta_med_data.size(); ++i)
+    {
+       v1.push_back(beta_med_data[i][0]);
+       v2.push_back(beta_med_data[i][1]);
+    }
+    for(unsigned i = 0; i < beta_small_data.size(); ++i)
+    {
+       v1.push_back(beta_small_data[i][0]);
+       v2.push_back(beta_small_data[i][1]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::beta(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_beta_float.cu b/test/cuda/test_beta_float.cu
new file mode 100644
index 0000000000..326b32a68a
--- /dev/null
+++ b/test/cuda/test_beta_float.cu
@@ -0,0 +1,130 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::beta(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../beta_med_data.ipp"
+#include "../beta_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < beta_med_data.size(); ++i)
+    {
+       v1.push_back(beta_med_data[i][0]);
+       v2.push_back(beta_med_data[i][1]);
+    }
+    for(unsigned i = 0; i < beta_small_data.size(); ++i)
+    {
+       v1.push_back(beta_small_data[i][0]);
+       v2.push_back(beta_small_data[i][1]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::beta(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cauchy_cdf_double.cu b/test/cuda/test_cauchy_cdf_double.cu
new file mode 100644
index 0000000000..4b346f0809
--- /dev/null
+++ b/test/cuda/test_cauchy_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/cauchy.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::cauchy_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(-10000, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::cauchy_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cauchy_cdf_float.cu b/test/cuda/test_cauchy_cdf_float.cu
new file mode 100644
index 0000000000..d839097dec
--- /dev/null
+++ b/test/cuda/test_cauchy_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/cauchy.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::cauchy_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(-10000, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::cauchy_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cauchy_pdf_double.cu b/test/cuda/test_cauchy_pdf_double.cu
new file mode 100644
index 0000000000..a6e1c7a400
--- /dev/null
+++ b/test/cuda/test_cauchy_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/cauchy.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::cauchy_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(-10000, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::cauchy_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cauchy_pdf_float.cu b/test/cuda/test_cauchy_pdf_float.cu
new file mode 100644
index 0000000000..d3555f375d
--- /dev/null
+++ b/test/cuda/test_cauchy_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/cauchy.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::cauchy_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(-10000, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::cauchy_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cauchy_quan_double.cu b/test/cuda/test_cauchy_quan_double.cu
new file mode 100644
index 0000000000..e0b0080908
--- /dev/null
+++ b/test/cuda/test_cauchy_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/cauchy.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::cauchy_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::cauchy_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cauchy_quan_float.cu b/test/cuda/test_cauchy_quan_float.cu
new file mode 100644
index 0000000000..3d4ecb5b37
--- /dev/null
+++ b/test/cuda/test_cauchy_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/cauchy.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::cauchy_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::cauchy_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_cbrt_double.cu b/test/cuda/test_cbrt_double.cu
new file mode 100644
index 0000000000..0e7034e897
--- /dev/null
+++ b/test/cuda/test_cbrt_double.cu
@@ -0,0 +1,99 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cbrt(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cbrt(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_cbrt_float.cu b/test/cuda/test_cbrt_float.cu
new file mode 100644
index 0000000000..c1bd6208b1
--- /dev/null
+++ b/test/cuda/test_cbrt_float.cu
@@ -0,0 +1,98 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::cbrt(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::cbrt(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED in " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_changesign_double.cu b/test/cuda/test_changesign_double.cu
new file mode 100644
index 0000000000..b95a867a59
--- /dev/null
+++ b/test/cuda/test_changesign_double.cu
@@ -0,0 +1,112 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/sign.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::changesign(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        }
+        if(i % 1)
+           h_A[i] = -h_A[i];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::changesign(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_chi_sq_cdf_double.cu b/test/cuda/test_chi_sq_cdf_double.cu
new file mode 100644
index 0000000000..9e52388f73
--- /dev/null
+++ b/test/cuda/test_chi_sq_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::chi_squared_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::chi_squared_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_chi_sq_cdf_float.cu b/test/cuda/test_chi_sq_cdf_float.cu
new file mode 100644
index 0000000000..44bc07746d
--- /dev/null
+++ b/test/cuda/test_chi_sq_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::chi_squared_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::chi_squared_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_chi_sq_pdf_double.cu b/test/cuda/test_chi_sq_pdf_double.cu
new file mode 100644
index 0000000000..f114f6aba3
--- /dev/null
+++ b/test/cuda/test_chi_sq_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::chi_squared_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::chi_squared_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_chi_sq_pdf_float.cu b/test/cuda/test_chi_sq_pdf_float.cu
new file mode 100644
index 0000000000..6c83af21a5
--- /dev/null
+++ b/test/cuda/test_chi_sq_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::chi_squared_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::chi_squared_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_copysign_double.cu b/test/cuda/test_copysign_double.cu
new file mode 100644
index 0000000000..0ddcdfa395
--- /dev/null
+++ b/test/cuda/test_copysign_double.cu
@@ -0,0 +1,112 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/sign.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::copysign(in[i], float_type(-1.0));
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        }
+        if(i % 1)
+           h_A[i] = -h_A[i];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::copysign(h_A[i], float_type(-1.0)));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_1E_double.cu b/test/cuda/test_ellint_1E_double.cu
new file mode 100644
index 0000000000..4d4116aea7
--- /dev/null
+++ b/test/cuda/test_ellint_1E_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_1(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_k_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < ellint_k_data.size(); ++i)
+       v.push_back(ellint_k_data[i][0]);
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_1(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ellint_1_double.cu b/test/cuda/test_ellint_1_double.cu
new file mode 100644
index 0000000000..406e8e1040
--- /dev/null
+++ b/test/cuda/test_ellint_1_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_1.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_1(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_f_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < ellint_f_data.size(); ++i)
+    {
+       v1.push_back(ellint_f_data[i][1]);
+       v2.push_back(ellint_f_data[i][0]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_1(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_2_double.cu b/test/cuda/test_ellint_2_double.cu
new file mode 100644
index 0000000000..b7e4d42919
--- /dev/null
+++ b/test/cuda/test_ellint_2_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_2.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_2(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_e2_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < ellint_e2_data.size(); ++i)
+    {
+       v1.push_back(ellint_e2_data[i][1]);
+       v2.push_back(ellint_e2_data[i][0]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_2(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_2c_double.cu b/test/cuda/test_ellint_2c_double.cu
new file mode 100644
index 0000000000..6f8dd25608
--- /dev/null
+++ b/test/cuda/test_ellint_2c_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_2.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_2(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_e_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < ellint_e_data.size(); ++i)
+       v.push_back(ellint_e_data[i][0]);
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_2(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ellint_3_double.cu b/test/cuda/test_ellint_3_double.cu
new file mode 100644
index 0000000000..04fcca0b03
--- /dev/null
+++ b/test/cuda/test_ellint_3_double.cu
@@ -0,0 +1,120 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_3.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, const float_type* in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_3(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_pi3_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ellint_pi3_data.size(); ++i)
+    {
+       v1.push_back(ellint_pi3_data[i][2]);
+       v2.push_back(ellint_pi3_data[i][0]);
+       v3.push_back(ellint_pi3_data[i][1]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_3c_double.cu b/test/cuda/test_ellint_3c_double.cu
new file mode 100644
index 0000000000..1c7928c9a0
--- /dev/null
+++ b/test/cuda/test_ellint_3c_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_3.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_3(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_pi2_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < ellint_pi2_data.size(); ++i)
+    {
+       v1.push_back(ellint_pi2_data[i][1]);
+       v2.push_back(ellint_pi2_data[i][0]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_3(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_d_double.cu b/test/cuda/test_ellint_d_double.cu
new file mode 100644
index 0000000000..b067ceefcf
--- /dev/null
+++ b/test/cuda/test_ellint_d_double.cu
@@ -0,0 +1,122 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_d.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_d(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_d2_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < ellint_d2_data.size(); ++i)
+    {
+       v1.push_back(ellint_d2_data[i][1]);
+       v2.push_back(ellint_d2_data[i][0]);
+    }
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_d(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ellint_dc_double.cu b/test/cuda/test_ellint_dc_double.cu
new file mode 100644
index 0000000000..42ec1f28d0
--- /dev/null
+++ b/test/cuda/test_ellint_dc_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_d.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_d(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_d_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < ellint_d_data.size(); ++i)
+       v.push_back(ellint_d_data[i][0]);
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_d(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ellint_lambda_double.cu b/test/cuda/test_ellint_lambda_double.cu
new file mode 100644
index 0000000000..a50ddbb17e
--- /dev/null
+++ b/test/cuda/test_ellint_lambda_double.cu
@@ -0,0 +1,122 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/heuman_lambda.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::heuman_lambda(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../heuman_lambda_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < heuman_lambda_data.size(); ++i)
+    {
+       v1.push_back(heuman_lambda_data[i][1]);
+       v2.push_back(heuman_lambda_data[i][0]);
+    }
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 128;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::heuman_lambda(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10000)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ellint_rc_double.cu b/test/cuda/test_ellint_rc_double.cu
new file mode 100644
index 0000000000..5b8a395216
--- /dev/null
+++ b/test/cuda/test_ellint_rc_double.cu
@@ -0,0 +1,117 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_rc.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_rc(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_rc_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < ellint_rc_data.size(); ++i)
+    {
+       v1.push_back(ellint_rc_data[i][0]);
+       v2.push_back(ellint_rc_data[i][1]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_rc(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_rd_double.cu b/test/cuda/test_ellint_rd_double.cu
new file mode 100644
index 0000000000..334fe5339b
--- /dev/null
+++ b/test/cuda/test_ellint_rd_double.cu
@@ -0,0 +1,120 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_rd.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, const float_type* in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_rd(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_rd_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ellint_rd_data.size(); ++i)
+    {
+       v1.push_back(ellint_rd_data[i][0]);
+       v2.push_back(ellint_rd_data[i][1]);
+       v3.push_back(ellint_rd_data[i][2]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_rd(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_rf_double.cu b/test/cuda/test_ellint_rf_double.cu
new file mode 100644
index 0000000000..19e7604ba8
--- /dev/null
+++ b/test/cuda/test_ellint_rf_double.cu
@@ -0,0 +1,120 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_rf.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, const float_type* in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_rf(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_rf_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ellint_rf_data.size(); ++i)
+    {
+       v1.push_back(ellint_rf_data[i][0]);
+       v2.push_back(ellint_rf_data[i][1]);
+       v3.push_back(ellint_rf_data[i][2]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_rf(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_rg_double.cu b/test/cuda/test_ellint_rg_double.cu
new file mode 100644
index 0000000000..107c719227
--- /dev/null
+++ b/test/cuda/test_ellint_rg_double.cu
@@ -0,0 +1,120 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_rg.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, const float_type* in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_rg(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_rg.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ellint_rg.size(); ++i)
+    {
+       v1.push_back(ellint_rg[i][0]);
+       v2.push_back(ellint_rg[i][1]);
+       v3.push_back(ellint_rg[i][2]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_rg(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_rj_double.cu b/test/cuda/test_ellint_rj_double.cu
new file mode 100644
index 0000000000..d18c5ea6da
--- /dev/null
+++ b/test/cuda/test_ellint_rj_double.cu
@@ -0,0 +1,124 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/ellint_rj.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type* in2, const float_type* in3, const float_type* in4, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ellint_rj(in1[i], in2[i], in3[i], in4[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ellint_rj_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    std::vector<float_type> v1, v2, v3, v4;
+
+    for(unsigned i = 0; i < ellint_rj_data.size(); ++i)
+    {
+       v1.push_back(ellint_rj_data[i][0]);
+       v2.push_back(ellint_rj_data[i][1]);
+       v3.push_back(ellint_rj_data[i][2]);
+       v4.push_back(ellint_rj_data[i][4]);
+    }
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+    cuda_managed_ptr<float_type> input_vector4(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+        input_vector4[i] = v4[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+    
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), input_vector4.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ellint_rj(input_vector1[i], input_vector2[i], input_vector3[i], input_vector4[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 15)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error found was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ellint_zeta_double.cu b/test/cuda/test_ellint_zeta_double.cu
new file mode 100644
index 0000000000..dbab7a75aa
--- /dev/null
+++ b/test/cuda/test_ellint_zeta_double.cu
@@ -0,0 +1,122 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/jacobi_zeta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::jacobi_zeta(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../jacobi_zeta_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < jacobi_zeta_data.size(); ++i)
+    {
+       v1.push_back(jacobi_zeta_data[i][1]);
+       v2.push_back(jacobi_zeta_data[i][0]);
+    }
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 128;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::jacobi_zeta(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_erf_double.cu b/test/cuda/test_erf_double.cu
new file mode 100644
index 0000000000..78dc132266
--- /dev/null
+++ b/test/cuda/test_erf_double.cu
@@ -0,0 +1,98 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/erf.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(h_C[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_erf_float.cu b/test/cuda/test_erf_float.cu
new file mode 100644
index 0000000000..c91175fdb7
--- /dev/null
+++ b/test/cuda/test_erf_float.cu
@@ -0,0 +1,99 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/erf.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(h_C[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED in " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_erf_inv_double.cu b/test/cuda/test_erf_inv_double.cu
new file mode 100644
index 0000000000..26220f7291
--- /dev/null
+++ b/test/cuda/test_erf_inv_double.cu
@@ -0,0 +1,105 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf_inv(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) x
+
+#include "../erf_inv_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % erf_inv_data.size();
+        input_vector[i] = erf_inv_data[table_id][0];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf_inv(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_erf_inv_float.cu b/test/cuda/test_erf_inv_float.cu
new file mode 100644
index 0000000000..9e6356f36f
--- /dev/null
+++ b/test/cuda/test_erf_inv_float.cu
@@ -0,0 +1,105 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::erf_inv(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) x
+
+#include "../erf_inv_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % erf_inv_data.size();
+        input_vector[i] = erf_inv_data[table_id][0];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf_inv(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_ex_val_cdf_double.cu b/test/cuda/test_ex_val_cdf_double.cu
new file mode 100644
index 0000000000..db8ae2aace
--- /dev/null
+++ b/test/cuda/test_ex_val_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/extreme_value.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::extreme_value_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::extreme_value_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ex_val_cdf_float.cu b/test/cuda/test_ex_val_cdf_float.cu
new file mode 100644
index 0000000000..faa35e8155
--- /dev/null
+++ b/test/cuda/test_ex_val_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/extreme_value.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::extreme_value_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::extreme_value_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ex_val_pdf_double.cu b/test/cuda/test_ex_val_pdf_double.cu
new file mode 100644
index 0000000000..bc1666c3d2
--- /dev/null
+++ b/test/cuda/test_ex_val_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/extreme_value.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::extreme_value_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::extreme_value_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ex_val_pdf_float.cu b/test/cuda/test_ex_val_pdf_float.cu
new file mode 100644
index 0000000000..aa6baf659c
--- /dev/null
+++ b/test/cuda/test_ex_val_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/extreme_value.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::extreme_value_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::extreme_value_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ex_val_quan_double.cu b/test/cuda/test_ex_val_quan_double.cu
new file mode 100644
index 0000000000..f3d2cf946f
--- /dev/null
+++ b/test/cuda/test_ex_val_quan_double.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/extreme_value.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::extreme_value_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::extreme_value_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 3000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ex_val_quan_float.cu b/test/cuda/test_ex_val_quan_float.cu
new file mode 100644
index 0000000000..e622c9b291
--- /dev/null
+++ b/test/cuda/test_ex_val_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/extreme_value.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::extreme_value_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::extreme_value_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 3000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_exp_cdf_double.cu b/test/cuda/test_exp_cdf_double.cu
new file mode 100644
index 0000000000..55c4cc0bc8
--- /dev/null
+++ b/test/cuda/test_exp_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::exponential_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::exponential_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_exp_cdf_float.cu b/test/cuda/test_exp_cdf_float.cu
new file mode 100644
index 0000000000..682e7d8d24
--- /dev/null
+++ b/test/cuda/test_exp_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::exponential_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::exponential_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_exp_pdf_double.cu b/test/cuda/test_exp_pdf_double.cu
new file mode 100644
index 0000000000..8cddeae2f8
--- /dev/null
+++ b/test/cuda/test_exp_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::exponential_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::exponential_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_exp_pdf_float.cu b/test/cuda/test_exp_pdf_float.cu
new file mode 100644
index 0000000000..b2d63b4cab
--- /dev/null
+++ b/test/cuda/test_exp_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::exponential_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::exponential_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_exponential_quan_double.cu b/test/cuda/test_exponential_quan_double.cu
new file mode 100644
index 0000000000..5f00a056c8
--- /dev/null
+++ b/test/cuda/test_exponential_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::exponential_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::exponential_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_exponential_quan_float.cu b/test/cuda/test_exponential_quan_float.cu
new file mode 100644
index 0000000000..7790e220d3
--- /dev/null
+++ b/test/cuda/test_exponential_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/exponential.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::exponential_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::exponential_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_fpclassify_double.cu b/test/cuda/test_fpclassify_double.cu
new file mode 100644
index 0000000000..880dfeb62d
--- /dev/null
+++ b/test/cuda/test_fpclassify_double.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, int *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::fpclassify(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<int> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::fpclassify(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_gamma_cdf_double.cu b/test/cuda/test_gamma_cdf_double.cu
new file mode 100644
index 0000000000..dd319ad35f
--- /dev/null
+++ b/test/cuda/test_gamma_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::gamma_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_cdf_float.cu b/test/cuda/test_gamma_cdf_float.cu
new file mode 100644
index 0000000000..9f3847aa79
--- /dev/null
+++ b/test/cuda/test_gamma_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::gamma_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::gamma_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_p_deriv_double.cu b/test/cuda/test_gamma_p_deriv_double.cu
new file mode 100644
index 0000000000..30b2eac393
--- /dev/null
+++ b/test/cuda/test_gamma_p_deriv_double.cu
@@ -0,0 +1,124 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gamma_p_derivative(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../igamma_med_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < igamma_med_data.size(); ++i)
+    {
+       v1.push_back(igamma_med_data[i][0]);
+       v2.push_back(igamma_med_data[i][1]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::gamma_p_derivative(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_p_deriv_float.cu b/test/cuda/test_gamma_p_deriv_float.cu
new file mode 100644
index 0000000000..2ba810bd6c
--- /dev/null
+++ b/test/cuda/test_gamma_p_deriv_float.cu
@@ -0,0 +1,136 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gamma_p_derivative(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../igamma_med_data.ipp"
+#include "../igamma_big_data.ipp"
+#include "../igamma_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < igamma_med_data.size(); ++i)
+    {
+       v1.push_back(igamma_med_data[i][0]);
+       v2.push_back(igamma_med_data[i][1]);
+    }
+    for(unsigned i = 0; i < igamma_big_data.size(); ++i)
+    {
+       v1.push_back(igamma_big_data[i][0]);
+       v2.push_back(igamma_big_data[i][1]);
+    }
+    for(unsigned i = 0; i < igamma_small_data.size(); ++i)
+    {
+       v1.push_back(igamma_small_data[i][0]);
+       v2.push_back(igamma_small_data[i][1]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::gamma_p_derivative(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_p_double.cu b/test/cuda/test_gamma_p_double.cu
new file mode 100644
index 0000000000..a243582fb8
--- /dev/null
+++ b/test/cuda/test_gamma_p_double.cu
@@ -0,0 +1,136 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gamma_p(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../igamma_med_data.ipp"
+#include "../igamma_big_data.ipp"
+#include "../igamma_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < igamma_med_data.size(); ++i)
+    {
+       v1.push_back(igamma_med_data[i][0]);
+       v2.push_back(igamma_med_data[i][1]);
+    }
+    for(unsigned i = 0; i < igamma_big_data.size(); ++i)
+    {
+       v1.push_back(igamma_big_data[i][0]);
+       v2.push_back(igamma_big_data[i][1]);
+    }
+    for(unsigned i = 0; i < igamma_small_data.size(); ++i)
+    {
+       v1.push_back(igamma_small_data[i][0]);
+       v2.push_back(igamma_small_data[i][1]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::gamma_p(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_p_float.cu b/test/cuda/test_gamma_p_float.cu
new file mode 100644
index 0000000000..f9510c33df
--- /dev/null
+++ b/test/cuda/test_gamma_p_float.cu
@@ -0,0 +1,136 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::gamma_p(in1[i], in2[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../igamma_med_data.ipp"
+#include "../igamma_big_data.ipp"
+#include "../igamma_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2;
+
+    for(unsigned i = 0; i < igamma_med_data.size(); ++i)
+    {
+       v1.push_back(igamma_med_data[i][0]);
+       v2.push_back(igamma_med_data[i][1]);
+    }
+    for(unsigned i = 0; i < igamma_big_data.size(); ++i)
+    {
+       v1.push_back(igamma_big_data[i][0]);
+       v2.push_back(igamma_big_data[i][1]);
+    }
+    for(unsigned i = 0; i < igamma_small_data.size(); ++i)
+    {
+       v1.push_back(igamma_small_data[i][0]);
+       v2.push_back(igamma_small_data[i][1]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::gamma_p(input_vector1[i], input_vector2[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_pdf_double.cu b/test/cuda/test_gamma_pdf_double.cu
new file mode 100644
index 0000000000..f8d67d4974
--- /dev/null
+++ b/test/cuda/test_gamma_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::gamma_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_gamma_pdf_float.cu b/test/cuda/test_gamma_pdf_float.cu
new file mode 100644
index 0000000000..205e8221e0
--- /dev/null
+++ b/test/cuda/test_gamma_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::gamma_distribution<float_type>(2), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::gamma_distribution<float_type>(2), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_geo_cdf_double.cu b/test/cuda/test_geo_cdf_double.cu
new file mode 100644
index 0000000000..adc9de66bb
--- /dev/null
+++ b/test/cuda/test_geo_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::geometric_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_geo_cdf_float.cu b/test/cuda/test_geo_cdf_float.cu
new file mode 100644
index 0000000000..eeac8f7774
--- /dev/null
+++ b/test/cuda/test_geo_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::geometric_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::geometric_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_geo_pdf_double.cu b/test/cuda/test_geo_pdf_double.cu
new file mode 100644
index 0000000000..6bd3da0f66
--- /dev/null
+++ b/test/cuda/test_geo_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::geometric_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_geo_pdf_float.cu b/test/cuda/test_geo_pdf_float.cu
new file mode 100644
index 0000000000..9160fc73a9
--- /dev/null
+++ b/test/cuda/test_geo_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::geometric_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::geometric_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_geo_quan_double.cu b/test/cuda/test_geo_quan_double.cu
new file mode 100644
index 0000000000..55e8d76934
--- /dev/null
+++ b/test/cuda/test_geo_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::geometric_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_geo_quan_float.cu b/test/cuda/test_geo_quan_float.cu
new file mode 100644
index 0000000000..e368406970
--- /dev/null
+++ b/test/cuda/test_geo_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/geometric.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::geometric_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::geometric_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ibeta_derivative_double.cu b/test/cuda/test_ibeta_derivative_double.cu
new file mode 100644
index 0000000000..cdb6a7a8b9
--- /dev/null
+++ b/test/cuda/test_ibeta_derivative_double.cu
@@ -0,0 +1,148 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, const float_type* in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ibeta_derivative_data.ipp"
+#include "../ibeta_derivative_int_data.ipp"
+#include "../ibeta_derivative_large_data.ipp"
+#include "../ibeta_derivative_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_derivative_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_data[i][0]);
+       v2.push_back(ibeta_derivative_data[i][1]);
+       v3.push_back(ibeta_derivative_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_derivative_int_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_int_data[i][0]);
+       v2.push_back(ibeta_derivative_int_data[i][1]);
+       v3.push_back(ibeta_derivative_int_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_derivative_large_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_large_data[i][0]);
+       v2.push_back(ibeta_derivative_large_data[i][1]);
+       v3.push_back(ibeta_derivative_large_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_derivative_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_small_data[i][0]);
+       v2.push_back(ibeta_derivative_small_data[i][1]);
+       v3.push_back(ibeta_derivative_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_ibeta_derivative_float.cu b/test/cuda/test_ibeta_derivative_float.cu
new file mode 100644
index 0000000000..9ef7d7b6e1
--- /dev/null
+++ b/test/cuda/test_ibeta_derivative_float.cu
@@ -0,0 +1,151 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/beta.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type * in2, const float_type* in3, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::ibeta_derivative(in1[i], in2[i], in3[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../ibeta_derivative_data.ipp"
+#include "../ibeta_derivative_int_data.ipp"
+/*
+#include "../ibeta_derivative_large_data.ipp"
+*/
+#include "../ibeta_derivative_small_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v1, v2, v3;
+
+    for(unsigned i = 0; i < ibeta_derivative_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_data[i][0]);
+       v2.push_back(ibeta_derivative_data[i][1]);
+       v3.push_back(ibeta_derivative_data[i][2]);
+    }
+    for(unsigned i = 0; i < ibeta_derivative_int_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_int_data[i][0]);
+       v2.push_back(ibeta_derivative_int_data[i][1]);
+       v3.push_back(ibeta_derivative_int_data[i][2]);
+    }
+    /*
+    for(unsigned i = 0; i < ibeta_derivative_large_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_large_data[i][0]);
+       v2.push_back(ibeta_derivative_large_data[i][1]);
+       v3.push_back(ibeta_derivative_large_data[i][2]);
+    }*/
+    for(unsigned i = 0; i < ibeta_derivative_small_data.size(); ++i)
+    {
+       v1.push_back(ibeta_derivative_small_data[i][0]);
+       v2.push_back(ibeta_derivative_small_data[i][1]);
+       v3.push_back(ibeta_derivative_small_data[i][2]);
+    }
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+    cuda_managed_ptr<float_type> input_vector3(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v1.size();
+        input_vector1[i] = v1[table_id];
+        input_vector2[i] = v2[table_id];
+        input_vector3[i] = v3[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), input_vector3.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::ibeta_derivative(input_vector1[i], input_vector2[i], input_vector3[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_chi_sq_cdf_double.cu b/test/cuda/test_inv_chi_sq_cdf_double.cu
new file mode 100644
index 0000000000..cac6cbd9cc
--- /dev/null
+++ b/test/cuda/test_inv_chi_sq_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_chi_squared_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_chi_sq_cdf_float.cu b/test/cuda/test_inv_chi_sq_cdf_float.cu
new file mode 100644
index 0000000000..d720eb7a47
--- /dev/null
+++ b/test/cuda/test_inv_chi_sq_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_chi_squared_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_chi_squared_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_chi_sq_pdf_double.cu b/test/cuda/test_inv_chi_sq_pdf_double.cu
new file mode 100644
index 0000000000..0bb40af3af
--- /dev/null
+++ b/test/cuda/test_inv_chi_sq_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_chi_squared_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_chi_sq_pdf_float.cu b/test/cuda/test_inv_chi_sq_pdf_float.cu
new file mode 100644
index 0000000000..95b6868d92
--- /dev/null
+++ b/test/cuda/test_inv_chi_sq_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_chi_squared.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_chi_squared_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_chi_squared_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gam_cdf_double.cu b/test/cuda/test_inv_gam_cdf_double.cu
new file mode 100644
index 0000000000..847675252f
--- /dev/null
+++ b/test/cuda/test_inv_gam_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gamma_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gam_cdf_float.cu b/test/cuda/test_inv_gam_cdf_float.cu
new file mode 100644
index 0000000000..5f11a6f8ed
--- /dev/null
+++ b/test/cuda/test_inv_gam_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gamma_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gamma_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gam_pdf_double.cu b/test/cuda/test_inv_gam_pdf_double.cu
new file mode 100644
index 0000000000..77020cfffb
--- /dev/null
+++ b/test/cuda/test_inv_gam_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gamma_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gam_pdf_float.cu b/test/cuda/test_inv_gam_pdf_float.cu
new file mode 100644
index 0000000000..b6fbe1838e
--- /dev/null
+++ b/test/cuda/test_inv_gam_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gamma_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gamma_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gaus_cdf_double.cu b/test/cuda/test_inv_gaus_cdf_double.cu
new file mode 100644
index 0000000000..68b5f2f262
--- /dev/null
+++ b/test/cuda/test_inv_gaus_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gaussian.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gaussian_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gaussian_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gaus_cdf_float.cu b/test/cuda/test_inv_gaus_cdf_float.cu
new file mode 100644
index 0000000000..d498aeeec7
--- /dev/null
+++ b/test/cuda/test_inv_gaus_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gaussian.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::inverse_gaussian_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::inverse_gaussian_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gaus_pdf_double.cu b/test/cuda/test_inv_gaus_pdf_double.cu
new file mode 100644
index 0000000000..404d569e25
--- /dev/null
+++ b/test/cuda/test_inv_gaus_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gaussian.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gaussian_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gaussian_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_inv_gaus_pdf_float.cu b/test/cuda/test_inv_gaus_pdf_float.cu
new file mode 100644
index 0000000000..4e319d66ae
--- /dev/null
+++ b/test/cuda/test_inv_gaus_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/inverse_gaussian.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::inverse_gaussian_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::inverse_gaussian_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_isfinite_double.cu b/test/cuda/test_isfinite_double.cu
new file mode 100644
index 0000000000..96673cf86a
--- /dev/null
+++ b/test/cuda/test_isfinite_double.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, bool *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::isfinite(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<bool> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::isfinite(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_isinf_double.cu b/test/cuda/test_isinf_double.cu
new file mode 100644
index 0000000000..1f9358dbc9
--- /dev/null
+++ b/test/cuda/test_isinf_double.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, bool *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::isinf(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<bool> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::isinf(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_isnan_double.cu b/test/cuda/test_isnan_double.cu
new file mode 100644
index 0000000000..149307e327
--- /dev/null
+++ b/test/cuda/test_isnan_double.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, bool *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::isnan(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<bool> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::isnan(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_isnormal_double.cu b/test/cuda/test_isnormal_double.cu
new file mode 100644
index 0000000000..8a5ef262a2
--- /dev/null
+++ b/test/cuda/test_isnormal_double.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, bool *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::isnormal(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<bool> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::isnormal(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_laplace_cdf_double.cu b/test/cuda/test_laplace_cdf_double.cu
new file mode 100644
index 0000000000..3e3f013a2e
--- /dev/null
+++ b/test/cuda/test_laplace_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/laplace.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::laplace_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::laplace_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_laplace_cdf_float.cu b/test/cuda/test_laplace_cdf_float.cu
new file mode 100644
index 0000000000..3478d53523
--- /dev/null
+++ b/test/cuda/test_laplace_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/laplace.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::laplace_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::laplace_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_laplace_pdf_double.cu b/test/cuda/test_laplace_pdf_double.cu
new file mode 100644
index 0000000000..f65fbb0d60
--- /dev/null
+++ b/test/cuda/test_laplace_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/laplace.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::laplace_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::laplace_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_laplace_pdf_float.cu b/test/cuda/test_laplace_pdf_float.cu
new file mode 100644
index 0000000000..c4d52380c1
--- /dev/null
+++ b/test/cuda/test_laplace_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/laplace.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::laplace_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::laplace_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_laplace_quan_double.cu b/test/cuda/test_laplace_quan_double.cu
new file mode 100644
index 0000000000..c22f53d895
--- /dev/null
+++ b/test/cuda/test_laplace_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/laplace.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::laplace_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::laplace_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_laplace_quan_float.cu b/test/cuda/test_laplace_quan_float.cu
new file mode 100644
index 0000000000..0133d6d7f6
--- /dev/null
+++ b/test/cuda/test_laplace_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/laplace.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::laplace_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::laplace_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lgamma_double.cu b/test/cuda/test_lgamma_double.cu
new file mode 100644
index 0000000000..d1ddf645ee
--- /dev/null
+++ b/test/cuda/test_lgamma_double.cu
@@ -0,0 +1,128 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = ::lgamma(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../test_gamma_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < factorials.size(); ++i)
+       v.push_back(factorials[i][0]);
+    for(unsigned i = 0; i < near_1.size(); ++i)
+       v.push_back(near_1[i][0]);
+    for(unsigned i = 0; i < near_2.size(); ++i)
+       v.push_back(near_2[i][0]);
+    for(unsigned i = 0; i < near_0.size(); ++i)
+       v.push_back(near_0[i][0]);
+    for(unsigned i = 0; i < near_m10.size(); ++i)
+       v.push_back(near_m10[i][0]);
+    for(unsigned i = 0; i < near_m55.size(); ++i)
+       v.push_back(near_m55[i][0]);
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::lgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lgamma_float.cu b/test/cuda/test_lgamma_float.cu
new file mode 100644
index 0000000000..a038d200d6
--- /dev/null
+++ b/test/cuda/test_lgamma_float.cu
@@ -0,0 +1,128 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::lgamma(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../test_gamma_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < factorials.size(); ++i)
+       v.push_back(factorials[i][0]);
+    for(unsigned i = 0; i < near_1.size(); ++i)
+       v.push_back(near_1[i][0]);
+    for(unsigned i = 0; i < near_2.size(); ++i)
+       v.push_back(near_2[i][0]);
+    for(unsigned i = 0; i < near_0.size(); ++i)
+       v.push_back(near_0[i][0]);
+    for(unsigned i = 0; i < near_m10.size(); ++i)
+       v.push_back(near_m10[i][0]);
+    for(unsigned i = 0; i < near_m55.size(); ++i)
+       v.push_back(near_m55[i][0]);
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::lgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_logistic_cdf_double.cu b/test/cuda/test_logistic_cdf_double.cu
new file mode 100644
index 0000000000..0fb31cc25a
--- /dev/null
+++ b/test/cuda/test_logistic_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/logistic.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::logistic_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::logistic_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_logistic_cdf_float.cu b/test/cuda/test_logistic_cdf_float.cu
new file mode 100644
index 0000000000..9250121f05
--- /dev/null
+++ b/test/cuda/test_logistic_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/logistic.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::logistic_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::logistic_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_logistic_pdf_double.cu b/test/cuda/test_logistic_pdf_double.cu
new file mode 100644
index 0000000000..14aca5110b
--- /dev/null
+++ b/test/cuda/test_logistic_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/logistic.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::logistic_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::logistic_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_logistic_pdf_float.cu b/test/cuda/test_logistic_pdf_float.cu
new file mode 100644
index 0000000000..03c4965ce0
--- /dev/null
+++ b/test/cuda/test_logistic_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/logistic.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::logistic_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::logistic_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_logistic_quan_double.cu b/test/cuda/test_logistic_quan_double.cu
new file mode 100644
index 0000000000..002cb3c3e2
--- /dev/null
+++ b/test/cuda/test_logistic_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/logistic.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::logistic_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::logistic_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_logistic_quan_float.cu b/test/cuda/test_logistic_quan_float.cu
new file mode 100644
index 0000000000..a8e7d6e37c
--- /dev/null
+++ b/test/cuda/test_logistic_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/logistic.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::logistic_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::logistic_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lognorm_cdf_double.cu b/test/cuda/test_lognorm_cdf_double.cu
new file mode 100644
index 0000000000..06e7a2bee2
--- /dev/null
+++ b/test/cuda/test_lognorm_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::lognormal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::lognormal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lognorm_cdf_float.cu b/test/cuda/test_lognorm_cdf_float.cu
new file mode 100644
index 0000000000..6ee85de268
--- /dev/null
+++ b/test/cuda/test_lognorm_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::lognormal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::lognormal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lognorm_pdf_double.cu b/test/cuda/test_lognorm_pdf_double.cu
new file mode 100644
index 0000000000..2eaf6f9923
--- /dev/null
+++ b/test/cuda/test_lognorm_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::lognormal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::lognormal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lognorm_pdf_float.cu b/test/cuda/test_lognorm_pdf_float.cu
new file mode 100644
index 0000000000..b5574b0f1f
--- /dev/null
+++ b/test/cuda/test_lognorm_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::lognormal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::lognormal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lognorm_quan_double.cu b/test/cuda/test_lognorm_quan_double.cu
new file mode 100644
index 0000000000..014fdd53cf
--- /dev/null
+++ b/test/cuda/test_lognorm_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::lognormal_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::lognormal_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_lognorm_quan_float.cu b/test/cuda/test_lognorm_quan_float.cu
new file mode 100644
index 0000000000..88069c2439
--- /dev/null
+++ b/test/cuda/test_lognorm_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/lognormal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::lognormal_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::lognormal_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_modf_double.cu b/test/cuda/test_modf_double.cu
new file mode 100644
index 0000000000..0a99316011
--- /dev/null
+++ b/test/cuda/test_modf_double.cu
@@ -0,0 +1,104 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/modf.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    float_type fract;
+    int i_part;
+    long l_part;
+    long long ll_part;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::modf(in[i], &fract) + boost::math::modf(in[i], &i_part) + boost::math::modf(in[i], &l_part) + boost::math::modf(in[i], &ll_part);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    float_type fract;
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(4 * boost::math::modf(h_A[i], &fract));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(h_C[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_normal_cdf_double.cu b/test/cuda/test_normal_cdf_double.cu
new file mode 100644
index 0000000000..f1d2fe0334
--- /dev/null
+++ b/test/cuda/test_normal_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::normal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::normal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_normal_cdf_float.cu b/test/cuda/test_normal_cdf_float.cu
new file mode 100644
index 0000000000..dc1485ab8f
--- /dev/null
+++ b/test/cuda/test_normal_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::normal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::normal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_normal_pdf_double.cu b/test/cuda/test_normal_pdf_double.cu
new file mode 100644
index 0000000000..ea83e2ac91
--- /dev/null
+++ b/test/cuda/test_normal_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::normal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::normal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_normal_pdf_float.cu b/test/cuda/test_normal_pdf_float.cu
new file mode 100644
index 0000000000..5738b43b20
--- /dev/null
+++ b/test/cuda/test_normal_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::normal_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::normal_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 100.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_normal_quan_double.cu b/test/cuda/test_normal_quan_double.cu
new file mode 100644
index 0000000000..2f01bfe513
--- /dev/null
+++ b/test/cuda/test_normal_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::normal_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::normal_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_normal_quan_float.cu b/test/cuda/test_normal_quan_float.cu
new file mode 100644
index 0000000000..f8de25f4b0
--- /dev/null
+++ b/test/cuda/test_normal_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/normal.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::normal_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::normal_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_pareto_cdf_double.cu b/test/cuda/test_pareto_cdf_double.cu
new file mode 100644
index 0000000000..4908095d7c
--- /dev/null
+++ b/test/cuda/test_pareto_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/pareto.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::pareto_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::pareto_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_pareto_cdf_float.cu b/test/cuda/test_pareto_cdf_float.cu
new file mode 100644
index 0000000000..0d1709ba28
--- /dev/null
+++ b/test/cuda/test_pareto_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/pareto.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::pareto_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::pareto_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_pareto_pdf_double.cu b/test/cuda/test_pareto_pdf_double.cu
new file mode 100644
index 0000000000..fb1e8526a6
--- /dev/null
+++ b/test/cuda/test_pareto_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/pareto.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::pareto_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::pareto_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_pareto_pdf_float.cu b/test/cuda/test_pareto_pdf_float.cu
new file mode 100644
index 0000000000..4ea1df02c9
--- /dev/null
+++ b/test/cuda/test_pareto_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/pareto.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::pareto_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::pareto_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_pareto_quan_double.cu b/test/cuda/test_pareto_quan_double.cu
new file mode 100644
index 0000000000..f336a3884e
--- /dev/null
+++ b/test/cuda/test_pareto_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/pareto.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::pareto_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::pareto_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_pareto_quan_float.cu b/test/cuda/test_pareto_quan_float.cu
new file mode 100644
index 0000000000..efe644f592
--- /dev/null
+++ b/test/cuda/test_pareto_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/pareto.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::pareto_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::pareto_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_poisson_cdf_double.cu b/test/cuda/test_poisson_cdf_double.cu
new file mode 100644
index 0000000000..957878c318
--- /dev/null
+++ b/test/cuda/test_poisson_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/poisson.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::poisson_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::poisson_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_poisson_cdf_float.cu b/test/cuda/test_poisson_cdf_float.cu
new file mode 100644
index 0000000000..ff91fc8939
--- /dev/null
+++ b/test/cuda/test_poisson_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/poisson.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::poisson_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::poisson_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_poisson_pdf_double.cu b/test/cuda/test_poisson_pdf_double.cu
new file mode 100644
index 0000000000..5322467f38
--- /dev/null
+++ b/test/cuda/test_poisson_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/poisson.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::poisson_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::poisson_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_poisson_pdf_float.cu b/test/cuda/test_poisson_pdf_float.cu
new file mode 100644
index 0000000000..bc6b50f3c6
--- /dev/null
+++ b/test/cuda/test_poisson_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/poisson.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::poisson_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::poisson_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_rayleigh_cdf_double.cu b/test/cuda/test_rayleigh_cdf_double.cu
new file mode 100644
index 0000000000..624c3b9620
--- /dev/null
+++ b/test/cuda/test_rayleigh_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/rayleigh.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::rayleigh_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::rayleigh_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_rayleigh_cdf_float.cu b/test/cuda/test_rayleigh_cdf_float.cu
new file mode 100644
index 0000000000..49eb228228
--- /dev/null
+++ b/test/cuda/test_rayleigh_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/rayleigh.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::rayleigh_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::rayleigh_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_rayleigh_pdf_double.cu b/test/cuda/test_rayleigh_pdf_double.cu
new file mode 100644
index 0000000000..2c01a45d02
--- /dev/null
+++ b/test/cuda/test_rayleigh_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/rayleigh.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::rayleigh_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::rayleigh_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_rayleigh_pdf_float.cu b/test/cuda/test_rayleigh_pdf_float.cu
new file mode 100644
index 0000000000..2a2fccc8f2
--- /dev/null
+++ b/test/cuda/test_rayleigh_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/rayleigh.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::rayleigh_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::rayleigh_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_rayleigh_quan_double.cu b/test/cuda/test_rayleigh_quan_double.cu
new file mode 100644
index 0000000000..bbb13da04e
--- /dev/null
+++ b/test/cuda/test_rayleigh_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/rayleigh.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::rayleigh_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::rayleigh_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_rayleigh_quan_float.cu b/test/cuda/test_rayleigh_quan_float.cu
new file mode 100644
index 0000000000..0e84b6a844
--- /dev/null
+++ b/test/cuda/test_rayleigh_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/rayleigh.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::rayleigh_distribution<float_type>(0.25), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::rayleigh_distribution<float_type>(0.25), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_round_double.cu b/test/cuda/test_round_double.cu
new file mode 100644
index 0000000000..b72f977a21
--- /dev/null
+++ b/test/cuda/test_round_double.cu
@@ -0,0 +1,98 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/round.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::round(in[i]) + boost::math::iround(in[i]) + boost::math::lround(in[i]) + boost::math::llround(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(4 * boost::math::round(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(h_C[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_sign_double.cu b/test/cuda/test_sign_double.cu
new file mode 100644
index 0000000000..0266171239
--- /dev/null
+++ b/test/cuda/test_sign_double.cu
@@ -0,0 +1,115 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/sign.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, int *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::sign(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<int> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+        if(i % 1)
+           h_A[i] = -h_A[i];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::sign(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_signbit_double.cu b/test/cuda/test_signbit_double.cu
new file mode 100644
index 0000000000..8f82825175
--- /dev/null
+++ b/test/cuda/test_signbit_double.cu
@@ -0,0 +1,115 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/sign.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, int *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::signbit(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<int> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+        switch(i % 55)
+        {
+        case 1:
+           h_A[i] = 0;
+           break;
+        case 2:
+           h_A[i] = std::numeric_limits<float_type>::infinity();
+           break;
+        case 3:
+           h_A[i] = -std::numeric_limits<float_type>::infinity();
+           break;
+        case 4:
+           h_A[i] = std::numeric_limits<float_type>::quiet_NaN();
+           break;
+        }
+        if(i % 1)
+           h_A[i] = -h_A[i];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::signbit(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (h_C[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_std_erf_double.cu b/test/cuda/test_std_erf_double.cu
new file mode 100644
index 0000000000..2a4427d834
--- /dev/null
+++ b/test/cuda/test_std_erf_double.cu
@@ -0,0 +1,99 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/math/special_functions/erf.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = ::erf(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    double t = w.elapsed();
+    std::cout << "CUDA kernal done in " << t << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::erf(h_A[i]));
+    t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(h_C[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_tgamma_double.cu b/test/cuda/test_tgamma_double.cu
new file mode 100644
index 0000000000..9d34f81c68
--- /dev/null
+++ b/test/cuda/test_tgamma_double.cu
@@ -0,0 +1,128 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../test_gamma_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < factorials.size(); ++i)
+       v.push_back(factorials[i][0]);
+    for(unsigned i = 0; i < near_1.size(); ++i)
+       v.push_back(near_1[i][0]);
+    for(unsigned i = 0; i < near_2.size(); ++i)
+       v.push_back(near_2[i][0]);
+    for(unsigned i = 0; i < near_0.size(); ++i)
+       v.push_back(near_0[i][0]);
+    for(unsigned i = 0; i < near_m10.size(); ++i)
+       v.push_back(near_m10[i][0]);
+    for(unsigned i = 0; i < near_m55.size(); ++i)
+       v.push_back(near_m55[i][0]);
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::tgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_tgamma_float.cu b/test/cuda/test_tgamma_float.cu
new file mode 100644
index 0000000000..49804ef439
--- /dev/null
+++ b/test/cuda/test_tgamma_float.cu
@@ -0,0 +1,128 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/gamma.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/array.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::tgamma(in[i]);
+    }
+}
+
+template <class T> struct table_type { typedef T type; };
+typedef float_type T;
+#define SC_(x) static_cast<T>(x)
+
+#include "../test_gamma_data.ipp"
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+    // Consolidate the test data:
+    std::vector<float_type> v;
+
+    for(unsigned i = 0; i < factorials.size(); ++i)
+       v.push_back(factorials[i][0]);
+    for(unsigned i = 0; i < near_1.size(); ++i)
+       v.push_back(near_1[i][0]);
+    for(unsigned i = 0; i < near_2.size(); ++i)
+       v.push_back(near_2[i][0]);
+    for(unsigned i = 0; i < near_0.size(); ++i)
+       v.push_back(near_0[i][0]);
+    for(unsigned i = 0; i < near_m10.size(); ++i)
+       v.push_back(near_m10[i][0]);
+    for(unsigned i = 0; i < near_m55.size(); ++i)
+       v.push_back(near_m55[i][0]);
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        int table_id = i % v.size();
+        input_vector[i] = v[table_id];
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(boost::math::tgamma(input_vector[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 300)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_triangle_cdf_double.cu b/test/cuda/test_triangle_cdf_double.cu
new file mode 100644
index 0000000000..9c71b02d0b
--- /dev/null
+++ b/test/cuda/test_triangle_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/triangular.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::triangular_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::triangular_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_triangle_cdf_float.cu b/test/cuda/test_triangle_cdf_float.cu
new file mode 100644
index 0000000000..50bd41cb8d
--- /dev/null
+++ b/test/cuda/test_triangle_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/triangular.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::triangular_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::triangular_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_triangle_pdf_double.cu b/test/cuda/test_triangle_pdf_double.cu
new file mode 100644
index 0000000000..aff61e7d2d
--- /dev/null
+++ b/test/cuda/test_triangle_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/triangular.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::triangular_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::triangular_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_triangle_pdf_float.cu b/test/cuda/test_triangle_pdf_float.cu
new file mode 100644
index 0000000000..6fbc9ba5d1
--- /dev/null
+++ b/test/cuda/test_triangle_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/triangular.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::triangular_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::triangular_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_triangle_quan_double.cu b/test/cuda/test_triangle_quan_double.cu
new file mode 100644
index 0000000000..1880ed2a3a
--- /dev/null
+++ b/test/cuda/test_triangle_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/triangular.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::triangular_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::triangular_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_triangle_quan_float.cu b/test/cuda/test_triangle_quan_float.cu
new file mode 100644
index 0000000000..8d0e701cfb
--- /dev/null
+++ b/test/cuda/test_triangle_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/triangular.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::triangular_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::triangular_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_trunc_double.cu b/test/cuda/test_trunc_double.cu
new file mode 100644
index 0000000000..6d0bfde217
--- /dev/null
+++ b/test/cuda/test_trunc_double.cu
@@ -0,0 +1,98 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/special_functions/trunc.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::trunc(in[i]) + boost::math::itrunc(in[i]) + boost::math::ltrunc(in[i]) + boost::math::lltrunc(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector addition of " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> h_A(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> h_C(numElements);
+
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        h_A[i] = rand()/(float_type)RAND_MAX;
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(h_A.get(), h_C.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(4 * boost::math::trunc(h_A[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(h_C[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
+
diff --git a/test/cuda/test_uniform_cdf_double.cu b/test/cuda/test_uniform_cdf_double.cu
new file mode 100644
index 0000000000..c7c411914c
--- /dev/null
+++ b/test/cuda/test_uniform_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/uniform.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::uniform_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::uniform_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_uniform_cdf_float.cu b/test/cuda/test_uniform_cdf_float.cu
new file mode 100644
index 0000000000..9ed322941d
--- /dev/null
+++ b/test/cuda/test_uniform_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/uniform.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::uniform_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::uniform_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_uniform_pdf_double.cu b/test/cuda/test_uniform_pdf_double.cu
new file mode 100644
index 0000000000..dab9a47500
--- /dev/null
+++ b/test/cuda/test_uniform_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/uniform.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::uniform_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::uniform_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_uniform_pdf_float.cu b/test/cuda/test_uniform_pdf_float.cu
new file mode 100644
index 0000000000..01c835003c
--- /dev/null
+++ b/test/cuda/test_uniform_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/uniform.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::uniform_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::uniform_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 500.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_uniform_quan_double.cu b/test/cuda/test_uniform_quan_double.cu
new file mode 100644
index 0000000000..3925ed3d36
--- /dev/null
+++ b/test/cuda/test_uniform_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/uniform.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::uniform_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::uniform_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_uniform_quan_float.cu b/test/cuda/test_uniform_quan_float.cu
new file mode 100644
index 0000000000..37dda9e76a
--- /dev/null
+++ b/test/cuda/test_uniform_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/uniform.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::uniform_distribution<float_type>(), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::uniform_distribution<float_type>(), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_weibull_cdf_double.cu b/test/cuda/test_weibull_cdf_double.cu
new file mode 100644
index 0000000000..c8b040db86
--- /dev/null
+++ b/test/cuda/test_weibull_cdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/weibull.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::weibull_distribution<float_type>(1.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::weibull_distribution<float_type>(1.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_weibull_cdf_float.cu b/test/cuda/test_weibull_cdf_float.cu
new file mode 100644
index 0000000000..c7524a6c86
--- /dev/null
+++ b/test/cuda/test_weibull_cdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/weibull.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = cdf(boost::math::weibull_distribution<float_type>(1.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(cdf(boost::math::weibull_distribution<float_type>(1.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_weibull_pdf_double.cu b/test/cuda/test_weibull_pdf_double.cu
new file mode 100644
index 0000000000..3d310ff839
--- /dev/null
+++ b/test/cuda/test_weibull_pdf_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/weibull.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::weibull_distribution<float_type>(23), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::weibull_distribution<float_type>(23), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_weibull_pdf_float.cu b/test/cuda/test_weibull_pdf_float.cu
new file mode 100644
index 0000000000..8910f4dea1
--- /dev/null
+++ b/test/cuda/test_weibull_pdf_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/weibull.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = pdf(boost::math::weibull_distribution<float_type>(1.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist(0.00001, 10000);
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 512;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(pdf(boost::math::weibull_distribution<float_type>(1.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 5000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_weibull_quan_double.cu b/test/cuda/test_weibull_quan_double.cu
new file mode 100644
index 0000000000..3757b640bc
--- /dev/null
+++ b/test/cuda/test_weibull_quan_double.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/weibull.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::weibull_distribution<float_type>(1.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::weibull_distribution<float_type>(1.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+
diff --git a/test/cuda/test_weibull_quan_float.cu b/test/cuda/test_weibull_quan_float.cu
new file mode 100644
index 0000000000..7968f1475a
--- /dev/null
+++ b/test/cuda/test_weibull_quan_float.cu
@@ -0,0 +1,110 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <boost/math/distributions/weibull.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = quantile(boost::math::weibull_distribution<float_type>(1.5), in1[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+  try{
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    boost::random::mt19937 gen;
+    boost::random::uniform_real_distribution<float_type> dist;
+    // Initialize the input vectors
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(gen);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 32;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+    
+    watch w;
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+    std::cout << "CUDA kernal done in " << w.elapsed() << "s" << std::endl;
+    
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+       results.push_back(quantile(boost::math::weibull_distribution<float_type>(1.5), input_vector1[i]));
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 6000.0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            std::cerr << "Error rate was: " << boost::math::epsilon_difference(output_vector[i], results[i]) << "eps" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED with calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+  }
+  catch(const std::exception& e)
+  {
+    std::cerr << "Stopped with exception: " << e.what() << std::endl;
+  }
+  return 0;
+}
+