Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Zipfian distribution. #171

Merged
merged 1 commit into from Sep 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Expand Up @@ -25,6 +25,7 @@
- [Usage](#usage)
- [General syntax](#general-syntax)
- [General command line options](#general-command-line-options)
- [Random numbers options](#random-numbers-options)

<!-- markdown-toc end -->

Expand Down Expand Up @@ -259,6 +260,20 @@ The table below lists the supported common options, their descriptions and defau

Note that numerical values for all *size* options (like `--thread-stack-size` in this table) may be specified by appending the corresponding multiplicative suffix (K for kilobytes, M for megabytes, G for gigabytes and T for terabytes).

## Random numbers options

sysbench provides a number of algorithms to generate random numbers that are distributed according to a given probability distribution. The table below lists options that can be used to control those algorithms.

*Option* | *Description* | *Default value*
----------------------|---------------|----------------
`--rand-type` | random numbers distribution {uniform, gaussian, special, pareto, zipfian} to use by default. Benchmark scripts may choose to use either the default distribution, or specify it explictly, i.e. override the default. | special
`--rand-seed` | seed for random number generator. When 0, the current time is used as an RNG seed. | 0
`--rand-spec-iter` | number of iterations for the special distribution | 12
`--rand-spec-pct` | percentage of the entire range where 'special' values will fall in the special distribution | 1
`--rand-spec-res` | percentage of 'special' values to use for the special distribution | 75
`--rand-pareto-h` | shape parameter for the Pareto distribution | 0.2
`--rand-zipfian-exp` | shape parameter (theta) for the Zipfian distribution | 0.8`

[coveralls-badge]: https://coveralls.io/repos/github/akopytov/sysbench/badge.svg?branch=master
[coveralls-url]: https://coveralls.io/github/akopytov/sysbench?branch=master
[travis-badge]: https://travis-ci.org/akopytov/sysbench.svg?branch=master
Expand Down
5 changes: 5 additions & 0 deletions src/lua/internal/sysbench.rand.lua
Expand Up @@ -29,6 +29,7 @@ uint32_t sb_rand_uniform(uint32_t, uint32_t);
uint32_t sb_rand_gaussian(uint32_t, uint32_t);
uint32_t sb_rand_special(uint32_t, uint32_t);
uint32_t sb_rand_pareto(uint32_t, uint32_t);
uint32_t sb_rand_zipfian(uint32_t, uint32_t);
uint32_t sb_rand_unique(void);
void sb_rand_str(const char *, char *);
double sb_rand_uniform_double(void);
Expand Down Expand Up @@ -58,6 +59,10 @@ function sysbench.rand.pareto(a, b)
return ffi.C.sb_rand_pareto(a, b)
end

function sysbench.rand.zipfian(a, b)
return ffi.C.sb_rand_zipfian(a, b)
end

function sysbench.rand.unique()
return ffi.C.sb_rand_unique()
end
Expand Down
259 changes: 247 additions & 12 deletions src/sb_rand.c
Expand Up @@ -16,6 +16,25 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

/*
* This file incorporates work covered by the following copyright and
* permission notice:
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
Expand Down Expand Up @@ -47,20 +66,25 @@ int sb_rand_seed; /* optional seed set on the command line */
static sb_arg_t rand_args[] =
{
SB_OPT("rand-type",
"random numbers distribution {uniform,gaussian,special,pareto}",
"special", STRING),
"random numbers distribution {uniform, gaussian, special, pareto, "
"zipfian} to use by default", "special", STRING),
SB_OPT("rand-seed",
"seed for random number generator. When 0, the current time is "
"used as an RNG seed.", "0", INT),
SB_OPT("rand-spec-iter",
"number of iterations used for numbers generation", "12", INT),
"number of iterations for the special distribution", "12", INT),
SB_OPT("rand-spec-pct",
"percentage of values to be treated as 'special' "
"(for special distribution)", "1", INT),
"percentage of the entire range where 'special' values will fall "
"in the special distribution",
"1", INT),
SB_OPT("rand-spec-res",
"percentage of 'special' values to use "
"(for special distribution)", "75", INT),
SB_OPT("rand-seed",
"seed for random number generator. When 0, the current time is "
"used as a RNG seed.", "0", INT),
SB_OPT("rand-pareto-h", "parameter h for pareto distribution", "0.2", DOUBLE),
"percentage of 'special' values to use for the special distribution",
"75", INT),
SB_OPT("rand-pareto-h", "shape parameter for the Pareto distribution",
"0.2", DOUBLE),
SB_OPT("rand-zipfian-exp",
"shape parameter (exponent, theta) for the Zipfian distribution",
"0.8", DOUBLE),

SB_OPT_END
};
Expand All @@ -85,6 +109,11 @@ static double rand_res_mult;
static double pareto_h; /* parameter h */
static double pareto_power; /* parameter pre-calculated by h */

/* parameter and precomputed values for the Zipfian distribution */
static double zipf_exp;
static double zipf_s;
static double zipf_hIntegralX1;

/* Unique sequence generator state */
static uint32_t rand_unique_index CK_CC_CACHELINE;
static uint32_t rand_unique_offset;
Expand All @@ -96,6 +125,13 @@ extern inline uint64_t xoroshiro_next(uint64_t s[2]);

static void rand_unique_seed(uint32_t index, uint32_t offset);

/* Helper functions for the Zipfian distribution */
static double hIntegral(double x, double e);
static double hIntegralInverse(double x, double e);
static double h(double x, double e);
static double helper1(double x);
static double helper2(double x);

int sb_rand_register(void)
{
sb_register_arg_set(rand_args);
Expand Down Expand Up @@ -132,6 +168,11 @@ int sb_rand_init(void)
rand_type = DIST_TYPE_PARETO;
rand_func = &sb_rand_pareto;
}
else if (!strcmp(s, "zipfian"))
{
rand_type = DIST_TYPE_ZIPFIAN;
rand_func = &sb_rand_zipfian;
}
else
{
log_text(LOG_FATAL, "Invalid random numbers distribution: %s.", s);
Expand All @@ -151,7 +192,18 @@ int sb_rand_init(void)
pareto_h = sb_get_value_double("rand-pareto-h");
pareto_power = log(pareto_h) / log(1.0-pareto_h);

/* Seed PRNG for the main thread. Worker thread do their own seeding */
zipf_exp = sb_get_value_double("rand-zipfian-exp");
if (zipf_exp < 0)
{
log_text(LOG_FATAL, "--rand-zipfian-exp must be >= 0");
return 1;
}

zipf_s = 2 - hIntegralInverse(hIntegral(2.5, zipf_exp) - h(2, zipf_exp),
zipf_exp);
zipf_hIntegralX1 = hIntegral(1.5, zipf_exp) - 1;

/* Seed PRNG for the main thread. Worker threads do their own seeding */
sb_rand_thread_init();

/* Seed the unique sequence generator */
Expand Down Expand Up @@ -325,3 +377,186 @@ uint32_t sb_rand_unique(void)
return rand_unique_permute((rand_unique_permute(index) + rand_unique_offset) ^
0x5bf03635);
}

/*
Implementation of the Zipf distribution is based on
RejectionInversionZipfSampler.java from the Apache Commons RNG project
(https://commons.apache.org/proper/commons-rng/) implementing the rejection
inversion sampling method for a descrete, bounded Zipf distribution that is in
turn based on the method described in:

Wolfgang Hörmann and Gerhard Derflinger. "Rejection-inversion to generate
variates from monotone discrete distributions", ACM Transactions on Modeling
and Computer Simulation, (TOMACS) 6.3 (1996): 169-184.
*/

static uint32_t sb_rand_zipfian_int(uint32_t n, double e, double s,
double hIntegralX1)
{
/*
The paper describes an algorithm for exponents larger than 1 (Algorithm
ZRI).

The original method uses

H(x) = (v + x)^(1 - q) / (1 - q)

as the integral of the hat function. This function is undefined for q = 1,
which is the reason for the limitation of the exponent. If instead the
integral function

H(x) = ((v + x)^(1 - q) - 1) / (1 - q)

is used, for which a meaningful limit exists for q = 1, the method works for
all positive exponents. The following implementation uses v = 0 and
generates integral number in the range [1, numberOfElements]. This is
different to the original method where v is defined to be positive and
numbers are taken from [0, i_max]. This explains why the implementation
looks slightly different.
*/
const double hIntegralNumberOfElements = hIntegral(n + 0.5, e);

for (;;)
{
double u = hIntegralNumberOfElements + sb_rand_uniform_double() *
(hIntegralX1 - hIntegralNumberOfElements);
/* u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements] */

double x = hIntegralInverse(u, e);
uint32_t k = (uint32_t) (x + 0.5);

/*
Limit k to the range [1, numberOfElements] if it would be outside due to
numerical inaccuracies.
*/
if (SB_UNLIKELY(k < 1))
k = 1;
else if (SB_UNLIKELY(k > n))
k = n;

/*
Here, the distribution of k is given by:

P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C
P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2

where C = 1 / (hIntegralNumberOfElements - hIntegralX1)
*/

if (k - x <= s || u >= hIntegral(k + 0.5, e) - h(k, e))
{
/*
Case k = 1:

The right inequality is always true, because replacing k by 1 gives u >=
hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from (hIntegralX1,
hIntegralNumberOfElements].

Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 and
the probability that 1 is returned as random value is P(k = 1 and
accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent

Case k >= 2:

The left inequality (k - x <= s) is just a short cut to avoid the more
expensive evaluation of the right inequality (u >= hIntegral(k + 0.5) -
h(k)) in many cases.

If the left inequality is true, the right inequality is also true:
Theorem 2 in the paper is valid for all positive exponents, because
the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and
(-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 are both
fulfilled. Therefore, f(x) = x - hIntegralInverse(hIntegral(x + 0.5)
- h(x)) is a non-decreasing function. If k - x <= s holds, k - x <= s
+ f(k) - f(2) is obviously also true which is equivalent to -x <=
-hIntegralInverse(hIntegral(k + 0.5) - h(k)), -hIntegralInverse(u) <=
-hIntegralInverse(hIntegral(k + 0.5) - h(k)), and finally u >=
hIntegral(k + 0.5) - h(k).

Hence, the right inequality determines the acceptance rate: P(accepted |
k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) The
probability that m is returned is given by P(k = m and accepted) =
P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent.

In both cases the probabilities are proportional to the probability mass
function of the Zipf distribution.
*/

return k;
}
}
}

uint32_t sb_rand_zipfian(uint32_t a, uint32_t b)
{
/* sb_rand_zipfian_int() returns a number in the range [1, b - a + 1] */
return a +
sb_rand_zipfian_int(b - a + 1, zipf_exp, zipf_s, zipf_hIntegralX1) - 1;
}

/*
H(x) is defined as

(x^(1 - exponent) - 1) / (1 - exponent), exponent != 1
log(x), if exponent == 1

H(x) is an integral function of h(x), the derivative of H(x) is h(x).
*/

static double hIntegral(double x, double e)
{
const double logX = log(x);
return helper2((1 - e) * logX) * logX;
}

/* h(x) = 1 / x^exponent */

static double h(double x, double e)
{
return exp(-e * log(x));
}

/* The inverse function of H(x) */

static double hIntegralInverse(double x, double e)
{
double t = x * (1 -e);
if (t < -1)
{
/*
Limit value to the range [-1, +inf).
t could be smaller than -1 in some rare cases due to numerical errors.
*/
t = -1;
}

return exp(helper1(t) * x);
}

/*
Helper function that calculates log(1 + x) / x.

A Taylor series expansion is used, if x is close to 0.
*/

static double helper1(double x)
{
if (fabs(x) > 1e-8)
return log1p(x) / x;
else
return 1 - x * (0.5 - x * (0.33333333333333333 - 0.25 * x));
}

/*
Helper function that calculates (exp(x) - 1) / x.

A Taylor series expansion is used, if x is close to 0.
*/

static double helper2(double x)
{
if (fabs(x) > 1e-8)
return expm1(x) / x;
else
return 1 + x * 0.5 * (1 + x * 0.33333333333333333 * (1 + 0.25 * x));
}
4 changes: 3 additions & 1 deletion src/sb_rand.h
Expand Up @@ -29,7 +29,8 @@ typedef enum
DIST_TYPE_UNIFORM,
DIST_TYPE_GAUSSIAN,
DIST_TYPE_SPECIAL,
DIST_TYPE_PARETO
DIST_TYPE_PARETO,
DIST_TYPE_ZIPFIAN
} rand_dist_t;

typedef uint64_t sb_rng_state_t [2];
Expand Down Expand Up @@ -67,6 +68,7 @@ uint32_t sb_rand_uniform(uint32_t, uint32_t);
uint32_t sb_rand_gaussian(uint32_t, uint32_t);
uint32_t sb_rand_special(uint32_t, uint32_t);
uint32_t sb_rand_pareto(uint32_t, uint32_t);
uint32_t sb_rand_zipfian(uint32_t, uint32_t);
uint32_t sb_rand_unique(void);
void sb_rand_str(const char *, char *);

Expand Down
2 changes: 1 addition & 1 deletion tests/t/api_legacy_rand.t
Expand Up @@ -28,7 +28,7 @@ Legacy PRNG Lua API tests
sb_rand_special\(0, 9\) = [0-9]{1} (re)

########################################################################
issue #96: sb_rand_uniq(1, oltp_table_size) generate duplicate value
GH-96: sb_rand_uniq(1, oltp_table_size) generate duplicate value
########################################################################
$ cat >$CRAMTMP/api_rand_uniq.lua <<EOF
> function event()
Expand Down