From 419d7245e3c9d884f37b1d559630431ec8a13c2f Mon Sep 17 00:00:00 2001 From: Stephen Mallette Date: Wed, 8 Jul 2020 06:42:44 -0400 Subject: [PATCH] TINKERPOP-2376 Improved probability distribution on global sample() --- CHANGELOG.asciidoc | 1 + .../process/traversal/step/filter/SampleGlobalStep.java | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index f41839bf0d6..a89a91c9538 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -26,6 +26,7 @@ image::https://raw.githubusercontent.com/apache/tinkerpop/master/docs/static/ima * Fixed bug in `has(T,Traversal)` where results were not being returned. * Fixed bug in `select(Traversal)` where side-effects were getting lost if accessed from the child traversal. * Fixed authorization bug when using `WsAndHttpChannelizerHandler` with keep-alive enabled. +* Improved sampling distribution for global scope `sample()` operations. [[release-3-4-7]] === TinkerPop 3.4.7 (Release Date: June 1, 2020) diff --git a/gremlin-core/src/main/java/org/apache/tinkerpop/gremlin/process/traversal/step/filter/SampleGlobalStep.java b/gremlin-core/src/main/java/org/apache/tinkerpop/gremlin/process/traversal/step/filter/SampleGlobalStep.java index 28d2fb49b42..5290f971150 100644 --- a/gremlin-core/src/main/java/org/apache/tinkerpop/gremlin/process/traversal/step/filter/SampleGlobalStep.java +++ b/gremlin-core/src/main/java/org/apache/tinkerpop/gremlin/process/traversal/step/filter/SampleGlobalStep.java @@ -86,22 +86,21 @@ public void barrierConsumer(final TraverserSet traverserSet) { int runningAmountToSample = 0; while (runningAmountToSample < this.amountToSample) { boolean reSample = false; - double runningWeight = 0.0d; + double runningTotalWeight = totalWeight; for (final Traverser.Admin s : traverserSet) { long sampleBulk = sampledSet.contains(s) ? sampledSet.get(s).bulk() : 0; if (sampleBulk < s.bulk()) { final double currentWeight = ((ProjectedTraverser) s).getProjections().get(0).doubleValue(); for (int i = 0; i < (s.bulk() - sampleBulk); i++) { - runningWeight = runningWeight + currentWeight; - if (RANDOM.nextDouble() <= ((runningWeight / totalWeight))) { + if (RANDOM.nextDouble() <= ((currentWeight / runningTotalWeight))) { final Traverser.Admin split = s.split(); split.setBulk(1L); sampledSet.add(split); runningAmountToSample++; - totalWeight = totalWeight - currentWeight; reSample = true; break; } + runningTotalWeight = runningTotalWeight - currentWeight; } if (reSample || (runningAmountToSample >= this.amountToSample)) break; @@ -139,4 +138,4 @@ public void setTraversal(final Traversal.Admin parentTraversal) { public int hashCode() { return super.hashCode() ^ this.amountToSample ^ this.probabilityTraversal.hashCode(); } -} +} \ No newline at end of file