From 4e02e8212eea663d7b2a3c236f4d607b4e8ffac2 Mon Sep 17 00:00:00 2001 From: Adam Ashenfelter Date: Tue, 23 Oct 2012 20:24:26 -0700 Subject: [PATCH 1/2] Uses absolute min and max for histogram limits, adds percentiles and sampling --- project.clj | 3 +- src/clj/histogram/core.clj | 34 +++--- src/java/com/bigml/histogram/Histogram.java | 123 ++++++++++++-------- test/histogram/test/core.clj | 14 +-- test/histogram/test/examples.clj | 6 +- 5 files changed, 105 insertions(+), 75 deletions(-) diff --git a/project.clj b/project.clj index bc3e944..85d4648 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,5 @@ -(defproject histogram "2.0.1" + +(defproject histogram "2.1.0" :description "Dynamic/streaming histograms" :source-path "src/clj" :java-source-path "src/java" diff --git a/src/clj/histogram/core.clj b/src/clj/histogram/core.clj index ac92754..9672a26 100644 --- a/src/clj/histogram/core.clj +++ b/src/clj/histogram/core.clj @@ -194,12 +194,25 @@ (first (uniform hist 2))) (defn mean - [^Histogram hist] "Returns the mean for the points inserted into the histogram." + [^Histogram hist] (when-not (empty? (.getBins hist)) (.getMean ^Bin (reduce (fn [^Bin b1 ^Bin b2] (.combine b1 b2)) (.getBins hist))))) +(defn percentiles + "Returns a map of percentiles and their associated locations." + [^Histogram hist & percentiles] + (into (sorted-map) + (.percentiles hist (into-array (map double percentiles))))) + +(defn sample + "Returns a sequence of samples from the distribution approximated by + the histogram." + [hist & [sample-size]] + (repeatedly (or sample-size 1) + #(second (first (percentiles hist (rand)))))) + (defn extended-sum "Returns the approximate number of points occuring in the histogram equal or less than the given point, along with the sum of the @@ -251,20 +264,11 @@ (.getMaximum hist)) (defn bounds - "Returns the bounds of the histogram, nil if the histogram is empty. - An optional parameter may be supplied to enable a small buffer to - the bounds (true or false - default false)." - ([^Histogram hist] - (bounds hist false)) - ([^Histogram hist buffer?] - (when-let [bins (seq (bins hist))] - (let [l-mean (:mean (last bins)) - f-mean (:mean (first bins))] - (if (and buffer? (second bins)) - {:min (- f-mean (apply - (map :mean (reverse (take 2 bins))))) - :max (+ l-mean (apply - (map :mean (reverse (take-last 2 bins)))))} - {:min f-mean - :max l-mean}))))) + "Returns the bounds of the histogram, nil if the histogram is empty." + [^Histogram hist] + (when-let [bins (seq (bins hist))] + {:min (minimum hist) + :max (maximum hist)})) (defn hist-to-clj "Transforms a Histogram object into a Clojure map representing the diff --git a/src/java/com/bigml/histogram/Histogram.java b/src/java/com/bigml/histogram/Histogram.java index 03e1f47..5af1dbe 100644 --- a/src/java/com/bigml/histogram/Histogram.java +++ b/src/java/com/bigml/histogram/Histogram.java @@ -295,22 +295,22 @@ public SumResult extendedSum(double p) throws SumOutOfRangeException { throw new SumOutOfRangeException("Cannot sum with an empty histogram."); } - double min = _bins.firstKey(); - double max = _bins.lastKey(); + double binMin = _bins.firstKey(); + double binMax = _bins.lastKey(); - if (p < min) { + if (p < _minimum) { + result = new SumResult(0, (T) _bins.firstEntry().getValue().getTarget().init()); + } else if (p >= _maximum) { + result = new SumResult(getTotalCount(), getTotalTargetSum()); + } else if (p < binMin) { Bin bin = _bins.firstEntry().getValue(); - double distanceRatio = (1 - (bin.getMean() - p) / binGapRange(p, bin)) / 2; - if (distanceRatio > 0) { - double countSum = distanceRatio * bin.getCount(); - T targetSum = (T) bin.getTarget().clone().mult(distanceRatio); - result = new SumResult(countSum, targetSum); - } else { - result = new SumResult(0, (T) bin.getTarget().init()); - } - } else if (p > max) { + double distanceRatio = ((p - _minimum ) / (bin.getMean() - _minimum)) / 2; + double countSum = distanceRatio * bin.getCount(); + T targetSum = (T) bin.getTarget().clone().mult(distanceRatio); + result = new SumResult(countSum, targetSum); + } else if (p > binMax) { Bin bin = _bins.lastEntry().getValue(); - double distanceRatio = (1 - (p - bin.getMean()) / binGapRange(p, bin)) / 2; + double distanceRatio = ((_maximum - p) / (_maximum - bin.getMean())) / 2; if (distanceRatio > 0) { double countSum = getTotalCount() - distanceRatio * bin.getCount(); T targetSum = (T) getTotalTargetSum().sum(bin.getTarget().clone().mult(-distanceRatio)); @@ -318,14 +318,12 @@ public SumResult extendedSum(double p) throws SumOutOfRangeException { } else { result = new SumResult(getTotalCount(), getTotalTargetSum()); } - } else if (p == max) { + } else if (p == binMax) { Bin lastBin = _bins.lastEntry().getValue(); double totalCount = this.getTotalCount(); double count = totalCount - (lastBin.getCount() / 2d); - T targetSum = (T) getTotalTargetSum().sum(lastBin.getTarget().clone().mult(-0.5d)); - Entry> prevEntry = _bins.lowerEntry(lastBin.getMean()); result = new SumResult(count, targetSum); } else { @@ -406,7 +404,7 @@ public SumResult extendedDensity(double p) { targetDensity = (T) bin.getTarget().clone().mult(countDensity); } else { countDensity = 0; - targetDensity = null; + targetDensity = (T) bin.getTarget().init(); } } else { Bin hBin = higher.getValue(); @@ -508,6 +506,26 @@ public ArrayList uniform(int numberOfBins) { } return uniformBinSplits; } + + /** + * Returns a map of percentiles and their associated locations. + * + * @param percentiles the desired percentiles + */ + public HashMap percentiles(Double... percentiles) { + HashMap results = new HashMap(); + double totalCount = getTotalCount(); + + if (totalCount > 0) { + TreeMap> binSumMap = createBinSumMap(); + + for (double percentile : percentiles) { + double targetSum = (double) percentile * totalCount; + results.put(percentile, findPointForSum(targetSum, binSumMap)); + } + } + return results; + } /** * Merges a histogram into the current histogram. @@ -722,6 +740,11 @@ private void updateBins(Bin bin) { private TreeMap> createBinSumMap() { TreeMap> binSumMap = new TreeMap>(); + Bin minBin = new Bin(_minimum, 0d, _bins.firstEntry().getValue().getTarget().init()); + Bin maxBin = new Bin(_maximum, 0d, _bins.firstEntry().getValue().getTarget().init()); + binSumMap.put(0d, minBin); + binSumMap.put((double) _totalCount, maxBin); + for (Bin bin : _bins.values()) { try { double sum = sum(bin.getMean()); @@ -737,12 +760,10 @@ private double binGapRange(double p, Bin bin) { Entry> higher = _bins.higherEntry(bin.getMean()); double range; - if (lower == null && higher == null) { - range = 0; - } else if (lower == null) { - range = higher.getValue().getMean() - bin.getMean(); + if (lower == null) { + range = bin.getMean() - _minimum; } else if (higher == null) { - range = bin.getMean() - lower.getValue().getMean(); + range = _maximum - bin.getMean(); } else { if (p < bin.getMean()) { range = bin.getMean() - lower.getValue().getMean(); @@ -777,36 +798,40 @@ private Target computeSum(double r, U p, U i, U i1) { } private double findPointForSum(double s, TreeMap> binSumMap) { - Entry> sumEntry = binSumMap.floorEntry(s); - double sumP_i = sumEntry.getKey(); - Bin bin_i = sumEntry.getValue(); - double p_i = bin_i.getMean(); - double m_i = bin_i.getCount(); - - Double sumP_i1 = binSumMap.navigableKeySet().higher(sumP_i); - if (sumP_i1 == null) { - sumP_i1 = binSumMap.navigableKeySet().floor(sumP_i); - } - - Bin bin_i1 = binSumMap.get(sumP_i1); - double p_i1 = bin_i1.getMean(); - double m_i1 = bin_i1.getCount(); - - double d = s - sumP_i; - double a = m_i1 - m_i; - - double u; - if (a == 0) { - double offset = d / ((m_i + m_i1) / 2); - u = p_i + (offset * (p_i1 - p_i)); + double result; + if (s <= 0) { + result = _minimum; + } else if (s >= _totalCount) { + result = _maximum; } else { - double b = 2 * m_i; - double c = -2 * d; - double z = findZ(a, b, c); - u = (p_i + (p_i1 - p_i) * z); + Entry> sumEntry = binSumMap.floorEntry(s); + double sumP_i = sumEntry.getKey(); + Bin bin_i = sumEntry.getValue(); + double p_i = bin_i.getMean(); + double m_i = bin_i.getCount(); + + Double sumP_i1 = binSumMap.navigableKeySet().higher(sumP_i); + Bin bin_i1 = binSumMap.get(sumP_i1); + double p_i1 = bin_i1.getMean(); + double m_i1 = bin_i1.getCount(); + + double d = s - sumP_i; + double a = m_i1 - m_i; + + double u; + if (a == 0 || m_i == 0 || m_i1 == 0) { + double offset = d / ((m_i + m_i1) / 2); + u = p_i + (offset * (p_i1 - p_i)); + } else { + double b = 2 * m_i; + double c = -2 * d; + double z = findZ(a, b, c); + u = (p_i + (p_i1 - p_i) * z); + } + result = u; } - return u; + return result; } private void updateGaps(Bin newBin) { diff --git a/test/histogram/test/core.clj b/test/histogram/test/core.clj index a6d428c..1d3624c 100644 --- a/test/histogram/test/core.clj +++ b/test/histogram/test/core.clj @@ -74,13 +74,13 @@ (deftest density-test (let [hist (reduce insert! (create) [1 2 2 3])] (is (= 0.0 (density hist 0.0))) - (is (= 0.5 (density hist 0.5))) - (is (= 1.0 (density hist 1.0))) - (is (= 1.5 (density hist 1.5))) - (is (= 1.5 (density hist 2.0))) - (is (= 1.5 (density hist 2.5))) - (is (= 1.0 (density hist 3.0))) - (is (= 0.5 (density hist 3.5))) + (is (= 0.0 (density hist 0.5))) + (is (= 0.5 (density hist 1.0))) + (is (= 1.0 (density hist 1.5))) + (is (= 1.0 (density hist 2.0))) + (is (= 1.0 (density hist 2.5))) + (is (= 0.5 (density hist 3.0))) + (is (= 0.0 (density hist 3.5))) (is (= 0.0 (density hist 4.0))))) (deftest categorical-test diff --git a/test/histogram/test/examples.clj b/test/histogram/test/examples.clj index 638e673..75b16a7 100644 --- a/test/histogram/test/examples.clj +++ b/test/histogram/test/examples.clj @@ -26,17 +26,17 @@ (next hists))))) (defn sum-density-chart [hist] - (let [{:keys [min max]} (hst/bounds hist true)] + (let [{:keys [min max]} (hst/bounds hist)] (core/view (-> (charts/function-plot #(hst/sum hist %) min max) (charts/add-function #(hst/density hist %) min max))))) (defn cdf-pdf-chart [hist] - (let [{:keys [min max]} (hst/bounds hist true)] + (let [{:keys [min max]} (hst/bounds hist)] (core/view (-> (charts/function-plot (hst/cdf hist) min max) (charts/add-function (hst/pdf hist) min max))))) (defn pdf-target-chart [hist] - (let [{:keys [min max]} (hst/bounds hist true)] + (let [{:keys [min max]} (hst/bounds hist)] (core/view (-> (charts/function-plot (hst/pdf hist) min max) (charts/add-function #(:sum (hst/average-target hist %)) min max))))) From e39421b8f862fdd8798b9b1687898d4b515ee4df Mon Sep 17 00:00:00 2001 From: Adam Ashenfelter Date: Mon, 12 Nov 2012 02:12:36 -0800 Subject: [PATCH 2/2] Fixes total count bug when merging histograms with missing counts --- project.clj | 2 +- src/java/com/bigml/histogram/Histogram.java | 1 - test/histogram/test/core.clj | 9 ++++++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/project.clj b/project.clj index 85d4648..3bb37c1 100644 --- a/project.clj +++ b/project.clj @@ -1,5 +1,5 @@ -(defproject histogram "2.1.0" +(defproject histogram "2.1.1" :description "Dynamic/streaming histograms" :source-path "src/clj" :java-source-path "src/java" diff --git a/src/java/com/bigml/histogram/Histogram.java b/src/java/com/bigml/histogram/Histogram.java index 5af1dbe..f927d1a 100644 --- a/src/java/com/bigml/histogram/Histogram.java +++ b/src/java/com/bigml/histogram/Histogram.java @@ -574,7 +574,6 @@ public Histogram merge(Histogram histogram) throws MixedInsertException { _missingTarget.sum(histogram.getMissingTarget()); } _missingCount += histogram.getMissingCount(); - _totalCount += histogram.getMissingCount(); return this; } diff --git a/test/histogram/test/core.clj b/test/histogram/test/core.clj index 1d3624c..f412b3a 100644 --- a/test/histogram/test/core.clj +++ b/test/histogram/test/core.clj @@ -58,7 +58,14 @@ merged-hist (reduce merge! hists)] (is (about= (sum merged-hist 0) (/ (* points hist-count) 2) - (/ (* points hist-count) 50))))) + (/ (* points hist-count) 50)))) + (let [h1 (-> (create) + (insert! 1 1) + (insert! nil 1)) + h2 (-> (create) + (insert! 2 2) + (insert! nil 2))] + (is (== 2 (total-count (merge! h1 h2)))))) (deftest mixed-test (let [insert-pair #(apply insert! (apply insert! (create) %1) %2)