Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Refactors and adds bin reservoirs for better performance

  • Loading branch information...
commit 3adfd7319618589907bb1761d8ebbe29da6c7cc2 1 parent 56c647f
@ashenfad ashenfad authored
View
25 README.md
@@ -404,16 +404,33 @@ the histogram can suffer if the `:freeze` parameter is too small.
```clojure
examples> (time (reduce insert! (create) ex/normal-data))
-"Elapsed time: 391.857 msecs"
+"Elapsed time: 333.5 msecs"
examples> (time (reduce insert! (create :freeze 1024) ex/normal-data))
-"Elapsed time: 99.92 msecs"
+"Elapsed time: 166.9 msecs"
```
# Performance
-Insert time scales `log(n)` with respect to the number of bins in the
-histogram.
+There are two implementations of bin reservoirs (which support the
+`insert!` and `merge!` functions). Either of the two implementations,
+`:tree` and `:array`, can be explicitly selected with the `:reservoir`
+parameter. The `:tree` option is useful for histograms with many bins
+as the insert time scales at `O(log n)` with respect to the # of
+bins. The `:array` option is good for small number of bins since
+inserts are `O(n)` but there's a smaller overhead. If `:reservoir` is
+left unspecified then `:array` is used for histograms with <= 256 bins
+and `:tree` is used for anything larger.
+```clojure
+examples> (time (reduce insert! (create :bins 16 :reservoir :tree)
+ ex/normal-data))
+"Elapsed time: 554.478 msecs"
+examples> (time (reduce insert! (create :bins 16 :reservoir :array)
+ ex/normal-data))
+"Elapsed time: 183.532 msecs"
+```
+
+Insert times using reservoir defaults:
![timing chart]
(https://docs.google.com/spreadsheet/oimg?key=0Ah2oAcudnjP4dG1CLUluRS1rcHVqU05DQ2Z4UVZnbmc&oid=2&zx=mppmmoe214jm)
View
18 src/clj/bigml/histogram/core.clj
@@ -7,9 +7,14 @@
Target SimpleTarget NumericTarget
ArrayCategoricalTarget GroupTarget
MapCategoricalTarget SumResult
- MixedInsertException)
+ MixedInsertException
+ Histogram$BinReservoirType)
(java.util HashMap ArrayList)))
+(def ^:private clj-to-reservoir-types
+ {:array Histogram$BinReservoirType/array
+ :tree Histogram$BinReservoirType/tree})
+
(def ^:private clj-to-java-types
{:none Histogram$TargetType/none
:numeric Histogram$TargetType/numeric
@@ -30,11 +35,14 @@
:group-types - A sequence of types (:numeric or :categorical) that
describing a group target.
:freeze - After this # of inserts, bin locations will 'freeze',
- improving the performance of future inserts."
- [& {:keys [bins gap-weighted? categories group-types freeze]
+ improving the performance of future inserts.
+ :reservoir - Selects the bin reservoir type (:array or :tree).
+ Defaults to :array for <= 256 bins, otherwise :tree."
+ [& {:keys [bins gap-weighted? categories group-types freeze reservoir]
:or {bins 64 gap-weighted? false}}]
- (let [group-types (seq (map clj-to-java-types group-types))]
- (Histogram. bins gap-weighted? categories group-types freeze)))
+ (let [group-types (seq (map clj-to-java-types group-types))
+ reservoir (clj-to-reservoir-types reservoir)]
+ (Histogram. bins gap-weighted? categories group-types freeze reservoir)))
(defn histogram?
"Returns true if the input is a histogram."
View
141 src/java/com/bigml/histogram/ArrayBinReservoir.java
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2013 BigML
+ * Licensed under the Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+package com.bigml.histogram;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+
+/**
+ * This class implements bin operations (insertions, merges, etc) for a histogram.
+ * This implementation is best for histograms with a small (<=256) number of bins.
+ * It uses an ArrayList to give O(N) insert performance with regard to the number
+ * of bins in the histogram. For histograms with more bins, the TreeBinReservoir
+ * class offers faster insert performance.
+ */
+public class ArrayBinReservoir <T extends Target> extends BinReservoir<T> {
+
+ public ArrayBinReservoir(int maxBins, boolean weightGaps, Long freezeThreshold) {
+ super(maxBins, weightGaps, freezeThreshold);
+ _bins = new ArrayList<Bin<T>>();
+ }
+
+ @Override
+ public void insert(Bin<T> bin) {
+ addTotalCount(bin);
+ int index = Collections.binarySearch(_bins, bin);
+ if (index >= 0) {
+ _bins.get(index).sumUpdate(bin);
+ } else {
+ if (isFrozen()) {
+ int prevIndex = Math.abs(index) - 2;
+ int nextIndex = prevIndex + 1;
+ double prevDist = (prevIndex >= 0) ?
+ bin.getMean() - _bins.get(prevIndex).getMean() : Double.MAX_VALUE;
+ double nextDist = (nextIndex < _bins.size()) ?
+ _bins.get(nextIndex).getMean() - bin.getMean() : Double.MAX_VALUE;
+ if (prevDist < nextDist) {
+ _bins.get(prevIndex).sumUpdate(bin);
+ } else {
+ _bins.get(nextIndex).sumUpdate(bin);
+ }
+ } else {
+ _bins.add(Math.abs(index) - 1, bin);
+ }
+ }
+ }
+
+ @Override
+ public Bin<T> first() {
+ return _bins.get(0);
+ }
+
+ @Override
+ public Bin<T> last() {
+ return _bins.get(_bins.size() - 1);
+ }
+
+ @Override
+ public Bin<T> get(double p) {
+ int index = Collections.binarySearch(_bins, new Bin(p, 0, null));
+ if (index >= 0) {
+ return _bins.get(index);
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public Bin<T> floor(double p) {
+ int index = Collections.binarySearch(_bins, new Bin(p, 0, null));
+ if (index >= 0) {
+ return _bins.get(index);
+ } else {
+ index = Math.abs(index) - 2;
+ return (index >= 0) ? _bins.get(index) : null;
+ }
+ }
+
+ @Override
+ public Bin<T> ceiling(double p) {
+ int index = Collections.binarySearch(_bins, new Bin(p, 0, null));
+ if (index >= 0) {
+ return _bins.get(index);
+ } else {
+ index = Math.abs(index) - 1;
+ return (index < _bins.size()) ? _bins.get(index) : null;
+ }
+ }
+
+ @Override
+ public Bin<T> lower(double p) {
+ int index = Collections.binarySearch(_bins, new Bin(p, 0, null));
+ if (index >= 0) {
+ index--;
+ return (index >= 0) ? _bins.get(index) : null;
+ } else {
+ index = Math.abs(index) - 2;
+ return (index >= 0) ? _bins.get(index) : null;
+ }
+ }
+
+ @Override
+ public Bin<T> higher(double p) {
+ int index = Collections.binarySearch(_bins, new Bin(p, 0, null));
+ if (index >= 0) {
+ index++;
+ return (index < _bins.size()) ? _bins.get(index) : null;
+ } else {
+ index = Math.abs(index) - 1;
+ return (index < _bins.size()) ? _bins.get(index) : null;
+ }
+ }
+
+ @Override
+ public Collection<Bin<T>> getBins() {
+ return _bins;
+ }
+
+ @Override
+ public void merge() {
+ while (_bins.size() > getMaxBins()) {
+ int minGapIndex = -1;
+ double minGap = Double.MAX_VALUE;
+ for (int i = 0; i < _bins.size() - 1; i++) {
+ double gap = gapWeight(_bins.get(i), _bins.get(i + 1));
+ if (minGap > gap) {
+ minGap = gap;
+ minGapIndex = i;
+ }
+ }
+ Bin<T> prev = _bins.get(minGapIndex);
+ Bin<T> next = _bins.remove(minGapIndex + 1);
+ _bins.set(minGapIndex, prev.combine(next));
+ }
+ }
+
+ private ArrayList<Bin<T>> _bins;
+}
View
5 src/java/com/bigml/histogram/Bin.java
@@ -8,7 +8,7 @@
import java.text.DecimalFormat;
import org.json.simple.JSONArray;
-public class Bin<T extends Target> {
+public class Bin<T extends Target> implements Comparable<Bin> {
public Bin(double mean, double count, T target) {
/* Hack to avoid Java's negative zero */
@@ -100,4 +100,7 @@ public int hashCode() {
private final double _mean;
private double _count;
+ public int compareTo(Bin o) {
+ return Double.compare(getMean(), o.getMean());
+ }
}
View
64 src/java/com/bigml/histogram/BinReservoir.java
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2013 BigML
+ * Licensed under the Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+package com.bigml.histogram;
+
+import java.util.Collection;
+
+public abstract class BinReservoir<T extends Target> {
+ public BinReservoir(int maxBins, boolean weightGaps, Long freezeThreshold) {
+ _maxBins = maxBins;
+ _weightGaps = weightGaps;
+ _freezeThreshold = freezeThreshold;
+ _totalCount = 0;
+ }
+
+ public int getMaxBins() {
+ return _maxBins;
+ }
+
+ public boolean isWeightGaps() {
+ return _weightGaps;
+ }
+
+ public Long getFreezeThreshold() {
+ return _freezeThreshold;
+ }
+
+ public boolean isFrozen() {
+ return _freezeThreshold != null && _totalCount > _freezeThreshold;
+ }
+ public long getTotalCount() {
+ return _totalCount;
+ }
+
+ public void addTotalCount(Bin<T> bin) {
+ _totalCount += bin.getCount();
+ }
+
+ public abstract void insert(Bin<T> bin);
+ public abstract Bin<T> first();
+ public abstract Bin<T> last();
+ public abstract Bin<T> get(double p);
+ public abstract Bin<T> floor(double p);
+ public abstract Bin<T> ceiling(double p);
+ public abstract Bin<T> higher(double p);
+ public abstract Bin<T> lower(double p);
+ public abstract Collection<Bin<T>> getBins();
+ public abstract void merge();
+
+ protected double gapWeight(Bin<T> prev, Bin<T> next) {
+ double diff = next.getMean() - prev.getMean();
+ if (isWeightGaps()) {
+ diff *= Math.log(Math.E + Math.min(prev.getCount(), next.getCount()));
+ }
+ return diff;
+ }
+
+ private final int _maxBins;
+ private final boolean _weightGaps;
+ private final Long _freezeThreshold;
+ private long _totalCount;
+}
View
188 src/java/com/bigml/histogram/Histogram.java
@@ -13,7 +13,6 @@
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.TreeMap;
-import java.util.TreeSet;
import org.json.simple.JSONArray;
/**
@@ -37,6 +36,7 @@
public class Histogram<T extends Target> {
public static final String DEFAULT_FORMAT_STRING = "#.#####";
+ public static final int RESERVOIR_THRESHOLD = 256;
/**
* Creates an empty Histogram with the defined number of bins.
@@ -50,21 +50,22 @@
* then a collection group target types may be provided
* @param freezeThreshold after this # of inserts, bin locations
* will 'freeze', increasing the performance of future inserts
+ * @param reservoirType selects the bin reservoir implementation,
+ * defaults to 'array' when # bins < 256 and 'tree' otherwise
*/
public Histogram(int maxBins, boolean countWeightedGaps,
Collection<Object> categories, Collection<TargetType> groupTypes,
- Long freezeThreshold) {
- _maxBins = maxBins;
- _bins = new TreeMap<Double, Bin<T>>();
- _gaps = new TreeSet<Gap<T>>();
- _binsToGaps = new HashMap<Double, Gap<T>>();
+ Long freezeThreshold, BinReservoirType reservoirType) {
+ if (reservoirType == BinReservoirType.tree ||
+ (reservoirType == null && maxBins > RESERVOIR_THRESHOLD)) {
+ _bins = new TreeBinReservoir<T>(maxBins, countWeightedGaps, freezeThreshold);
+ } else {
+ _bins = new ArrayBinReservoir<T>(maxBins, countWeightedGaps, freezeThreshold);
+ }
_decimalFormat = new DecimalFormat(DEFAULT_FORMAT_STRING);
- _countWeightedGaps = countWeightedGaps;
- _totalCount = 0;
_missingCount = 0;
_minimum = null;
_maximum = null;
- _freezeThreshold = freezeThreshold;
if (categories != null && !categories.isEmpty()) {
_targetType = TargetType.categorical;
@@ -91,7 +92,7 @@ public Histogram(int maxBins, boolean countWeightedGaps,
* @param countWeightedGaps true if count weighted gaps are desired
*/
public Histogram(int maxBins, boolean countWeightedGaps) {
- this(maxBins, countWeightedGaps, null, null, null);
+ this(maxBins, countWeightedGaps, null, null, null, null);
}
/**
@@ -220,8 +221,8 @@ public Histogram(int maxBins) {
* @param bin the new bin
*/
public Histogram<T> insertBin(Bin<T> bin) {
- updateBins(bin);
- mergeBins();
+ _bins.insert(bin);
+ _bins.merge();
return this;
}
@@ -243,21 +244,21 @@ public TargetType getTargetType() {
* Returns the maximum number of allowed bins.
*/
public int getMaxBins() {
- return _maxBins;
+ return _bins.getMaxBins();
}
/**
* Returns the freeze threshold.
*/
public Long getFreezeThreshold() {
- return _freezeThreshold;
+ return _bins.getFreezeThreshold();
}
/**
* Returns whether gaps are count weighted.
*/
public boolean isCountWeightedGaps() {
- return _countWeightedGaps;
+ return _bins.isWeightGaps();
}
/**
@@ -296,7 +297,7 @@ public double sum(double p) throws SumOutOfRangeException {
public SumResult<T> extendedSum(double p) throws SumOutOfRangeException {
SumResult<T> result;
- if (_bins.isEmpty()) {
+ if (_bins.getBins().isEmpty()) {
throw new SumOutOfRangeException("Cannot sum with an empty histogram.");
}
@@ -304,14 +305,14 @@ public double sum(double p) throws SumOutOfRangeException {
throw new SumOutOfRangeException("Cannot compute a histogram sum for NaN");
}
- double binMax = _bins.lastKey();
+ double binMax = _bins.last().getMean();
if (p < _minimum) {
- result = new SumResult<T>(0, (T) _bins.firstEntry().getValue().getTarget().init());
+ result = new SumResult<T>(0, (T) _bins.first().getTarget().init());
} else if (p >= _maximum) {
result = new SumResult<T>(getTotalCount(), getTotalTargetSum());
} else if (p == binMax) {
- Bin<T> lastBin = _bins.lastEntry().getValue();
+ Bin<T> lastBin = _bins.last();
double totalCount = this.getTotalCount();
double count = totalCount - (lastBin.getCount() / 2d);
@@ -319,27 +320,21 @@ public double sum(double p) throws SumOutOfRangeException {
result = new SumResult<T>(count, targetSum);
} else {
- T emptyTarget = (T) _bins.firstEntry().getValue().getTarget().init();
- Entry<Double,Bin<T>> bin_iEntry = _bins.floorEntry(p);
- Bin<T> bin_i;
- if (bin_iEntry == null) {
+ T emptyTarget = (T) _bins.first().getTarget().init();
+ Bin<T> bin_i = _bins.floor(p);
+ if (bin_i == null) {
bin_i = new Bin(_minimum, 0, emptyTarget.clone());
- } else {
- bin_i = bin_iEntry.getValue();
}
- Entry<Double,Bin<T>> bin_i1Entry = _bins.higherEntry(p);
- Bin<T> bin_i1;
- if (bin_i1Entry == null) {
+ Bin<T> bin_i1 = _bins.higher(p);
+ if (bin_i1 == null) {
bin_i1 = new Bin(_maximum, 0, emptyTarget.clone());
- } else {
- bin_i1 = bin_i1Entry.getValue();
}
double prevCount = 0;
T prevTargetSum = (T) emptyTarget.clone();
- for (Bin<T> bin : _bins.values()) {
+ for (Bin<T> bin : getBins()) {
if (bin.equals(bin_i) || bin_i.getMean() == _minimum) {
break;
}
@@ -381,7 +376,7 @@ public double density(double p) {
* @param p the density estimate point
*/
public SumResult<T> extendedDensity(double p) {
- T emptyTarget = (T) _bins.firstEntry().getValue().getTarget().init();
+ T emptyTarget = (T) _bins.first().getTarget().init();
double countDensity;
T targetDensity;
@@ -401,20 +396,14 @@ public double density(double p) {
countDensity = (lowerResult.getCount() + higherResult.getCount()) / 2;
targetDensity = (T) lowerResult.getTargetSum().clone().sum(higherResult.getTargetSum()).mult(0.5);
} else {
- Entry<Double, Bin<T>> lowerEntry = _bins.lowerEntry(p);
- Bin<T> lowerBin;
- if (lowerEntry == null) {
+ Bin<T> lowerBin = _bins.lower(p);
+ if (lowerBin == null) {
lowerBin = new Bin(_minimum, 0, emptyTarget.clone());
- } else {
- lowerBin = lowerEntry.getValue();
}
- Entry<Double, Bin<T>> higherEntry = _bins.higherEntry(p);
- Bin<T> higherBin;
- if (higherEntry == null) {
+ Bin<T> higherBin = _bins.higher(p);
+ if (higherBin == null) {
higherBin = new Bin(_maximum, 0, emptyTarget.clone());
- } else {
- higherBin = higherEntry.getValue();
}
double bDiff = p - lowerBin.getMean();
@@ -459,8 +448,8 @@ public T averageTarget(double p) {
TreeMap<Double, Bin<T>> binSumMap = createBinSumMap();
double gapSize = totalCount / (double) numberOfBins;
- double minGapSize = Math.max(_bins.firstEntry().getValue().getCount(),
- _bins.lastEntry().getValue().getCount()) / 2;
+ double minGapSize = Math.max(_bins.first().getCount(),
+ _bins.last().getCount()) / 2;
int splits = numberOfBins;
if (gapSize < minGapSize) {
@@ -521,9 +510,9 @@ public Histogram merge(Histogram<T> histogram) throws MixedInsertException {
if (_indexMap != null) {
((ArrayCategoricalTarget) newBin.getTarget()).setIndexMap(_indexMap);
}
- updateBins(new Bin<T>(bin));
+ _bins.insert(new Bin<T>(bin));
}
- mergeBins();
+ _bins.merge();
}
if (_minimum == null) {
@@ -551,14 +540,14 @@ public Histogram merge(Histogram<T> histogram) throws MixedInsertException {
* Returns the total number of points in the histogram.
*/
public double getTotalCount() {
- return _totalCount;
+ return _bins.getTotalCount();
}
/**
* Returns the collection of bins that form the histogram.
*/
public Collection<Bin<T>> getBins() {
- return _bins.values();
+ return _bins.getBins();
}
public JSONArray toJSON(DecimalFormat format) {
@@ -580,7 +569,7 @@ public String toString() {
public T getTotalTargetSum() {
T target = null;
- for (Bin<T> bin : _bins.values()) {
+ for (Bin<T> bin : getBins()) {
if (target == null) {
target = (T) bin.getTarget().init();
}
@@ -675,46 +664,14 @@ private void processPointTarget(Double point, Target target) {
}
}
- private void updateBins(Bin<T> bin) {
- _totalCount += bin.getCount();
- Bin<T> existingBin = _bins.get(bin.getMean());
- if (_freezeThreshold != null
- && _totalCount > _freezeThreshold
- && _bins.size() == _maxBins) {
- Double floorDiff = Double.MAX_VALUE;
- Entry<Double, Bin<T>> floorEntry = _bins.floorEntry(bin.getMean());
- if (floorEntry != null) {
- floorDiff = Math.abs(floorEntry.getValue().getMean() - bin.getMean());
- }
- Double ceilDiff = Double.MAX_VALUE;
- Entry<Double, Bin<T>> ceilEntry = _bins.ceilingEntry(bin.getMean());
- if (ceilEntry != null) {
- ceilDiff = Math.abs(ceilEntry.getValue().getMean() - bin.getMean());
- }
- if (floorDiff <= ceilDiff) {
- floorEntry.getValue().sumUpdate(bin);
- } else {
- ceilEntry.getValue().sumUpdate(bin);
- }
- } else if (existingBin != null) {
- existingBin.sumUpdate(bin);
- if (_countWeightedGaps) {
- updateGaps(existingBin);
- }
- } else {
- updateGaps(bin);
- _bins.put(bin.getMean(), bin);
- }
- }
-
private TreeMap<Double, Bin<T>> createBinSumMap() {
TreeMap<Double, Bin<T>> binSumMap = new TreeMap<Double, Bin<T>>();
- Bin<T> minBin = new Bin(_minimum, 0d, _bins.firstEntry().getValue().getTarget().init());
- Bin<T> maxBin = new Bin(_maximum, 0d, _bins.firstEntry().getValue().getTarget().init());
+ Bin<T> minBin = new Bin(_minimum, 0d, _bins.first().getTarget().init());
+ Bin<T> maxBin = new Bin(_maximum, 0d, _bins.first().getTarget().init());
binSumMap.put(0d, minBin);
- binSumMap.put((double) _totalCount, maxBin);
+ binSumMap.put((double) getTotalCount(), maxBin);
- for (Bin<T> bin : _bins.values()) {
+ for (Bin<T> bin : getBins()) {
try {
double sum = sum(bin.getMean());
binSumMap.put(sum, bin);
@@ -756,7 +713,7 @@ private double findPointForSum(double s, TreeMap<Double, Bin<T>> binSumMap) {
double result;
if (s <= 0) {
result = _minimum;
- } else if (s >= _totalCount) {
+ } else if (s >= _bins.getTotalCount()) {
result = _maximum;
} else {
Entry<Double, Bin<T>> sumEntry = binSumMap.floorEntry(s);
@@ -789,56 +746,6 @@ private double findPointForSum(double s, TreeMap<Double, Bin<T>> binSumMap) {
return result;
}
- private void updateGaps(Bin<T> newBin) {
- Entry<Double, Bin<T>> prevEntry = _bins.lowerEntry(newBin.getMean());
- if (prevEntry != null) {
- updateGaps(prevEntry.getValue(), newBin);
- }
-
- Entry<Double, Bin<T>> nextEntry = _bins.higherEntry(newBin.getMean());
- if (nextEntry != null) {
- updateGaps(newBin, nextEntry.getValue());
- }
- }
-
- private void updateGaps(Bin<T> prevBin, Bin<T> nextBin) {
- double gapWeight = nextBin.getMean() - prevBin.getMean();
- if (_countWeightedGaps) {
- gapWeight *= Math.log(Math.E + Math.min(prevBin.getCount(), nextBin.getCount()));
- }
-
- Gap<T> newGap = new Gap<T>(prevBin, nextBin, gapWeight);
-
- Gap<T> prevGap = _binsToGaps.get(prevBin.getMean());
- if (prevGap != null) {
- _gaps.remove(prevGap);
- }
-
- _binsToGaps.put(prevBin.getMean(), newGap);
- _gaps.add(newGap);
- }
-
- private void mergeBins() {
- while (_bins.size() > _maxBins) {
- Gap<T> smallestGap = _gaps.pollFirst();
- Bin<T> newBin = smallestGap.getStartBin().combine(smallestGap.getEndBin());
-
- Gap<T> followingGap = _binsToGaps.get(smallestGap.getEndBin().getMean());
- if (followingGap != null) {
- _gaps.remove(followingGap);
- }
-
- _bins.remove(smallestGap.getStartBin().getMean());
- _bins.remove(smallestGap.getEndBin().getMean());
-
- _binsToGaps.remove(smallestGap.getStartBin().getMean());
- _binsToGaps.remove(smallestGap.getEndBin().getMean());
-
- updateGaps(newBin);
- _bins.put(newBin.getMean(), newBin);
- }
- }
-
private static Double findZ(double a, double b, double c) {
Double resultRoot = null;
ArrayList<Double> candidateRoots = solveQuadratic(a, b, c);
@@ -864,20 +771,15 @@ private static Double findZ(double a, double b, double c) {
return roots;
}
+ public enum BinReservoirType {tree, array};
public enum TargetType {none, numeric, categorical, group, histogram};
private TargetType _targetType;
- private final int _maxBins;
- private final TreeMap<Double, Bin<T>> _bins;
- private final TreeSet<Gap<T>> _gaps;
- private final HashMap<Double, Gap<T>> _binsToGaps;
+ private final BinReservoir<T> _bins;
private final DecimalFormat _decimalFormat;
- private final boolean _countWeightedGaps;
private ArrayList<TargetType> _groupTypes;
private HashMap<Object, Integer> _indexMap;
- private long _totalCount;
private long _missingCount;
private T _missingTarget;
private Double _minimum;
private Double _maximum;
- private Long _freezeThreshold;
}
View
157 src/java/com/bigml/histogram/TreeBinReservoir.java
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2013 BigML
+ * Licensed under the Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ */
+package com.bigml.histogram;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/**
+ * This class implements bin operations (insertions, merges, etc) for a histogram.
+ * This implementation is best for histograms with a large (>256) number of bins.
+ * It uses tree data structures to give O(logN) insert performance with regard to
+ * the number of bins in the histogram. For histograms with fewer bins, the
+ * ArrayBinReservoir class offers faster insert performance.
+ */
+public class TreeBinReservoir<T extends Target> extends BinReservoir<T> {
+
+ public TreeBinReservoir(int maxBins, boolean weightGaps, Long freezeThreshold) {
+ super(maxBins, weightGaps, freezeThreshold);
+ _bins = new TreeMap<Double, Bin<T>>();
+ _gaps = new TreeSet<Gap<T>>();
+ _binsToGaps = new HashMap<Double, Gap<T>>();
+ }
+
+ @Override
+ public void insert(Bin<T> bin) {
+ addTotalCount(bin);
+ Bin<T> existingBin = get(bin.getMean());
+ if (isFrozen() && getBins().size() == getMaxBins()) {
+ Double floorDiff = Double.MAX_VALUE;
+ Bin<T> floorBin = floor(bin.getMean());
+ if (floorBin != null) {
+ floorDiff = Math.abs(floorBin.getMean() - bin.getMean());
+ }
+ Double ceilDiff = Double.MAX_VALUE;
+ Bin<T> ceilBin = ceiling(bin.getMean());
+ if (ceilBin != null) {
+ ceilDiff = Math.abs(ceilBin.getMean() - bin.getMean());
+ }
+ if (floorDiff <= ceilDiff) {
+ floorBin.sumUpdate(bin);
+ } else {
+ ceilBin.sumUpdate(bin);
+ }
+ } else if (existingBin != null) {
+ existingBin.sumUpdate(bin);
+ if (isWeightGaps()) {
+ updateGaps(existingBin);
+ }
+ } else {
+ updateGaps(bin);
+ _bins.put(bin.getMean(), bin);
+ }
+ }
+
+ @Override
+ public Bin<T> first() {
+ return binFromEntry(_bins.firstEntry());
+ }
+
+ @Override
+ public Bin<T> last() {
+ return binFromEntry(_bins.lastEntry());
+ }
+
+ @Override
+ public Bin<T> get(double p) {
+ return _bins.get(p);
+ }
+
+ @Override
+ public Bin<T> floor(double p) {
+ return binFromEntry(_bins.floorEntry(p));
+ }
+
+ @Override
+ public Bin<T> ceiling(double p) {
+ return binFromEntry(_bins.ceilingEntry(p));
+ }
+
+ @Override
+ public Bin<T> higher(double p) {
+ return binFromEntry(_bins.higherEntry(p));
+ }
+
+ @Override
+ public Bin<T> lower(double p) {
+ return binFromEntry(_bins.lowerEntry(p));
+ }
+
+ @Override
+ public Collection<Bin<T>> getBins() {
+ return _bins.values();
+ }
+
+ @Override
+ public void merge() {
+ while (_bins.size() > getMaxBins()) {
+ Gap<T> smallestGap = _gaps.pollFirst();
+ Bin<T> newBin = smallestGap.getStartBin().combine(smallestGap.getEndBin());
+
+ Gap<T> followingGap = _binsToGaps.get(smallestGap.getEndBin().getMean());
+ if (followingGap != null) {
+ _gaps.remove(followingGap);
+ }
+
+ _bins.remove(smallestGap.getStartBin().getMean());
+ _bins.remove(smallestGap.getEndBin().getMean());
+ _binsToGaps.remove(smallestGap.getStartBin().getMean());
+ _binsToGaps.remove(smallestGap.getEndBin().getMean());
+
+ updateGaps(newBin);
+ _bins.put(newBin.getMean(), newBin);
+ }
+ }
+
+ private void updateGaps(Bin<T> newBin) {
+ Bin<T> prev = lower(newBin.getMean());
+ if (prev != null) {
+ updateGaps(prev, newBin);
+ }
+
+ Bin<T> next = higher(newBin.getMean());
+ if (next != null) {
+ updateGaps(newBin, next);
+ }
+ }
+
+ private void updateGaps(Bin<T> prev, Bin<T> next) {
+ Gap<T> newGap = new Gap<T>(prev, next, gapWeight(prev, next));
+
+ Gap<T> prevGap = _binsToGaps.get(prev.getMean());
+ if (prevGap != null) {
+ _gaps.remove(prevGap);
+ }
+
+ _binsToGaps.put(prev.getMean(), newGap);
+ _gaps.add(newGap);
+ }
+
+ private Bin<T> binFromEntry(Entry<Double, Bin<T>> entry) {
+ if (entry == null) {
+ return null;
+ } else {
+ return entry.getValue();
+ }
+ }
+
+ private final TreeMap<Double, Bin<T>> _bins;
+ private final TreeSet<Gap<T>> _gaps;
+ private final HashMap<Double, Gap<T>> _binsToGaps;
+}
View
7 test/bigml/histogram/test/core.clj
@@ -323,3 +323,10 @@
(-> (create)
(insert! 0)
(density 0)))))
+
+(deftest reservoir-type
+ (let [data (normal-data 1000)
+ array (reduce insert! (create :reservoir :array) data)
+ tree (reduce insert! (create :reservoir :tree) data)]
+ (= (bins array) (bins tree))
+ (= (uniform array 4) (uniform tree 4))))
Please sign in to comment.
Something went wrong with that request. Please try again.