Browse files

Adds sum-of-squares to histograms with numeric targets

  • Loading branch information...
1 parent ba73316 commit 962437b744d49a9551a052a3fbcf34ab46f61191 @ashenfad ashenfad committed Oct 11, 2012
View
64 README.md
@@ -165,8 +165,9 @@ variable called the *target*.
The target may be either numeric or categorical. The `insert!` fn is
overloaded to accept either type of target. Each histogram bin will
-contain information summarizing the target. For numerics the targets
-sums are tracked. For categoricals a map of counts is maintained.
+contain information summarizing the target. For numeric targets the
+sum and sum-of-squares are tracked. For categoricals, a map of
+counts is maintained.
```clojure
examples> (-> (create)
@@ -175,18 +176,30 @@ examples> (-> (create)
(insert! 3 7)
(insert! 3 6)
(bins))
-({:target {:sum 9.0, :missing-count 0.0}, :mean 1.0, :count 1}
- {:target {:sum 8.0, :missing-count 0.0}, :mean 2.0, :count 1}
- {:target {:sum 13.0, :missing-count 0.0}, :mean 3.0, :count 2})
+({:target {:sum 9.0, :sum-squares 81.0, :missing-count 0.0},
+ :mean 1.0,
+ :count 1}
+ {:target {:sum 8.0, :sum-squares 64.0, :missing-count 0.0},
+ :mean 2.0,
+ :count 1}
+ {:target {:sum 13.0, :sum-squares 85.0, :missing-count 0.0},
+ :mean 3.0,
+ :count 2})
examples> (-> (create)
(insert! 1 :a)
(insert! 2 :b)
(insert! 3 :c)
(insert! 3 :d)
(bins))
-({:target {:counts {:a 1.0}, :missing-count 0.0}, :mean 1.0, :count 1}
- {:target {:counts {:b 1.0}, :missing-count 0.0}, :mean 2.0, :count 1}
- {:target {:counts {:d 1.0, :c 1.0}, :missing-count 0.0}, :mean 3.0, :count 2})
+({:target {:counts {:a 1.0}, :missing-count 0.0},
+ :mean 1.0,
+ :count 1}
+ {:target {:counts {:b 1.0}, :missing-count 0.0},
+ :mean 2.0,
+ :count 1}
+ {:target {:counts {:d 1.0, :c 1.0}, :missing-count 0.0},
+ :mean 3.0,
+ :count 2})
```
Mixing target types isn't allowed:
@@ -246,13 +259,14 @@ produces values close to original target:
```clojure
examples> (def view-target (fn [x] {:actual (make-y x)
- :approx (average-target hist x)}))
+ :approx (:sum (average-target hist x))}))
+{:actual 0.0, :approx -0.04261679840707788}
examples> (view-target 0)
-{:actual 0.0, :approx {:sum -0.04696, :missing-count 0.0}}
-examples> (view-target (/ Math/PI 2))
-{:actual 1.0, :approx {:sum 0.99698, :missing-count 0.0}}
+{:actual 0.0, :approx -0.04261679840707788}
+examples> (view-target (/ Math/PI 2))
+{:actual 1.0, :approx 0.9968169965429206}
examples> (view-target Math/PI)
-{:actual 1.22464E-16, :approx {:sum -0.04881, :missing-count 0.0}}
+{:actual 0.0, :approx 0.021364059655214544}
```
# Missing Values
@@ -331,15 +345,21 @@ examples> (-> (create :group-types [:categorical :numeric])
(insert! 3 [:c 7])
(insert! 1 [:d 6])
(bins))
-({:target ({:counts {:a 1.0, :d 1.0}, :missing-count 0.0}
- {:sum 6.0, :missing-count 1.0}),
- :mean 1.0, :count 2}
- {:target ({:counts {:b 1.0}, :missing-count 0.0}
- {:sum 8.0, :missing-count 0.0}),
- :mean 2.0, :count 1}
- {:target ({:counts {:c 1.0}, :missing-count 0.0}
- {:sum 7.0, :missing-count 0.0}),
- :mean 3.0, :count 1})
+({:target
+ ({:counts {:d 1.0, :a 1.0}, :missing-count 0.0}
+ {:sum 6.0, :sum-squares 36.0, :missing-count 1.0}),
+ :mean 1.0,
+ :count 2}
+ {:target
+ ({:counts {:b 1.0}, :missing-count 0.0}
+ {:sum 8.0, :sum-squares 64.0, :missing-count 0.0}),
+ :mean 2.0,
+ :count 1}
+ {:target
+ ({:counts {:c 1.0}, :missing-count 0.0}
+ {:sum 7.0, :sum-squares 49.0, :missing-count 0.0}),
+ :mean 3.0,
+ :count 1})
```
# Freezing a Histogram
View
2 project.clj
@@ -1,4 +1,4 @@
-(defproject histogram "1.9.9"
+(defproject histogram "2.0.0"
:description "Dynamic/streaming histograms"
:source-path "src/clj"
:java-source-path "src/java"
View
3 src/clj/histogram/core.clj
@@ -140,7 +140,8 @@
nil)
(defmethod scrub-target NumericTarget [^NumericTarget target]
- {:sum (.getTarget target)
+ {:sum (.getSum target)
+ :sum-squares (.getSumSquares target)
:missing-count (.getMissingCount target)})
(defmethod scrub-target MapCategoricalTarget [^MapCategoricalTarget target]
View
2 src/java/com/bigml/histogram/Histogram.java
@@ -349,7 +349,7 @@ public double sum(double p) throws SumOutOfRangeException {
NumericTarget countTarget = (NumericTarget) computeSum(bpRatio, new NumericTarget(prevCount),
new NumericTarget(bin_i.getCount()), new NumericTarget(bin_i1.getCount()));
- double countSum = countTarget.getTarget();
+ double countSum = countTarget.getSum();
T targetSum = (T) computeSum(bpRatio, prevTargetSum, bin_i.getTarget(), bin_i1.getTarget());
View
24 src/java/com/bigml/histogram/MapCategoricalTarget.java
@@ -10,26 +10,26 @@
public class MapCategoricalTarget extends Target<MapCategoricalTarget> implements CategoricalTarget {
public MapCategoricalTarget(Object category) {
- _target = new HashMap<Object, Double>(1,1);
- _target.put(category, 1d);
+ _counts = new HashMap<Object, Double>(1,1);
+ _counts.put(category, 1d);
}
public MapCategoricalTarget(HashMap<Object, Double> targetCounts, double missingCount) {
- _target = targetCounts;
- _target.put(null, missingCount);
+ _counts = targetCounts;
+ _counts.put(null, missingCount);
}
public MapCategoricalTarget(HashMap<Object, Double> targetCounts) {
- _target = targetCounts;
+ _counts = targetCounts;
}
public HashMap<Object, Double> getCounts() {
- return _target;
+ return _counts;
}
@Override
public double getMissingCount() {
- Double missingCount = _target.get(null);
+ Double missingCount = _counts.get(null);
return missingCount == null ? 0 : missingCount;
}
@@ -41,7 +41,7 @@ public TargetType getTargetType() {
@Override
protected void addJSON(JSONArray binJSON, DecimalFormat format) {
JSONObject counts = new JSONObject();
- for (Entry<Object,Double> categoryCount : _target.entrySet()) {
+ for (Entry<Object,Double> categoryCount : _counts.entrySet()) {
Object category = categoryCount.getKey();
double count = categoryCount.getValue();
counts.put(category, Double.valueOf(format.format(count)));
@@ -54,11 +54,11 @@ protected MapCategoricalTarget sum(MapCategoricalTarget target) {
for (Entry<Object, Double> categoryCount : target.getCounts().entrySet()) {
Object category = categoryCount.getKey();
- Double oldCount = _target.get(category);
+ Double oldCount = _counts.get(category);
oldCount = (oldCount == null) ? 0 : oldCount;
double newCount = oldCount + categoryCount.getValue();
- _target.put(category, newCount);
+ _counts.put(category, newCount);
}
return this;
@@ -75,13 +75,13 @@ protected MapCategoricalTarget mult(double multiplier) {
@Override
protected MapCategoricalTarget clone() {
- return new MapCategoricalTarget(new HashMap<Object, Double>(_target));
+ return new MapCategoricalTarget(new HashMap<Object, Double>(_counts));
}
@Override
protected MapCategoricalTarget init() {
return new MapCategoricalTarget(new HashMap<Object, Double>());
}
- private HashMap<Object, Double> _target;
+ private HashMap<Object, Double> _counts;
}
View
48 src/java/com/bigml/histogram/NumericTarget.java
@@ -6,19 +6,32 @@
public class NumericTarget extends Target<NumericTarget> {
+ private NumericTarget(Double target, Double sumSquares, double missingCount) {
+ _sum = target;
+ _sumSquares = sumSquares;
+ _missingCount = missingCount;
+ }
+
public NumericTarget(Double target, double missingCount) {
- _target = target;
+ _sum = target;
+ if (target != null) {
+ _sumSquares = target * target;
+ }
_missingCount = missingCount;
}
public NumericTarget(Double target) {
this(target, target == null ? 1 : 0);
}
- public Double getTarget() {
- return _target;
+ public Double getSum() {
+ return _sum;
}
-
+
+ public Double getSumSquares() {
+ return _sumSquares;
+ }
+
@Override
public double getMissingCount() {
return _missingCount;
@@ -31,15 +44,16 @@ public TargetType getTargetType() {
@Override
public String toString() {
- return String.valueOf(_target);
+ return String.valueOf(_sum) + "," + String.valueOf(_sumSquares);
}
@Override
protected void addJSON(JSONArray binJSON, DecimalFormat format) {
- if (_target == null) {
+ if (_sum == null) {
binJSON.add(null);
} else {
- binJSON.add(Double.valueOf(format.format(_target)));
+ binJSON.add(Double.valueOf(format.format(_sum)));
+ binJSON.add(Double.valueOf(format.format(_sumSquares)));
}
}
@@ -50,27 +64,31 @@ protected NumericTarget init() {
@Override
protected NumericTarget clone() {
- return new NumericTarget(_target, _missingCount);
+ return new NumericTarget(_sum, _sumSquares, _missingCount);
}
- private Double _target;
+ private Double _sum;
+ private Double _sumSquares;
private double _missingCount;
@Override
protected NumericTarget sum(NumericTarget target) {
- if (_target == null && target.getTarget() != null) {
- _target = target.getTarget();
- } else if (_target != null && target.getTarget() != null){
- this._target += target.getTarget();
+ if (_sum == null && target.getSum() != null) {
+ _sum = target.getSum();
+ _sumSquares = target.getSumSquares();
+ } else if (_sum != null && target.getSum() != null){
+ _sum += target.getSum();
+ _sumSquares += target.getSumSquares();
}
_missingCount += target.getMissingCount();
return this;
}
@Override
protected NumericTarget mult(double multiplier) {
- if (_target != null) {
- _target *= multiplier;
+ if (_sum != null) {
+ _sum *= multiplier;
+ _sumSquares *= multiplier;
}
_missingCount *= multiplier;
return this;
View
22 test/histogram/test/core.clj
@@ -114,18 +114,24 @@
(deftest group-test
(let [points 10000
+ data (group-data points false)
hist (reduce (fn [h [x y]] (insert! h x y))
(create)
- (group-data points false))
- ext-sum (extended-sum hist 0.5)]
+ data)
+ target (:target (extended-sum hist 0.5))]
(is (= (target-type hist) :group))
(is (= (group-types hist) '(:numeric :categorical)))
- (is (about= (:sum (first (:target ext-sum)))
+ (is (about= (:sum (first target))
(/ points 4)
- (/ points 100)))
- (is (about= (:orange (:counts (second (:target ext-sum))))
+ (/ points 50)))
+ (is (about= (:sum-squares (first target))
+ (reduce + (map #(* % %)
+ (take (int (/ (count data) 2))
+ (map first data))))
+ 150))
+ (is (about= (:orange (:counts (second target)))
(/ points 6)
- (/ points 100)))))
+ (/ points 50)))))
(deftest weighted-gap-test
;; Histograms using weighted gaps are less eager to merge bins with
@@ -172,10 +178,10 @@
(is (= result
'({:mean 1.0
:count 2
- :target {:sum 1.0 :missing-count 1.0}}
+ :target {:sum 1.0 :sum-squares 1.0 :missing-count 1.0}}
{:mean 5.0
:count 2
- :target {:sum 2.0 :missing-count 1.0}})))))
+ :target {:sum 2.0 :sum-squares 4.0 :missing-count 1.0}})))))
(deftest categorical-missing-test
(let [data [[1 :foo] [1 nil] [4 :bar] [6 nil]]

0 comments on commit 962437b

Please sign in to comment.