Permalink
Browse files

Adds variance, mean, pdf/cdf, and 'flexible' hist edges

  • Loading branch information...
1 parent 3c8606f commit ddb8fa614253a1ef54bd8acc2f33fb968bcc3148 @ashenfad ashenfad committed Jun 1, 2012
View
2 project.clj
@@ -1,4 +1,4 @@
-(defproject histogram "1.9.4"
+(defproject histogram "1.9.5"
:description "Dynamic/streaming histograms"
:source-path "src/clj"
:java-source-path "src/java"
View
36 src/clj/histogram/core.clj
@@ -259,8 +259,8 @@
(let [l-mean (:mean (last bins))
f-mean (:mean (first bins))]
(if (and buffer? (second bins))
- {:min (- f-mean (* 1.1 (- (:mean (second bins)) f-mean)))
- :max (+ l-mean (* 1.1 (- l-mean (:mean (last (drop-last bins))))))}
+ {:min (- f-mean (apply - (map :mean (reverse (take 2 bins)))))
+ :max (+ l-mean (apply - (map :mean (reverse (take-last 2 bins)))))}
{:min f-mean
:max l-mean})))))
@@ -293,3 +293,35 @@
(when maximum (.setMaximum hist maximum))
(when missing-bin (insert-bin! hist missing-bin))
hist))
+
+(defn mean
+ "Returns the mean over the points inserted into the histogram."
+ [^Histogram hist]
+ (:mean (first (-> (create :bins 1)
+ (merge! hist)
+ (bins)))))
+
+(defn cdf
+ "Returns the cumulative distribution function for histogram."
+ [^Histogram hist]
+ (let [total (total-count hist)]
+ #(/ (sum hist %) total)))
+
+(defn pdf
+ "Returns the probability density function for the histogram."
+ [^Histogram hist]
+ (let [total (total-count hist)]
+ #(/ (density hist %) total)))
+
+(defn variance
+ "Returns an estimate of the variance for the histogram."
+ [^Histogram hist]
+ (let [h-mean (mean hist)
+ h-count (total-count hist)]
+ (when (pos? h-count)
+ (/ (reduce (fn [v {:keys [mean count]}]
+ (let [diff (- mean h-mean)]
+ (+ v (* count diff diff))))
+ 0
+ (bins hist))
+ h-count))))
View
29 src/java/com/bigml/histogram/Histogram.java
@@ -275,14 +275,35 @@ public double sum(double p) throws SumOutOfRangeException {
* @param p the sum point
*/
public SumResult<T> extendedSum(double p) throws SumOutOfRangeException {
- SumResult<T> result = null;
+ SumResult<T> result;
+
+ if (_bins.isEmpty()) {
+ throw new SumOutOfRangeException("Cannot sum with an empty histogram.");
+ }
double min = _bins.firstKey();
double max = _bins.lastKey();
- if (p < min || p > max) {
- throw new SumOutOfRangeException("Sum point " + p + " should be between "
- + min + " and " + max);
+ if (p < min) {
+ Bin<T> bin = _bins.firstEntry().getValue();
+ double distanceRatio = (1 - (bin.getMean() - p) / binGapRange(p, bin)) / 2;
+ if (distanceRatio > 0) {
+ double countSum = distanceRatio * bin.getCount();
+ T targetSum = (T) bin.getTarget().clone().mult(distanceRatio);
+ result = new SumResult<T>(countSum, targetSum);
+ } else {
+ result = new SumResult<T>(0, (T) bin.getTarget().init());
+ }
+ } else if (p > max) {
+ Bin<T> bin = _bins.lastEntry().getValue();
+ double distanceRatio = (1 - (p - bin.getMean()) / binGapRange(p, bin)) / 2;
+ if (distanceRatio > 0) {
+ double countSum = getTotalCount() - distanceRatio * bin.getCount();
+ T targetSum = (T) getTotalTargetSum().sum(bin.getTarget().clone().mult(-distanceRatio));
+ result = new SumResult<T>(countSum, targetSum);
+ } else {
+ result = new SumResult<T>(getTotalCount(), getTotalTargetSum());
+ }
} else if (p == max) {
Bin<T> lastBin = _bins.lastEntry().getValue();
View
8 test/histogram/test/core.clj
@@ -32,11 +32,13 @@
(/ points 2)
(/ points 50)))))
-(deftest median-test
+(deftest median-mean-test
(let [points 10000]
(is (about= (median (reduce insert! (create) (rand-data points)))
0.5 0.05))
(is (about= (median (reduce insert! (create) (normal-data points)))
+ 0 0.05))
+ (is (about= (mean (reduce insert! (create) (normal-data points)))
0 0.05))))
(deftest mean-test
@@ -258,3 +260,7 @@
(is (= (missing-bin hist1) (missing-bin hist2)))
(is (= (minimum hist1) (minimum hist2)))
(is (= (maximum hist1) (maximum hist2)))))
+
+(deftest variance-test
+ (is (about= (variance (reduce insert! (create) (normal-data 10000)))
+ 1 0.05)))
View
60 test/histogram/test/examples.clj
@@ -7,71 +7,55 @@
;; 100K samples from a normal distribution (mean 0 and variance 1)
(def normal-data (repeatedly 100000 #(dst/draw (dst/normal-distribution))))
-(defn multi-density-chart [hists]
- (let [bounds (hst/bounds (first hists) true)]
+(defn multi-pdf-chart [hists]
+ (let [min (reduce min (map (comp :min hst/bounds) hists))
+ max (reduce max (map (comp :max hst/bounds) hists))]
(core/view
(reduce (fn [c h]
- (charts/add-function c #(hst/density h %)
- (:min bounds)
- (:max bounds)))
- (charts/function-plot #(hst/density (first hists) %)
- (:min bounds)
- (:max bounds))
+ (charts/add-function c (hst/pdf h) min max))
+ (charts/function-plot (hst/pdf (first hists)) min max)
(next hists)))))
(defn sum-density-chart [hist]
- (let [bounds (hst/bounds hist)]
- (core/view
- (-> (charts/function-plot #(hst/sum hist %)
- (:min bounds)
- (:max bounds))
- (charts/add-function #(hst/density hist %)
- (:min bounds)
- (:max bounds))))))
+ (let [{:keys [min max]} (hst/bounds hist true)]
+ (core/view (-> (charts/function-plot #(hst/sum hist %) min max)
+ (charts/add-function #(hst/density hist %) min max)))))
-(defn extended-sum-chart [hist]
- (let [bounds (hst/bounds hist)]
- (core/view
- (-> (charts/function-plot #(:sum (hst/extended-sum hist %))
- (:min bounds)
- (:max bounds))
- (charts/add-function #(:sum (:target (hst/extended-sum hist %)))
- (:min bounds)
- (:max bounds))))))
+(defn cdf-pdf-chart [hist]
+ (let [{:keys [min max]} (hst/bounds hist true)]
+ (core/view (-> (charts/function-plot (hst/cdf hist) min max)
+ (charts/add-function (hst/pdf hist) min max)))))
-(defn density-target-chart [hist]
- (let [bounds (hst/bounds hist)]
+(defn pdf-target-chart [hist]
+ (let [{:keys [min max]} (hst/bounds hist true)]
(core/view
- (-> (charts/function-plot #(hst/density hist %)
- (:min bounds)
- (:max bounds))
- (charts/add-function #(:sum (hst/average-target hist %))
- (:min bounds)
- (:max bounds))))))
+ (-> (charts/function-plot (hst/pdf hist) min max)
+ (charts/add-function #(:sum (hst/average-target hist %)) min max)))))
;; Builds and charts a histogram for the normal distribution.
(defn- normal-example []
(let [hist (reduce hst/insert! (hst/create) normal-data)]
(println "Total sum of points less than 0:" (hst/sum hist 0))
(println "Quartile splits:" (hst/uniform hist 4))
- (sum-density-chart hist)))
+ (sum-density-chart hist)
+ (cdf-pdf-chart hist)))
(defn- varying-bins-example []
- (multi-density-chart
+ (multi-pdf-chart
[(reduce hst/insert! (hst/create :bins 16) normal-data)
(reduce hst/insert! (hst/create :bins 64) normal-data)]))
(defn- gap-weighted-example []
- (multi-density-chart
+ (multi-pdf-chart
[(reduce hst/insert! (hst/create :bins 16 :gap-weighted? true) normal-data)
(reduce hst/insert! (hst/create :bins 16 :gap-weighted? false) normal-data)]))
(defn- numeric-target-example []
- (let [target-data (map (fn [x] [x (+ 10000 (* 10000 (Math/sin x)))])
+ (let [target-data (map (fn [x] [x (Math/sin x)])
normal-data)
hist (reduce #(apply hst/insert! %1 %2)
(hst/create) target-data)]
- (density-target-chart hist)))
+ (pdf-target-chart hist)))
(defn -main [& args]
(normal-example)

0 comments on commit ddb8fa6

Please sign in to comment.