/
test_1.clj
53 lines (39 loc) · 1.89 KB
/
test_1.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
(ns vrs.test-1
(:require [incanter.core :as ic]
[incanter.charts :as icharts]
[clojure.pprint :as pp]))
; Uses Vitter's reservoir sampling algorithm to take n items
; without replacement, from a iterable of items of unknown,
; arbitrary length. Inspired by
; https://www.omniref.com/ruby/gems/stream_sampler/0.0.1/symbols/StreamSampler
; Given some large pool of data, we'd like to confirm that over several runs the
; distribution of randomly selected samples is uniform.
(defn- replace-in-output [sample-size output [idx item]]
(let [rnd (rand-int idx)]
(if (< idx sample-size)
; Pick any item if we don't yet have enough samples
(assoc output idx item)
; Otherwise pick current item if rnd is less than sample-size
(if (< rnd sample-size)
(assoc output rnd item)
; Finally, if rnd >= sample-size: Keep output as-is
output))))
(defn get-random-samples [sample-size data]
(reduce (partial replace-in-output sample-size) [] (map #(vec [%1 %2]) (range) data)))
; The below data can be of any kind, but we'll stick to a simple
; sequence of numbers for the example:
(def data (range 1 1000))
; How many samples to randomly fetch
(def sample-size 25)
;(println (get-random-samples sample-size data))
; Ok, so above we have the get-random-samples function we can now use..
; Next we want to get the histogram data we need to look at the distribution.
(def sample-count 10000)
(defn- update-map-count [output item]
(assoc output item (inc (get output item 0))))
(def sample-seq (map (fn [& _] (get-random-samples sample-size data)) (range sample-count)))
;(pp/pprint sample-seq)
(def distribution-map (reduce update-map-count (sorted-map) (mapcat identity sample-seq)))
;(pp/pprint distribution-map)
(ic/view (icharts/histogram (mapcat identity sample-seq) :nbins 1000))
; Formally testing that the distribution is uniform is left to the reader..