Skip to content

Commit

Permalink
Merge pull request #26 from elazarl/master
Browse files Browse the repository at this point in the history
Explain logarithmic scale
  • Loading branch information
jehiah committed Jul 5, 2015
2 parents 136ac9f + c9bfb89 commit e9bc1ad
Showing 1 changed file with 86 additions and 55 deletions.
141 changes: 86 additions & 55 deletions data_hacks/histogram.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
Expand Down Expand Up @@ -31,16 +31,17 @@
from optparse import OptionParser
from collections import namedtuple


class MVSD(object):
""" A class that calculates a running Mean / Variance / Standard Deviation"""
"A class that calculates a running Mean / Variance / Standard Deviation"
def __init__(self):
self.is_started = False
self.ss = Decimal(0) # (running) sum of square deviations from mean
self.m = Decimal(0) # (running) mean
self.total_w = Decimal(0) # weight of items seen
self.ss = Decimal(0) # (running) sum of square deviations from mean
self.m = Decimal(0) # (running) mean
self.total_w = Decimal(0) # weight of items seen

def add(self, x, w=1):
""" add another datapoint to the Mean / Variance / Standard Deviation"""
"add another datapoint to the Mean / Variance / Standard Deviation"
if not isinstance(x, Decimal):
x = Decimal(x)
if not self.is_started:
Expand All @@ -50,32 +51,33 @@ def add(self, x, w=1):
self.is_started = True
else:
temp_w = self.total_w + w
self.ss += (self.total_w * w * (x - self.m) * (x - self.m )) / temp_w
self.m += (x - self.m) / temp_w
self.ss += (self.total_w * w * (x - self.m) *
(x - self.m)) / temp_w
self.m += (x - self.m) / temp_w
self.total_w = temp_w

# print "added %-2d mean=%0.2f var=%0.2f std=%0.2f" % (x, self.mean(), self.var(), self.sd())


def var(self):
return self.ss / self.total_w

def sd(self):
return math.sqrt(self.var())

def mean(self):
return self.m

DataPoint = namedtuple('DataPoint', ['value', 'count'])


def test_mvsd():
mvsd = MVSD()
for x in range(10):
mvsd.add(x)

assert '%.2f' % mvsd.mean() == "4.50"
assert '%.2f' % mvsd.var() == "8.25"
assert '%.14f' % mvsd.sd() == "2.87228132326901"


def load_stream(input_stream, agg_value_key, agg_key_value):
for line in input_stream:
clean_line = line.strip()
Expand All @@ -97,37 +99,42 @@ def load_stream(input_stream, agg_value_key, agg_key_value):
logging.exception('failed %r', line)
print >>sys.stderr, "invalid line %r" % line


def median(values, key=None):
if not key:
key= lambda x: x
key = None # map and sort accept None as identity
length = len(values)
if length%2:
if length % 2:
median_indeces = [length/2]
else:
median_indeces = [length/2-1, length/2]

values = sorted(values, key=key)
return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)
return sum(map(key,
[values[i] for i in median_indeces])) / len(median_indeces)


def test_median():
assert 6 == median([8,7,9,1,2,6,3]) # odd-sized list
assert 4 == median([4,5,2,1,9,10]) # even-sized int list. (4+5)/2 = 4
assert "4.50" == "%.2f" % median([4.0,5,2,1,9,10]) #even-sized float list. (4.0+5)/2 = 4.5
assert 6 == median([8, 7, 9, 1, 2, 6, 3]) # odd-sized list
assert 4 == median([4, 5, 2, 1, 9, 10]) # even-sized int list. (4+5)/2 = 4
# even-sized float list. (4.0+5)/2 = 4.5
assert "4.50" == "%.2f" % median([4.0, 5, 2, 1, 9, 10])


def histogram(stream, options):
"""
Loop over the stream and add each entry to the dataset, printing out at the end
stream yields Decimal()
Loop over the stream and add each entry to the dataset, printing out at the
end.
stream yields Decimal()
"""
if not options.min or not options.max:
# glob the iterator here so we can do min/max on it
data = list(stream)
else:
data = stream
bucket_scale = 1

if options.min:
min_v = Decimal(options.min)
else:
Expand All @@ -154,7 +161,7 @@ def histogram(stream, options):
# if the last value is smaller than the maximum, replace it
if bound_sort[-1] < max_v:
bound_sort[-1] = max_v

# iterate through the sorted list and append to boundaries
for x in bound_sort:
if x >= min_v and x <= max_v:
Expand All @@ -163,17 +170,30 @@ def histogram(stream, options):
boundaries.append(max_v)
break

# beware: the min_v is not included in the boundaries, so no need to do a -1!
# beware: the min_v is not included in the boundaries,
# so no need to do a -1!
bucket_counts = [0 for x in range(len(boundaries))]
buckets = len(boundaries)
elif options.logscale:
buckets = options.buckets and int(options.buckets) or 10
if buckets <= 0:
raise ValueError('# of buckets must be > 0')
fx = lambda k, n: n/(2**(k+1)-1)

def first_bucket_size(k, n):
"""Logarithmic buckets means, the size of bucket i+1 is twice
the size of bucket i.
For k+1 buckets whose sum is n, we have
(note, k+1 buckets, since 0 is counted as well):
\sum_{i=0}^{k} x*2^i = n
x * \sum_{i=0}^{k} 2^i = n
x * (2^{k+1} - 1) = n
x = n/(2^{k+1} - 1)
"""
return n/(2**(k+1)-1)

def log_steps(k, n):
"k logarithmic steps whose sum is n"
x = fx(k-1, n)
x = first_bucket_size(k-1, n)
sum = 0
for i in range(k):
sum += 2**i * x
Expand All @@ -195,7 +215,7 @@ def log_steps(k, n):
mvsd = MVSD()
accepted_data = []
for record in data:
samples += record.count
samples += record.count
if options.mvsd:
mvsd.add(record.value, record.count)
accepted_data.append(record)
Expand All @@ -207,16 +227,20 @@ def log_steps(k, n):
if record.value <= boundary:
bucket_counts[bucket_postion] += record.count
break

# auto-pick the hash scale
if max(bucket_counts) > 75:
bucket_scale = int(max(bucket_counts) / 75)

print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v)

print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" %
(samples, min_v, max_v))
if skipped:
print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
print("# %d value%s outside of min/max" %
(skipped, skipped > 1 and 's' or ''))
if options.mvsd:
print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value))
print("# Mean = %f; Variance = %f; SD = %f; Median %f" %
(mvsd.mean(), mvsd.var(), mvsd.sd(),
median(accepted_data, key=lambda x: x.value)))
print "# each ∎ represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
Expand All @@ -230,39 +254,46 @@ def log_steps(k, n):
if bucket_count:
star_count = bucket_count / bucket_scale
if options.percentage:
percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage)

percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) /
Decimal(samples))
print format_string % (bucket_min, bucket_max, bucket_count, '∎' *
star_count, percentage)


if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
parser.add_option("-a", "--agg", dest="agg_value_key", default=False, action="store_true",
help="Two column input format, space seperated with value<space>key")
parser.add_option("-A", "--agg-key-value", dest="agg_key_value", default=False, action="store_true",
help="Two column input format, space seperated with key<space>value")
parser.add_option("-a", "--agg", dest="agg_value_key", default=False,
action="store_true", help="Two column input format, " +
"space seperated with value<space>key")
parser.add_option("-A", "--agg-key-value", dest="agg_key_value",
default=False, action="store_true", help="Two column " +
"input format, space seperated with key<space>value")
parser.add_option("-m", "--min", dest="min",
help="minimum value for graph")
help="minimum value for graph")
parser.add_option("-x", "--max", dest="max",
help="maximum value for graph")
help="maximum value for graph")
parser.add_option("-b", "--buckets", dest="buckets",
help="Number of buckets to use for the histogram")
parser.add_option("-l", "--logscale", dest="logscale", default=False, action="store_true",
help="Buckets grow in logarithmic scale")
help="Number of buckets to use for the histogram")
parser.add_option("-l", "--logscale", dest="logscale", default=False,
action="store_true",
help="Buckets grow in logarithmic scale")
parser.add_option("-B", "--custom-buckets", dest="custbuckets",
help="Comma seperated list of bucket edges for the histogram")
parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
help="Disable the calculation of Mean, Variance and SD (improves performance)")
help="Comma seperated list of bucket " +
"edges for the histogram")
parser.add_option("--no-mvsd", dest="mvsd", action="store_false",
default=True, help="Disable the calculation of Mean, " +
"Variance and SD (improves performance)")
parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
help="format for bucket numbers")
parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
help="List percentage for each bar")
help="format for bucket numbers")
parser.add_option("-p", "--percentage", dest="percentage", default=False,
action="store_true", help="List percentage for each bar")

(options, args) = parser.parse_args()
if sys.stdin.isatty():
# if isatty() that means it's run without anything piped into it
parser.print_usage()
print "for more help use --help"
sys.exit(1)
histogram(load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options)

histogram(load_stream(sys.stdin, options.agg_value_key,
options.agg_key_value), options)

0 comments on commit e9bc1ad

Please sign in to comment.