Skip to content
Browse files

adding command line args

  • Loading branch information...
1 parent 32118e8 commit 1db934e826ce502d2fa739f677b513a7fa081063 @jehiah jehiah committed Oct 14, 2010
Showing with 90 additions and 25 deletions.
  1. +54 −10 data_hacks/histogram.py
  2. +4 −0 data_hacks/nintey_five_percent.py
  3. +14 −4 data_hacks/run_for.py
  4. +17 −11 data_hacks/sample.py
  5. +1 −0 setup.py
View
64 data_hacks/histogram.py
@@ -15,6 +15,7 @@
import sys
from decimal import Decimal
import math
+from optparse import OptionParser
class MVSD(object):
""" A class that calculates a running Mean / Variance / Standard Deviation"""
@@ -76,36 +77,61 @@ def load_stream(input_stream):
except:
print >>sys.stderr, "invalid line %r" % line
-def histogram(stream):
+def histogram(stream, options):
# we can't iterate on stream because we need to get min/max first and then put it into buckets
- data = list(stream)
- buckets = 10
+ if not options.min or not options.max:
+ # glob the data here so we can do min/max on it
+ data = list(stream)
+ else:
+ data = stream
bucket_scale = 1
- min_v = min(data)
- max_v = max(data)
+ if options.min:
+ min_v = Decimal(options.min)
+ else:
+ min_v = min(data)
+ if options.max:
+ max_v = Decimal(options.max)
+ else:
+ max_v = max(data)
+ buckets = options.buckets and int(options.buckets) or 10
+ if buckets <= 0:
+ raise ValueError('# of buckets must be > 0')
+ if not max_v > min_v:
+ raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
+
diff = max_v - min_v
step = diff / buckets
bucket_counts = [0 for x in range(buckets)]
boundaries = []
for x in range(buckets):
boundaries.append(min_v + (step * (x + 1)))
+ skipped = 0
+ samples = 0
mvsd = MVSD()
for value in data:
- mvsd.add(value)
+ samples +=1
+ if options.mvsd:
+ mvsd.add(value)
# find the bucket this goes in
+ if value < min_v or value > max_v:
+ skipped +=1
+ continue
for bucket_postion, boundary in enumerate(boundaries):
if value <= boundary:
bucket_counts[bucket_postion] +=1
break
- # auto-pick the bucket size
+ # auto-pick the hash scale
if max(bucket_counts) > 75:
bucket_scale = int(max(bucket_counts) / 75)
- print "# NumSamples = %d; Max = %0.2f; Min = %0.2f" % (len(data), max_v, min_v)
- print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
+ print "# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v)
+ if skipped:
+ print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
+ if options.mvsd:
+ print "# Mean = %f; Variance = %f; SD = %f" % (mvsd.mean(), mvsd.var(), mvsd.sd())
print "# each * represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
@@ -120,4 +146,22 @@ def histogram(stream):
if __name__ == "__main__":
- histogram(load_stream(sys.stdin))
+ parser = OptionParser()
+ parser.usage = "cat data | %prog [options]"
+ parser.add_option("-m", "--min", dest="min",
+ help="minimum value for graph")
+ parser.add_option("-x", "--max", dest="max",
+ help="maximum value for graph")
+ parser.add_option("-b", "--buckets", dest="buckets",
+ help="Number of buckets to use for the histogram")
+ parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
+ help="Dissable the calculation of Mean, Vairance and SD. (improves performance)")
+
+ (options, args) = parser.parse_args()
+ if sys.stdin.isatty():
+ # if isatty() that means it's run without anything piped into it
+ parser.print_usage()
+ print "for more help use --help"
+ sys.exit(1)
+ histogram(load_stream(sys.stdin), options)
+
View
4 data_hacks/nintey_five_percent.py
@@ -8,6 +8,7 @@
http://github.com/bitly/data_hacks
"""
import sys
+import os
from decimal import Decimal
def run():
@@ -42,4 +43,7 @@ def calc_95(data, count):
return t
if __name__ == "__main__":
+ if sys.stdin.isatty():
+ print "Usage: cat data | %(prog)s" % os.path.basename(sys.argv[0])
+ sys.exit(1)
run()
View
18 data_hacks/run_for.py
@@ -9,6 +9,7 @@
"""
import time
import sys
+import os
def getruntime(arg):
if not arg:
@@ -17,12 +18,14 @@ def getruntime(arg):
base = int(arg[:-1])
if suffix == "s":
return base
- elif suffix == "h":
+ elif suffix == "m":
return base * 60
+ elif suffix == "h":
+ return base * 60 * 60
elif suffix == "d":
- return base * 60 * 24
+ return base * 60 * 60 * 24
else:
- print >>sys.stderr, "invalid time suffix %r" % arg
+ print >>sys.stderr, "invalid time suffix %r. must be one of s,m,h,d" % arg
def run(runtime):
end = time.time() + runtime
@@ -35,8 +38,15 @@ def run(runtime):
return
if __name__ == "__main__":
+ usage = "Usage: tail -f access.log | %(prog)s [time] | ..." % os.path.basename(sys.argv[0])
+ help = "time can be in the format 10s 10m 10h etc"
+ if sys.stdin.isatty():
+ print usage
+ print help
+ sys.exit(1)
+
runtime = getruntime(sys.argv[-1])
if not runtime:
- print >>sys.stderr, "usage: tail -f access.log | run_for.py 10s | wc -l"
+ print usage
sys.exit(1)
run(runtime)
View
28 data_hacks/sample.py
@@ -10,6 +10,7 @@
import sys
import random
+from optparse import OptionParser
from decimal import Decimal
def usage():
@@ -37,22 +38,27 @@ def get_sample_rate(rate_string):
rate = Decimal(x) / (Decimal(y) * Decimal('1.0'))
rate = int(rate * 100)
else:
- raise Exception("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string)
+ raise ValueError("rate %r is invalid rate format must be '10%%' or '1/10'" % rate_string)
if rate < 1 or rate > 100:
- raise Exception('rate %r must be 1%% <= rate <= 100%% ' % rate_string)
+ raise ValueError('rate %r must be 1%% <= rate <= 100%% ' % rate_string)
return rate
if __name__ == "__main__":
- debug = '--debug' in sys.argv
- try:
- sys.argv.remove('--debug')
- except ValueError:
- pass
- if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv) != 2:
- usage()
+ parser = OptionParser()
+ parser.usage = "cat data | %prog [options] [sample_rate]"
+ parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true")
+ (options, args) = parser.parse_args()
+
+ if not args or sys.stdin.isatty():
+ parser.print_usage()
sys.exit(1)
- sample_rate = get_sample_rate(sys.argv[-1])
- if debug:
+ try:
+ sample_rate = get_sample_rate(sys.argv[-1])
+ except ValueError, e:
+ print >>sys.stderr, e
+ parser.print_usage()
+ sys.exit(1)
+ if options.verbose:
print >>sys.stderr, "Sample rate is %d%%" % sample_rate
run(sample_rate)
View
1 setup.py
@@ -9,5 +9,6 @@
# packages=['data_hacks'],
scripts = ['data_hacks/histogram.py',
'data_hacks/nintey_five_percent.py',
+ 'data_hacks/run_for.py',
'data_hacks/sample.py']
)

0 comments on commit 1db934e

Please sign in to comment.
Something went wrong with that request. Please try again.