Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Initial commit

  • Loading branch information...
commit 7d4530a1018d879485ebc3e65f3d33be7c5eca3a 0 parents
@barnybug authored
Showing with 209 additions and 0 deletions.
  1. +1 −0  .gitignore
  2. +49 −0 README.md
  3. +148 −0 s3grep
  4. +11 −0 setup.py
1  .gitignore
@@ -0,0 +1 @@
+build
49 README.md
@@ -0,0 +1,49 @@
+# s3grep
+
+grep for Amazon S3.
+
+Searches text files on an S3 bucket for specific content. The search is parallelised in order to
+yield good performance, using multiple connections to S3.
+
+## Usage
+
+Search for straw in bucket 'mybucket', keys starting 'prefix':
+
+ $ s3grep s3://mybucket/prefix straw
+
+Supports regular expressions:
+
+ $ s3grep s3://mybucket/prefix a.+b
+
+To see all options:
+
+ $ s3grep -h
+
+## Installation
+
+ $ pip install s3grep
+or:
+
+ $ easy_install s3grep
+
+from sources, the usual:
+
+ $ sudo python setup.py install
+
+## AWS Credentials
+
+Your credentials can be set through environment variables:
+
+ AWS_ACCESS_KEY_ID - Your AWS Access Key ID
+ AWS_SECRET_ACCESS_KEY - Your AWS Secret Access Key
+
+Alternatively they can be configured in the boto configuration file,
+in short, create ~/.boto with the content:
+
+ [Credentials]
+ aws_access_key_id = <your access key>
+ aws_secret_access_key = <your secret key>
+
+## TODO
+
+ * support for compressed content
148 s3grep
@@ -0,0 +1,148 @@
+#!/usr/bin/python
+
+import sys
+import re
+import time
+import argparse
+import boto.s3
+from multiprocessing import Pool
+from functools import wraps
+import signal
+
+def S3Path(p):
+ m = re.match(r's3://([^/]+)/(.+)', p)
+ if not m:
+ raise ValueError, 'invalid s3 url - should be "s3://bucketname/prefix"'
+ return m.groups()
+
+def Regex(r):
+ try:
+ return re.compile(r)
+ except re.error, ex:
+ raise ValueError, str(ex)
+
+class Worker(object):
+ pass
+
+class LineBuffer(object):
+ """Add readline() line buffering to a file handle only supporting read()"""
+ def __init__(self, fin):
+ self.fin = fin
+ self.buf = []
+
+ def __iter__(self):
+ return self
+
+ def readline(self):
+ if len(self.buf) <= 1:
+ more = self.fin.read().split('\n')
+ if self.buf and more:
+ # join across the boundary
+ self.buf[0] = self.buf[0] + more.pop(0)
+ self.buf.extend(more)
+
+ if self.buf:
+ return self.buf.pop(0)
+ else:
+ return ''
+
+ def next(self):
+ l = self.readline()
+ if l:
+ return l
+ else:
+ raise StopIteration
+
+class Worker(object):
+ def __init__(self, bucket_name, regex, args):
+ s3 = boto.connect_s3()
+ self.bucket = s3.get_bucket(bucket_name)
+ self.regex = regex
+ self.args = args
+
+ def __call__(self, key_name):
+ key = self.bucket.get_key(key_name)
+
+ nbytes = nlines = nmatches = 0
+ if self.args.verbose:
+ print >> sys.stderr, 'Scanning: %s' % (key.name)
+
+ if self.regex.pattern == '.':
+ # optimise special case - handy for using conventional grep, for example
+ key.get_contents_to_file(sys.stdout)
+ else:
+ for line in LineBuffer(key):
+ nlines += 1
+ nbytes += len(line)+1 # \n
+ m = self.regex.search(line)
+ if bool(self.args.invert) != bool(m): # XOR
+ if self.args.with_filename:
+ print '%s:%s' % (key.name, line)
+ else:
+ print line
+ nmatches += 1
+
+ return nbytes, nlines, nmatches
+
+def initialize_worker(*args):
+ global worker
+ try:
+ worker = Worker(*args)
+ except KeyboardInterrupt:
+ # leave cleanly on keyboard interrupt
+ return
+
+def call_worker(args):
+ global worker
+ try:
+ return worker(args)
+ except KeyboardInterrupt:
+ # multiprocessing hangs on KeyboardInterrupt - workaround
+ raise Exception
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('path', nargs=1, type=S3Path, help="eg. s3://bucketname/prefix")
+ parser.add_argument('regex', nargs=1, type=Regex, help="regular expression to search for")
+ parser.add_argument('--verbose', action='store_true', help="verbose output")
+ parser.add_argument('-p', '--processes', type=int, default=6, help="number of processes to run")
+ # grep options
+ parser.add_argument('-v', '--invert', action='store_true', help="invert match (ie. return lines that do not match)")
+ parser.add_argument('-H', '--with-filename', action='store_true', help="Print the file name for each match.")
+
+ args = parser.parse_args()
+
+ # setup
+ nfiles = nlines = nbytes = nmatches = 0
+ p = args.path[0]
+ regex = args.regex[0]
+ pool = Pool(processes=args.processes,
+ initializer=initialize_worker,
+ initargs=(p[0], regex, args))
+ start = time.time()
+ try:
+ if args.verbose:
+ print >> sys.stderr, 'Searching bucket: %s' % (p[0])
+ s3 = boto.connect_s3()
+ bucket = s3.get_bucket(p[0])
+
+ keys = bucket.list(prefix=p[1])
+ jobs = ( k.name for k in keys )
+ # do the actual work
+ for b, l, m in pool.imap(call_worker, jobs):
+ nfiles += 1
+ nlines += l
+ nbytes += b
+ nmatches += m
+ except KeyboardInterrupt:
+ pool.terminate()
+ pool.join()
+ else:
+ pool.close()
+ pool.join()
+
+ end = time.time()
+ print >> sys.stderr, 'Scanned: %d files, %d lines, %d bytes, %d matches, took %dms' % (nfiles, nlines, nbytes, nmatches, (end-start)*1000)
+
+if __name__ == '__main__':
+ main()
11 setup.py
@@ -0,0 +1,11 @@
+from distutils.core import setup
+
+setup(name='s3grep',
+ version='0.1',
+ description='grep for Amazon S3',
+ author='Barnaby Gray',
+ author_email='barnaby@pickle.me.uk',
+ url='http://github.com/barnybug/s3grep/',
+ requires=['boto'],
+ scripts=['s3grep'],
+ )
Please sign in to comment.
Something went wrong with that request. Please try again.