Permalink
Browse files

added support for decompression, content in HAR

There is currently a hacky method of handling unpleasant characters in the
stream. this will be fixed.
  • Loading branch information...
1 parent 0e55c43 commit 771b3f7471bb7bf9efac424ad8524728d252f865 Andrew Fleenor committed Aug 26, 2010
Showing with 77 additions and 6 deletions.
  1. +8 −4 har.py
  2. +68 −1 http.py
  3. +1 −1 main.py
View
12 har.py
@@ -44,6 +44,13 @@ def HTTPRequestJsonRepr(self):
http.Request.json_repr = HTTPRequestJsonRepr
def HTTPResponseJsonRepr(self):
+ content = {
+ 'size': len(self.body),
+ 'compression': len(self.body) - len(self.raw_body),
+ 'mimeType': self.mimeType,
+ }
+ if self.istext:
+ content['text'] = self.body.decode('iso-8859-1').encode('utf8') # must transcode to utf8
return {
'status': self.msg.status,
'statusText': self.msg.reason,
@@ -53,10 +60,7 @@ def HTTPResponseJsonRepr(self):
'bodySize': len(self.msg.body),
'redirectURL': self.msg.headers['location'] if 'location' in self.msg.headers else '',
'headers': header_json_repr(self.msg.headers),
- 'content': {
- 'size': len(self.msg.body), # should really be uncompressed length
- 'mimeType': self.mimeType
- },
+ 'content': content,
}
http.Response.json_repr = HTTPResponseJsonRepr
View
69 http.py
@@ -1,5 +1,9 @@
import dpkt
import urlparse
+import gzip
+import zlib
+import cStringIO
+import re
def find_index(f, seq):
'''
@@ -13,7 +17,13 @@ def find_index(f, seq):
class HTTPError(Exception):
'''
- Thrown when HTTP cannot be parsed from the given data.
+ Raised when HTTP cannot be parsed from the given data.
+ '''
+ pass
+
+class DecodingError(HTTPError):
+ '''
+ Raised when encoded HTTP data cannot be decompressed/decoded/whatever.
'''
pass
@@ -68,6 +78,7 @@ class Message:
* seq_end: first sequence number past Message's data (slice-style indices)
* ts_start: when Message started arriving (dpkt timestamp)
* ts_end: when Message had fully arrived (dpkt timestamp)
+ * body_raw: body before compression is taken into account
'''
def __init__(self, tcpdir, pointer, msgclass):
'''
@@ -86,6 +97,8 @@ def __init__(self, tcpdir, pointer, msgclass):
# calculate arrival_times
self.ts_start = tcpdir.seq_final_arrival(self.seq_start)
self.ts_end = tcpdir.seq_final_arrival(self.seq_end - 1)
+ # get raw body
+ self.raw_body = self.msg.body
class Request(Message):
'''
@@ -106,11 +119,17 @@ def __init__(self, tcpdir, pointer):
self.url, frag = urlparse.urldefrag(self.fullurl)
self.query = urlparse.parse_qs(uri.query)
+# RE's for use on mime types
+mimetype_text = re.compile('text/.+')
+mimetype_image = re.compile('image/.+')
+
class Response(Message):
'''
HTTP response.
Members:
* mimeType: string mime type of returned data
+ * body: http decoded body data
+ * compression: string, compression type
'''
def __init__(self, tcpdir, pointer):
Message.__init__(self, tcpdir, pointer, dpkt.http.Response)
@@ -119,6 +138,54 @@ def __init__(self, tcpdir, pointer):
self.mimeType= self.msg.headers['content-type']
else:
self.mimeType = ''
+ self.handle_compression()
+ # determine whether this is text
+ self.istext = bool(mimetype_text.match(self.mimeType))
+ def handle_compression(self):
+ '''
+ Sets self.body to the http decoded response data. Sets compression to
+ the name of the compresson type.
+ '''
+ # if content-encoding is found
+ if 'content-encoding' in self.msg.headers:
+ encoding = self.msg.headers['content-encoding'].lower()
+ self.compression = encoding
+ # handle gzip
+ if encoding == 'gzip' or encoding == 'x-gzip':
+ try:
+ gzipfile = gzip.GzipFile(
+ fileobj = cStringIO.StringIO(self.raw_body)
+ )
+ self.body = gzipfile.read()
+ except zlib.error:
+ raise DecodingError('zlib failed to gunzip HTTP data')
+ except:
+ # who knows what else it might raise
+ raise DecodingError("failed to gunzip HTTP data, don't know why")
+ # handle deflate
+ elif encoding == 'deflate':
+ try:
+ # NOTE: wbits = -15 is a undocumented feature in python (it's
+ # documented in zlib) that gets rid of the header so we can
+ # do raw deflate. See: http://bugs.python.org/issue5784
+ self.body = zlib.decompress(self.raw_body, -15)
+ except zlib.error:
+ raise DecodingError('zlib failed to undeflate HTTP data')
+ elif encoding == 'compress' or encoding == 'x-compress':
+ # apparently nobody uses this, so basically just ignore it
+ self.body = self.raw_body
+ elif encoding == 'identity':
+ # no compression
+ self.body = self.raw_body
+ else:
+ # I'm pretty sure the above are the only allowed encoding types
+ # see RFC 2616 sec 3.5 (http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.5)
+ raise DecodingError('unknown content-encoding token: ' + encoding)
+ else:
+ # no compression
+ self.compression = 'identity'
+ self.body = self.raw_body
+
class MessagePair:
'''
View
@@ -48,6 +48,6 @@ def combine_pairs(pairs, flow):
session = httpsession.HTTPSession(pairs)
with open(outputfile, 'w') as f:
- json.dump(session, f, cls=har.JsonReprEncoder, indent=2)
+ json.dump(session, f, cls=har.JsonReprEncoder, indent=2, encoding='utf8')
pass

0 comments on commit 771b3f7

Please sign in to comment.