Permalink
Browse files

Merge remote branch 'github/addhar' into addhar

  • Loading branch information...
2 parents dcacbba + 5fe2d84 commit ff060139968abc0130cb098bbddb951db6bbd187 Andrew Fleenor committed Aug 25, 2010
Showing with 248 additions and 31 deletions.
  1. +42 −18 har.py
  2. +30 −12 http.py
  3. +144 −0 httpsession.py
  4. +14 −0 main.py
  5. +16 −0 pcaputil.py
  6. +2 −1 tcp/direction.py
View
60 har.py
@@ -1,26 +1,50 @@
+import http
+import json
+
'''
-Parses a list of HTTPFlows into data suitable for writing to a HAR file.
+functions and classes for generating HAR data from parsed http data
'''
-class Page:
- def __init__(self, title, startedDateTime):
- self.title = title
- self.startedDateTime = startedDateTime
-
-class Entry:
- def __init__(self, request, response):
- self.request = request
- self.response = response
- self.total_time = (response.end_time - request.start_time) + startup_time
+# json_repr for HTTP header dicts
+def header_json_repr(d):
+ return [
+ {
+ 'name': k,
+ 'value': v
+ } for k, v in d.iteritems()
+ ]
-def extract_data(httpdata):
+# add json_repr methods to http classes
+def HTTPRequestJsonRepr(self):
'''
- Extracts http data from the httpdata and converts it into a python dict
- suitable for writing straight out as a HAR.
+ self = http.Request
+ '''
+ return {
+ 'method': self.msg.method,
+ 'url': self.msg.uri,
+ 'httpVersion': self.msg.version,
+ 'cookies': [],
+ 'headers': header_json_repr(self.msg.headers),
+ }
+http.Request.json_repr = HTTPRequestJsonRepr
- Args:
- httpflows = [http.MessagePair]
+def HTTPResponseJsonRepr(self):
+ return {
+ 'status': self.msg.status,
+ 'statusText': self.msg.reason,
+ 'httpVersion': self.msg.version,
+ 'cookies': [],
+ 'headers': header_json_repr(self.msg.headers)
+ }
+http.Response.json_repr = HTTPResponseJsonRepr
- Returns:
- {} = HAR data
+# custom json encoder
+class JsonReprEncoder(json.JSONEncoder):
+ '''
+ Custom Json Encoder that attempts to call json_repr on every object it
+ encounters.
'''
+ def default(self, obj):
+ if hasattr(obj, 'json_repr'):
+ return obj.json_repr()
+ return json.JSONEncoder.default(self, obj) # should call super instead?
View
42 http.py
@@ -1,5 +1,15 @@
import dpkt
+def find_index(f, seq):
+ '''
+ returns the index of the first item in seq for which predicate f returns
+ True. If no matching item is found, LookupError is raised.
+ '''
+ for i, item in enumerate(seq):
+ if f(item):
+ return i
+ raise LookupError('no item was found in the sequence that matched the predicate')
+
class HTTPError(Exception):
'''
Thrown when HTTP cannot be parsed from the given data.
@@ -9,7 +19,7 @@ class HTTPError(Exception):
class HTTPFlow:
'''
Parses a TCPFlow into HTTP request/response pairs. Or not, depending on the
- integrity of the flow.
+ integrity of the flow. After __init__, self.pairs,
'''
def __init__(self, tcpflow):
# try parsing it with forward as request dir
@@ -19,17 +29,25 @@ def __init__(self, tcpflow):
if not success:
# flow is not HTTP
raise HTTPError('TCPFlow does not contain HTTP')
- # we have requests/responses. store them
- self.requests = requests
- self.responses = responses
- if len(requests) == len(responses):
- self.pairs = zip(requests, responses)
- elif len(requests) > len(responses):
- #pad responses with None
- responses += [None for i in range(len(requests) - len(responses))]
- self.pairs = zip(requests, responses)
- else:
- self.pairs = None
+ # match up requests with nearest response that occured after them
+ # first request is the benchmark; responses before that are irrelevant for now
+ self.pairs = []
+ try:
+ # find the first response to a request we know about, that is, the first response after the first request
+ first_response_index = find_index(lambda response: response.ts_start > requests[0].ts_start, responses)
+ # these are responses that match up with our requests
+ pairable_responses = responses[first_response_index:]
+ if len(requests) > len(pairable_responses): # if there are more requests than responses
+ # pad responses with None
+ pairable_responses.extend( [None for i in range(len(requests) - len(pairable_responses))] )
+ # if there are more responses, we would just ignore them anyway, which zip does for use
+ # create MessagePair's
+ for req, resp in zip(requests, responses):
+ self.pairs.append(MessagePair(req, resp))
+ except LookupError:
+ # there were no responses after the first request
+ # there's nothing we can do
+ pass
class Message:
'''
View
144 httpsession.py
@@ -0,0 +1,144 @@
+'''
+Parses a list of HTTPFlows into data suitable for writing to a HAR file.
+'''
+
+from datetime import datetime
+from pcaputil import ms_from_timedelta, ms_from_dpkt_time
+
+class Page:
+ def __init__(self, title, startedDateTime):
+ self.title = title
+ self.startedDateTime = startedDateTime # python datetime
+
+class Entry:
+ '''
+ represents an HTTP request/response in a form suitable for writing to a HAR
+ file.
+ Members:
+ * request = http.Request
+ * response = http.Response
+ * page_ref = string
+ * startedDateTime = python datetime
+ * total_time = from sending of request to end of response, milliseconds
+ * time_blocked
+ * time_dnsing
+ * time_connecting
+ * time_sending
+ * time_waiting
+ * time_receiving
+ '''
+ def __init__(self, request, response):
+ self.request = request
+ self.response = response
+ self.page_ref = ''
+ self.startedDateTime = datetime.fromtimestamp(request.ts_start)
+ endedDateTime = datetime.fromtimestamp(response.ts_end)
+ self.total_time = ms_from_timedelta(
+ endedDateTime - self.startedDateTime # plus connection time, someday
+ )
+ # calculate other timings
+ self.time_blocked = -1
+ self.time_dnsing = -1
+ self.time_connecting = -1
+ self.time_sending = \
+ ms_from_dpkt_time(request.ts_end - request.ts_start)
+ self.time_waiting = \
+ ms_from_dpkt_time(response.ts_start - request.ts_end)
+ self.time_receiving = \
+ ms_from_dpkt_time(response.ts_end - response.ts_start)
+ # check if timing calculations are consistent
+ if self.time_sending + self.time_waiting + self.time_receiving != self.total_time:
+ pass
+ def json_repr(self):
+ '''
+ return a JSON serializable python object representation of self.
+ '''
+ return {
+ 'page_ref': self.page_ref,
+ 'startedDateTime': self.startedDateTime.isoformat(),
+ 'time': self.total_time,
+ 'request': self.request,
+ 'response': self.response,
+ 'timings': {
+ 'blocked': self.time_blocked,
+ 'dns': self.time_dnsing,
+ 'connect': self.time_connecting,
+ 'send': self.time_sending,
+ 'wait': self.time_waiting,
+ 'receive': self.time_receiving
+ }
+ }
+
+class UserAgentTracker:
+ def __init__(self):
+ self.data = {} # {user-agent string: number of uses}
+ def add(self, string):
+ '''
+ either increments the use-count, or creates a new entry
+ '''
+ if string in self.data:
+ self.data[string] += 1
+ else:
+ self.data[string] = 1
+ def dominant_user_agent(self):
+ '''
+ The agent string with the most uses
+ '''
+ if not len(self.data):
+ return None
+ elif len(self.data) == 1:
+ return self.data.keys()[0]
+ else:
+ # max returns first value of tuple produced when iterating through
+ # dict, in this case, the user-agent string, even though the key
+ # function has access to the full tuple
+ return max(self.data, key=lambda v: v[0])
+
+class HTTPSession(object):
+ '''
+ Represents all http traffic from within a pcap.
+
+ Members:
+ * user_agent = most-used user-agent in the flow
+ * referers = referers/page-loads
+ '''
+ def __init__(self, messages):
+ '''
+ Parses http.MessagePairs to get http info out, in preparation for
+ writing it to a HAR file.
+ '''
+ # set-up
+ self.user_agents = UserAgentTracker()
+ self.entries = []
+ # iter through messages
+ for msg in messages:
+ # if msg.request has a user-agent, add it to our list
+ if 'user-agent' in msg.request.msg.headers:
+ self.user_agents.add(msg.request.msg.headers['user-agent'])
+ # if msg.request has a referer, keep track of that, too
+ if 'referer' in msg.request.msg.headers:
+ # not really
+ pass
+ # parse basic data in the pair, add it to the list
+ self.entries.append(Entry(msg.request, msg.response))
+ # finish calculating data
+ self.user_agent = self.user_agents.dominant_user_agent()
+ def json_repr(self):
+ '''
+ return a JSON serializable python object representation of self.
+ '''
+ return {
+ 'log': {
+ 'version' : '1.1',
+ 'creator': {
+ 'name': 'pcap2har',
+ 'version': '0.1'
+ },
+ 'browser': {
+ 'name': self.user_agent,
+ 'version': 'mumble'
+ },
+ 'pages': [],
+ 'entries': self.entries
+ }
+ }
View
14 main.py
@@ -6,6 +6,9 @@
import logging
import sys
import http
+import httpsession
+import har
+import json
# get cmdline args/options
parser = optparse.OptionParser(usage='usage: %prog inputfile outputfile [options]')
@@ -32,4 +35,15 @@
except http.HTTPError as e:
pass
+# put all message pairs in one list
+def combine_pairs(pairs, flow):
+ return pairs + flow.pairs
+pairs = reduce(combine_pairs, httpflows, [])
+
+# parse HAR stuff
+session = httpsession.HTTPSession(pairs)
+
+with open(outputfile, 'w') as f:
+ json.dump(session, f, cls=har.JsonReprEncoder, indent=2)
+
pass
View
16 pcaputil.py
@@ -22,6 +22,22 @@ def friendly_socket(sock):
def friendly_data(str):
return `str`
+def ms_from_timedelta(td):
+ '''
+ gets the number of ms in td, which is datetime.timedelta.
+ Modified from here:
+ http://docs.python.org/library/datetime.html#datetime.timedelta, near the
+ end of the section.
+ '''
+ return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**3
+
+def ms_from_dpkt_time(td):
+ '''
+ Get milliseconds from a dpkt timestamp. This should probably only really be
+ done on a number gotten from subtracting two dpkt timestamps.
+ '''
+ return int(td * 1000) # um, I guess
+
class ModifiedReader(object):
"""
A copy of the dpkt pcap Reader. The only change is that the iterator
View
3 tcp/direction.py
@@ -1,4 +1,5 @@
from sortedcollection import SortedCollection
+import tcp
class Direction:
def __init__(self, flow):
@@ -86,7 +87,7 @@ def byte_to_seq(self, byte):
is assumed to be zero-based.
'''
if self.seq_start:
- return byte + self.seq_start + 1
+ return byte + self.seq_start
else:
return byte + self.flow.first_packet.seq

0 comments on commit ff06013

Please sign in to comment.