Skip to content
Browse files

Merge remote branch 'origin/master'

Update test cases to handle new sorting.
  • Loading branch information...
2 parents 2825ec3 + 938787e commit 219d002918d12e778a919f07612999369c07323d Andrew Fleenor committed Dec 14, 2012
Showing with 2,720 additions and 908 deletions.
  1. +1 −0 .gitignore
  2. +1 −1 LICENSE
  3. +34 −14 main.py
  4. +0 −160 pcap2har.psproj
  5. 0 { → pcap2har}/BeautifulSoup.py
  6. 0 pcap2har/__init__.py
  7. +19 −6 { → pcap2har}/dns.py
  8. +67 −18 { → pcap2har}/dpkt_http_replacement.py
  9. +15 −8 { → pcap2har}/har.py
  10. +5 −0 pcap2har/http/__init__.py
  11. +1 −5 http/__init__.py → pcap2har/http/common.py
  12. +64 −39 { → pcap2har}/http/flow.py
  13. +19 −4 { → pcap2har}/http/message.py
  14. +6 −4 { → pcap2har}/http/request.py
  15. +76 −25 { → pcap2har}/http/response.py
  16. +42 −27 { → pcap2har}/httpsession.py
  17. +23 −5 { → pcap2har}/mediatype.py
  18. +5 −1 { → pcap2har}/packetdispatcher.py
  19. +24 −11 { → pcap2har}/pagetracker.py
  20. +29 −18 { → pcap2har}/pcap.py
  21. +67 −7 { → pcap2har}/pcaputil.py
  22. +13 −0 pcap2har/settings.py
  23. +5 −5 { → pcap2har}/sortedcollection.py
  24. +12 −0 pcap2har/tcp/__init__.py
  25. +4 −2 { → pcap2har}/tcp/chunk.py
  26. +1 −13 tcp/__init__.py → pcap2har/tcp/common.py
  27. +76 −19 { → pcap2har}/tcp/direction.py
  28. +44 −15 { → pcap2har}/tcp/flow.py
  29. +83 −0 pcap2har/tcp/flowbuilder.py
  30. +33 −4 { → pcap2har}/tcp/packet.py
  31. +12 −8 { → pcap2har}/tcp/seq.py
  32. +8 −5 { → pcap2har}/udp.py
  33. +0 −1 settings.py
  34. +0 −47 tcp/flowbuilder.py
  35. +6 −0 tests/README.txt
  36. +1,102 −0 tests/fhs.pcap.dropped.har
  37. +10 −10 tests/fhs.pcap.har
  38. +10 −10 tests/fhs_ncomp.pcap.har
  39. +4 −4 tests/github.pcap.har
  40. +3 −3 tests/http.pcap.har
  41. BIN tests/missing_response.pcap
  42. +215 −0 tests/missing_response.pcap.har
  43. +363 −295 tests/out-of-order.pcap.har
  44. +45 −45 tests/pcapr.net.pcap.har
  45. BIN tests/request_only.pcap
  46. +84 −0 tests/request_only.pcap.har
  47. +27 −7 tests/run_tests.sh
  48. +62 −62 tests/sajal.pcap.har
View
1 .gitignore
@@ -3,3 +3,4 @@
pcap2har.log
tests/*.har
tests/*.log
+*.sw[op]
View
2 LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2009 Andrew Fleenor, Ryan C. Witt and Jake Holland
+Copyright (c) 2009 Andrew Fleenor, Ryan C. Witt, Jake Holland, and Google, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
View
48 main.py
@@ -4,31 +4,50 @@
Main program that converts pcaps to HAR's.
'''
-import pcap
import os
import optparse
import logging
import sys
-import http
-import httpsession
-import har
import json
-import tcp
-import settings
-from packetdispatcher import PacketDispatcher
+
+from pcap2har import pcap
+from pcap2har import http
+from pcap2har import httpsession
+from pcap2har import har
+from pcap2har import tcp
+from pcap2har import settings
+from pcap2har.packetdispatcher import PacketDispatcher
+from pcap2har.pcaputil import print_rusage
+
# get cmdline args/options
parser = optparse.OptionParser(
usage='usage: %prog inputfile outputfile'
)
-parser.add_option('--no-pages', action="store_false", dest="pages", default=True)
+parser.add_option('--no-pages', action='store_false',
+ dest='pages', default=True)
+parser.add_option('-d', '--drop-bodies', action='store_true',
+ dest='drop_bodies', default=False)
+parser.add_option('-k', '--keep-unfulfilled-requests', action='store_true',
+ dest='keep_unfulfilled', default=False)
+parser.add_option('-r', '--resource-usage', action='store_true',
+ dest='resource_usage', default=False)
+parser.add_option('--pad_missing_tcp_data', action='store_true',
+ dest='pad_missing_tcp_data', default=False)
+parser.add_option('--strict-http-parsing', action='store_true',
+ dest='strict_http_parsing', default=False)
+parser.add_option('-l', '--log', dest='logfile', default='pcap2har.log')
options, args = parser.parse_args()
# copy options to settings module
settings.process_pages = options.pages
+settings.drop_bodies = options.drop_bodies
+settings.keep_unfulfilled_requests = options.keep_unfulfilled
+settings.pad_missing_tcp_data = options.pad_missing_tcp_data
+settings.strict_http_parse_body = options.strict_http_parsing
# setup logs
-logging.basicConfig(filename='pcap2har.log', level=logging.INFO)
+logging.basicConfig(filename=options.logfile, level=logging.INFO)
# get filenames, or bail out with usage error
if len(args) == 2:
@@ -40,18 +59,19 @@
parser.print_help()
sys.exit()
-logging.info("Processing %s", inputfile)
+logging.info('Processing %s', inputfile)
# parse pcap file
-dispatcher = PacketDispatcher()
-pcap.ParsePcap(dispatcher, filename=inputfile)
-dispatcher.finish()
+dispatcher = pcap.EasyParsePcap(filename=inputfile)
# parse HAR stuff
session = httpsession.HttpSession(dispatcher)
-logging.info("Flows=%d. HTTP pairs=%d" % (len(session.flows),len(session.entries)))
+logging.info('Flows=%d. HTTP pairs=%d' % (len(session.flows), len(session.entries)))
#write the HAR file
with open(outputfile, 'w') as f:
json.dump(session, f, cls=har.JsonReprEncoder, indent=2, encoding='utf8', sort_keys=True)
+
+if options.resource_usage:
+ print_rusage()
View
160 pcap2har.psproj
@@ -1,160 +0,0 @@
-[PyScripter]
-Version=2.4.1.0
-
-[Project]
-ClassName=TProjectRootNode
-StoreRelativePaths=TRUE
-ShowFileExtensions=FALSE
-
-[Project\ChildNodes\Node0]
-ClassName=TProjectFilesNode
-
-[Project\ChildNodes\Node0\ChildNodes\Node0]
-ClassName=TProjectFolderNode
-Name=http
-
-[Project\ChildNodes\Node0\ChildNodes\Node0\ChildNodes\Node0]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]http\__init__.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node0\ChildNodes\Node1]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]http\flow.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node0\ChildNodes\Node2]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]http\message.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node0\ChildNodes\Node3]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]http\request.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node0\ChildNodes\Node4]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]http\response.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node0\ChildNodes]
-Count=5
-
-[Project\ChildNodes\Node0\ChildNodes\Node1]
-ClassName=TProjectFolderNode
-Name=tcp
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node0]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\__init__.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node1]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\chunk.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node2]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\direction.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node3]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\flow.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node4]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\flowbuilder.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node5]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\packet.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes\Node6]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]tcp\seq.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node1\ChildNodes]
-Count=7
-
-[Project\ChildNodes\Node0\ChildNodes\Node2]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]BeautifulSoup.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node3]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]dns.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node4]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]dpkt_http_replacement.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node5]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]har.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node6]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]httpsession.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node7]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]main.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node8]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]mediatype.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node9]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]packetdispatcher.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node10]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]pcap.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node11]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]pcaputil.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node12]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]sortedcollection.py
-
-[Project\ChildNodes\Node0\ChildNodes\Node13]
-ClassName=TProjectFileNode
-FileName=$[Project-Path]udp.py
-
-[Project\ChildNodes\Node0\ChildNodes]
-Count=14
-
-[Project\ChildNodes\Node1]
-ClassName=TProjectRunConfiguationsNode
-
-[Project\ChildNodes\Node1\ChildNodes\Node0]
-ClassName=TProjectRunConfiguationNode
-Name=main
-
-[Project\ChildNodes\Node1\ChildNodes\Node0\RunConfig]
-ScriptName=main.py
-Description=Current main test program
-EngineType=peRemote
-ReinitializeBeforeRun=TRUE
-Parameters=../pcap2har/dns_requests.pcap output.har
-WorkingDir=$[ActiveScript-Dir]
-WriteOutputToFile=FALSE
-OutputFileName=$[ActiveScript-NoExt].log
-AppendToFile=FALSE
-
-[Project\ChildNodes\Node1\ChildNodes\Node0\RunConfig\ExternalRun]
-Caption=External Run
-Description=Run script using an external Python Interpreter
-ApplicationName=$[PythonExe-Short]
-Parameters=$[ActiveScript-Short]
-WorkingDirectory=$[ActiveScript-Dir]
-ShortCut=0
-MessagesFormat=$[FileName] $[LineNumber]
-
-[Project\ChildNodes\Node1\ChildNodes]
-Count=1
-
-[Project\ChildNodes]
-Count=2
-
-[Project\ExtraPythonPath]
-Count=0
-
View
0 BeautifulSoup.py → pcap2har/BeautifulSoup.py
File renamed without changes.
View
0 pcap2har/__init__.py
No changes.
View
25 dns.py → pcap2har/dns.py
@@ -1,6 +1,7 @@
-import logging as log
+import logging
-class Packet:
+
+class Packet(object):
'''
A DNS packet, wrapped for convenience and with the pcap timestamp
@@ -12,6 +13,7 @@ class Packet:
names = list of names asked about
dns = dpkt.dns.DNS
'''
+
def __init__(self, ts, pkt):
'''
ts = pcap timestamp
@@ -22,11 +24,13 @@ def __init__(self, ts, pkt):
self.txid = pkt.id
self.names = [q.name for q in pkt.qd]
if len(self.names) > 1:
- log.warning('DNS packet with multiple questions')
+ logging.warning('DNS packet with multiple questions')
+
def name(self):
return self.names[0]
-class Query:
+
+class Query(object):
'''
A DNS question/answer conversation with a single ID
@@ -37,6 +41,7 @@ class Query:
name = domain name being discussed
resolved = Bool, whether the question has been answered
'''
+
def __init__(self, initial_packet):
'''
initial_packet = dns.Packet, simply the first one on the wire with
@@ -47,19 +52,22 @@ def __init__(self, initial_packet):
self.last_ts = initial_packet.ts
self.resolved = False
self.name = initial_packet.name()
+
def add(self, pkt):
'''
pkt = dns.Packet
'''
- assert(pkt.txid == self.txid)
+ assert pkt.txid == self.txid
self.last_ts = max(pkt.ts, self.last_ts)
# see if this resolves the query
if len(pkt.dns.an) > 0:
self.resolved = True
+
def duration(self):
return self.last_ts - self.started_time
-class Processor:
+
+class Processor(object):
'''
Processes and interprets DNS packets.
@@ -69,9 +77,11 @@ class Processor:
queries = {txid: Query}
by_hostname = {string: [Query]}
'''
+
def __init__(self):
self.queries = {}
self.by_hostname = {}
+
def add(self, pkt):
'''
adds the packet to a Query object by id, and makes sure that Queryies
@@ -86,12 +96,14 @@ def add(self, pkt):
new_query = Query(pkt)
self.queries[pkt.txid] = new_query
self.add_by_name(new_query)
+
def add_by_name(self, query):
name = query.name
if name in self.by_hostname:
self.by_hostname[name].append(query)
else:
self.by_hostname[name] = [query]
+
def get_resolution_time(self, hostname):
'''
Returns the last time it took to resolve the hostname.
@@ -104,6 +116,7 @@ def get_resolution_time(self, hostname):
return self.by_hostname[hostname][-1].duration()
except KeyError:
return None
+
def num_queries(self, hostname):
'''
Returns the number of DNS requests for that name
View
85 dpkt_http_replacement.py → pcap2har/dpkt_http_replacement.py
@@ -7,14 +7,16 @@
import cStringIO
import dpkt
+import logging
+import settings
def parse_headers(f):
"""Return dict of HTTP headers parsed from a file object."""
d = {}
while 1:
line = f.readline()
- if not line:
- raise dpkt.NeedData('premature end of headers')
+ # regular dpkt checks for premature end of headers
+ # but that's too picky
line = line.strip()
if not line:
break
@@ -29,7 +31,22 @@ def parse_headers(f):
d[k] = v
return d
-def parse_body(f, headers):
+
+def parse_length(s, base=10):
+ """Take a string and convert to int (not long), returning 0 if invalid"""
+ try:
+ n = int(s, base)
+ # int() can actually return long, which can't be used in file.read()
+ if isinstance(n, int):
+ return n
+ except ValueError:
+ pass
+ # if s was invalid or too big (that is, int returned long)...
+ logging.warn('Invalid HTTP content/chunk length "%s", assuming 0' % s)
+ return 0
+
+
+def parse_body(f, version, headers):
"""Return HTTP body parsed from a file object, given HTTP header dict."""
if headers.get('transfer-encoding', '').lower() == 'chunked':
l = []
@@ -39,8 +56,8 @@ def parse_body(f, headers):
sz = f.readline().split(None, 1)[0]
except IndexError:
raise dpkt.UnpackError('missing chunk size')
- n = int(sz, 16)
- if n == 0:
+ n = parse_length(sz, 16)
+ if n == 0: # may happen if sz is invalid
found_end = True
buf = f.read(n)
if f.readline().strip():
@@ -49,19 +66,56 @@ def parse_body(f, headers):
l.append(buf)
else:
break
- if not found_end:
+ if settings.strict_http_parse_body and not found_end:
raise dpkt.NeedData('premature end of chunked body')
body = ''.join(l)
elif 'content-length' in headers:
- n = int(headers['content-length'])
+ # Ethan K B: Have observed malformed 0,0 content lengths
+ n = parse_length(headers['content-length'])
body = f.read(n)
if len(body) != n:
- raise dpkt.NeedData('short body (missing %d bytes)' % (n - len(body)))
+ logging.warn('HTTP content-length mismatch: expected %d, got %d', n,
+ len(body))
+ if settings.strict_http_parse_body:
+ raise dpkt.NeedData('short body (missing %d bytes)' % (n - len(body)))
else:
# XXX - need to handle HTTP/0.9
- body = ''
+ # BTW, this function is not called if status code is 204 or 304
+ if version == '1.0':
+ # we can assume that there are no further
+ # responses on this stream, since 1.0 doesn't
+ # support keepalive
+ body = f.read()
+ elif (version == '1.1' and
+ headers.get('connection', None) == 'close'):
+ # sender has said they won't send anything else.
+ body = f.read()
+ # there's also the case where other end sends connection: close,
+ # but we don't have the architecture to handle that.
+ else:
+ # we don't really know what to do
+ #print 'returning body as empty string:', version, headers
+ body = ''
return body
+def parse_message(message, f):
+ """
+ Unpack headers and optionally body from the passed file-like object.
+
+ Args:
+ message: Request or Response to which to add data.
+ f: file-like object, probably StringIO.
+ """
+ # Parse headers
+ message.headers = parse_headers(f)
+ # Parse body, unless we know there isn't one
+ if not (getattr(message, 'status', None) in ('204', '304')):
+ message.body = parse_body(f, message.version, message.headers)
+ else:
+ message.body = ''
+ # Save the rest
+ message.data = f.read()
+
class Message(dpkt.Packet):
"""Hypertext Transfer Protocol headers + body."""
__metaclass__ = type
@@ -82,12 +136,7 @@ def __init__(self, *args, **kwargs):
def unpack(self, buf):
f = cStringIO.StringIO(buf)
- # Parse headers
- self.headers = parse_headers(f)
- # Parse body
- self.body = parse_body(f, self.headers)
- # Save the rest
- self.data = f.read()
+ parse_message(self, f)
def pack_hdr(self):
return ''.join([ '%s: %s\r\n' % t for t in self.headers.iteritems() ])
@@ -130,7 +179,7 @@ def unpack(self, buf):
self.method = l[0]
self.uri = l[1]
self.version = l[2][len(self.__proto)+1:]
- Message.unpack(self, f.read())
+ parse_message(self, f)
def __str__(self):
return '%s %s %s/%s\r\n' % (self.method, self.uri, self.__proto,
@@ -149,12 +198,12 @@ def unpack(self, buf):
f = cStringIO.StringIO(buf)
line = f.readline()
l = line.strip().split(None, 2)
- if len(l) < 2 or not l[0].startswith(self.__proto) or not l[1].isdigit():
+ if len(l) < 3 or not l[0].startswith(self.__proto) or not l[1].isdigit():
raise dpkt.UnpackError('invalid response: %r' % line)
self.version = l[0][len(self.__proto)+1:]
self.status = l[1]
self.reason = l[2]
- Message.unpack(self, f.read())
+ parse_message(self, f)
def __str__(self):
return '%s/%s %s %s\r\n' % (self.__proto, self.version, self.status,
View
23 har.py → pcap2har/har.py
@@ -1,10 +1,11 @@
-import http
-import json
-
'''
functions and classes for generating HAR data from parsed http data
'''
+import http
+import json
+
+
# json_repr for HTTP header dicts
def header_json_repr(d):
return [
@@ -14,6 +15,7 @@ def header_json_repr(d):
} for k, v in sorted(d.iteritems())
]
+
def query_json_repr(d):
# d = {string: [string]}
# we need to print all values of the list
@@ -26,6 +28,7 @@ def query_json_repr(d):
})
return output
+
# add json_repr methods to http classes
def HTTPRequestJsonRepr(self):
'''
@@ -43,37 +46,41 @@ def HTTPRequestJsonRepr(self):
}
http.Request.json_repr = HTTPRequestJsonRepr
+
def HTTPResponseJsonRepr(self):
- content = {
- 'size': len(self.body),
- 'compression': len(self.body) - len(self.raw_body),
+ content = {
+ 'size': self.body_length,
'mimeType': self.mimeType
}
+ if self.compression_amount is not None:
+ content['compression'] = self.compression_amount
if self.text:
if self.encoding:
content['text'] = self.text
content['encoding'] = self.encoding
else:
- content['text'] = self.text.encode('utf8') # must transcode to utf-8
+ content['text'] = self.text.encode('utf8') # must transcode to utf-8
return {
'status': int(self.msg.status),
'statusText': self.msg.reason,
'httpVersion': self.msg.version,
'cookies': [],
'headersSize': -1,
- 'bodySize': len(self.msg.body),
+ 'bodySize': self.raw_body_length,
'redirectURL': self.msg.headers['location'] if 'location' in self.msg.headers else '',
'headers': header_json_repr(self.msg.headers),
'content': content,
}
http.Response.json_repr = HTTPResponseJsonRepr
+
# custom json encoder
class JsonReprEncoder(json.JSONEncoder):
'''
Custom Json Encoder that attempts to call json_repr on every object it
encounters.
'''
+
def default(self, obj):
if hasattr(obj, 'json_repr'):
return obj.json_repr()
View
5 pcap2har/http/__init__.py
@@ -0,0 +1,5 @@
+from message import Message
+from request import Request
+from response import Response
+from flow import Flow
+from common import Error
View
6 http/__init__.py → pcap2har/http/common.py
@@ -1,14 +1,10 @@
-from message import Message
-from request import Request
-from response import Response
-from flow import Flow
-
class Error(Exception):
'''
Raised when HTTP cannot be parsed from the given data.
'''
pass
+
class DecodingError(Error):
'''
Raised when encoded HTTP data cannot be decompressed/decoded/whatever.
View
103 http/flow.py → pcap2har/http/flow.py
@@ -1,20 +1,25 @@
import logging
import dpkt
-import http
-from http import Request, Response
-class Flow:
+import common as http
+from request import Request
+from response import Response
+from .. import settings
+
+
+class Flow(object):
'''
- Parses a TCPFlow into HTTP request/response pairs. Or not, depending on the
- integrity of the flow. After __init__, self.pairs contains a list of
- MessagePair's. Requests are paired up with the first response that occured
- after them which has not already been paired with a previous request. Responses
- that don't match up with a request are ignored. Requests with no response are
- paired with None.
+ Parses a TCPFlow into HTTP request/response pairs. Or not, depending
+ on the integrity of the flow. After __init__, self.pairs contains a
+ list of MessagePair's. Requests are paired up with the first response
+ that occured after them which has not already been paired with a
+ previous request. Responses that don't match up with a request are
+ ignored. Requests with no response are paired with None.
Members:
- pairs = [MessagePair], where ei
+ pairs = [MessagePair], where either request or response might be None
'''
+
def __init__(self, tcpflow):
'''
tcpflow = tcp.Flow
@@ -26,44 +31,62 @@ def __init__(self, tcpflow):
if not success:
# flow is not HTTP
raise HTTPError('TCP Flow does not contain HTTP')
+ # now optionally clear the data on tcpflow
+ if settings.drop_bodies:
+ tcpflow.fwd.clear_data()
+ tcpflow.rev.clear_data()
# match up requests with nearest response that occured after them
- # first request is the benchmark; responses before that are irrelevant for now
+ # first request is the benchmark; responses before that
+ # are irrelevant for now
self.pairs = []
+ # determine a list of responses that we can match up with requests,
+ # padding the list with None where necessary.
try:
- # find the first response to a request we know about, that is, the first response after the first request
- first_response_index = find_index(lambda response: response.ts_start > requests[0].ts_start, responses)
+ # find the first response to a request we know about,
+ # that is, the first response after the first request
+ first_response_index = find_index(
+ lambda response: response.ts_start > requests[0].ts_start,
+ responses
+ )
+ except LookupError:
+ # no responses at all
+ pairable_responses = [None for i in requests]
+ else:
# these are responses that match up with our requests
pairable_responses = responses[first_response_index:]
- if len(requests) > len(pairable_responses): # if there are more requests than responses
+ # if there are more requests than responses...
+ if len(requests) > len(pairable_responses):
# pad responses with None
- pairable_responses.extend( [None for i in range(len(requests) - len(pairable_responses))] )
- # if there are more responses, we would just ignore them anyway, which zip does for us
- # create MessagePair's
- connected = False # whether connection timing has been taken into account in a request yet
- for req, resp in zip(requests, responses):
- if not req:
- logging.warning("Request is missing.")
- continue
- if not connected and tcpflow.handshake:
- req.ts_connect = tcpflow.handshake[0].ts
- connected = True
- else:
- req.ts_connect = req.ts_start
- self.pairs.append(MessagePair(req, resp))
- except LookupError:
- # there were no responses after the first request
- # there's nothing we can do
- logging.warning("Request has no reponse.")
+ pairable_responses.extend(
+ [None for i in range(len(requests) - len(pairable_responses))]
+ )
+ # if there are more responses, we would just ignore them anyway,
+ # which zip does for us
+ # create MessagePair's
+ connected = False # if conn. timing has been added to a request yet
+ for req, resp in zip(requests, pairable_responses):
+ if not req:
+ logging.warning('Request is missing.')
+ continue
+ if not connected and tcpflow.handshake:
+ req.ts_connect = tcpflow.handshake[0].ts
+ connected = True
+ else:
+ req.ts_connect = req.ts_start
+ self.pairs.append(MessagePair(req, resp))
-class MessagePair:
+
+class MessagePair(object):
'''
An HTTP Request/Response pair/transaction/whatever. Loosely corresponds to
a HAR entry.
'''
+
def __init__(self, request, response):
self.request = request
self.response = response
+
def gather_messages(MessageClass, tcpdir):
'''
Attempts to construct a series of MessageClass objects from the data. The
@@ -84,22 +107,23 @@ def gather_messages(MessageClass, tcpdir):
pointer = 0 # starting index of data that MessageClass should look at
# while there's data left
while pointer < len(tcpdir.data):
- curr_data = tcpdir.data[pointer:pointer+200] # debug var
+ #curr_data = tcpdir.data[pointer:pointer+200] # debug var
try:
msg = MessageClass(tcpdir, pointer)
- except dpkt.Error as error: # if the message failed
- if pointer == 0: # if this is the first message
+ except dpkt.Error as error: # if the message failed
+ if pointer == 0: # if this is the first message
raise http.Error('Invalid http')
- else: # we're done parsing messages
- logging.warning("We got a dpkt.Error %s, but we are done." % error)
- break # out of the loop
+ else: # we're done parsing messages
+ logging.warning('We got a dpkt.Error %s, but we are done.' % error)
+ break # out of the loop
except:
raise
# ok, all good
messages.append(msg)
pointer += msg.data_consumed
return messages
+
def parse_streams(request_stream, response_stream):
'''
attempts to construct http.Request/Response's from the corresponding
@@ -121,6 +145,7 @@ def parse_streams(request_stream, response_stream):
else:
return True, requests, responses
+
def find_index(f, seq):
'''
returns the index of the first item in seq for which predicate f returns
View
23 http/message.py → pcap2har/http/message.py
@@ -1,4 +1,4 @@
-class Message:
+class Message(object):
'''
Contains a dpkt.http.Request/Response, as well as other data required to
build a HAR, including (mostly) start and end time.
@@ -9,9 +9,10 @@ class Message:
* seq_end: first sequence number past Message's data (slice-style indices)
* ts_start: when Message started arriving (dpkt timestamp)
* ts_end: when Message had fully arrived (dpkt timestamp)
- * body_raw: body before compression is taken into account
+ * raw_body: body before compression is taken into account
* tcpdir: The tcp.Direction corresponding to the HTTP message
'''
+
def __init__(self, tcpdir, pointer, msgclass):
'''
Args:
@@ -22,8 +23,9 @@ def __init__(self, tcpdir, pointer, msgclass):
self.tcpdir = tcpdir
# attempt to parse as http. let exception fall out to caller
self.msg = msgclass(tcpdir.data[pointer:])
- self.data = self.msg.data
- self.data_consumed = (len(tcpdir.data) - pointer) - len(self.data)
+ self.data_consumed = (len(tcpdir.data) - pointer) - len(self.msg.data)
+ # save memory by deleting data attribute; it's useless
+ self.msg.data = None
# calculate sequence numbers of data
self.seq_start = tcpdir.byte_to_seq(pointer)
self.seq_end = tcpdir.byte_to_seq(pointer + self.data_consumed) # past-the-end
@@ -32,3 +34,16 @@ def __init__(self, tcpdir, pointer, msgclass):
self.ts_end = tcpdir.seq_final_arrival(self.seq_end - 1)
# get raw body
self.raw_body = self.msg.body
+ self.__pointer = pointer
+ # Access self.__raw_msg via raw_msg @property, which will set it if None
+ self.__raw_msg = None
+
+ @property
+ def raw_msg(self):
+ '''
+ Returns the message (including header) as a byte string.
+ '''
+ if not self.__raw_msg:
+ self.__raw_msg = self.tcpdir.data[
+ self.__pointer:(self.__pointer+self.data_consumed)]
+ return self.__raw_msg
View
10 http/request.py → pcap2har/http/request.py
@@ -1,7 +1,9 @@
import urlparse
-#import dpkt.http this is buggy
-import dpkt_http_replacement as dpkt_http
-import http
+
+# dpkt.http is buggy, so we use our modified replacement
+from .. import dpkt_http_replacement as dpkt_http
+import message as http
+
class Request(http.Message):
'''
@@ -12,6 +14,7 @@ class Request(http.Message):
* fullurl: Full URL, with all components.
* url: Full URL, but without fragments. (that's what HAR wants)
'''
+
def __init__(self, tcpdir, pointer):
http.Message.__init__(self, tcpdir, pointer, dpkt_http.Request)
# get query string. its the URL after the first '?'
@@ -21,4 +24,3 @@ def __init__(self, tcpdir, pointer):
self.fullurl = fullurl.geturl()
self.url, frag = urlparse.urldefrag(self.fullurl)
self.query = urlparse.parse_qs(uri.query, keep_blank_values=True)
-
View
101 http/response.py → pcap2har/http/response.py
@@ -1,21 +1,30 @@
import gzip
import zlib
import cStringIO
-import dpkt_http_replacement as dpkt_http
-import http
-from mediatype import MediaType
-import logging as log
-#from http import DecodingError # exception class from parent module
from base64 import encodestring as b64encode
+import logging
-# try to import UnicodeDammit from BeautifulSoup
+from .. import dpkt_http_replacement as dpkt_http
+from ..mediatype import MediaType
+from .. import settings
+
+import common as http
+import message
+
+# try to import UnicodeDammit from BeautifulSoup,
+# starting with system and defaulting to included version
# otherwise, set the name to None
try:
- from BeautifulSoup import UnicodeDammit
+ try:
+ from BeautifulSoup import UnicodeDammit
+ except ImportError:
+ from ..BeautifulSoup import UnicodeDammit
except ImportError:
UnicodeDammit = None
+ log.warning('Can\'t find BeautifulSoup, unicode is more likely to be '
+ 'misinterpreted')
-class Response(http.Message):
+class Response(message.Message):
'''
HTTP response.
Members:
@@ -26,19 +35,43 @@ class Response(http.Message):
* encoding: 'base64' if self.text is base64 encoded binary data, else None
* compression: string, compression type
* original_encoding: string, original text encoding/charset/whatever
+ * body_length: int, length of body, uncompressed if possible/applicable
+ * compression_amount: int or None, difference between lengths of
+ uncompressed data and raw data. None if no compression or we're not sure
'''
+
def __init__(self, tcpdir, pointer):
- http.Message.__init__(self, tcpdir, pointer, dpkt_http.Response)
- # uncompress body if necessary
- self.handle_compression()
+ message.Message.__init__(self, tcpdir, pointer, dpkt_http.Response)
# get mime type
if 'content-type' in self.msg.headers:
self.mediaType = MediaType(self.msg.headers['content-type'])
else:
self.mediaType = MediaType('application/x-unknown-content-type')
self.mimeType = self.mediaType.mimeType()
- # try to get out unicode
- self.handle_text()
+ # first guess at body size. handle_compression might
+ # modify it, but this has to be before clear_body
+ self.body_length = len(self.msg.body)
+ self.compression_amount = None
+ self.text = None
+ # handle body stuff
+ if settings.drop_bodies:
+ self.clear_body()
+ else:
+ # uncompress body if necessary
+ self.handle_compression()
+ # try to get out unicode
+ self.handle_text()
+
+ def clear_body(self):
+ '''
+ Clear response body to save memory
+
+ http.Flow has to do most of the work (after any other responses are
+ parsed), here we just want to get rid of any references.
+ '''
+ self.body = self.raw_body = None
+ self.msg.body = None
+
def handle_compression(self):
'''
Sets self.body to the http decoded response data. Sets compression to
@@ -59,7 +92,8 @@ def handle_compression(self):
raise http.DecodingError('zlib failed to gunzip HTTP data')
except:
# who knows what else it might raise
- raise http.DecodingError("failed to gunzip HTTP data, don't know why")
+ raise http.DecodingError(
+ 'failed to gunzip HTTP data, don\'t know why')
# handle deflate
elif encoding == 'deflate':
try:
@@ -68,13 +102,18 @@ def handle_compression(self):
# do raw deflate. See: http://bugs.python.org/issue5784
self.body = zlib.decompress(self.raw_body, -15)
except zlib.error:
- raise http.DecodingError('zlib failed to undeflate HTTP data')
+ raise http.DecodingError(
+ 'zlib failed to undeflate HTTP data')
elif encoding == 'compress' or encoding == 'x-compress':
# apparently nobody uses this, so basically just ignore it
self.body = self.raw_body
elif encoding == 'identity':
# no compression
self.body = self.raw_body
+ elif 'sdch' in encoding:
+ # ignore sdch, a Google proposed modification to HTTP/1.1
+ # not in RFC 2616.
+ self.body = self.raw_body
else:
# I'm pretty sure the above are the only allowed encoding types
# see RFC 2616 sec 3.5 (http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.5)
@@ -83,6 +122,9 @@ def handle_compression(self):
# no compression
self.compression = 'identity'
self.body = self.raw_body
+ self.body_length = len(self.body)
+ # comp_amount is 0 when no compression, which may or may not be to spec
+ self.compression_amount = self.body_length - len(self.raw_body)
def handle_text(self):
'''
@@ -91,7 +133,6 @@ def handle_text(self):
to unicode if possible. Must come after handle_compression, and after
self.mediaType is valid.
'''
- self.text = None
self.encoding = None
# if the body is text
if (self.mediaType and
@@ -103,41 +144,51 @@ def handle_text(self):
override_encodings = [self.mediaType.params['charset']]
else:
override_encodings = []
- # if there even is data (otherwise, dammit.originalEncoding might be None)
+ # if there even is data (otherwise,
+ # dammit.originalEncoding might be None)
if self.body != '':
if UnicodeDammit:
- # honestly, I don't mind not abiding by RFC 2023. UnicodeDammit just
- # does what makes sense, and if the content is remotely standards-
- # compliant, it will do the right thing.
+ # honestly, I don't mind not abiding by RFC 2023.
+ # UnicodeDammit just does what makes sense, and if the
+ # content is remotely standards-compliant, it will do the
+ # right thing.
dammit = UnicodeDammit(self.body, override_encodings)
# if unicode was found
if dammit.unicode:
self.text = dammit.unicode
self.originalEncoding = dammit.originalEncoding
else:
# unicode could not be decoded, at all
- # HAR can't write data, but body might still be useful as-is
+ # HAR can't write data, but body might still
+ # be useful as-is
pass
else:
- # try the braindead version, just guess content-type or utf-8
+ # try the stupid version, just guess content-type or utf-8
u = None
# try our list of encodings + utf8 with strict errors
for e in override_encodings + ['utf8', 'iso-8859-1']:
try:
u = self.body.decode(e, 'strict')
self.originalEncoding = e
- break # if ^^ didn't throw, we're done
+ break # if ^^ didn't throw, we're done
except UnicodeError:
pass
- # if none of those worked, try utf8 with 'replace' error mode
+ # if none of those worked, try utf8
+ # with 'replace' error mode
if not u:
# unicode has failed
u = self.body.decode('utf8', 'replace')
- self.originalEncoding = None # ???
+ self.originalEncoding = None # ???
self.text = u or None
else:
# body is not text
# base64 encode it and set self.encoding
# TODO: check with list that this is right
self.text = b64encode(self.body)
self.encoding = 'base64'
+
+ @property
+ def raw_body_length(self):
+ if self.compression_amount is None:
+ return self.body_length
+ return self.body_length - self.compression_amount
View
69 httpsession.py → pcap2har/httpsession.py
@@ -5,14 +5,15 @@
from datetime import datetime
import dpkt
-import logging as log
+import logging
from pcaputil import ms_from_timedelta, ms_from_dpkt_time
from pagetracker import PageTracker
import http
import settings
-class Entry:
+
+class Entry(object):
'''
represents an HTTP request/response in a form suitable for writing to a HAR
file.
@@ -29,36 +30,42 @@ class Entry:
* time_waiting
* time_receiving
'''
+
def __init__(self, request, response):
self.request = request
self.response = response
self.pageref = None
self.ts_start = int(request.ts_connect*1000)
- self.startedDateTime = datetime.fromtimestamp(request.ts_connect)
- endedDateTime = datetime.fromtimestamp(response.ts_end)
- self.total_time = ms_from_timedelta(
- endedDateTime - self.startedDateTime # plus connection time, someday
- )
+ self.startedDateTime = datetime.utcfromtimestamp(request.ts_connect)
# calculate other timings
self.time_blocked = -1
self.time_dnsing = -1
- self.time_connecting = ms_from_dpkt_time(request.ts_start -
- request.ts_connect)
- self.time_sending = \
- ms_from_dpkt_time(request.ts_end - request.ts_start)
- self.time_waiting = \
- ms_from_dpkt_time(response.ts_start - request.ts_end)
- self.time_receiving = \
- ms_from_dpkt_time(response.ts_end - response.ts_start)
- # check if timing calculations are consistent
- if self.time_sending + self.time_waiting + self.time_receiving != self.total_time:
- pass
+ self.time_connecting = (
+ ms_from_dpkt_time(request.ts_start - request.ts_connect))
+ self.time_sending = (
+ ms_from_dpkt_time(request.ts_end - request.ts_start))
+ if response is not None:
+ self.time_waiting = (
+ ms_from_dpkt_time(response.ts_start - request.ts_end))
+ self.time_receiving = (
+ ms_from_dpkt_time(response.ts_end - response.ts_start))
+ endedDateTime = datetime.utcfromtimestamp(response.ts_end)
+ self.total_time = ms_from_timedelta(
+ endedDateTime - self.startedDateTime
+ )
+ else:
+ # this can happen if the request never gets a response
+ self.time_waiting = -1
+ self.time_receiving = -1
+ self.total_time = -1
+
def json_repr(self):
'''
return a JSON serializable python object representation of self.
'''
d = {
- 'startedDateTime': self.startedDateTime.isoformat() + 'Z', # assume time is in UTC
+ # Z means time is in UTC
+ 'startedDateTime': self.startedDateTime.isoformat() + 'Z',
'time': self.total_time,
'request': self.request,
'response': self.response,
@@ -75,6 +82,7 @@ def json_repr(self):
if self.pageref:
d['pageref'] = self.pageref
return d
+
def add_dns(self, dns_query):
'''
Adds the info from the dns.Query to this entry
@@ -84,13 +92,16 @@ def add_dns(self, dns_query):
'''
self.time_dnsing = ms_from_dpkt_time(dns_query.duration())
+
class UserAgentTracker(object):
'''
Keeps track of how many uses each user-agent header receives, and provides
a function for finding the most-used one.
'''
+
def __init__(self):
- self.data = {} # {user-agent string: number of uses}
+ self.data = {} # {user-agent string: number of uses}
+
def add(self, ua_string):
'''
Either increments the use-count for the user-agent string, or creates a
@@ -100,6 +111,7 @@ def add(self, ua_string):
self.data[ua_string] += 1
else:
self.data[ua_string] = 1
+
def dominant_user_agent(self):
'''
Returns the agent string with the most uses.
@@ -112,6 +124,7 @@ def dominant_user_agent(self):
# return the string from the key-value pair with the biggest value
return max(self.data.iteritems(), key=lambda v: v[1])[0]
+
class HttpSession(object):
'''
Represents all http traffic from within a pcap.
@@ -122,19 +135,20 @@ class HttpSession(object):
* flows = [http.Flow]
* entries = [Entry], all http request/response pairs
'''
+
def __init__(self, packetdispatcher):
'''
parses http.flows from packetdispatcher, and parses those for HAR info
'''
# parse http flows
- self.flows= []
- for flow in packetdispatcher.tcp.flowdict.itervalues():
+ self.flows = []
+ for flow in packetdispatcher.tcp.flows():
try:
self.flows.append(http.Flow(flow))
except http.Error as error:
- log.warning(error)
+ logging.warning(error)
except dpkt.dpkt.Error as error:
- log.warning(error)
+ logging.warning(error)
# combine the messages into a list
pairs = reduce(lambda p, f: p+f.pairs, self.flows, [])
# set-up
@@ -157,8 +171,9 @@ def __init__(self, packetdispatcher):
# if msg.request has a referer, keep track of that, too
if self.page_tracker:
entry.pageref = self.page_tracker.getref(entry)
- # add it to the list
- self.entries.append(entry)
+ # add it to the list, if we're supposed to keep it.
+ if entry.response or settings.keep_unfulfilled_requests:
+ self.entries.append(entry)
self.user_agent = self.user_agents.dominant_user_agent()
# handle DNS AFTER sorting
# this algo depends on first appearance of a name
@@ -180,7 +195,7 @@ def json_repr(self):
'''
d = {
'log': {
- 'version' : '1.1',
+ 'version': '1.1',
'creator': {
'name': 'pcap2har',
'version': '0.1'
View
28 mediatype.py → pcap2har/mediatype.py
@@ -1,4 +1,6 @@
import re
+import logging
+
class MediaType(object):
'''
@@ -10,26 +12,33 @@ class MediaType(object):
* subtype: string, the mime subtype
* params: {string: string}. Maybe should be {string: [string]}?
'''
+
# RE for parsing media types. type and subtype are alpha-numeric strings
# possibly with '-'s. Then the optional parameter list: names are same type
# of string as the types above, values are pretty much anything but another
# semicolon
mediatype_re = re.compile(
- r'^([\w\-+.]+)/([\w\-+.]+)((?:\s*;\s*[\w\-]+=[^;]+)*)\s*$'
+ r'^([\w\-+.]+)/([\w\-+.]+)((?:\s*;\s*[\w\-]+=[^;]+)*);?\s*$'
)
+
# RE for parsing name-value pairs
nvpair_re = re.compile(r'^\s*([\w\-]+)=([^;\s]+)\s*$')
- # constructor
+
def __init__(self, data):
'''
Args:
data = string, the media type string
'''
+ if not data:
+ logging.warning(
+ 'Setting empty media type to x-unknown-content-type')
+ self.set_unknown()
+ return
match = self.mediatype_re.match(data)
if match:
# get type/subtype
self.type = match.group(1).lower()
- self.subtype= match.group(2).lower()
+ self.subtype = match.group(2).lower()
# params
self.params = {}
param_str = match.group(3) # we know this is well-formed, except for extra whitespace
@@ -41,14 +50,23 @@ def __init__(self, data):
self.params[pairmatch.group(1)] = pairmatch.group(2)
pass
else:
- raise ValueError('invalid media type string: ' + data)
+ logging.warning('Invalid media type string: "%s"' % data)
+ self.set_unknown()
+
+ def set_unknown(self):
+ self.type = 'application'
+ self.subtype = 'x-unknown-content-type'
+ self.params = {}
+
def mimeType(self):
return '%s/%s' % (self.type, self.subtype)
+
def __str__(self):
result = self.mimeType()
- for n,v in self.params.iteritems():
+ for n, v in self.params.iteritems():
result += '; %s=%s' % (n, v)
return result
+
def __repr__(self):
return 'MediaType(%s)' % self.__str__()
View
6 packetdispatcher.py → pcap2har/packetdispatcher.py
@@ -2,6 +2,7 @@
import tcp
import udp
+
class PacketDispatcher:
'''
takes a series of dpkt.Packet's and calls callbacks based on their type
@@ -14,9 +15,11 @@ class PacketDispatcher:
* flowbuilder = tcp.FlowBuilder
* udp = udp.Processor
'''
+
def __init__(self):
self.tcp = tcp.FlowBuilder()
self.udp = udp.Processor()
+
def add(self, ts, buf, eth):
'''
ts = dpkt timestamp
@@ -26,7 +29,7 @@ def add(self, ts, buf, eth):
#decide based on pkt.data
# if it's IP...
if (isinstance(eth.data, dpkt.ip.IP) or
- isinstance(eth.data, dpkt.ip6.IP6)):
+ isinstance(eth.data, dpkt.ip6.IP6)):
ip = eth.data
# if it's TCP
if isinstance(ip.data, dpkt.tcp.TCP):
@@ -35,6 +38,7 @@ def add(self, ts, buf, eth):
# if it's UDP...
elif isinstance(ip.data, dpkt.udp.UDP):
self.udp.add(ts, ip.data)
+
def finish(self):
#This is a hack, until tcp.Flow no longer has to be `finish()`ed
self.tcp.finish()
View
35 pagetracker.py → pcap2har/pagetracker.py
@@ -3,14 +3,15 @@ class Page(object):
Members:
* pageref
* url = string or None
- * root_document = entry or None
+ * root_document = entry or None
* startedDateTime
* user_agent = string, UA of program requesting page
* title = url
* referrers = set([string]), urls that have referred to this page, directly
or indirectly. If anything refers to them, they also belong on this page
- * last_entry = entry, the last entry to be added
+ * last_entry = entry, the last entry to be added
'''
+
def __init__(self, pageref, entry, is_root_doc=True):
'''
Creates new page with passed ref and data from entry
@@ -33,17 +34,20 @@ def __init__(self, pageref, entry, is_root_doc=True):
self.referrers.add(entry.request.msg.headers['referer'])
self.url = None # can't guarantee it's the referrer
self.title = 'unknown title'
+
def has_referrer(self, ref):
'''
Returns whether the passed ref might be referring to an url in this page
'''
return ref == self.url or ref in self.referrers
+
def add(self, entry):
'''
Adds the entry to the page's data, whether it likes it or not
'''
self.last_entry = entry
self.referrers.add(entry.request.url)
+
def json_repr(self):
return {
'id': self.pageref,
@@ -58,18 +62,22 @@ def json_repr(self):
'onLoad': -1
}
+
def is_root_document(entry):
'''
guesses whether the entry is from the root document of a web page
'''
# guess based on media type
- mt = entry.response.mediaType
- if mt.type == 'text':
- if mt.subtype in ['html', 'xhtml', 'xml']:
- # probably...
- return True
+ if entry.response: # might be None
+ mt = entry.response.mediaType
+ if mt.type == 'text':
+ if mt.subtype in ['html', 'xhtml', 'xml']:
+ # probably...
+ return True
+ # else, guess by request url?
return False
+
class PageTracker(object):
'''
Groups http entries into pages.
@@ -79,20 +87,22 @@ class PageTracker(object):
locality). Basically all it has to do is sort entries into buckets by any
means available.
'''
+
def __init__(self):
- self.page_number = 0 # used for generating pageids
- self.pages = [] # [Page]
+ self.page_number = 0 # used for generating pageids
+ self.pages = [] # [Page]
+
def getref(self, entry):
'''
takes an Entry and returns a pageref.
Entries must be passed in by order of arrival
'''
# extract interesting information all at once
- req = entry.request # all the interesting stuff is in the request
+ req = entry.request # all the interesting stuff is in the request
referrer = req.msg.headers.get('referer')
user_agent = req.msg.headers.get('user-agent')
- matched_page = None # page we added the request to
+ matched_page = None # page we added the request to
# look through pages for matches
for page in self.pages:
# check user agent
@@ -110,6 +120,7 @@ def getref(self, entry):
else:
# make a new page
return self.new_ref(entry)
+
def new_ref(self, entry):
'''
Internal. Wraps creating a new pages entry. Returns the new ref
@@ -120,9 +131,11 @@ def new_ref(self, entry):
is_root_document(entry))
self.pages.append(new_page)
return new_page.pageref
+
def new_id(self):
result = 'page_%d' % self.page_number
self.page_number += 1
return result
+
def json_repr(self):
return sorted(self.pages)
View
47 pcap.py → pcap2har/pcap.py
@@ -1,10 +1,8 @@
+import logging
+
import dpkt
-from pcaputil import *
-from socket import inet_ntoa
-import logging as log
-import os
-import shutil
+from pcaputil import *
import tcp
from packetdispatcher import PacketDispatcher
@@ -28,24 +26,26 @@ def ParsePcap(dispatcher, filename=None, reader=None):
try:
pcap = ModifiedReader(f)
except dpkt.dpkt.Error as e:
- log.warning('failed to parse pcap file %s' % filename)
+ logging.warning('failed to parse pcap file %s' % filename)
return
elif reader:
pcap = reader
else:
raise 'function ParsePcap needs either a filename or pcap reader'
- #now we have the reader; read from it
- packet_count = 1 # start from 1 like Wireshark
+ # now we have the reader; read from it
+ packet_count = 1 # start from 1 like Wireshark
errors = [] # store errors for later inspection
try:
for packet in pcap:
- ts = packet[0] # timestamp
- buf = packet[1] # frame data
- hdr = packet[2] # libpcap header
+ ts = packet[0] # timestamp
+ buf = packet[1] # frame data
+ hdr = packet[2] # libpcap header
# discard incomplete packets
if hdr.caplen != hdr.len:
# log packet number so user can diagnose issue in wireshark
- log.warning('ParsePcap: discarding incomplete packet, # %d' % packet_count)
+ logging.warning(
+ 'ParsePcap: discarding incomplete packet, #%d' %
+ packet_count)
continue
# parse packet
try:
@@ -60,12 +60,23 @@ def ParsePcap(dispatcher, filename=None, reader=None):
# catch errors from this packet
except dpkt.Error as e:
errors.append((packet, e, packet_count))
- log.warning('Error parsing packet: %s. On packet #%s' %
- (e, packet_count))
+ logging.warning(
+ 'Error parsing packet: %s. On packet #%d' %
+ (e, packet_count))
packet_count += 1
except dpkt.dpkt.NeedData as error:
- log.warning(error)
- log.warning('A packet in the pcap file was too short, '
- 'debug_pkt_count=%d' % debug_pkt_count)
+ logging.warning(error)
+ logging.warning(
+ 'A packet in the pcap file was too short, packet_count=%d' %
+ packet_count)
errors.append((None, error))
-
+
+
+def EasyParsePcap(filename=None, reader=None):
+ '''
+ Like ParsePcap, but makes and returns a PacketDispatcher for you.
+ '''
+ dispatcher = PacketDispatcher()
+ ParsePcap(dispatcher, filename=filename, reader=reader)
+ dispatcher.finish()
+ return dispatcher
View
74 pcaputil.py → pcap2har/pcaputil.py
@@ -3,19 +3,38 @@
'''
import dpkt
-from socket import inet_ntoa
+import resource
+import sys
+
+# Re-implemented here only because it's missing on AppEngine.
+def inet_ntoa(packed):
+ '''Custom implementation of inet_ntoa'''
+ if not isinstance(packed, str) or len(packed) != 4:
+ raise ValueError('Argument to inet_ntoa must a string of length 4')
+ return '.'.join(str(ord(c)) for c in packed)
+
def friendly_tcp_flags(flags):
'''
returns a string containing a user-friendly representation of the tcp flags
'''
# create mapping of flags to string repr's
- d = {dpkt.tcp.TH_FIN:'FIN', dpkt.tcp.TH_SYN:'SYN', dpkt.tcp.TH_RST:'RST', dpkt.tcp.TH_PUSH:'PUSH', dpkt.tcp.TH_ACK:'ACK', dpkt.tcp.TH_URG:'URG', dpkt.tcp.TH_ECE:'ECE', dpkt.tcp.TH_CWR:'CWR'}
+ d = {
+ dpkt.tcp.TH_FIN: 'FIN',
+ dpkt.tcp.TH_SYN: 'SYN',
+ dpkt.tcp.TH_RST: 'RST',
+ dpkt.tcp.TH_PUSH: 'PUSH',
+ dpkt.tcp.TH_ACK: 'ACK',
+ dpkt.tcp.TH_URG: 'URG',
+ dpkt.tcp.TH_ECE: 'ECE',
+ dpkt.tcp.TH_CWR: 'CWR'
+ }
#make a list of the flags that are activated
active_flags = filter(lambda t: t[0] & flags, d.iteritems())
#join all their string representations with '|'
return '|'.join(t[1] for t in active_flags)
+
def friendly_socket(sock):
'''
returns a socket where the addresses are converted by inet_ntoa into
@@ -29,11 +48,13 @@ def friendly_socket(sock):
sock[1][1]
)
-def friendly_data(str):
+
+def friendly_data(data):
'''
convert (possibly binary) data into a form readable by people on terminals
'''
- return `str`
+ return `data`
+
def ms_from_timedelta(td):
'''
@@ -44,12 +65,14 @@ def ms_from_timedelta(td):
'''
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**3
+
def ms_from_dpkt_time(td):
'''
Get milliseconds from a dpkt timestamp. This should probably only really be
done on a number gotten from subtracting two dpkt timestamps.
'''
- return int(td * 1000) # um, I guess
+ return int(td * 1000)
+
class ModifiedReader(object):
'''
@@ -61,8 +84,16 @@ class ModifiedReader(object):
'''
def __init__(self, fileobj):
- self.name = fileobj.name
- self.fd = fileobj.fileno()
+ if hasattr(fileobj, 'name'):
+ self.name = fileobj.name
+ else:
+ self.name = '<unknown>'
+
+ if hasattr(fileobj, 'fileno'):
+ self.fd = fileobj.fileno()
+ else:
+ self.fd = None
+
self.__f = fileobj
buf = self.__f.read(dpkt.pcap.FileHdr.__hdr_len__)
self.__fh = dpkt.pcap.FileHdr(buf)
@@ -108,3 +139,32 @@ def __iter__(self):
hdr = self.__ph(buf)
buf = self.__f.read(hdr.caplen)
yield (hdr.tv_sec + (hdr.tv_usec / 1000000.0), buf, hdr)
+
+
+class FakeStream(object):
+ '''
+ Emulates a tcp.Direction with a predetermined data stream.
+
+ Useful for debugging http message classes.
+ '''
+ def __init__(self, data):
+ self.data = data
+ def byte_to_seq(self, n):
+ return n
+ def seq_final_arrival(self, n):
+ return None
+
+
+class FakeFlow(object):
+ '''
+ Emulates a tcp.Flow, with two FakeStream's.
+ '''
+ def __init__(self, fwd, rev):
+ self.fwd = fwd
+ self.rev = rev
+
+def print_rusage():
+ rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+ if sys.platform == 'darwin':
+ rss /= 1024 # Mac OSX returns rss in bytes, not KiB
+ print 'max_rss:', rss, 'KiB'
View
13 pcap2har/settings.py
@@ -0,0 +1,13 @@
+process_pages = True
+drop_bodies = False # bodies of http responses, that is
+
+# Whether HTTP parsing should case whether the content length matches the
+# content-length header.
+strict_http_parse_body = False
+
+# Whether to pad missing data in TCP flows with 0 bytes
+pad_missing_tcp_data = False
+
+# Whether to keep requests with missing responses. Could break consumers
+# that assume every request has a response.
+keep_unfulfilled_requests = False
View
10 sortedcollection.py → pcap2har/sortedcollection.py
@@ -127,21 +127,21 @@ def index(self, item):
if self._items[i] == item:
return i
i += 1
- raise ValueError('No item found with key equal to: %r' % (key,))
+ raise ValueError('No item found with key equal to: %r' % (key,))
def insert(self, item):
'Insert a new item. If equal keys are found, add to the left'
key = self._key(item)
i = bisect_left(self._keys, key)
self._keys.insert(i, key)
self._items.insert(i, item)
-
+
def remove(self, index):
'Remove the item at the passed index'
# lets IndexError fall out if indices are invalid
del self._items[index]
del self._keys[index]
-
+
def insert_right(self, item):
'Insert a new item. If equal keys are found, add to the right'
key = self._key(item)
@@ -191,7 +191,7 @@ def find_ge(self, key):
return self._items[i]
except IndexError:
raise ValueError('No item found with key at or above: %r' % (key,))
-
+
if __name__ == '__main__':
sd = SortedCollection('The quick Brown Fox jumped'.split(), key=str.lower)
@@ -230,7 +230,7 @@ def find_ge(self, key):
pass
else:
print('Oops, failed to notify of missing value')
-
+
import doctest
print(doctest.testmod())
View
12 pcap2har/tcp/__init__.py
@@ -0,0 +1,12 @@
+'''
+Objects for parsing TCP streams and packets.
+'''
+
+import dpkt
+
+# make tcp.Flow, tcp.Packet, etc. valid
+from packet import Packet
+from flow import Flow
+from chunk import Chunk
+from direction import Direction
+from flowbuilder import FlowBuilder
View
6 tcp/chunk.py → pcap2har/tcp/chunk.py
@@ -1,11 +1,13 @@
import seq
-class Chunk:
+
+class Chunk(object):
'''
A chunk of data from a TCP stream in the process of being merged. Takes the
place of the data tuples, ((begin, end), data, logger) in the old algorithm.
Adds member functions that encapsulate the main merging logic.
'''
+
def __init__(self):
'''
Basic initialization on the chunk.
@@ -14,7 +16,7 @@ def __init__(self):
self.seq_start = None
self.seq_end = None
- def merge(self, new, new_seq_callback = None):
+ def merge(self, new, new_seq_callback=None):
'''
Attempts to merge the packet or chunk with the existing data. Returns
details of the operation's success or failure.
View
14 tcp/__init__.py → pcap2har/tcp/common.py
@@ -1,17 +1,5 @@
-'''
-Objects for parsing TCP streams and packets.
-'''
-
import dpkt
-# make tcp.Flow, tcp.Packet, etc. valid
-from packet import Packet
-from flow import Flow
-from chunk import Chunk
-from direction import Direction
-from flowbuilder import FlowBuilder
-
-# util functions
def detect_handshake(packets):
'''
@@ -29,7 +17,7 @@ def detect_handshake(packets):
rev_seq = None
if syn.tcp.flags & dpkt.tcp.TH_SYN and not syn.tcp.flags & dpkt.tcp.TH_ACK:
# have syn
- fwd_seq = syn.seq # start_seq is the seq field of the segment
+ fwd_seq = syn.seq # start_seq is the seq field of the segment
if (synack.flags & dpkt.tcp.TH_SYN and
synack.flags & dpkt.tcp.TH_ACK and
synack.ack == fwd_seq + 1):
View
95 tcp/direction.py → pcap2har/tcp/direction.py
@@ -1,22 +1,30 @@
-from sortedcollection import SortedCollection
-import tcp
from operator import itemgetter, attrgetter
+import logging
-class Direction:
+from ..sortedcollection import SortedCollection
+
+import packet
+import chunk as tcp
+from .. import settings
+
+
+class Direction(object):
'''
Represents data moving in one direction in a TCP flow.
Members:
* finished = bool. Indicates whether more packets should be expected.
- * chunks = [tcp.Chunk], sorted by seq_start
+ * chunks = [tcp.Chunk] or None, sorted by seq_start. None iff data
+ has been cleared.
* flow = tcp.Flow, the flow to which the direction belongs
* arrival_data = SortedCollection([(seq_num, pkt)])
* final_arrival_data = SortedCollection([(seq_num, ts)])
* final_data_chunk = Chunk or None, the chunk that contains the final data,
- only after seq_start is valid
+ only after seq_start is valid and before clear_data
* final_arrival_pointer = the end sequence number of data that has
completely arrived
'''
+
def __init__(self, flow):
'''
Sets things up for adding packets.
@@ -31,6 +39,7 @@ def __init__(self, flow):
self.final_arrival_pointer = None
self.chunks = SortedCollection(key=attrgetter('seq_start'))
self.final_data_chunk = None
+
def add(self, pkt):
'''
Merge the packet into the first chunk it overlaps with. If data was
@@ -43,76 +52,102 @@ def add(self, pkt):
'''
if self.finished:
raise RuntimeError('tried to add packets to a finished tcp.Direction')
+ if self.chunks is None:
+ raise RuntimeError('Tried to add packet to a tcp.Direction'
+ 'that has been cleared')
# discard packets with no payload. we don't care about them here
if pkt.data == '':
return
# attempt to merge packet with existing chunks
merged = False
for i, chunk in enumerate(self.chunks):
- overlapped, (front, back) = chunk.merge(pkt,
- self.create_merge_callback(pkt))
+ overlapped, (front, back) = chunk.merge(
+ pkt, self.create_merge_callback(pkt))
if overlapped:
# check if this packet bridged the gap between two chunks
if back and i < (len(self.chunks)-1):
overlapped2, result2 = chunk.merge(self.chunks[i+1])
+ # if the gap was bridged, the later chunk is obsolete
+ # so get rid of it.
if overlapped2:
- assert( (not result2[0]) and (result2[1]))
self.chunks.remove(i+1)
# if this is the main data chunk, calc final arrival
if self.seq_start and chunk.seq_start == self.seq_start:
- if front: # packet was first in stream but just now arriving
+ if front:
+ # packet was first in stream but is just now arriving
self.final_arrival_data.insert((self.seq_start, pkt.ts))
- if back: # usual case
- self.final_arrival_data.insert((self.final_arrival_pointer, pkt.ts))
+ if back: # usual case
+ self.final_arrival_data.insert(
+ (self.final_arrival_pointer, pkt.ts))
if not self.final_data_chunk:
self.final_data_chunk = chunk
self.final_arrival_pointer = self.final_data_chunk.seq_end
merged = True
- break # skip further chunks
+ break # skip further chunks
if not merged:
# nothing overlapped with the packet
# we need a new chunk
self.new_chunk(pkt)
+
@property
def data(self):
'''
returns the TCP data, as far as it has been determined.
'''
+ if self.chunks is None:
+ return None
if self.final_data_chunk:
return self.final_data_chunk.data
else:
if self.finished:
- return '' # no data was ever added
+ return '' # no data was ever added
else:
- return None # just don't know at all
+ return None # just don't know at all
+
+ def clear_data(self):
+ '''
+ Drop data to save memory
+ '''
+ # we need to make sure we've grabbed any timing info we can
+ if not self.finished:
+ logging.warn('tried to clear data on an unfinished tcp.Direction')
+ # clear the list, to make sure all chunks are orphaned to make it
+ # easier for GC. hopefully.
+ self.chunks.clear()
+ self.chunks = None
+ self.final_data_chunk = None
+
@property
def seq_start(self):
'''
starting sequence number, as far as we can tell now.
'''
if self.flow.handshake:
+ assert(self in (self.flow.fwd, self.flow.rev))
if self is self.flow.fwd:
return self.flow.handshake[2].seq
- elif self is self.flow.rev:
- return self.flow.handshake[1].seq + 1
else:
- raise RuntimeError(
- "holy crap, tcp.Direction has a flow it doesn't belong to")
+ return self.flow.handshake[1].seq + 1
elif self.finished:
if self.chunks:
return self.chunks[0].seq_start
else:
- log.warning('getting seq_start from finished tcp.Direction '
+ # this will also occur when a Direction with no handshake
+ # has been cleared.
+ logging.warning('getting seq_start from finished tcp.Direction '
'with no handshake and no data')
return None
else:
return None
+
def finish(self):
'''
Notifies the direction that there are no more packets coming. This means
that self.data can be decided upon. Also calculates final_arrival for
any packets that arrived while seq_start was None
'''
+ if settings.pad_missing_tcp_data:
+ self.pad_missing_data()
self.finished = True
# calculate final_arrival
if not self.final_arrival_data:
@@ -121,8 +156,10 @@ def finish(self):
if vertex[1].ts > peak_time:
peak_time = vertex[1].ts
self.final_arrival_data.insert((vertex[0], vertex[1].ts))
+
if self.chunks and not self.final_data_chunk:
self.final_data_chunk = self.chunks[0]
+
def new_chunk(self, pkt):
'''
creates a new tcp.Chunk for the pkt to live in. Only called if an
@@ -135,6 +172,7 @@ def new_chunk(self, pkt):
self.final_arrival_pointer = chunk.seq_end
self.final_arrival_data.insert((pkt.seq, pkt.ts))
self.chunks.insert(chunk)
+