Skip to content
This repository was archived by the owner on Aug 8, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 50 additions & 43 deletions apache_log_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class LineDoesntMatchException(ApacheLogParserException):
def __init__(self, log_line=None, regex=None, *args, **kwargs):
self.log_line = log_line
self.regex = regex

def __repr__(self):
return u"LineDoesntMatchException(log_line={0!r}, regex={1!r})".format(self.log_line, self.regex)

Expand Down Expand Up @@ -65,16 +65,16 @@ def apachetime(s):
Given a string representation of a datetime in apache format (e.g.
"01/Sep/2012:06:05:11 +0000"), return the python datetime for that string
"""
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12}
s = s[1:-1]
return datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), \
int(s[12:14]), int(s[15:17]), int(s[18:20]))

def format_time(matched_strings):
time_recieved = matched_strings['time_recieved']
obj = apachetime(time_recieved)
return {'time_recieved':time_recieved, 'time_recieved_datetimeobj': obj, 'time_recieved_isoformat': obj.isoformat()}
time_received = matched_strings['time_received']
obj = apachetime(time_received)
return {'time_received':time_received, 'time_received_datetimeobj': obj, 'time_received_isoformat': obj.isoformat()}


FORMAT_STRINGS = [
Expand Down Expand Up @@ -107,7 +107,7 @@ def format_time(matched_strings):
[make_regex('%r'), '.*?', lambda match: 'request_first_line', extra_request_from_first_line], # First line of request
[make_regex('%R'), '.*?', lambda match: 'handler', lambda matched_strings: matched_strings], # The handler generating the response (if any).
[make_regex('%s'), '[0-9]+?', lambda match: 'status', lambda matched_strings: matched_strings], # Status. For requests that got internally redirected, this is the status of the *original* request --- %>s for the last.
[make_regex('%t'), '.*?', lambda match: 'time_recieved', format_time], # Time the request was received (standard english format)
[make_regex('%t'), '.*?', lambda match: 'time_received', format_time], # Time the request was received (standard english format)
[make_regex('%\{[^\]]+?\}t'), '.*?', extract_inner_value("time_", "t") , lambda matched_strings: matched_strings], # The time, in the form given by format, which should be in strftime(3) format. (potentially localized)
[make_regex('%T'), '.*?', lambda match: 'time_s', lambda matched_strings: matched_strings], # The time taken to serve the request, in seconds.
[make_regex('%u'), '.*?', lambda match: 'remote_user', lambda matched_strings: matched_strings], # Remote user (from auth; may be bogus if return status (%s) is 401)
Expand All @@ -123,47 +123,54 @@ def format_time(matched_strings):
[make_regex('%O'), '.*?', lambda match: 'bytes_tx', lambda matched_strings: matched_strings], # Bytes sent, including headers, cannot be zero. You need to enable mod_logio to use this.
]

def make_parser(format_string):
pattern = "("+"|".join(x[0] for x in FORMAT_STRINGS)+")"
parts = re.split(pattern, format_string)


functions_to_parse = {}

log_line_regex = ""
while True:
if len(parts) == 0:
break
if len(parts) == 1:
raw, regex = parts.pop(0), None
elif len(parts) >= 2:
raw, regex = parts.pop(0), parts.pop(0)
if len(raw) > 0:
log_line_regex += re.escape(raw)
if regex is not None:
for format_spec in FORMAT_STRINGS:
pattern_regex, log_part_regex, name_func, values_func = format_spec
match = re.match("^"+pattern_regex+"$", regex)
if match:
name = name_func(match.group())
functions_to_parse[name] = values_func
log_line_regex += "(?P<"+name+">"+log_part_regex+")"
break

log_line_regex = re.compile(log_line_regex)

def matcher(log_line):
match = log_line_regex.match(log_line)
class Parser:
def __init__(self, format_string):
self.names = []

self.pattern = "("+"|".join(x[0] for x in FORMAT_STRINGS)+")"
self.parts = re.split(self.pattern, format_string)

self.functions_to_parse = {}

self.log_line_regex = ""
while True:
if len(self.parts) == 0:
break
if len(self.parts) == 1:
raw, regex = self.parts.pop(0), None
elif len(self.parts) >= 2:
raw, regex = self.parts.pop(0), self.parts.pop(0)
if len(raw) > 0:
self.log_line_regex += re.escape(raw)
if regex is not None:
for format_spec in FORMAT_STRINGS:
pattern_regex, log_part_regex, name_func, values_func = format_spec
match = re.match("^"+pattern_regex+"$", regex)
if match:
name = name_func(match.group())
self.names.append(name)
self.functions_to_parse[name] = values_func
self.log_line_regex += "(?P<"+name+">"+log_part_regex+")"
break

self.log_line_regex = re.compile(self.log_line_regex)
self.names = tuple(self.names)

def parse(self, log_line):
match = self.log_line_regex.match(log_line)
if match is None:
raise LineDoesntMatchException(log_line=log_line, regex=log_line_regex.pattern)
raise LineDoesntMatchException(log_line=log_line, regex=self.log_line_regex.pattern)
else:
results = {}
for name in functions_to_parse:
for name in self.functions_to_parse:
values = {name: match.groupdict()[name]}
values = functions_to_parse[name](values)
values = self.functions_to_parse[name](values)
results.update(values)

return results
return matcher



def make_parser(format_string):
return Parser(format_string).parse

def get_fieldnames(format_string):
return Parser(format_string).names
5 changes: 4 additions & 1 deletion apache_log_parser/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

class ApacheLogParserTestCase(unittest.TestCase):
def test_simple(self):
parser = apache_log_parser.make_parser("%h <<%P>> %t %Dus \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %l %u")
format_string = "%h <<%P>> %t %Dus \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %l %u"
parser = apache_log_parser.make_parser(format_string)
sample = '127.0.0.1 <<6113>> [16/Aug/2013:15:45:34 +0000] 1966093us "GET / HTTP/1.1" 200 3478 "https://example.com/" "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18)" - -'
log_data = parser(sample)
self.assertNotEqual(log_data, None)
Expand All @@ -13,11 +14,13 @@ def test_simple(self):
self.assertEqual(log_data['request_first_line'], 'GET / HTTP/1.1')
self.assertEqual(log_data['request_method'], 'GET')
self.assertEqual(log_data['request_url'], '/')
self.assertEqual(log_data['request_header_referer'], 'https://example.com/')

self.assertEqual(log_data['request_header_user_agent'], 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18)')

self.assertEqual(log_data['request_header_user_agent__os__family'], 'Linux')

self.assertEqual(apache_log_parser.get_fieldnames(format_string), ('remote_host', 'pid', 'time_received', 'time_us', 'request_first_line', 'status', 'response_bytes_clf', 'request_header_referer', 'request_header_user_agent', 'remote_logname', 'remote_user'))

if __name__ == '__main__':
unittest.main()