diff --git a/apache_log_parser/__init__.py b/apache_log_parser/__init__.py index df9a14f..0d1249e 100644 --- a/apache_log_parser/__init__.py +++ b/apache_log_parser/__init__.py @@ -9,7 +9,7 @@ class LineDoesntMatchException(ApacheLogParserException): def __init__(self, log_line=None, regex=None, *args, **kwargs): self.log_line = log_line self.regex = regex - + def __repr__(self): return u"LineDoesntMatchException(log_line={0!r}, regex={1!r})".format(self.log_line, self.regex) @@ -65,16 +65,16 @@ def apachetime(s): Given a string representation of a datetime in apache format (e.g. "01/Sep/2012:06:05:11 +0000"), return the python datetime for that string """ - month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, + month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12} s = s[1:-1] return datetime(int(s[7:11]), month_map[s[3:6]], int(s[0:2]), \ int(s[12:14]), int(s[15:17]), int(s[18:20])) def format_time(matched_strings): - time_recieved = matched_strings['time_recieved'] - obj = apachetime(time_recieved) - return {'time_recieved':time_recieved, 'time_recieved_datetimeobj': obj, 'time_recieved_isoformat': obj.isoformat()} + time_received = matched_strings['time_received'] + obj = apachetime(time_received) + return {'time_received':time_received, 'time_received_datetimeobj': obj, 'time_received_isoformat': obj.isoformat()} FORMAT_STRINGS = [ @@ -107,7 +107,7 @@ def format_time(matched_strings): [make_regex('%r'), '.*?', lambda match: 'request_first_line', extra_request_from_first_line], # First line of request [make_regex('%R'), '.*?', lambda match: 'handler', lambda matched_strings: matched_strings], # The handler generating the response (if any). [make_regex('%s'), '[0-9]+?', lambda match: 'status', lambda matched_strings: matched_strings], # Status. For requests that got internally redirected, this is the status of the *original* request --- %>s for the last. - [make_regex('%t'), '.*?', lambda match: 'time_recieved', format_time], # Time the request was received (standard english format) + [make_regex('%t'), '.*?', lambda match: 'time_received', format_time], # Time the request was received (standard english format) [make_regex('%\{[^\]]+?\}t'), '.*?', extract_inner_value("time_", "t") , lambda matched_strings: matched_strings], # The time, in the form given by format, which should be in strftime(3) format. (potentially localized) [make_regex('%T'), '.*?', lambda match: 'time_s', lambda matched_strings: matched_strings], # The time taken to serve the request, in seconds. [make_regex('%u'), '.*?', lambda match: 'remote_user', lambda matched_strings: matched_strings], # Remote user (from auth; may be bogus if return status (%s) is 401) @@ -123,47 +123,54 @@ def format_time(matched_strings): [make_regex('%O'), '.*?', lambda match: 'bytes_tx', lambda matched_strings: matched_strings], # Bytes sent, including headers, cannot be zero. You need to enable mod_logio to use this. ] -def make_parser(format_string): - pattern = "("+"|".join(x[0] for x in FORMAT_STRINGS)+")" - parts = re.split(pattern, format_string) - - - functions_to_parse = {} - - log_line_regex = "" - while True: - if len(parts) == 0: - break - if len(parts) == 1: - raw, regex = parts.pop(0), None - elif len(parts) >= 2: - raw, regex = parts.pop(0), parts.pop(0) - if len(raw) > 0: - log_line_regex += re.escape(raw) - if regex is not None: - for format_spec in FORMAT_STRINGS: - pattern_regex, log_part_regex, name_func, values_func = format_spec - match = re.match("^"+pattern_regex+"$", regex) - if match: - name = name_func(match.group()) - functions_to_parse[name] = values_func - log_line_regex += "(?P<"+name+">"+log_part_regex+")" - break - - log_line_regex = re.compile(log_line_regex) - - def matcher(log_line): - match = log_line_regex.match(log_line) +class Parser: + def __init__(self, format_string): + self.names = [] + + self.pattern = "("+"|".join(x[0] for x in FORMAT_STRINGS)+")" + self.parts = re.split(self.pattern, format_string) + + self.functions_to_parse = {} + + self.log_line_regex = "" + while True: + if len(self.parts) == 0: + break + if len(self.parts) == 1: + raw, regex = self.parts.pop(0), None + elif len(self.parts) >= 2: + raw, regex = self.parts.pop(0), self.parts.pop(0) + if len(raw) > 0: + self.log_line_regex += re.escape(raw) + if regex is not None: + for format_spec in FORMAT_STRINGS: + pattern_regex, log_part_regex, name_func, values_func = format_spec + match = re.match("^"+pattern_regex+"$", regex) + if match: + name = name_func(match.group()) + self.names.append(name) + self.functions_to_parse[name] = values_func + self.log_line_regex += "(?P<"+name+">"+log_part_regex+")" + break + + self.log_line_regex = re.compile(self.log_line_regex) + self.names = tuple(self.names) + + def parse(self, log_line): + match = self.log_line_regex.match(log_line) if match is None: - raise LineDoesntMatchException(log_line=log_line, regex=log_line_regex.pattern) + raise LineDoesntMatchException(log_line=log_line, regex=self.log_line_regex.pattern) else: results = {} - for name in functions_to_parse: + for name in self.functions_to_parse: values = {name: match.groupdict()[name]} - values = functions_to_parse[name](values) + values = self.functions_to_parse[name](values) results.update(values) - return results - return matcher - + +def make_parser(format_string): + return Parser(format_string).parse + +def get_fieldnames(format_string): + return Parser(format_string).names diff --git a/apache_log_parser/tests.py b/apache_log_parser/tests.py index 5661ff2..8994b03 100644 --- a/apache_log_parser/tests.py +++ b/apache_log_parser/tests.py @@ -4,7 +4,8 @@ class ApacheLogParserTestCase(unittest.TestCase): def test_simple(self): - parser = apache_log_parser.make_parser("%h <<%P>> %t %Dus \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %l %u") + format_string = "%h <<%P>> %t %Dus \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %l %u" + parser = apache_log_parser.make_parser(format_string) sample = '127.0.0.1 <<6113>> [16/Aug/2013:15:45:34 +0000] 1966093us "GET / HTTP/1.1" 200 3478 "https://example.com/" "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18)" - -' log_data = parser(sample) self.assertNotEqual(log_data, None) @@ -13,11 +14,13 @@ def test_simple(self): self.assertEqual(log_data['request_first_line'], 'GET / HTTP/1.1') self.assertEqual(log_data['request_method'], 'GET') self.assertEqual(log_data['request_url'], '/') + self.assertEqual(log_data['request_header_referer'], 'https://example.com/') self.assertEqual(log_data['request_header_user_agent'], 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.18)') self.assertEqual(log_data['request_header_user_agent__os__family'], 'Linux') + self.assertEqual(apache_log_parser.get_fieldnames(format_string), ('remote_host', 'pid', 'time_received', 'time_us', 'request_first_line', 'status', 'response_bytes_clf', 'request_header_referer', 'request_header_user_agent', 'remote_logname', 'remote_user')) if __name__ == '__main__': unittest.main()