Skip to content

Commit

Permalink
ENH+FIX: This commit:
Browse files Browse the repository at this point in the history
* Adds new `TimeFormat` class for `time_format` bot parameter. It improves performance, as it validates the parameter only once on instantiation of the bot class and not every time datetime is parsed (looking at you HTML Table parser). Also removes some code duplicity.
* Changes CSV Parser time conversions. For some reason the CSV parser had it's own `TIME_CONVERSIONS` and it was very limited. This PR changes it to use `DateTime.TIME_CONVERSIONS`. Now CSV parser uses `TimeFormat` class for `time_format` parameter.
* Changes HTML Table parser to use `TimeFormat` class for `time_format` parameter as well.
* Changes `DateTime` conversion function names to consistent naming scheme starting with `from_`. Changes function signature to be consistent. Backwards compatible.
* Updates some docstrings.
* Fixes a bug in `InvalidArgument` exception.
  • Loading branch information
gethvi committed May 17, 2023
1 parent 64abf29 commit 11fd27f
Show file tree
Hide file tree
Showing 10 changed files with 276 additions and 88 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ CHANGELOG
- Fixed not resetting destination path statistics in the stats cache after restarting bot (Fixes [#2331](https://github.com/certtools/intelmq/issues/2331))
- Force flushing statistics if bot will sleep longer than flushing delay (Fixes [#2336](https://github.com/certtools/intelmq/issues/2336))
- `intelmq.lib.upgrages`: Fix a bug in the upgrade function for version 3.1.0 which caused an exception if a generic csv parser instance had no parameter `type` (PR#2319 by Filip Pokorný).
- `intelmq.lib.datatypes`: Adds `TimeFormat` class to be used for the `time_format` bot parameter (PR#2329 by Filip Pokorný).
- `intelmq.lib.exceptions`: Fixes a bug in `InvalidArgument` exception (PR#2329 by Filip Pokorný).
- `intelmq.lib.harmonization`: Changes signature and names of `DateTime` conversion functions for consistency, backwards compatible (PR#2329 by Filip Pokorný).

### Development

Expand All @@ -31,6 +34,10 @@ CHANGELOG
- Added 'Accessible-SIP' report. (PR#2348)
- Added 'IPv6-Open-HTTP-Proxy' and 'IPv6-Accessible-HTTP-Proxy' aliases. (PR#2348)
- Removed duplicate mappings from the 'Spam-URL' report. (PR#2348)
- Added 'IPv6-Vulnerable-Exchange' alias and 'Accessible-WS-Discovery-Service' report. (PR#2338 by elsif2)
- Removed unused 'p0f_genre' and 'p0f_detail' from the 'DNS-Open-Resolvers' report. (PR#2338 by elsif2)
- `intelmq.bots.parsers.generic.parser_csv`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).
- `intelmq.bots.parsers.html_table.parser`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).

#### Experts
- `intelmq.bots.experts.sieve`:
Expand Down
2 changes: 1 addition & 1 deletion intelmq/bots/experts/sieve/expert.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def process_bool_match(self, key, op, value, event):
return self._bool_op_map[op](event[key], value)

def compute_basic_math(self, action, event) -> str:
date = DateTime.parse_utc_isoformat(event[action.key], True)
date = DateTime.from_isoformat(event[action.key], True)
delta = datetime.timedelta(minutes=parse_relative(action.value))

return self._basic_math_op_map[action.operator](date, delta).isoformat()
Expand Down
4 changes: 2 additions & 2 deletions intelmq/bots/parsers/abusech/parser_feodotracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def parse_line(self, line, report):
if line.get("first_seen"):
try:
event.add("time.source",
str(DateTime.convert_from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
str(DateTime.from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
raise_failure=False)

except ValueError:
Expand All @@ -48,7 +48,7 @@ def parse_line(self, line, report):
elif line.get("last_online"):
try:
event.add("time.source",
str(DateTime.convert_from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
str(DateTime.from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
raise_failure=False)
except ValueError:
self.logger.warning("Failed to parse '%s' to DateTime.", line.get('last_online'))
Expand Down
22 changes: 4 additions & 18 deletions intelmq/bots/parsers/generic/parser_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,11 @@
import re
from typing import Optional, Union, Iterable

from dateutil.parser import parse

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument, InvalidValue
from intelmq.lib.harmonization import DateTime
from intelmq.lib.utils import RewindableFileHandle

TIME_CONVERSIONS = {'timestamp': DateTime.from_timestamp,
'windows_nt': DateTime.from_windows_nt,
'epoch_millis': DateTime.from_epoch_millis,
None: lambda value: parse(value, fuzzy=True).isoformat() + " UTC"}
from intelmq.lib.datatypes import TimeFormat

DATA_CONVERSIONS = {'json': lambda data: json.loads(data)}
DOCS = "https://intelmq.readthedocs.io/en/latest/guides/Bots.html#generic-csv-parser"
Expand All @@ -49,7 +42,7 @@ class GenericCsvParserBot(ParserBot):
delimiter: str = ','
filter_text = None
filter_type = None
time_format = None
time_format: Optional[TimeFormat] = None
type: Optional[str] = None
type_translation = {}
skip_header: Union[bool, int] = False
Expand All @@ -67,14 +60,8 @@ def init(self):

# prevents empty strings:
self.column_regex_search = self.column_regex_search or {}
self.time_format = TimeFormat(self.time_format)

# handle empty strings, false etc.
if not self.time_format:
self.time_format = None
if self.time_format not in TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(TIME_CONVERSIONS.keys()),
docs=DOCS)
if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'):
raise InvalidArgument('filter_type', got=self.filter_type,
expected=("blacklist", "whitelist"),
Expand Down Expand Up @@ -137,7 +124,6 @@ def parse_line(self, row: list, report):
if search:
value = search.group(0)
else:
type = None
value = None

if key in ("__IGNORE__", ""):
Expand All @@ -147,7 +133,7 @@ def parse_line(self, row: list, report):
value = DATA_CONVERSIONS[self.data_type[key]](value)

if key in ("time.source", "time.destination"):
value = TIME_CONVERSIONS[self.time_format](value)
value = self.time_format.parse_datetime(value)
elif key.endswith('.url'):
if not value:
continue
Expand Down
14 changes: 5 additions & 9 deletions intelmq/bots/parsers/html_table/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
time_format: string
type: string
"""
from typing import Optional

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime
from intelmq.lib.exceptions import MissingDependencyError
from intelmq.lib.datatypes import TimeFormat


try:
Expand All @@ -46,7 +46,7 @@ class HTMLTableParserBot(ParserBot):
split_index = 0
split_separator = None
table_index = 0
time_format = None
time_format: Optional[TimeFormat] = None
type = "c2-server"
_parser = 'html.parser'

Expand All @@ -69,11 +69,7 @@ def init(self):
self.attr_value = self.attribute_value
self.skip_head = self.skip_table_head
self.skip_row = 1 if self.skip_head else 0

if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(DateTime.TIME_CONVERSIONS.keys()),
docs='https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser')
self.time_format = TimeFormat(self.time_format)

def process(self):
report = self.receive_message()
Expand Down Expand Up @@ -119,7 +115,7 @@ def process(self):
data = int(data)
except ValueError:
pass
data = DateTime.convert(data, format=self.time_format)
data = self.time_format.parse_datetime(data)

elif key.endswith('.url'):
if not data:
Expand Down
106 changes: 92 additions & 14 deletions intelmq/lib/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
# SPDX-FileCopyrightText: 2021 Birger Schacht
#
# SPDX-License-Identifier: AGPL-3.0-or-later

from datetime import datetime
from enum import Enum
from inspect import signature
from typing import Optional, Callable, Union, List

from termstyle import green
import json

from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime


class BotType(str, Enum):
COLLECTOR = "Collector"
PARSER = "Parser"
EXPERT = "Expert"
OUTPUT = "Output"
COLLECTOR = "Collector"
PARSER = "Parser"
EXPERT = "Expert"
OUTPUT = "Output"

def toJson(self):
return self.value


class ReturnType(str, Enum):
TEXT = "Text"
JSON = "Json"
PYTHON = "Python"
TEXT = "Text"
JSON = "Json"
PYTHON = "Python"

def toJson(self):
return self.value
Expand All @@ -40,7 +45,6 @@ def toJson(self):
'restarting': 'Restarting %s...',
}


ERROR_MESSAGES = {
'starting': 'Bot %s failed to START.',
'running': 'Bot %s is still running.',
Expand All @@ -54,8 +58,82 @@ def toJson(self):


class LogLevel(Enum):
DEBUG = 0
INFO = 1
WARNING = 2
ERROR = 3
DEBUG = 0
INFO = 1
WARNING = 2
ERROR = 3
CRITICAL = 4


class TimeFormat(str):
"""
Pydantic style Field Type class for bot parameter time_format. Used for validation.
"""

def __new__(cls, value: Optional[str] = None):
"""
Because str is immutable and we want to manipulate it, it must be done before the object is instantiated.
Therefore it is necessary to overload __new__ method.
"""
value = value or "fuzzy"
return super().__new__(cls, value)

def __init__(self, value: Optional[str] = None):

self.convert: Callable
self.format_string: Optional[str] = None

super().__init__()

if isinstance(value, TimeFormat):
self.convert = value.convert
self.format_string = value.format_string
else:
self.convert, self.format_string = TimeFormat.validate(self)

def parse_datetime(self, value: str, return_datetime: bool = False) -> Union[datetime, str]:
"""
This function uses the selected conversion function to parse the datetime value.
:param value: external datetime string
:param return_datetime: whether to return string or datetime object
:return: parsed datetime or string
"""
if self.format_string:
return self.convert(value=value, format=self.format_string, return_datetime=return_datetime)
else:
return self.convert(value=value, return_datetime=return_datetime)

@staticmethod
def validate(value: str) -> [Callable, Optional[str]]:
"""
This function validates the time_format parameter value.
:param value: bot parameter for datetime conversion
:return: correct time conversion function and the format string
"""

split_value: List[str] = value.split('|')
conversion: Callable
conversion_name: str = split_value[0]
format_string: Optional[str] = split_value[1] if len(split_value) > 1 else None

# validation of the conversion name
if conversion_name in DateTime.TIME_CONVERSIONS.keys():
conversion = DateTime.TIME_CONVERSIONS[conversion_name]

else:
raise InvalidArgument(argument="time_format", got=value,
expected=[key for key in DateTime.TIME_CONVERSIONS.keys()])

# validate that we have format_string when the conversion function expects it
if not format_string and signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=f"{conversion_name}|FORMAT_STRING")

# validate that we do not have format_string when the conversion function doesn't expect it
elif format_string and not signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=conversion_name)

return conversion, format_string
4 changes: 2 additions & 2 deletions intelmq/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class InvalidArgument(IntelMQException):
def __init__(self, argument: Any, got: Any = None, expected=None,
docs: str = None):
message = f"Argument {repr(argument)} is invalid."
if expected is list:
message += f" Should be one of: {list}."
if isinstance(expected, list):
message += f" Should be one of: {expected}."
elif expected: # not None
message += f" Should be of type: {expected}."
if got:
Expand Down
Loading

0 comments on commit 11fd27f

Please sign in to comment.