Skip to content

Commit

Permalink
ENH+FIX: This commit:
Browse files Browse the repository at this point in the history
* Adds new `TimeFormat` class for `time_format` bot parameter. It improves performance, as it validates the parameter only once on instantiation of the bot class and not every time datetime is parsed (looking at you HTML Table parser). Also removes some code duplicity.
* Changes CSV Parser time conversions. For some reason the CSV parser had it's own `TIME_CONVERSIONS` and it was very limited. This PR changes it to use `DateTime.TIME_CONVERSIONS`. Now CSV parser uses `TimeFormat` class for `time_format` parameter.
* Changes HTML Table parser to use `TimeFormat` class for `time_format` parameter as well.
* Changes `DateTime` conversion function names to consistent naming scheme starting with `from_`. Changes function signature to be consistent. Backwards compatible.
* Updates some docstrings.
* Fixes a bug in `InvalidArgument` exception.
  • Loading branch information
gethvi committed Mar 16, 2023
1 parent 8bf1dd0 commit fc197fa
Show file tree
Hide file tree
Showing 10 changed files with 277 additions and 88 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ CHANGELOG

### Core
- Fixed not resetting destination path statistics in the stats cache after restarting bot (Fixes [#2331](https://github.com/certtools/intelmq/issues/2331))
- `intelmq.lib.datatypes`: Adds `TimeFormat` class to be used for the `time_format` bot parameter (PR#2329 by Filip Pokorný).
- `intelmq.lib.exceptions`: Fixes a bug in `InvalidArgument` exception (PR#2329 by Filip Pokorný).
- `intelmq.lib.harmonization`: Changes signature and names of `DateTime` conversion functions for consistency, backwards compatible (PR#2329 by Filip Pokorný).

### Development

Expand All @@ -19,6 +22,8 @@ CHANGELOG
#### Collectors

#### Parsers
- `intelmq.bots.parsers.generic.parser_csv`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).
- `intelmq.bots.parsers.html_table.parser`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).

#### Experts

Expand Down
2 changes: 1 addition & 1 deletion intelmq/bots/experts/sieve/expert.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def process_bool_match(self, key, op, value, event):
return self._bool_op_map[op](event[key], value)

def compute_basic_math(self, action, event) -> str:
date = DateTime.parse_utc_isoformat(event[action.key], True)
date = DateTime.from_isoformat(event[action.key], True)
delta = datetime.timedelta(minutes=parse_relative(action.value))

return self._basic_math_op_map[action.operator](date, delta).isoformat()
Expand Down
4 changes: 2 additions & 2 deletions intelmq/bots/parsers/abusech/parser_feodotracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def parse_line(self, line, report):
if line.get("first_seen"):
try:
event.add("time.source",
str(DateTime.convert_from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
str(DateTime.from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
raise_failure=False)

except ValueError:
Expand All @@ -48,7 +48,7 @@ def parse_line(self, line, report):
elif line.get("last_online"):
try:
event.add("time.source",
str(DateTime.convert_from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
str(DateTime.from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
raise_failure=False)
except ValueError:
self.logger.warning("Failed to parse '%s' to DateTime.", line.get('last_online'))
Expand Down
22 changes: 4 additions & 18 deletions intelmq/bots/parsers/generic/parser_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,11 @@
import re
from typing import Optional, Union, Iterable

from dateutil.parser import parse

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument, InvalidValue
from intelmq.lib.harmonization import DateTime
from intelmq.lib.utils import RewindableFileHandle

TIME_CONVERSIONS = {'timestamp': DateTime.from_timestamp,
'windows_nt': DateTime.from_windows_nt,
'epoch_millis': DateTime.from_epoch_millis,
None: lambda value: parse(value, fuzzy=True).isoformat() + " UTC"}
from intelmq.lib.datatypes import TimeFormat

DATA_CONVERSIONS = {'json': lambda data: json.loads(data)}
DOCS = "https://intelmq.readthedocs.io/en/latest/guides/Bots.html#generic-csv-parser"
Expand All @@ -49,7 +42,7 @@ class GenericCsvParserBot(ParserBot):
delimiter: str = ','
filter_text = None
filter_type = None
time_format = None
time_format: Optional[TimeFormat] = None
type: Optional[str] = None
type_translation = {}
skip_header: Union[bool, int] = False
Expand All @@ -67,14 +60,8 @@ def init(self):

# prevents empty strings:
self.column_regex_search = self.column_regex_search or {}
self.time_format = TimeFormat(self.time_format)

# handle empty strings, false etc.
if not self.time_format:
self.time_format = None
if self.time_format not in TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(TIME_CONVERSIONS.keys()),
docs=DOCS)
if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'):
raise InvalidArgument('filter_type', got=self.filter_type,
expected=("blacklist", "whitelist"),
Expand Down Expand Up @@ -137,7 +124,6 @@ def parse_line(self, row: list, report):
if search:
value = search.group(0)
else:
type = None
value = None

if key in ("__IGNORE__", ""):
Expand All @@ -147,7 +133,7 @@ def parse_line(self, row: list, report):
value = DATA_CONVERSIONS[self.data_type[key]](value)

if key in ("time.source", "time.destination"):
value = TIME_CONVERSIONS[self.time_format](value)
value = self.time_format.parse_datetime(value)
elif key.endswith('.url'):
if not value:
continue
Expand Down
14 changes: 5 additions & 9 deletions intelmq/bots/parsers/html_table/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
time_format: string
type: string
"""
from typing import Optional

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime
from intelmq.lib.exceptions import MissingDependencyError
from intelmq.lib.datatypes import TimeFormat


try:
Expand All @@ -46,7 +46,7 @@ class HTMLTableParserBot(ParserBot):
split_index = 0
split_separator = None
table_index = 0
time_format = None
time_format: Optional[TimeFormat] = None
type = "c2-server"
_parser = 'html.parser'

Expand All @@ -69,11 +69,7 @@ def init(self):
self.attr_value = self.attribute_value
self.skip_head = self.skip_table_head
self.skip_row = 1 if self.skip_head else 0

if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(DateTime.TIME_CONVERSIONS.keys()),
docs='https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser')
self.time_format = TimeFormat(self.time_format)

def process(self):
report = self.receive_message()
Expand Down Expand Up @@ -119,7 +115,7 @@ def process(self):
data = int(data)
except ValueError:
pass
data = DateTime.convert(data, format=self.time_format)
data = self.time_format.parse_datetime(data)

elif key.endswith('.url'):
if not data:
Expand Down
109 changes: 95 additions & 14 deletions intelmq/lib/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
# SPDX-FileCopyrightText: 2021 Birger Schacht
#
# SPDX-License-Identifier: AGPL-3.0-or-later

from datetime import datetime
from enum import Enum
from inspect import signature
from typing import Optional, Callable, Union, List

from termstyle import green
import json

from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime


class BotType(str, Enum):
COLLECTOR = "Collector"
PARSER = "Parser"
EXPERT = "Expert"
OUTPUT = "Output"
COLLECTOR = "Collector"
PARSER = "Parser"
EXPERT = "Expert"
OUTPUT = "Output"

def toJson(self):
return self.value


class ReturnType(str, Enum):
TEXT = "Text"
JSON = "Json"
PYTHON = "Python"
TEXT = "Text"
JSON = "Json"
PYTHON = "Python"

def toJson(self):
return self.value
Expand All @@ -40,7 +45,6 @@ def toJson(self):
'restarting': 'Restarting %s...',
}


ERROR_MESSAGES = {
'starting': 'Bot %s failed to START.',
'running': 'Bot %s is still running.',
Expand All @@ -54,8 +58,85 @@ def toJson(self):


class LogLevel(Enum):
DEBUG = 0
INFO = 1
WARNING = 2
ERROR = 3
DEBUG = 0
INFO = 1
WARNING = 2
ERROR = 3
CRITICAL = 4


class TimeFormat(str):
"""
Pydantic style Field Type class for bot parameter time_format. Used for validation.
"""

def __new__(cls, value: Optional[str] = None):
value = "fuzzy" if (value is None or value == "") else value
return super().__new__(cls, value)

def __init__(self, value: Optional[str] = None):

self.convert: Callable
self.format_string: Optional[str] = None

super(TimeFormat, self).__init__()

if isinstance(value, TimeFormat):
self.convert = value.convert
self.format_string = value.format_string
else:
self.convert, self.format_string = TimeFormat.validate(self)

def parse_datetime(self, value: str, return_datetime: bool = False) -> Union[datetime, str]:
"""
This function uses the selected conversion function to parse the datetime value.
:param value: external datetime string
:param return_datetime: whether to return string or datetime object
:return: parsed datetime or string
"""
if self.format_string:
return self.convert(value=value, format=self.format_string, return_datetime=return_datetime)
else:
return self.convert(value=value, return_datetime=return_datetime)

@classmethod
def __get_validators__(cls):
"""
This function is for Pydantic compatibility.
"""
yield cls

@staticmethod
def validate(value: str) -> [Callable, Optional[str]]:
"""
This function validates the time_format parameter value.
:param value: bot parameter for datetime conversion
:return: correct time conversion function and the format string
"""

split_value: List[str] = value.split('|')
conversion: Callable
conversion_name: str = split_value[0]
format_string: Optional[str] = split_value[1] if len(split_value) > 1 else None

# validation of the conversion name
if conversion_name in DateTime.TIME_CONVERSIONS.keys():
conversion = DateTime.TIME_CONVERSIONS[conversion_name]

else:
raise InvalidArgument(argument="time_format", got=value,
expected=[key for key in DateTime.TIME_CONVERSIONS.keys()])

# validate that we have format_string when the conversion function expects it
if not format_string and signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=f"{conversion_name}|FORMAT_STRING")

# validate that we do not have format_string when the conversion function doesn't expect it
elif format_string and not signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=conversion_name)

return conversion, format_string
4 changes: 2 additions & 2 deletions intelmq/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class InvalidArgument(IntelMQException):
def __init__(self, argument: Any, got: Any = None, expected=None,
docs: str = None):
message = f"Argument {repr(argument)} is invalid."
if expected is list:
message += f" Should be one of: {list}."
if isinstance(expected, list):
message += f" Should be one of: {expected}."
elif expected: # not None
message += f" Should be of type: {expected}."
if got:
Expand Down
Loading

0 comments on commit fc197fa

Please sign in to comment.