Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX+ENH: Time conversions for CSV parser and HTML table parser #2329

Merged
merged 1 commit into from
May 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ CHANGELOG
- Fixed not resetting destination path statistics in the stats cache after restarting bot (Fixes [#2331](https://github.com/certtools/intelmq/issues/2331))
- Force flushing statistics if bot will sleep longer than flushing delay (Fixes [#2336](https://github.com/certtools/intelmq/issues/2336))
- `intelmq.lib.upgrages`: Fix a bug in the upgrade function for version 3.1.0 which caused an exception if a generic csv parser instance had no parameter `type` (PR#2319 by Filip Pokorný).
- `intelmq.lib.datatypes`: Adds `TimeFormat` class to be used for the `time_format` bot parameter (PR#2329 by Filip Pokorný).
- `intelmq.lib.exceptions`: Fixes a bug in `InvalidArgument` exception (PR#2329 by Filip Pokorný).
- `intelmq.lib.harmonization`: Changes signature and names of `DateTime` conversion functions for consistency, backwards compatible (PR#2329 by Filip Pokorný).

### Development

Expand All @@ -31,6 +34,8 @@ CHANGELOG
- Added 'Accessible-SIP' report. (PR#2348)
- Added 'IPv6-Open-HTTP-Proxy' and 'IPv6-Accessible-HTTP-Proxy' aliases. (PR#2348)
- Removed duplicate mappings from the 'Spam-URL' report. (PR#2348)
- `intelmq.bots.parsers.generic.parser_csv`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).
- `intelmq.bots.parsers.html_table.parser`: Changes `time_format` parameter to use new `TimeFormat` class (PR#2329 by Filip Pokorný).

#### Experts
- `intelmq.bots.experts.sieve`:
Expand Down
2 changes: 1 addition & 1 deletion intelmq/bots/experts/sieve/expert.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def process_bool_match(self, key, op, value, event):
return self._bool_op_map[op](event[key], value)

def compute_basic_math(self, action, event) -> str:
date = DateTime.parse_utc_isoformat(event[action.key], True)
date = DateTime.from_isoformat(event[action.key], True)
delta = datetime.timedelta(minutes=parse_relative(action.value))

return self._basic_math_op_map[action.operator](date, delta).isoformat()
Expand Down
4 changes: 2 additions & 2 deletions intelmq/bots/parsers/abusech/parser_feodotracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def parse_line(self, line, report):
if line.get("first_seen"):
try:
event.add("time.source",
str(DateTime.convert_from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
str(DateTime.from_format(value=line.get("first_seen"), format="%Y-%m-%d %H:%M:%S")),
raise_failure=False)

except ValueError:
Expand All @@ -48,7 +48,7 @@ def parse_line(self, line, report):
elif line.get("last_online"):
try:
event.add("time.source",
str(DateTime.convert_from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
str(DateTime.from_format_midnight(line.get("last_online"), format="%Y-%m-%d")),
raise_failure=False)
except ValueError:
self.logger.warning("Failed to parse '%s' to DateTime.", line.get('last_online'))
Expand Down
22 changes: 4 additions & 18 deletions intelmq/bots/parsers/generic/parser_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,11 @@
import re
from typing import Optional, Union, Iterable

from dateutil.parser import parse

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument, InvalidValue
from intelmq.lib.harmonization import DateTime
from intelmq.lib.utils import RewindableFileHandle

TIME_CONVERSIONS = {'timestamp': DateTime.from_timestamp,
'windows_nt': DateTime.from_windows_nt,
'epoch_millis': DateTime.from_epoch_millis,
None: lambda value: parse(value, fuzzy=True).isoformat() + " UTC"}
from intelmq.lib.datatypes import TimeFormat

DATA_CONVERSIONS = {'json': lambda data: json.loads(data)}
DOCS = "https://intelmq.readthedocs.io/en/latest/guides/Bots.html#generic-csv-parser"
Expand All @@ -49,7 +42,7 @@ class GenericCsvParserBot(ParserBot):
delimiter: str = ','
filter_text = None
filter_type = None
time_format = None
time_format: Optional[TimeFormat] = None
type: Optional[str] = None
type_translation = {}
skip_header: Union[bool, int] = False
Expand All @@ -67,14 +60,8 @@ def init(self):

# prevents empty strings:
self.column_regex_search = self.column_regex_search or {}
self.time_format = TimeFormat(self.time_format)

# handle empty strings, false etc.
if not self.time_format:
self.time_format = None
if self.time_format not in TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(TIME_CONVERSIONS.keys()),
docs=DOCS)
if self.filter_type and self.filter_type not in ('blacklist', 'whitelist'):
raise InvalidArgument('filter_type', got=self.filter_type,
expected=("blacklist", "whitelist"),
Expand Down Expand Up @@ -137,7 +124,6 @@ def parse_line(self, row: list, report):
if search:
value = search.group(0)
else:
type = None
value = None

if key in ("__IGNORE__", ""):
Expand All @@ -147,7 +133,7 @@ def parse_line(self, row: list, report):
value = DATA_CONVERSIONS[self.data_type[key]](value)

if key in ("time.source", "time.destination"):
value = TIME_CONVERSIONS[self.time_format](value)
value = self.time_format.parse_datetime(value)
elif key.endswith('.url'):
if not value:
continue
Expand Down
14 changes: 5 additions & 9 deletions intelmq/bots/parsers/html_table/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
time_format: string
type: string
"""
from typing import Optional

from intelmq.lib import utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime
from intelmq.lib.exceptions import MissingDependencyError
from intelmq.lib.datatypes import TimeFormat


try:
Expand All @@ -46,7 +46,7 @@ class HTMLTableParserBot(ParserBot):
split_index = 0
split_separator = None
table_index = 0
time_format = None
time_format: Optional[TimeFormat] = None
type = "c2-server"
_parser = 'html.parser'

Expand All @@ -69,11 +69,7 @@ def init(self):
self.attr_value = self.attribute_value
self.skip_head = self.skip_table_head
self.skip_row = 1 if self.skip_head else 0

if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
raise InvalidArgument('time_format', got=self.time_format,
expected=list(DateTime.TIME_CONVERSIONS.keys()),
docs='https://intelmq.readthedocs.io/en/latest/guides/Bots.html#html-table-parser')
self.time_format = TimeFormat(self.time_format)

def process(self):
report = self.receive_message()
Expand Down Expand Up @@ -119,7 +115,7 @@ def process(self):
data = int(data)
except ValueError:
pass
data = DateTime.convert(data, format=self.time_format)
data = self.time_format.parse_datetime(data)

elif key.endswith('.url'):
if not data:
Expand Down
84 changes: 81 additions & 3 deletions intelmq/lib/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
# SPDX-FileCopyrightText: 2021 Birger Schacht
#
# SPDX-License-Identifier: AGPL-3.0-or-later

from datetime import datetime
from enum import Enum
from inspect import signature
from typing import Optional, Callable, Union, List

from termstyle import green
import json

from intelmq.lib.exceptions import InvalidArgument
from intelmq.lib.harmonization import DateTime


class BotType(str, Enum):
Expand Down Expand Up @@ -40,7 +45,6 @@ def toJson(self):
'restarting': 'Restarting %s...',
}


ERROR_MESSAGES = {
'starting': 'Bot %s failed to START.',
'running': 'Bot %s is still running.',
Expand All @@ -59,3 +63,77 @@ class LogLevel(Enum):
WARNING = 2
ERROR = 3
CRITICAL = 4


class TimeFormat(str):
"""
Pydantic style Field Type class for bot parameter time_format. Used for validation.
"""

def __new__(cls, value: Optional[str] = None):
"""
Because str is immutable and we want to manipulate it, it must be done before the object is instantiated.
Therefore it is necessary to overload __new__ method.
"""
value = value or "fuzzy"
return super().__new__(cls, value)

def __init__(self, value: Optional[str] = None):

self.convert: Callable
self.format_string: Optional[str] = None

super().__init__()

if isinstance(value, TimeFormat):
gethvi marked this conversation as resolved.
Show resolved Hide resolved
self.convert = value.convert
self.format_string = value.format_string
else:
self.convert, self.format_string = TimeFormat.validate(self)

def parse_datetime(self, value: str, return_datetime: bool = False) -> Union[datetime, str]:
"""
This function uses the selected conversion function to parse the datetime value.

:param value: external datetime string
:param return_datetime: whether to return string or datetime object
:return: parsed datetime or string
"""
if self.format_string:
return self.convert(value=value, format=self.format_string, return_datetime=return_datetime)
else:
return self.convert(value=value, return_datetime=return_datetime)

@staticmethod
def validate(value: str) -> [Callable, Optional[str]]:
"""
This function validates the time_format parameter value.
gethvi marked this conversation as resolved.
Show resolved Hide resolved

:param value: bot parameter for datetime conversion
:return: correct time conversion function and the format string
"""

split_value: List[str] = value.split('|')
conversion: Callable
conversion_name: str = split_value[0]
format_string: Optional[str] = split_value[1] if len(split_value) > 1 else None

# validation of the conversion name
if conversion_name in DateTime.TIME_CONVERSIONS.keys():
conversion = DateTime.TIME_CONVERSIONS[conversion_name]

else:
raise InvalidArgument(argument="time_format", got=value,
expected=[key for key in DateTime.TIME_CONVERSIONS.keys()])

# validate that we have format_string when the conversion function expects it
if not format_string and signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=f"{conversion_name}|FORMAT_STRING")

# validate that we do not have format_string when the conversion function doesn't expect it
elif format_string and not signature(conversion).parameters.get("format"):
raise InvalidArgument(argument="time_format", got=value,
expected=conversion_name)

return conversion, format_string
4 changes: 2 additions & 2 deletions intelmq/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class InvalidArgument(IntelMQException):
def __init__(self, argument: Any, got: Any = None, expected=None,
docs: str = None):
message = f"Argument {repr(argument)} is invalid."
if expected is list:
message += f" Should be one of: {list}."
if isinstance(expected, list):
message += f" Should be one of: {expected}."
elif expected: # not None
message += f" Should be of type: {expected}."
if got:
Expand Down
Loading
Loading