Permalink
Browse files

Merge branch 'maintenance' into develop

  • Loading branch information...
wagner-certat committed Jan 9, 2019
2 parents 7eb5beb + c844fa2 commit 9c984d1a4c1fa10b87bc4ecb19f1f70acc60a21f
@@ -85,7 +85,8 @@ CHANGELOG
- `lib/pipeline.py` (`Redis.receive`): Wait in 1s steps if redis is busy loading its snapshot from disk (#1334).

### Default configuration
- Set `error_dump_message` to true by default.
- Set `error_dump_message` to true by default in `defaults.conf`.
- Fixed typo in `defaults.conf`: `proccess_manager` -> `process_manager`

### Development
- `bin/rewrite_config_files.py`: Fix ordering of BOTS file (#1327).
@@ -131,6 +132,7 @@ CHANGELOG
- Add support for `Darknet` (#1353).
- `intelmq.bots.parsers.generic.parser_csv`: If the `skip_header` parameter was set to `True`, the header was not part of the `raw` field as returned by the `recover_line` method. The header is now saved and handled correctly by the fixed recovery method.
- `intelmq.bots.parsers.cleanmx.parser`: Use field `first` instead of `firsttime` for `time.source` (#1329, #1348).
- `intelmq.bots.parsers.twitter.parser`: Support for `url-normalize` >= 1.4.1 and recommend it. Added new optional parameter `default_scheme`, passed to `url-normalize` (#1356).

#### Experts
- `intelmq.bots.experts.national_cert_contact_certat.expert`:
@@ -843,6 +843,13 @@ http://www.team-cymru.com/bogon-reference.html
* `domain_whitelist`: domains to be filetered out
* `substitutions`: semicolon delimited list of even length of pairs of substitutions (for example: '[.];.;,;.' substitutes '[.]' for '.' and ',' for '.')
* `classification_type: string with a valid classification type as defined in data harmonization
* `default_scheme`: Default scheme for URLs if not given. See also the next section.
##### Default scheme
The dependency `url-normalize` changed it's behavior in version 1.4.0 from using `http://` as default scheme to `https://`. Version 1.4.1 added the possibility to specify it. Thus you can only use the `default_scheme` parameter with a current version of this library >= 1.4.1, with 1.4.0 you will always get `https://` as default scheme and for older versions < 1.4.0 `http://` is used.
This does not affect URLs which already include the scheme.
### Shodan
@@ -289,6 +289,7 @@
"access_token_secret": "",
"consumer_key": "",
"consumer_secret": "",
"default_scheme": "http",
"exclude_replies": "false",
"follow_urls": "",
"include_rts": "true",
@@ -17,6 +17,7 @@
classification_type : string with a valid classificationtype
"""
import pkg_resources

from intelmq.lib.bot import ParserBot
from intelmq.lib.bot import utils
@@ -30,6 +31,7 @@
url_normalize = None

try:
import tld.exceptions
from tld import get_tld
from tld.utils import update_tld_names
except ImportError:
@@ -41,9 +43,16 @@ class TwitterParserBot(ParserBot):
def init(self):
if url_normalize is None:
raise ValueError("Could not import 'url-normalize'. Please install it.")
url_version = pkg_resources.get_distribution("url-normalize").version
if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and hasattr(self.parameters, 'default_scheme'):
raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
"Get at least version '1.4.1'." % url_version)
if get_tld is None:
raise ValueError("Could not import 'tld'. Please install it.")
update_tld_names()
try:
update_tld_names()
except tld.exceptions.TldIOError:
self.logger.info("Could not update TLD names cache.")
self.domain_whitelist = []
if getattr(self.parameters, "domain_whitelist", '') != '':
self.domain_whitelist.extend(self.parameters.domain_whitelist.split(','))
@@ -61,12 +70,17 @@ def init(self):
if not ClassificationType.is_valid(self.classification_type):
self.classification_type = 'unknown'

if hasattr(self.parameters, 'default_scheme'):
self.url_kwargs = {'default_scheme': self.parameters.default_scheme}
else:
self.url_kwargs = {}

def get_domain(self, address):
try:
dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
if not self.in_whitelist(dom):
if get_tld(url_normalize(dom), fail_silently=True):
return url_normalize(dom)
if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
return url_normalize(dom, **self.url_kwargs)
return None
except AttributeError:
return None
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-

import os
import sys
import unittest

import intelmq.lib.test as test
@@ -64,7 +65,12 @@ class TestTwitterParserBot(test.BotTestCase, unittest.TestCase):
def set_bot(cls):
cls.bot_reference = TwitterParserBot
cls.sysconfig = {"substitutions" : " .net;.net;[.];.;,;.",
"classification_type": "blacklist"}
"classification_type": "blacklist",
}
if sys.version_info >= (3, 6, 0):
# url-normalize 1.4.1 supporting this parameter is only available for 3.6
cls.sysconfig["default_scheme"] = "http"

def test_parse(self):
self.input_message = REPORT
self.run_bot()

0 comments on commit 9c984d1

Please sign in to comment.