Skip to content

Commit

Permalink
introduce MAX_REDIRECTS config setting and fix urllib3 redirect han…
Browse files Browse the repository at this point in the history
…dling (#461)

* introduce `MAX_REDIRECTS` config setting and fix urllib3 redirect handling

Fixes issue #450

After setting `MAX_REDIRECTS` to 5, I could fetch the original URL from the issue:
`trafilatura -u https://www.hydrogeninsight.com/production/breaking-us-reveals-the-seven-regional-hydrogen-hubs-to-receive-7bn-of-government-funding/2-1-1534596`

I also fixed this old issue: #128
The underlying urllib3 bug has not been fixed: urllib3/urllib3#2475

I had to pass the retry strategy to the actual request method: it doesn't propagate from the pool maanger

* use httpbin everywhere instead of httpbun

* restore MAX_REDIRECTS defaut config value to 2

* reset global objects after test_fetch to avoid side-effects

* pin lxml to < 5

* test no_ssl True & False to fix coverage

* move _reset_global_objects() to the test file

* rewrite assignment in multiple lines for readability

---------

Co-authored-by: Adrien Barbaresi <adbar@users.noreply.github.com>
  • Loading branch information
vbarbaresi and adbar committed Jan 4, 2024
1 parent de57ac1 commit 6a79511
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 9 deletions.
1 change: 1 addition & 0 deletions docs/settings.rst
Expand Up @@ -43,6 +43,7 @@ The default file included in the package is `settings.cfg <https://github.com/ad
* ``EXTENSIVE_DATE_SEARCH = on`` set to ``off`` to deactivate ``htmldate``'s opportunistic search (lower recall, higher precision)
- Navigation
* ``EXTERNAL_URLS = off`` do not take URLs from other websites in feeds and sitemaps (CLI mode)
* ``MAX_REDIRECTS = 2``: maximum number of `URL redirections <https://en.wikipedia.org/wiki/URL_redirection>`_ to be followed. Set to 0 to not follow any redirection.


Using a custom file on the command-line
Expand Down
28 changes: 25 additions & 3 deletions tests/downloads_tests.py
Expand Up @@ -27,6 +27,7 @@
from trafilatura.cli_utils import (download_queue_processing,
url_processing_pipeline)
from trafilatura.core import extract
import trafilatura.downloads
from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT,
_determine_headers, _handle_response,
_parse_config, _pycurl_is_live_page,
Expand All @@ -47,6 +48,14 @@
UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg'))


def _reset_downloads_global_objects():
"""
Force global objects to be re-created
"""
trafilatura.downloads.HTTP_POOL = None
trafilatura.downloads.NO_CERT_POOL = None
trafilatura.downloads.RETRY_STRATEGY = None

def test_fetch():
'''Test URL fetching.'''
# logic: empty request?
Expand All @@ -70,8 +79,9 @@ def test_fetch():
assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None
# no SSL, no decoding
url = 'https://httpbun.com/status/200'
response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert response.data == b''
for no_ssl in (True, False):
response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG)
assert response.data == b''
if pycurl is not None:
response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG)
assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG)
Expand All @@ -94,7 +104,19 @@ def test_fetch():
assert load_html(response) is not None
# nothing to see here
assert extract(response, url=response.url, config=ZERO_CONFIG) is None

# test handling redirects
res = fetch_url('http://httpbin.org/redirect/2')
assert len(res) > 100 # We followed redirects and downloaded something in the end
new_config = use_config() # get a new config instance to avoid mutating the default one
# Patch max directs: limit to 0. We won't fetch any page as a result
new_config.set('DEFAULT', 'MAX_REDIRECTS', '0')
_reset_downloads_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value
res = fetch_url('http://httpbin.org/redirect/1', config=new_config)
assert res is None
# Also test max redir implementation on pycurl if available
if pycurl is not None:
assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None
_reset_downloads_global_objects() # reset global objects again to avoid affecting other tests

def test_config():
'''Test how configuration options are read and stored.'''
Expand Down
11 changes: 5 additions & 6 deletions trafilatura/downloads.py
Expand Up @@ -43,7 +43,6 @@
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50
MAX_REDIRECTS = 2

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
Expand Down Expand Up @@ -90,8 +89,8 @@ def _send_request(url, no_ssl, config):
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
if not RETRY_STRATEGY:
RETRY_STRATEGY = urllib3.util.Retry(
total=0,
redirect=MAX_REDIRECTS, # raise_on_redirect=False,
total=config.getint("DEFAULT", "MAX_REDIRECTS"),
redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False,
connect=0,
backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2,
status_forcelist=[
Expand All @@ -107,13 +106,13 @@ def _send_request(url, no_ssl, config):
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS) # cert_reqs='CERT_REQUIRED'
# execute request
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config))
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
else:
# define pool
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS)
# execute request
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config))
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
return _send_request(url, True, config)
Expand Down Expand Up @@ -275,7 +274,7 @@ def _send_pycurl_request(url, no_ssl, config):
curl.setopt(pycurl.HTTPHEADER, headerlist)
# curl.setopt(pycurl.USERAGENT, '')
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, MAX_REDIRECTS)
curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS'))
curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.NOSIGNAL, 1)
Expand Down
2 changes: 2 additions & 0 deletions trafilatura/settings.cfg
Expand Up @@ -12,6 +12,8 @@ SLEEP_TIME = 5
USER_AGENTS =
# cookie for HTTP requests
COOKIE =
# Maximum number of redirects that we will follow
MAX_REDIRECTS = 2

# Extraction
MIN_EXTRACTED_SIZE = 250
Expand Down

0 comments on commit 6a79511

Please sign in to comment.