Skip to content

Commit

Permalink
added download error collection and archive.org option
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Dec 8, 2020
1 parent 44f5247 commit 7036193
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 23 deletions.
2 changes: 1 addition & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def test_download():
# multiprocessing
domain_dict = dict()
domain_dict['httpbin.org'] = ['https://httpbin.org/status/301', 'https://httpbin.org/status/304', 'https://httpbin.org/status/200', 'https://httpbin.org/status/300', 'https://httpbin.org/status/400', 'https://httpbin.org/status/505']
assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25, None) is None
assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25, None) == (['https://httpbin.org/status/301'], None)
# test backoff algorithm
testdict = dict()
backoffdict = dict()
Expand Down
3 changes: 3 additions & 0 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ def parse_args(args):
help="""Use file content hash as output file name (for deduplication)
instead of random default""",
action="store_true")
group1.add_argument('--archived',
help='Try to fetch URLs from the Internet Archive if downloads fail',
action="store_true")

# https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.add_mutually_exclusive_group
group2.add_argument('-out', '--output-format',
Expand Down
58 changes: 36 additions & 22 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def determine_output_path(args, orig_filename, content, counter=None, new_filena
extension = '.json'
# use cryptographic hash on file contents to define name
if args.hash_as_name is True:
new_filename = content_fingerprint(content)[:27].replace('/', '-')
new_filename = content_fingerprint(content)[:27].replace('/', '-')
# determine directory
if args.keep_dirs is True:
# strip directory
Expand Down Expand Up @@ -200,21 +200,17 @@ def url_processing_checks(blacklist, input_urls):

def process_result(htmlstring, args, url, counter):
'''Extract text and metadata from a download webpage and eventually write out the result'''
if htmlstring is not None:
# backup option
if args.backup_dir:
fileslug = archive_html(htmlstring, args, counter)
else:
fileslug = None
# process
result = examine(htmlstring, args, url=url)
write_result(result, args, orig_filename=None, counter=None, new_filename=fileslug)
# increment written file counter
if counter is not None:
counter += 1
# backup option
if args.backup_dir:
fileslug = archive_html(htmlstring, args, counter)
else:
# log the error
print('No result for URL: ' + url, file=sys.stderr)
fileslug = None
# process
result = examine(htmlstring, args, url=url)
write_result(result, args, orig_filename=None, counter=None, new_filename=fileslug)
# increment written file counter
if counter is not None:
counter += 1
return counter


Expand Down Expand Up @@ -248,21 +244,27 @@ def single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, count
'''Implement a single threaded processing algorithm'''
# start with a higher level
i = 3
errors = []
while domain_dict:
url, domain_dict, backoff_dict, i = draw_backoff_url(domain_dict, backoff_dict, sleeptime, i)
htmlstring = fetch_url(url)
counter = process_result(htmlstring, args, url, counter)
if htmlstring is not None:
counter = process_result(htmlstring, args, url, counter)
else:
LOGGER.debug('No result for URL: %s', url)
errors.append(url)
return errors, counter


def multi_threaded_processing(domain_dict, args, sleeptime, counter):
'''Implement a multi-threaded processing algorithm'''
i, backoff_dict = 0, dict()
i, backoff_dict, errors = 0, dict(), []
download_threads = args.parallel or DOWNLOAD_THREADS
while domain_dict:
# the remaining list is too small, process it differently
if len({x for v in domain_dict.values() for x in v}) < download_threads:
single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter)
return
errors, counter = single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter)
return errors, counter
# populate buffer
bufferlist = []
while len(bufferlist) < download_threads:
Expand All @@ -274,7 +276,12 @@ def multi_threaded_processing(domain_dict, args, sleeptime, counter):
for future in as_completed(future_to_url):
url = future_to_url[future]
# handle result
counter = process_result(future.result(), args, url, counter)
if future.result() is not None:
counter = process_result(future.result(), args, url, counter)
else:
LOGGER.debug('No result for URL: %s', url)
errors.append(url)
return errors, counter


def url_processing_pipeline(args, input_urls, sleeptime):
Expand All @@ -299,9 +306,16 @@ def url_processing_pipeline(args, input_urls, sleeptime):
else:
counter = None
if len(domain_dict) <= 5:
single_threaded_processing(domain_dict, dict(), args, sleeptime, counter)
errors, counter = single_threaded_processing(domain_dict, dict(), args, sleeptime, counter)
else:
multi_threaded_processing(domain_dict, args, sleeptime, counter)
errors, counter = multi_threaded_processing(domain_dict, args, sleeptime, counter)
LOGGER.debug('%s URLs could not be found', len(errors))
# option to retry
if args.archived is True:
domain_dict = dict()
domain_dict['archive.org'] = ['https://web.archive.org/web/20/' + e for e in errors]
archived_errors, _ = single_threaded_processing(domain_dict, dict(), args, sleeptime, counter)
LOGGER.debug('%s archived URLs out of %s could not be found', len(archived_errors), len(errors))


def file_processing_pipeline(args):
Expand Down

0 comments on commit 7036193

Please sign in to comment.