added download error collection and archive.org option

adbar · Dec 8, 2020 · 7036193 · 7036193
1 parent 44f5247
commit 7036193
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 23 deletions.
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -156,7 +156,7 @@ def test_download():
     # multiprocessing
     domain_dict = dict()
     domain_dict['httpbin.org'] = ['https://httpbin.org/status/301', 'https://httpbin.org/status/304', 'https://httpbin.org/status/200', 'https://httpbin.org/status/300', 'https://httpbin.org/status/400', 'https://httpbin.org/status/505']
-    assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25, None) is None
+    assert cli_utils.multi_threaded_processing(domain_dict, args, 0.25, None) == (['https://httpbin.org/status/301'], None)
     # test backoff algorithm
     testdict = dict()
     backoffdict = dict()

diff --git a/trafilatura/cli.py b/trafilatura/cli.py
@@ -89,6 +89,9 @@ def parse_args(args):
                         help="""Use file content hash as output file name (for deduplication) 
                         instead of random default""",
                         action="store_true")
+    group1.add_argument('--archived',
+                        help='Try to fetch URLs from the Internet Archive if downloads fail',
+                        action="store_true")
 
     # https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.add_mutually_exclusive_group
     group2.add_argument('-out', '--output-format',

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -123,7 +123,7 @@ def determine_output_path(args, orig_filename, content, counter=None, new_filena
         extension = '.json'
     # use cryptographic hash on file contents to define name
     if args.hash_as_name is True:
-         new_filename = content_fingerprint(content)[:27].replace('/', '-')
+        new_filename = content_fingerprint(content)[:27].replace('/', '-')
     # determine directory
     if args.keep_dirs is True:
         # strip directory
@@ -200,21 +200,17 @@ def url_processing_checks(blacklist, input_urls):
 
 def process_result(htmlstring, args, url, counter):
     '''Extract text and metadata from a download webpage and eventually write out the result'''
-    if htmlstring is not None:
-        # backup option
-        if args.backup_dir:
-            fileslug = archive_html(htmlstring, args, counter)
-        else:
-            fileslug = None
-        # process
-        result = examine(htmlstring, args, url=url)
-        write_result(result, args, orig_filename=None, counter=None, new_filename=fileslug)
-        # increment written file counter
-        if counter is not None:
-            counter += 1
+    # backup option
+    if args.backup_dir:
+        fileslug = archive_html(htmlstring, args, counter)
     else:
-        # log the error
-        print('No result for URL: ' + url, file=sys.stderr)
+        fileslug = None
+    # process
+    result = examine(htmlstring, args, url=url)
+    write_result(result, args, orig_filename=None, counter=None, new_filename=fileslug)
+    # increment written file counter
+    if counter is not None:
+        counter += 1
     return counter
 
 
@@ -248,21 +244,27 @@ def single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, count
     '''Implement a single threaded processing algorithm'''
     # start with a higher level
     i = 3
+    errors = []
     while domain_dict:
         url, domain_dict, backoff_dict, i = draw_backoff_url(domain_dict, backoff_dict, sleeptime, i)
         htmlstring = fetch_url(url)
-        counter = process_result(htmlstring, args, url, counter)
+        if htmlstring is not None:
+            counter = process_result(htmlstring, args, url, counter)
+        else:
+            LOGGER.debug('No result for URL: %s', url)
+            errors.append(url)
+    return errors, counter
 
 
 def multi_threaded_processing(domain_dict, args, sleeptime, counter):
     '''Implement a multi-threaded processing algorithm'''
-    i, backoff_dict = 0, dict()
+    i, backoff_dict, errors = 0, dict(), []
     download_threads = args.parallel or DOWNLOAD_THREADS
     while domain_dict:
         # the remaining list is too small, process it differently
         if len({x for v in domain_dict.values() for x in v}) < download_threads:
-            single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter)
-            return
+            errors, counter = single_threaded_processing(domain_dict, backoff_dict, args, sleeptime, counter)
+            return errors, counter
         # populate buffer
         bufferlist = []
         while len(bufferlist) < download_threads:
@@ -274,7 +276,12 @@ def multi_threaded_processing(domain_dict, args, sleeptime, counter):
             for future in as_completed(future_to_url):
                 url = future_to_url[future]
                 # handle result
-                counter = process_result(future.result(), args, url, counter)
+                if future.result() is not None:
+                    counter = process_result(future.result(), args, url, counter)
+                else:
+                    LOGGER.debug('No result for URL: %s', url)
+                    errors.append(url)
+    return errors, counter
 
 
 def url_processing_pipeline(args, input_urls, sleeptime):
@@ -299,9 +306,16 @@ def url_processing_pipeline(args, input_urls, sleeptime):
     else:
         counter = None
     if len(domain_dict) <= 5:
-        single_threaded_processing(domain_dict, dict(), args, sleeptime, counter)
+        errors, counter = single_threaded_processing(domain_dict, dict(), args, sleeptime, counter)
     else:
-        multi_threaded_processing(domain_dict, args, sleeptime, counter)
+        errors, counter = multi_threaded_processing(domain_dict, args, sleeptime, counter)
+    LOGGER.debug('%s URLs could not be found', len(errors))
+    # option to retry
+    if args.archived is True:
+        domain_dict = dict()
+        domain_dict['archive.org'] = ['https://web.archive.org/web/20/' + e for e in errors]
+        archived_errors, _ = single_threaded_processing(domain_dict, dict(), args, sleeptime, counter)
+        LOGGER.debug('%s archived URLs out of %s could not be found', len(archived_errors), len(errors))
 
 
 def file_processing_pipeline(args):