adbar · adbar · Jul 18, 2024 · Jul 17, 2024 · Jul 18, 2024
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -809,7 +809,9 @@ def test_precision_recall():
     result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
     assert '1' not in result
 
-    my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><h2>Test</h2></div></body></html>')
+    my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><p class="link">Test</p></div></body></html>')
+    result = extract(copy(my_document), favor_precision=False, config=ZERO_CONFIG, no_fallback=True)
+    assert 'content' in result and 'Test' in result
     result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
     assert 'content' in result and 'Test' not in result
 
@@ -823,6 +825,10 @@ def test_precision_recall():
     result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=False)
     assert len(result) > 0
 
+    my_document = html.fromstring('<html><body><div><span>Text.</span></div></body></html>')
+    assert extract(copy(my_document), favor_precision=True, no_fallback=True) == ""
+    assert extract(copy(my_document), favor_recall=True, no_fallback=True) == "Text."
+
 
 def test_table_processing():
     options = DEFAULT_OPTIONS

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -68,6 +68,23 @@ def determine_returnstring(document, options):
     return normalize_unicode(returnstring)
 
 
+def trafilatura_sequence(cleaned_tree, cleaned_tree_backup, tree_backup, options):
+    "Execute the standard cascade of extractors used by Trafilatura."
+    # Trafilatura's main extractor
+    postbody, temp_text, len_text = extract_content(cleaned_tree, options)
+
+    # comparison with external extractors
+    if not options.fast:
+        postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)
+
+    # rescue: baseline extraction on original/dirty tree
+    if len_text < options.min_extracted_size and not options.focus == "precision":
+        postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
+        LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)
+
+    return postbody, temp_text, len_text
+
+
 def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                     favor_precision=False, favor_recall=False,
                     include_comments=True, output_format="python", target_language=None,
@@ -176,11 +193,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
                 prune_xpath = [prune_xpath]
             tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])
 
-        # backup for further processing
-        tree_backup = copy(tree)
-
-        # clean
-        cleaned_tree = tree_cleaning(tree, options)
+        # clean and backup for further processing
+        cleaned_tree = tree_cleaning(copy(tree), options)
         cleaned_tree_backup = copy(cleaned_tree)
 
         # convert tags, the rest does not work without conversion
@@ -194,17 +208,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         if options.focus == "precision":
             cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)
 
-        # extract content
-        postbody, temp_text, len_text = extract_content(cleaned_tree, options)
-
-        # compare if necessary
-        if not options.fast:
-            postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)
-        # add baseline as additional fallback
-        # rescue: try to use original/dirty tree # and favor_precision is False=?
-        if len_text < options.min_extracted_size:
-            postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
-            LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)
+        postbody, temp_text, len_text = trafilatura_sequence(cleaned_tree, cleaned_tree_backup, tree, options)
 
         # tree size sanity check
         if options.max_tree_size: