Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

precision fix: do not use baseline as backup extraction #646

Merged
merged 2 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,9 @@ def test_precision_recall():
result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
assert '1' not in result

my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><h2>Test</h2></div></body></html>')
my_document = html.fromstring('<html><body><div class="article-body"><p>content</p><p class="link">Test</p></div></body></html>')
result = extract(copy(my_document), favor_precision=False, config=ZERO_CONFIG, no_fallback=True)
assert 'content' in result and 'Test' in result
result = extract(copy(my_document), favor_precision=True, config=ZERO_CONFIG, no_fallback=True)
assert 'content' in result and 'Test' not in result

Expand All @@ -823,6 +825,10 @@ def test_precision_recall():
result = extract(copy(my_document), favor_recall=True, config=ZERO_CONFIG, no_fallback=False)
assert len(result) > 0

my_document = html.fromstring('<html><body><div><span>Text.</span></div></body></html>')
assert extract(copy(my_document), favor_precision=True, no_fallback=True) == ""
assert extract(copy(my_document), favor_recall=True, no_fallback=True) == "Text."


def test_table_processing():
options = DEFAULT_OPTIONS
Expand Down
36 changes: 20 additions & 16 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,23 @@ def determine_returnstring(document, options):
return normalize_unicode(returnstring)


def trafilatura_sequence(cleaned_tree, cleaned_tree_backup, tree_backup, options):
"Execute the standard cascade of extractors used by Trafilatura."
# Trafilatura's main extractor
postbody, temp_text, len_text = extract_content(cleaned_tree, options)

# comparison with external extractors
if not options.fast:
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)

# rescue: baseline extraction on original/dirty tree
if len_text < options.min_extracted_size and not options.focus == "precision":
postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)

return postbody, temp_text, len_text


def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
favor_precision=False, favor_recall=False,
include_comments=True, output_format="python", target_language=None,
Expand Down Expand Up @@ -176,11 +193,8 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
prune_xpath = [prune_xpath]
tree = prune_unwanted_nodes(tree, [XPath(x) for x in prune_xpath])

# backup for further processing
tree_backup = copy(tree)

# clean
cleaned_tree = tree_cleaning(tree, options)
# clean and backup for further processing
cleaned_tree = tree_cleaning(copy(tree), options)
cleaned_tree_backup = copy(cleaned_tree)

# convert tags, the rest does not work without conversion
Expand All @@ -194,17 +208,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
if options.focus == "precision":
cleaned_tree = prune_unwanted_nodes(cleaned_tree, REMOVE_COMMENTS_XPATH)

# extract content
postbody, temp_text, len_text = extract_content(cleaned_tree, options)

# compare if necessary
if not options.fast:
postbody, temp_text, len_text = compare_extraction(cleaned_tree_backup, deepcopy(tree_backup), postbody, temp_text, len_text, options)
# add baseline as additional fallback
# rescue: try to use original/dirty tree # and favor_precision is False=?
if len_text < options.min_extracted_size:
postbody, temp_text, len_text = baseline(deepcopy(tree_backup))
LOGGER.debug('non-clean extracted length: %s (extraction)', len_text)
postbody, temp_text, len_text = trafilatura_sequence(cleaned_tree, cleaned_tree_backup, tree, options)

# tree size sanity check
if options.max_tree_size:
Expand Down
Loading