Skip to content

Commit

Permalink
extraction fix: div with only lb (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed May 11, 2022
1 parent 1f4605c commit 14d6205
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
2 changes: 2 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def test_exotic_tags(xmloutput=False):
element.append(etree.Element('lb'))
converted = handle_paragraphs(element, ['p'], False, ZERO_CONFIG)
assert etree.tostring(converted) == b'<p>1st part. 2nd part.</p>'
# naked div with <lb>
assert '1.\n2.\n3.' in extract('<html><body><main><div>1.<br/>2.<br/>3.<br/></div></main></body></html>', no_fallback=True, config=ZERO_CONFIG)
# malformed lists (common error)
result = etree.tostring(handle_lists(etree.fromstring('<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'), False, ZERO_CONFIG))
assert result.count(b'List item') == 3
Expand Down
8 changes: 6 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,10 +537,14 @@ def extract_content(tree, favor_precision=False, favor_recall=False, include_tab
if 'span' not in potential_tags:
strip_tags(subtree, 'span')
LOGGER.debug(sorted(potential_tags))
##strip_tags(subtree, 'lb') # BoingBoing-Bug
# proper extraction
subelems = subtree.xpath('.//*')
# e.g. only lb-elems in a div
if set(e.tag for e in subelems) == {'lb'}:
subelems = [subtree]
# extract content # list(filter(None.__ne__, processed_elems)) ?
result_body.extend(e for e in
[handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
[handle_textelem(e, potential_tags, deduplicate, config) for e in subelems]
if e is not None)
# remove trailing titles
while len(result_body) > 0 and (result_body[-1].tag in NOT_AT_THE_END):
Expand Down

0 comments on commit 14d6205

Please sign in to comment.