Skip to content

Commit

Permalink
fix: list spacing in TXT output (#598)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed May 16, 2024
1 parent 9307d90 commit 1ce0e76
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
4 changes: 4 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,10 @@ def test_table_processing():

def test_list_processing():
options = DEFAULT_OPTIONS
# basic lists
my_doc = "<html><body><article><p>P 1</p><ul><li>Item 1</li><li>Item 2</li></ul><p>P 2</p></article></body></html>"
my_result = extract(my_doc, no_fallback=True, output_format='txt', config=ZERO_CONFIG)
assert my_result == "P 1\n- Item 1\n- Item 2\nP 2"
# malformed lists (common error)
result = etree.tostring(handle_lists(etree.fromstring('<list>Description of the list:<item>List item 1</item><item>List item 2</item><item>List item 3</item></list>'), options))
assert result.count(b'List item') == 3
Expand Down
4 changes: 3 additions & 1 deletion trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
CONTROL_PARSER = XMLParser(remove_blank_text=True)

NEWLINE_ELEMS = {
'item': '\n- ',
**{tag: '\n' for tag in ['code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table']}
}
SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
Expand Down Expand Up @@ -248,6 +247,9 @@ def replace_element_text(element, include_formatting):
elem_text = link_text
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
return elem_text


Expand Down

0 comments on commit 1ce0e76

Please sign in to comment.