From 1ce0e76ced5d3589a1fbe7aa17fc35a96c3c431a Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 16 May 2024 17:24:20 +0200 Subject: [PATCH] fix: list spacing in TXT output (#598) --- tests/unit_tests.py | 4 ++++ trafilatura/xml.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 70b45f94..8426655d 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1085,6 +1085,10 @@ def test_table_processing(): def test_list_processing(): options = DEFAULT_OPTIONS + # basic lists + my_doc = "

P 1

P 2

" + my_result = extract(my_doc, no_fallback=True, output_format='txt', config=ZERO_CONFIG) + assert my_result == "P 1\n- Item 1\n- Item 2\nP 2" # malformed lists (common error) result = etree.tostring(handle_lists(etree.fromstring('Description of the list:List item 1List item 2List item 3'), options)) assert result.count(b'List item') == 3 diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 85962142..d30a22e2 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -39,7 +39,6 @@ CONTROL_PARSER = XMLParser(remove_blank_text=True) NEWLINE_ELEMS = { - 'item': '\n- ', **{tag: '\n' for tag in ['code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table']} } SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'} @@ -248,6 +247,9 @@ def replace_element_text(element, include_formatting): elem_text = link_text else: LOGGER.warning("empty link: %s %s", elem_text, element.attrib) + # lists + elif element.tag == "item" and elem_text: + elem_text = f"- {elem_text}\n" return elem_text