Skip to content

Commit

Permalink
fix behavior and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jul 17, 2024
1 parent 365d365 commit 8d95eb9
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 9 deletions.
9 changes: 5 additions & 4 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ def load_mock_page(url, xml_flag=False, langcheck=None, tei_output=False):
htmlstring = htmlbinary
else:
print('Encoding error')
output_format = 'txt'
if xml_flag is True:
if xml_flag:
output_format = 'xml'
if tei_output is True:
output_format = 'tei'
elif tei_output:
output_format = 'xmltei'
else:
output_format = 'txt'
return extract(htmlstring, url,
record_id='0000',
no_fallback=False,
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
file_processing_pipeline, load_blacklist,
load_input_dict, probe_homepage,
url_processing_pipeline, write_result)
from .settings import PARALLEL_CORES, SUPPORTED_FORMATS
from .settings import PARALLEL_CORES, SUPPORTED_FMT_CLI

# fix output encoding on some systems
try:
Expand Down Expand Up @@ -162,10 +162,10 @@ def add_args(parser):
# https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.add_mutually_exclusive_group
group5_ex.add_argument("-out",
help=argparse.SUPPRESS,
choices=sorted(SUPPORTED_FORMATS))
choices=sorted(SUPPORTED_FMT_CLI))
group5_ex.add_argument('--output-format',
help="determine output format",
choices=sorted(SUPPORTED_FORMATS),
choices=sorted(SUPPORTED_FMT_CLI),
default='txt')
group5_ex.add_argument("--csv",
help="shorthand for CSV output",
Expand Down
4 changes: 3 additions & 1 deletion trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

LOGGER = logging.getLogger(__name__)

TXT_FORMATS = {"markdown", "txt"}


def determine_returnstring(document, options):
'''Convert XML tree to chosen format, clean the result and output it as a string'''
Expand Down Expand Up @@ -347,7 +349,7 @@ def extract(filecontent, url=None, record_id=None, no_fallback=False,
if document is None:
return None

if options.format not in ("markdown", "txt"):
if options.format not in TXT_FORMATS:
# add record ID to metadata
document.id = record_id
# calculate fingerprint
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from .utils import line_processing


SUPPORTED_FORMATS = {"csv", "json", "html", "markdown", "txt", "xml", "xmltei"}
SUPPORTED_FMT_CLI = ["csv", "json", "html", "markdown", "txt", "xml", "xmltei"]
SUPPORTED_FORMATS = set(SUPPORTED_FMT_CLI) | {"python"} # the latter for bare_extraction() only
_SUPPORTED = ', '.join(sorted(SUPPORTED_FORMATS))


Expand Down

0 comments on commit 8d95eb9

Please sign in to comment.