Skip to content

Commit

Permalink
review logging levels (#347)
Browse files Browse the repository at this point in the history
* review logging levels

* workflow: remove duplicate tests
  • Loading branch information
adbar committed May 3, 2023
1 parent 075fe6d commit 6ff9ca2
Show file tree
Hide file tree
Showing 10 changed files with 39 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: [3.7, 3.8, 3.9, "3.10", "3.11"] # "3.12-dev"
python-version: [3.8, 3.9, "3.10", "3.11"] # "3.12-dev"
env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
include:
# custom python versions
Expand Down
2 changes: 0 additions & 2 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
from .sitemaps import sitemap_search


LOGGER = logging.getLogger(__name__)

# fix output encoding on some systems
try:
# > Python 3.7
Expand Down
22 changes: 11 additions & 11 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def handle_other_elements(element, potential_tags, options):
'''Handle diverse or unknown elements in the scope of relevant tags'''
# delete unwanted
if element.tag not in potential_tags:
# LOGGER.debug('discarding: %s %s', element.tag, element.text)
LOGGER.debug('discarding element: %s %s', element.tag, element.text)
return None
if element.tag == 'div':
# make a copy and prune it in case it contains sub-elements handled on their own?
Expand Down Expand Up @@ -668,9 +668,9 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
# apply decision
if algo_flag:
body, text, len_text = temppost_algo, algo_text, len_algo
LOGGER.info('using generic algorithm: %s', url)
LOGGER.debug('using generic algorithm: %s', url)
else:
LOGGER.info('using custom extraction: %s', url)
LOGGER.debug('using custom extraction: %s', url)
# override faulty extraction: try with justext
if body.xpath(SANITIZED_XPATH) or len_text < min_target_length: # body.find(...)
# or options.recall is True ?
Expand Down Expand Up @@ -883,7 +883,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)
# cut short if extracted URL in blacklist
if document.url in url_blacklist:
LOGGER.info('blacklisted URL: %s', url)
LOGGER.warning('blacklisted URL: %s', url)
raise ValueError
# cut short if core elements are missing
if only_with_metadata is True and any(
Expand Down Expand Up @@ -936,34 +936,34 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
if max_tree_size is not None:
# strip tags
if len(postbody) > max_tree_size:
LOGGER.warning('output tree too long: %s', len(postbody))
LOGGER.debug('output tree too long: %s', len(postbody))
strip_tags(postbody, 'hi')
# still too long, raise an error
if len(postbody) > max_tree_size:
LOGGER.error('output tree too long: %s, discarding file', len(postbody))
LOGGER.debug('output tree too long: %s, discarding file', len(postbody))
raise ValueError
# size checks
if len_comments < config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE'):
LOGGER.info('not enough comments %s', url)
LOGGER.debug('not enough comments %s', url)
if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT',
'MIN_OUTPUT_COMM_SIZE'):
LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments)
LOGGER.debug('text and comments not long enough: %s %s', len_text, len_comments)
raise ValueError

# check duplicates at body level
if deduplicate is True and duplicate_test(postbody, config) is True:
LOGGER.error('duplicate document for URL %s', url)
LOGGER.debug('discarding duplicate document for URL %s', url)
raise ValueError

# sanity check on language
if target_language is not None:
is_not_target_lang, document = language_filter(temp_text, temp_comments, target_language, document)
if is_not_target_lang is True:
LOGGER.error('wrong language for URL %s', url)
LOGGER.debug('wrong language for URL %s', url)
raise ValueError

except (TypeError, ValueError):
LOGGER.info('discarding data for url: %s', url) # document.url , record_id
LOGGER.warning('discarding data for url: %s', url) # document.url , record_id
return None

# special case: python variables
Expand Down
7 changes: 5 additions & 2 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _send_request(url, no_ssl, config):
# execute request
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config))
except urllib3.exceptions.SSLError:
LOGGER.error('retrying after SSLError: %s', url)
LOGGER.warning('retrying after SSLError: %s', url)
return _send_request(url, True, config)
except Exception as err:
LOGGER.error('download error: %s %s', url, err) # sys.exc_info()[0]
Expand Down Expand Up @@ -160,7 +160,7 @@ def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG):
return _handle_response(url, response, decode, config)
# return '' (useful do discard further processing?)
# return response
LOGGER.debug('no response: %s', url)
LOGGER.debug('request failed: %s', url)
return None


Expand All @@ -171,6 +171,9 @@ def _pycurl_is_live_page(url):
# Set the URL and HTTP method (HEAD)
curl.setopt(pycurl.URL, url.encode('utf-8'))
curl.setopt(pycurl.CONNECTTIMEOUT, 10)
# no SSL verification
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True)
# Perform the request
Expand Down
6 changes: 3 additions & 3 deletions trafilatura/feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None):
checked = check_url(link, language=target_lang)
if checked is not None:
if not is_similar_domain(domainname, checked[1]) and not "feed" in link:
LOGGER.error('Rejected, diverging domain names: %s %s', domainname, checked[1])
LOGGER.warning('Rejected, diverging domain names: %s %s', domainname, checked[1])
else:
output_links.append(checked[0])
# Feedburner/Google feeds
Expand Down Expand Up @@ -182,7 +182,7 @@ def find_feed_urls(url, target_lang=None):
return feed_links
LOGGER.debug('No usable feed links found: %s', url)
else:
LOGGER.warning('Could not download web page: %s', url)
LOGGER.error('Could not download web page: %s', url)
if url.strip('/') != baseurl:
return try_homepage(baseurl, target_lang)
# try alternative: Google News
Expand All @@ -201,5 +201,5 @@ def find_feed_urls(url, target_lang=None):
def try_homepage(baseurl, target_lang):
'''Shift into reverse and try the homepage instead of the particular feed
page that was given as input.'''
LOGGER.info('Probing homepage for feeds instead: %s', baseurl)
LOGGER.debug('Probing homepage for feeds instead: %s', baseurl)
return find_feed_urls(baseurl, target_lang)
12 changes: 6 additions & 6 deletions trafilatura/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@ def check_html_lang(tree, target_language, strict=False):
for elem in target_elements:
if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
return True
LOGGER.debug('HTML lang detection failed')
LOGGER.debug('HTML content-language failed')
return False
# locale
target_elements = tree.findall('.//meta[@property="og:locale"][@content]')
if target_elements:
for elem in target_elements:
if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
return True
LOGGER.debug('HTML lang detection failed')
LOGGER.debug('HTML og:locale failed')
return False
# HTML lang attribute: sometimes a wrong indication
if strict is True:
Expand All @@ -84,9 +84,9 @@ def check_html_lang(tree, target_language, strict=False):
for elem in target_elements:
if target_language in RE_HTML_LANG.split(elem.get('lang').lower()):
return True
LOGGER.debug('HTML lang detection failed')
LOGGER.debug('HTML lang failed')
return False
LOGGER.info('No relevant lang elements found')
LOGGER.debug('No relevant lang elements found')
return True


Expand All @@ -99,7 +99,7 @@ def language_classifier(temp_text, temp_comments):
else py3langid.classify(temp_comments)
)
else:
LOGGER.warning('Detector not installed, no language detection run')
LOGGER.warning('Language detector not installed, skipping detection')
result = None
return result

Expand All @@ -116,7 +116,7 @@ def language_filter(temp_text, temp_comments, target_language, docmeta):
# LOGGER.error('wrong HTML meta language for URL %s', url)
# raise ValueError
if docmeta.language is not None and docmeta.language != target_language:
LOGGER.warning('wrong language: %s %s %s', docmeta.language, docmeta.id, docmeta.url)
LOGGER.warning('wrong language: %s %s', docmeta.language, docmeta.url)
return True, docmeta
return False, docmeta

Expand Down
5 changes: 2 additions & 3 deletions trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def extract_metainfo(tree, expressions, len_limit=200):
for elem in tree.xpath(expression):
content = trim(' '.join(elem.itertext()))
if content and 2 < len(content) < len_limit:
# LOGGER.debug('metadata found in: %s', expression)
return content
i += 1
if i > 1:
Expand All @@ -307,7 +306,7 @@ def examine_title_element(tree):
first = mymatch[1] or None
second = mymatch[2] or None
except IndexError:
LOGGER.warning('no main title found')
LOGGER.debug('no main title found')
return title, first, second


Expand Down Expand Up @@ -336,7 +335,7 @@ def extract_title(tree):
try:
title = tree.xpath('.//h2')[0].text_content()
except IndexError:
LOGGER.warning('no h2 title found')
LOGGER.debug('no h2 title found')
return title


Expand Down
8 changes: 4 additions & 4 deletions trafilatura/readability_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@ def score_paragraphs(self):
for elem in ordered:
candidate = candidates[elem]
density = self.get_link_density(elem)
LOGGER.debug("Branch %6.3f link density %.3f -> %6.3f",
candidate.score, density, candidate.score * (1 - density)
)
# LOGGER.debug("Branch %6.3f link density %.3f -> %6.3f",
# candidate.score, density, candidate.score * (1 - density)
#)
candidate.score *= 1 - density

return candidates
Expand Down Expand Up @@ -307,7 +307,7 @@ def remove_unlikely_candidates(self):
and REGEXES["unlikelyCandidatesRe"].search(attrs)
and (not REGEXES["okMaybeItsACandidateRe"].search(attrs))
):
LOGGER.debug("Removing unlikely candidate: %s", elem.tag)
# LOGGER.debug("Removing unlikely candidate: %s", elem.tag)
elem.drop_tree()

def transform_misused_divs_into_paragraphs(self):
Expand Down
13 changes: 5 additions & 8 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def download_and_process_sitemap(url: str, domain: str, baseurl: str, target_lan
# variables init
sitemapurls, linklist = sitemapurls or [], linklist or []
# fetch and pre-process
LOGGER.info('fetching sitemap: %s', url)
LOGGER.debug('fetching sitemap: %s', url)
pagecontent = fetch_url(url)
sitemap = SitemapObject(baseurl, pagecontent, domain, url, target_lang)
add_sitemaps, add_links = process_sitemap(sitemap)
Expand Down Expand Up @@ -174,7 +174,7 @@ def handle_link(link: str, sitemap: SitemapObject) -> Tuple[str, str]:
# don't take links from another domain and make an exception for main platforms
# also bypass: subdomains vs. domains
elif not is_similar_domain(sitemap.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
LOGGER.error('link discarded, diverging domain names: %s %s', sitemap.domain, newdomain)
LOGGER.warning('link discarded, diverging domain names: %s %s', sitemap.domain, newdomain)
else:
state = 'sitemap' if DETECT_SITEMAP_LINK.search(link) else 'link'
return link, state
Expand Down Expand Up @@ -204,8 +204,7 @@ def extract_sitemap_langlinks(sitemap: SitemapObject) -> Tuple[List[str], List[s
if lang_match:
link, state = handle_link(lang_match[1], sitemap)
sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
LOGGER.info('%s sitemaps and %s links with hreflang found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
LOGGER.debug('sitemaps found: %s', sitemapurls)
LOGGER.debug('%s sitemaps and %s links with hreflang found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
return sitemapurls, linklist


Expand All @@ -217,8 +216,7 @@ def extract_sitemap_links(sitemap: SitemapObject) -> Tuple[List[str], List[str]]
# process middle part of the match tuple
link, state = handle_link(match[1], sitemap)
sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
LOGGER.info('%s sitemaps and %s links found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
LOGGER.debug('sitemaps found: %s', sitemapurls)
LOGGER.debug('%s sitemaps and %s links found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
return sitemapurls, linklist


Expand Down Expand Up @@ -251,6 +249,5 @@ def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
# urllib.parse.unquote(line[1].strip())
candidate = fix_relative_urls(baseurl, line[1].strip())
sitemapurls.append(candidate)
LOGGER.info('%s sitemaps found in robots.txt', len(sitemapurls))
LOGGER.debug('sitemaps found in robots.txt: %s', sitemapurls)
LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls))
return sitemapurls
4 changes: 2 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def control_xml_output(output_tree, output_format, tei_validation, docmeta):
# validate
if output_format == 'xmltei' and tei_validation is True:
result = validate_tei(output_tree)
LOGGER.info('TEI validation result: %s %s %s', result, docmeta.id, docmeta.url)
LOGGER.debug('TEI validation result: %s %s %s', result, docmeta.id, docmeta.url)
return tostring(output_tree, pretty_print=True, encoding='unicode').strip()


Expand Down Expand Up @@ -241,7 +241,7 @@ def replace_element_text(element, include_formatting):
LOGGER.warning('missing link attribute: %s %s', element.text, element.attrib)
element.text = ''.join(['[', element.text, ']'])
else:
LOGGER.error('empty link: %s %s', element.text, element.attrib)
LOGGER.warning('empty link: %s %s', element.text, element.attrib)
# handle text
if element.text is not None and element.tail is not None:
full_text = ''.join([element.text, element.tail])
Expand Down

0 comments on commit 6ff9ca2

Please sign in to comment.