review logging levels (#347)

* review logging levels * workflow: remove duplicate tests
adbar · May 3, 2023 · 6ff9ca2 · 6ff9ca2
1 parent 075fe6d
commit 6ff9ca2
Show file tree

Hide file tree

Showing 10 changed files with 39 additions and 42 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9, "3.10", "3.11"]  # "3.12-dev"
+        python-version: [3.8, 3.9, "3.10", "3.11"]  # "3.12-dev"
         env: [{ MINIMAL: "true" }, { MINIMAL: "false" }]
         include:
         # custom python versions

diff --git a/trafilatura/cli.py b/trafilatura/cli.py
@@ -23,8 +23,6 @@
 from .sitemaps import sitemap_search
 
 
-LOGGER = logging.getLogger(__name__)
-
 # fix output encoding on some systems
 try:
     # > Python 3.7

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -233,7 +233,7 @@ def handle_other_elements(element, potential_tags, options):
     '''Handle diverse or unknown elements in the scope of relevant tags'''
     # delete unwanted
     if element.tag not in potential_tags:
-        # LOGGER.debug('discarding: %s %s', element.tag, element.text)
+        LOGGER.debug('discarding element: %s %s', element.tag, element.text)
         return None
     if element.tag == 'div':
         # make a copy and prune it in case it contains sub-elements handled on their own?
@@ -668,9 +668,9 @@ def compare_extraction(tree, backup_tree, url, body, text, len_text, options):
     # apply decision
     if algo_flag:
         body, text, len_text = temppost_algo, algo_text, len_algo
-        LOGGER.info('using generic algorithm: %s', url)
+        LOGGER.debug('using generic algorithm: %s', url)
     else:
-        LOGGER.info('using custom extraction: %s', url)
+        LOGGER.debug('using custom extraction: %s', url)
     # override faulty extraction: try with justext
     if body.xpath(SANITIZED_XPATH) or len_text < min_target_length:  # body.find(...)
     # or options.recall is True ?
@@ -883,7 +883,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
             document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)
             # cut short if extracted URL in blacklist
             if document.url in url_blacklist:
-                LOGGER.info('blacklisted URL: %s', url)
+                LOGGER.warning('blacklisted URL: %s', url)
                 raise ValueError
             # cut short if core elements are missing
             if only_with_metadata is True and any(
@@ -936,34 +936,34 @@ def bare_extraction(filecontent, url=None, no_fallback=False,  # fast=False,
         if max_tree_size is not None:
             # strip tags
             if len(postbody) > max_tree_size:
-                LOGGER.warning('output tree too long: %s', len(postbody))
+                LOGGER.debug('output tree too long: %s', len(postbody))
                 strip_tags(postbody, 'hi')
             # still too long, raise an error
             if len(postbody) > max_tree_size:
-                LOGGER.error('output tree too long: %s, discarding file', len(postbody))
+                LOGGER.debug('output tree too long: %s, discarding file', len(postbody))
                 raise ValueError
         # size checks
         if len_comments < config.getint('DEFAULT', 'MIN_EXTRACTED_COMM_SIZE'):
-            LOGGER.info('not enough comments %s', url)
+            LOGGER.debug('not enough comments %s', url)
         if len_text < config.getint('DEFAULT', 'MIN_OUTPUT_SIZE') and len_comments < config.getint('DEFAULT',
                                                                                                    'MIN_OUTPUT_COMM_SIZE'):
-            LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments)
+            LOGGER.debug('text and comments not long enough: %s %s', len_text, len_comments)
             raise ValueError
 
         # check duplicates at body level
         if deduplicate is True and duplicate_test(postbody, config) is True:
-            LOGGER.error('duplicate document for URL %s', url)
+            LOGGER.debug('discarding duplicate document for URL %s', url)
             raise ValueError
 
         # sanity check on language
         if target_language is not None:
             is_not_target_lang, document = language_filter(temp_text, temp_comments, target_language, document)
             if is_not_target_lang is True:
-                LOGGER.error('wrong language for URL %s', url)
+                LOGGER.debug('wrong language for URL %s', url)
                 raise ValueError
 
     except (TypeError, ValueError):
-        LOGGER.info('discarding data for url: %s', url)  # document.url , record_id
+        LOGGER.warning('discarding data for url: %s', url)  # document.url , record_id
         return None
 
     # special case: python variables

diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py
@@ -110,7 +110,7 @@ def _send_request(url, no_ssl, config):
             # execute request
             response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config))
     except urllib3.exceptions.SSLError:
-        LOGGER.error('retrying after SSLError: %s', url)
+        LOGGER.warning('retrying after SSLError: %s', url)
         return _send_request(url, True, config)
     except Exception as err:
         LOGGER.error('download error: %s %s', url, err)  # sys.exc_info()[0]
@@ -160,7 +160,7 @@ def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG):
         return _handle_response(url, response, decode, config)
         # return '' (useful do discard further processing?)
         # return response
-    LOGGER.debug('no response: %s', url)
+    LOGGER.debug('request failed: %s', url)
     return None
 
 
@@ -171,6 +171,9 @@ def _pycurl_is_live_page(url):
     # Set the URL and HTTP method (HEAD)
     curl.setopt(pycurl.URL, url.encode('utf-8'))
     curl.setopt(pycurl.CONNECTTIMEOUT, 10)
+    # no SSL verification
+    curl.setopt(pycurl.SSL_VERIFYPEER, 0)
+    curl.setopt(pycurl.SSL_VERIFYHOST, 0)
     # Set option to avoid getting the response body
     curl.setopt(curl.NOBODY, True)
     # Perform the request

diff --git a/trafilatura/feeds.py b/trafilatura/feeds.py
@@ -37,7 +37,7 @@ def handle_link_list(linklist, domainname, baseurl, target_lang=None):
         checked = check_url(link, language=target_lang)
         if checked is not None:
             if not is_similar_domain(domainname, checked[1]) and not "feed" in link:
-                LOGGER.error('Rejected, diverging domain names: %s %s', domainname, checked[1])
+                LOGGER.warning('Rejected, diverging domain names: %s %s', domainname, checked[1])
             else:
                 output_links.append(checked[0])
         # Feedburner/Google feeds
@@ -182,7 +182,7 @@ def find_feed_urls(url, target_lang=None):
             return feed_links
         LOGGER.debug('No usable feed links found: %s', url)
     else:
-        LOGGER.warning('Could not download web page: %s', url)
+        LOGGER.error('Could not download web page: %s', url)
         if url.strip('/') != baseurl:
             return try_homepage(baseurl, target_lang)
     # try alternative: Google News
@@ -201,5 +201,5 @@ def find_feed_urls(url, target_lang=None):
 def try_homepage(baseurl, target_lang):
     '''Shift into reverse and try the homepage instead of the particular feed
        page that was given as input.'''
-    LOGGER.info('Probing homepage for feeds instead: %s', baseurl)
+    LOGGER.debug('Probing homepage for feeds instead: %s', baseurl)
     return find_feed_urls(baseurl, target_lang)
diff --git a/trafilatura/filters.py b/trafilatura/filters.py
@@ -67,15 +67,15 @@ def check_html_lang(tree, target_language, strict=False):
         for elem in target_elements:
             if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
                 return True
-        LOGGER.debug('HTML lang detection failed')
+        LOGGER.debug('HTML content-language failed')
         return False
     # locale
     target_elements = tree.findall('.//meta[@property="og:locale"][@content]')
     if target_elements:
         for elem in target_elements:
             if target_language in RE_HTML_LANG.split(elem.get('content').lower()):
                 return True
-        LOGGER.debug('HTML lang detection failed')
+        LOGGER.debug('HTML og:locale failed')
         return False
     # HTML lang attribute: sometimes a wrong indication
     if strict is True:
@@ -84,9 +84,9 @@ def check_html_lang(tree, target_language, strict=False):
             for elem in target_elements:
                 if target_language in RE_HTML_LANG.split(elem.get('lang').lower()):
                     return True
-            LOGGER.debug('HTML lang detection failed')
+            LOGGER.debug('HTML lang failed')
             return False
-    LOGGER.info('No relevant lang elements found')
+    LOGGER.debug('No relevant lang elements found')
     return True
 
 
@@ -99,7 +99,7 @@ def language_classifier(temp_text, temp_comments):
             else py3langid.classify(temp_comments)
         )
     else:
-        LOGGER.warning('Detector not installed, no language detection run')
+        LOGGER.warning('Language detector not installed, skipping detection')
         result = None
     return result
 
@@ -116,7 +116,7 @@ def language_filter(temp_text, temp_comments, target_language, docmeta):
         #        LOGGER.error('wrong HTML meta language for URL %s', url)
         #        raise ValueError
         if docmeta.language is not None and docmeta.language != target_language:
-            LOGGER.warning('wrong language: %s %s %s', docmeta.language, docmeta.id, docmeta.url)
+            LOGGER.warning('wrong language: %s %s', docmeta.language, docmeta.url)
             return True, docmeta
     return False, docmeta
 

diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py
@@ -289,7 +289,6 @@ def extract_metainfo(tree, expressions, len_limit=200):
         for elem in tree.xpath(expression):
             content = trim(' '.join(elem.itertext()))
             if content and 2 < len(content) < len_limit:
-                # LOGGER.debug('metadata found in: %s', expression)
                 return content
             i += 1
         if i > 1:
@@ -307,7 +306,7 @@ def examine_title_element(tree):
             first = mymatch[1] or None
             second = mymatch[2] or None
     except IndexError:
-        LOGGER.warning('no main title found')
+        LOGGER.debug('no main title found')
     return title, first, second
 
 
@@ -336,7 +335,7 @@ def extract_title(tree):
     try:
         title = tree.xpath('.//h2')[0].text_content()
     except IndexError:
-        LOGGER.warning('no h2 title found')
+        LOGGER.debug('no h2 title found')
     return title
 
 

diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
@@ -268,9 +268,9 @@ def score_paragraphs(self):
         for elem in ordered:
             candidate = candidates[elem]
             density = self.get_link_density(elem)
-            LOGGER.debug("Branch %6.3f link density %.3f -> %6.3f",
-                candidate.score, density, candidate.score * (1 - density)
-            )
+            # LOGGER.debug("Branch %6.3f link density %.3f -> %6.3f",
+            #    candidate.score, density, candidate.score * (1 - density)
+            #)
             candidate.score *= 1 - density
 
         return candidates
@@ -307,7 +307,7 @@ def remove_unlikely_candidates(self):
                 and REGEXES["unlikelyCandidatesRe"].search(attrs)
                 and (not REGEXES["okMaybeItsACandidateRe"].search(attrs))
             ):
-                LOGGER.debug("Removing unlikely candidate: %s", elem.tag)
+                # LOGGER.debug("Removing unlikely candidate: %s", elem.tag)
                 elem.drop_tree()
 
     def transform_misused_divs_into_paragraphs(self):

diff --git a/trafilatura/sitemaps.py b/trafilatura/sitemaps.py
@@ -128,7 +128,7 @@ def download_and_process_sitemap(url: str, domain: str, baseurl: str, target_lan
     # variables init
     sitemapurls, linklist = sitemapurls or [], linklist or []
     # fetch and pre-process
-    LOGGER.info('fetching sitemap: %s', url)
+    LOGGER.debug('fetching sitemap: %s', url)
     pagecontent = fetch_url(url)
     sitemap = SitemapObject(baseurl, pagecontent, domain, url, target_lang)
     add_sitemaps, add_links = process_sitemap(sitemap)
@@ -174,7 +174,7 @@ def handle_link(link: str, sitemap: SitemapObject) -> Tuple[str, str]:
         # don't take links from another domain and make an exception for main platforms
         # also bypass: subdomains vs. domains
         elif not is_similar_domain(sitemap.domain, newdomain) and not WHITELISTED_PLATFORMS.search(newdomain):
-            LOGGER.error('link discarded, diverging domain names: %s %s', sitemap.domain, newdomain)
+            LOGGER.warning('link discarded, diverging domain names: %s %s', sitemap.domain, newdomain)
         else:
             state = 'sitemap' if DETECT_SITEMAP_LINK.search(link) else 'link'
     return link, state
@@ -204,8 +204,7 @@ def extract_sitemap_langlinks(sitemap: SitemapObject) -> Tuple[List[str], List[s
             if lang_match:
                 link, state = handle_link(lang_match[1], sitemap)
                 sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
-    LOGGER.info('%s sitemaps and %s links with hreflang found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
-    LOGGER.debug('sitemaps found: %s', sitemapurls)
+    LOGGER.debug('%s sitemaps and %s links with hreflang found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
     return sitemapurls, linklist
 
 
@@ -217,8 +216,7 @@ def extract_sitemap_links(sitemap: SitemapObject) -> Tuple[List[str], List[str]]
         # process middle part of the match tuple
         link, state = handle_link(match[1], sitemap)
         sitemapurls, linklist = store_sitemap_link(sitemapurls, linklist, link, state)
-    LOGGER.info('%s sitemaps and %s links found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
-    LOGGER.debug('sitemaps found: %s', sitemapurls)
+    LOGGER.debug('%s sitemaps and %s links found for %s', len(sitemapurls), len(linklist), sitemap.sitemap_url)
     return sitemapurls, linklist
 
 
@@ -251,6 +249,5 @@ def extract_robots_sitemaps(robotstxt: str, baseurl: str) -> List[str]:
                 # urllib.parse.unquote(line[1].strip())
                 candidate = fix_relative_urls(baseurl, line[1].strip())
                 sitemapurls.append(candidate)
-    LOGGER.info('%s sitemaps found in robots.txt', len(sitemapurls))
-    LOGGER.debug('sitemaps found in robots.txt: %s', sitemapurls)
+    LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls))
     return sitemapurls
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -107,7 +107,7 @@ def control_xml_output(output_tree, output_format, tei_validation, docmeta):
     # validate
     if output_format == 'xmltei' and tei_validation is True:
         result = validate_tei(output_tree)
-        LOGGER.info('TEI validation result: %s %s %s', result, docmeta.id, docmeta.url)
+        LOGGER.debug('TEI validation result: %s %s %s', result, docmeta.id, docmeta.url)
     return tostring(output_tree, pretty_print=True, encoding='unicode').strip()
 
 
@@ -241,7 +241,7 @@ def replace_element_text(element, include_formatting):
                 LOGGER.warning('missing link attribute: %s %s', element.text, element.attrib)
                 element.text = ''.join(['[', element.text, ']'])
         else:
-            LOGGER.error('empty link: %s %s', element.text, element.attrib)
+            LOGGER.warning('empty link: %s %s', element.text, element.attrib)
     # handle text
     if element.text is not None and element.tail is not None:
         full_text = ''.join([element.text, element.tail])