buriy · buriy · Oct 11, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/.travis.yml b/.travis.yml
@@ -4,47 +4,16 @@ cache: pip
 
 matrix:
   include:
-    - name: "Python 2.7 on Linux"
-      python: 2.7
-      env: PIP=pip
-    - name: "Python 3.5 on Linux"
-      python: 3.5
-    - name: "Python 3.6 on Linux"
-      python: 3.6
-    - name: "Python 3.7 on Linux"
-      python: 3.7
     - name: "Python 3.8 on Linux"
       dist: xenial
       python: 3.8
     - name: "Python 3.9 Nightly on Linux"
       dist: bionic
       python: nightly
-    - name: "Pypy on Linux"
-      python: pypy
-      env: PIP=pip
     - name: "Pypy 3 on Linux"
       python: pypy3
-    - name: "Python 3.7 on older macOS"
-      os: osx
-      osx_image: xcode9.4
-      language: shell
-      env: TOXENV=py37
-      before_install:
-        - sw_vers
-        - python3 --version
-        - pip3 --version
-    - name: "Python 3.7 on macOS"
-      os: osx
-      osx_image: xcode11
-      language: shell
-      env: TOXENV=py37
-      before_install:
-        - sw_vers
-        - python3 --version
-        - pip3 --version
   allow_failures:
     - python: nightly
-    - python: pypy
     - python: pypy3
     - os: osx
 

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 #
 # readability documentation build configuration file, created by
 # sphinx-quickstart on Thu Mar 23 16:29:38 2017.
@@ -38,7 +37,7 @@
     "sphinx.ext.doctest",
     "sphinx.ext.intersphinx",
     "sphinx.ext.todo",
-    "recommonmark",
+    "myst_parser",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -72,7 +71,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.

diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
diff --git a/readability/compat/three.py b/readability/compat/three.py
diff --git a/readability/compat/two.py b/readability/compat/two.py
diff --git a/readability/encoding.py b/readability/encoding.py
@@ -39,11 +39,10 @@ def get_encoding(page):
     for declared_encoding in declared_encodings:
         try:
             # Python3 only
-            if sys.version_info[0] == 3:
-                # declared_encoding will actually be bytes but .decode() only
-                # accepts `str` type. Decode blindly with ascii because no one should
-                # ever use non-ascii characters in the name of an encoding.
-                declared_encoding = declared_encoding.decode("ascii", "replace")
+            # declared_encoding will actually be bytes but .decode() only
+            # accepts `str` type. Decode blindly with ascii because no one should
+            # ever use non-ascii characters in the name of an encoding.
+            declared_encoding = declared_encoding.decode("ascii", "replace")
 
             encoding = fix_charset(declared_encoding)
             # Now let's decode the page

diff --git a/readability/htmls.py b/readability/htmls.py
@@ -4,13 +4,12 @@
 
 from .cleaners import normalize_spaces, clean_attributes
 from .encoding import get_encoding
-from .compat import str_
 
 utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
 
 
 def build_doc(page):
-    if isinstance(page, str_):
+    if isinstance(page, str):
         encoding = None
         decoded_page = page
     else:
@@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl):
 
 def normalize_entities(cur_title):
     entities = {
-        u"\u2014": "-",
-        u"\u2013": "-",
-        u"&mdash;": "-",
-        u"&ndash;": "-",
-        u"\u00A0": " ",
-        u"\u00AB": '"',
-        u"\u00BB": '"',
-        u"&quot;": '"',
+        "\u2014": "-",
+        "\u2013": "-",
+        "&mdash;": "-",
+        "&ndash;": "-",
+        "\u00A0": " ",
+        "\u00AB": '"',
+        "\u00BB": '"',
+        "&quot;": '"',
     }
     for c, r in entities.items():
         if c in cur_title:

diff --git a/readability/readability.py b/readability/readability.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python
-from __future__ import print_function
 import logging
 import re
 import sys
+import urllib.request
+import urllib.parse
+import urllib.error
 
+from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.etree import _ElementTree
 from lxml.html import document_fromstring
@@ -17,7 +20,6 @@
 from .htmls import get_title
 from .htmls import get_author
 from .htmls import shorten_title
-from .compat import str_, bytes_, tostring_, pattern_type
 from .debug import describe, text_content
 
 
@@ -80,16 +82,16 @@ def text_length(i):
 def compile_pattern(elements):
     if not elements:
         return None
-    elif isinstance(elements, pattern_type):
+    elif isinstance(elements, re.Pattern):
         return elements
-    elif isinstance(elements, (str_, bytes_)):
-        if isinstance(elements, bytes_):
-            elements = str_(elements, "utf-8")
-        elements = elements.split(u",")
+    elif isinstance(elements, (str, bytes)):
+        if isinstance(elements, bytes):
+            elements = str(elements, "utf-8")
+        elements = elements.split(",")
     if isinstance(elements, (list, tuple)):
-        return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
+        return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
     else:
-        raise Exception("Unknown type for the pattern: {}".format(type(elements)))
+        raise Exception(f"Unknown type for the pattern: {type(elements)}")
         # assume string or string like object
 
 
@@ -242,19 +244,15 @@ def summary(self, html_partial=False):
                         log.info("ruthless removal did not work. ")
                         ruthless = False
                         log.debug(
-                            (
                                 "ended up stripping too much - "
                                 "going for a safer _parse"
-                            )
                         )
                         # try again
                         continue
                     else:
                         log.debug(
-                            (
                                 "Ruthless and lenient parsing did not work. "
                                 "Returning raw html"
-                            )
                         )
                         article = self.html.find("body")
                         if article is None:
@@ -272,11 +270,7 @@ def summary(self, html_partial=False):
                     return cleaned_article
         except Exception as e:
             log.exception("error getting summary: ")
-            if sys.version_info[0] == 2:
-                from .compat.two import raise_with_traceback
-            else:
-                from .compat.three import raise_with_traceback
-            raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
+            raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
 
     def get_article(self, candidates, best_candidate, html_partial=False):
         # Now that we have the top candidate, look through its siblings for
@@ -338,7 +332,7 @@ def select_best_candidate(self, candidates):
         )
         for candidate in sorted_candidates[:5]:
             elem = candidate["elem"]
-            log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
+            log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem)))
 
         best_candidate = sorted_candidates[0]
         return best_candidate
@@ -454,7 +448,7 @@ def score_node(self, elem):
 
     def remove_unlikely_candidates(self):
         for elem in self.html.findall(".//*"):
-            s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
+            s = "{} {}".format(elem.get("class", ""), elem.get("id", ""))
             if len(s) < 2:
                 continue
             if (
@@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self):
             # This results in incorrect results in case there is an <img>
             # buried within an <a> for example
             if not REGEXES["divToPElementsRe"].search(
-                str_(b"".join(map(tostring_, list(elem))))
+                str(b"".join(tostring(s, encoding='utf-8') for s in elem))
+                # str(b"".join(map(tostring_, list(elem))))
             ):
                 # log.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
@@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self):
 
     def tags(self, node, *tag_names):
         for tag_name in tag_names:
-            for e in node.findall(".//%s" % tag_name):
-                yield e
+            yield from node.findall(".//%s" % tag_name)
 
     def reverse_tags(self, node, *tag_names):
         for tag_name in tag_names:
-            for e in reversed(node.findall(".//%s" % tag_name)):
-                yield e
+            yield from reversed(node.findall(".//%s" % tag_name))
 
     def sanitize(self, node, candidates):
         MIN_LEN = self.min_text_length
@@ -594,13 +587,13 @@ def sanitize(self, node, candidates):
                     )
                     to_remove = True
                 elif weight < 25 and link_density > 0.2:
-                    reason = "too many links %.3f for its weight %s" % (
+                    reason = "too many links {:.3f} for its weight {}".format(
                         link_density,
                         weight,
                     )
                     to_remove = True
                 elif weight >= 25 and link_density > 0.5:
-                    reason = "too many links %.3f for its weight %s" % (
+                    reason = "too many links {:.3f} for its weight {}".format(
                         link_density,
                         weight,
                     )
@@ -726,18 +719,10 @@ def main():
     file = None
     if options.url:
         headers = {"User-Agent": "Mozilla/5.0"}
-        if sys.version_info[0] == 3:
-            import urllib.request, urllib.parse, urllib.error
-
-            request = urllib.request.Request(options.url, None, headers)
-            file = urllib.request.urlopen(request)
-        else:
-            import urllib2
-
-            request = urllib2.Request(options.url, None, headers)
-            file = urllib2.urlopen(request)
+        request = urllib.request.Request(options.url, None, headers)
+        file = urllib.request.urlopen(request)
     else:
-        file = open(args[0], "rt")
+        file = open(args[0])
     try:
         doc = Document(
             file.read(),
@@ -751,14 +736,8 @@ def main():
             result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
             open_in_browser(result)
         else:
-            enc = (
-                sys.__stdout__.encoding or "utf-8"
-            )  # XXX: this hack could not always work, better to set PYTHONIOENCODING
             result = "Title:" + doc.short_title() + "\n" + doc.summary()
-            if sys.version_info[0] == 3:
-                print(result)
-            else:
-                print(result.encode(enc, "replace"))
+            print(result)
     finally:
         file.close()
 

diff --git a/setup.py b/setup.py
@@ -1,21 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import print_function
 import codecs
 import os
 import re
 from setuptools import setup
-import sys
-
-lxml_requirement = "lxml"
-if sys.platform == "darwin":
-    import platform
-
-    mac_ver = platform.mac_ver()[0]
-    mac_major, mac_minor = mac_ver.split('.')[:2]
-    if int(mac_major) == 10 and int(mac_minor) < 9:
-        print("Using lxml<2.4")
-        lxml_requirement = "lxml<2.4"
 
 speed_deps = [
      "cchardet",
@@ -59,8 +47,13 @@ def find_version(*file_paths):
     long_description_content_type='text/x-rst',
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
-    packages=["readability", "readability.compat"],
-    install_requires=["chardet", lxml_requirement, "cssselect"],
+    packages=["readability"],
+    install_requires=[
+        "chardet",
+        "lxml[html_clean]",
+        "lxml-html-clean; python_version < '3.11'",
+        "cssselect"
+    ],
     tests_require=test_deps,
     extras_require=extras,
     classifiers=[
@@ -72,12 +65,12 @@ def find_version(*file_paths):
         "Topic :: Internet",
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 2",
-        "Programming Language :: Python :: 2.7",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: Implementation :: PyPy",
     ],
 )