diff --git a/.travis.yml b/.travis.yml index 21e1ce11..ea56f519 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,47 +4,16 @@ cache: pip matrix: include: - - name: "Python 2.7 on Linux" - python: 2.7 - env: PIP=pip - - name: "Python 3.5 on Linux" - python: 3.5 - - name: "Python 3.6 on Linux" - python: 3.6 - - name: "Python 3.7 on Linux" - python: 3.7 - name: "Python 3.8 on Linux" dist: xenial python: 3.8 - name: "Python 3.9 Nightly on Linux" dist: bionic python: nightly - - name: "Pypy on Linux" - python: pypy - env: PIP=pip - name: "Pypy 3 on Linux" python: pypy3 - - name: "Python 3.7 on older macOS" - os: osx - osx_image: xcode9.4 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version - - name: "Python 3.7 on macOS" - os: osx - osx_image: xcode11 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version allow_failures: - python: nightly - - python: pypy - python: pypy3 - os: osx diff --git a/doc/source/conf.py b/doc/source/conf.py index bb261349..e70cf9b3 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # readability documentation build configuration file, created by # sphinx-quickstart on Thu Mar 23 16:29:38 2017. @@ -38,7 +37,7 @@ "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.todo", - "recommonmark", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. @@ -72,7 +71,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py deleted file mode 100644 index caf0ea8f..00000000 --- a/readability/compat/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -This module contains compatibility helpers for Python 2/3 interoperability. - -It mainly exists because their are certain incompatibilities in the Python -syntax that can only be solved by conditionally importing different functions. -""" -import sys -from lxml.etree import tostring - -if sys.version_info[0] == 2: - bytes_ = str - str_ = unicode - def tostring_(s): - return tostring(s, encoding='utf-8').decode('utf-8') - -elif sys.version_info[0] == 3: - bytes_ = bytes - str_ = str - def tostring_(s): - return tostring(s, encoding='utf-8') - - -try: - from re import Pattern as pattern_type -except ImportError: - from re import _pattern_type as pattern_type diff --git a/readability/compat/three.py b/readability/compat/three.py deleted file mode 100644 index 26351575..00000000 --- a/readability/compat/three.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py deleted file mode 100644 index 642ecb75..00000000 --- a/readability/compat/two.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/encoding.py b/readability/encoding.py index 212ff929..c95cc14d 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -39,11 +39,10 @@ def get_encoding(page): for declared_encoding in declared_encodings: try: # Python3 only - if sys.version_info[0] == 3: - # declared_encoding will actually be bytes but .decode() only - # accepts `str` type. Decode blindly with ascii because no one should - # ever use non-ascii characters in the name of an encoding. - declared_encoding = declared_encoding.decode("ascii", "replace") + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode("ascii", "replace") encoding = fix_charset(declared_encoding) # Now let's decode the page diff --git a/readability/htmls.py b/readability/htmls.py index acacb5ab..87299f5a 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -4,13 +4,12 @@ from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding -from .compat import str_ utf8_parser = lxml.html.HTMLParser(encoding="utf-8") def build_doc(page): - if isinstance(page, str_): + if isinstance(page, str): encoding = None decoded_page = page else: @@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl): def normalize_entities(cur_title): entities = { - u"\u2014": "-", - u"\u2013": "-", - u"—": "-", - u"–": "-", - u"\u00A0": " ", - u"\u00AB": '"', - u"\u00BB": '"', - u""": '"', + "\u2014": "-", + "\u2013": "-", + "—": "-", + "–": "-", + "\u00A0": " ", + "\u00AB": '"', + "\u00BB": '"', + """: '"', } for c, r in entities.items(): if c in cur_title: diff --git a/readability/readability.py b/readability/readability.py index f16b170a..c86e7d17 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,9 +1,12 @@ #!/usr/bin/env python -from __future__ import print_function import logging import re import sys +import urllib.request +import urllib.parse +import urllib.error +from lxml.etree import tostring from lxml.etree import tounicode from lxml.etree import _ElementTree from lxml.html import document_fromstring @@ -17,7 +20,6 @@ from .htmls import get_title from .htmls import get_author from .htmls import shorten_title -from .compat import str_, bytes_, tostring_, pattern_type from .debug import describe, text_content @@ -80,16 +82,16 @@ def text_length(i): def compile_pattern(elements): if not elements: return None - elif isinstance(elements, pattern_type): + elif isinstance(elements, re.Pattern): return elements - elif isinstance(elements, (str_, bytes_)): - if isinstance(elements, bytes_): - elements = str_(elements, "utf-8") - elements = elements.split(u",") + elif isinstance(elements, (str, bytes)): + if isinstance(elements, bytes): + elements = str(elements, "utf-8") + elements = elements.split(",") if isinstance(elements, (list, tuple)): - return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U) + return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) else: - raise Exception("Unknown type for the pattern: {}".format(type(elements))) + raise Exception(f"Unknown type for the pattern: {type(elements)}") # assume string or string like object @@ -242,19 +244,15 @@ def summary(self, html_partial=False): log.info("ruthless removal did not work. ") ruthless = False log.debug( - ( "ended up stripping too much - " "going for a safer _parse" - ) ) # try again continue else: log.debug( - ( "Ruthless and lenient parsing did not work. " "Returning raw html" - ) ) article = self.html.find("body") if article is None: @@ -272,11 +270,7 @@ def summary(self, html_partial=False): return cleaned_article except Exception as e: log.exception("error getting summary: ") - if sys.version_info[0] == 2: - from .compat.two import raise_with_traceback - else: - from .compat.three import raise_with_traceback - raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e)) + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -338,7 +332,7 @@ def select_best_candidate(self, candidates): ) for candidate in sorted_candidates[:5]: elem = candidate["elem"] - log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem))) + log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem))) best_candidate = sorted_candidates[0] return best_candidate @@ -454,7 +448,7 @@ def score_node(self, elem): def remove_unlikely_candidates(self): for elem in self.html.findall(".//*"): - s = "%s %s" % (elem.get("class", ""), elem.get("id", "")) + s = "{} {}".format(elem.get("class", ""), elem.get("id", "")) if len(s) < 2: continue if ( @@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self): # This results in incorrect results in case there is an # buried within an for example if not REGEXES["divToPElementsRe"].search( - str_(b"".join(map(tostring_, list(elem)))) + str(b"".join(tostring(s, encoding='utf-8') for s in elem)) + # str(b"".join(map(tostring_, list(elem)))) ): # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" @@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self): def tags(self, node, *tag_names): for tag_name in tag_names: - for e in node.findall(".//%s" % tag_name): - yield e + yield from node.findall(".//%s" % tag_name) def reverse_tags(self, node, *tag_names): for tag_name in tag_names: - for e in reversed(node.findall(".//%s" % tag_name)): - yield e + yield from reversed(node.findall(".//%s" % tag_name)) def sanitize(self, node, candidates): MIN_LEN = self.min_text_length @@ -594,13 +587,13 @@ def sanitize(self, node, candidates): ) to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) to_remove = True elif weight >= 25 and link_density > 0.5: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) @@ -726,18 +719,10 @@ def main(): file = None if options.url: headers = {"User-Agent": "Mozilla/5.0"} - if sys.version_info[0] == 3: - import urllib.request, urllib.parse, urllib.error - - request = urllib.request.Request(options.url, None, headers) - file = urllib.request.urlopen(request) - else: - import urllib2 - - request = urllib2.Request(options.url, None, headers) - file = urllib2.urlopen(request) + request = urllib.request.Request(options.url, None, headers) + file = urllib.request.urlopen(request) else: - file = open(args[0], "rt") + file = open(args[0]) try: doc = Document( file.read(), @@ -751,14 +736,8 @@ def main(): result = "

" + doc.short_title() + "


" + doc.summary() open_in_browser(result) else: - enc = ( - sys.__stdout__.encoding or "utf-8" - ) # XXX: this hack could not always work, better to set PYTHONIOENCODING result = "Title:" + doc.short_title() + "\n" + doc.summary() - if sys.version_info[0] == 3: - print(result) - else: - print(result.encode(enc, "replace")) + print(result) finally: file.close() diff --git a/setup.py b/setup.py index 2770abef..294572d7 100755 --- a/setup.py +++ b/setup.py @@ -1,21 +1,9 @@ #!/usr/bin/env python -from __future__ import print_function import codecs import os import re from setuptools import setup -import sys - -lxml_requirement = "lxml" -if sys.platform == "darwin": - import platform - - mac_ver = platform.mac_ver()[0] - mac_major, mac_minor = mac_ver.split('.')[:2] - if int(mac_major) == 10 and int(mac_minor) < 9: - print("Using lxml<2.4") - lxml_requirement = "lxml<2.4" speed_deps = [ "cchardet", @@ -59,8 +47,13 @@ def find_version(*file_paths): long_description_content_type='text/x-rst', license="Apache License 2.0", url="http://github.com/buriy/python-readability", - packages=["readability", "readability.compat"], - install_requires=["chardet", lxml_requirement, "cssselect"], + packages=["readability"], + install_requires=[ + "chardet", + "lxml[html_clean]", + "lxml-html-clean; python_version < '3.11'", + "cssselect" + ], tests_require=test_deps, extras_require=extras, classifiers=[ @@ -72,12 +65,12 @@ def find_version(*file_paths): "Topic :: Internet", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tox.ini b/tox.ini index d6954339..3f03df82 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{27,35,36,37,38,39,310,py,py3}, doc + py{38,39,310,311,312,py3}, doc skip_missing_interpreters = True @@ -14,7 +14,7 @@ deps = pytest doc: sphinx doc: sphinx_rtd_theme - doc: recommonmark + doc: myst-parser # This creates the virtual envs with --site-packages so already packages # that are already installed will be reused. This is especially useful on @@ -30,4 +30,4 @@ commands = [testenv:doc] commands = - python setup.py build_sphinx + sphinx-build -b html doc/source/ build/