diff --git a/.travis.yml b/.travis.yml
index 21e1ce11..ea56f519 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,47 +4,16 @@ cache: pip
matrix:
include:
- - name: "Python 2.7 on Linux"
- python: 2.7
- env: PIP=pip
- - name: "Python 3.5 on Linux"
- python: 3.5
- - name: "Python 3.6 on Linux"
- python: 3.6
- - name: "Python 3.7 on Linux"
- python: 3.7
- name: "Python 3.8 on Linux"
dist: xenial
python: 3.8
- name: "Python 3.9 Nightly on Linux"
dist: bionic
python: nightly
- - name: "Pypy on Linux"
- python: pypy
- env: PIP=pip
- name: "Pypy 3 on Linux"
python: pypy3
- - name: "Python 3.7 on older macOS"
- os: osx
- osx_image: xcode9.4
- language: shell
- env: TOXENV=py37
- before_install:
- - sw_vers
- - python3 --version
- - pip3 --version
- - name: "Python 3.7 on macOS"
- os: osx
- osx_image: xcode11
- language: shell
- env: TOXENV=py37
- before_install:
- - sw_vers
- - python3 --version
- - pip3 --version
allow_failures:
- python: nightly
- - python: pypy
- python: pypy3
- os: osx
diff --git a/doc/source/conf.py b/doc/source/conf.py
index bb261349..e70cf9b3 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
#
# readability documentation build configuration file, created by
# sphinx-quickstart on Thu Mar 23 16:29:38 2017.
@@ -38,7 +37,7 @@
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
- "recommonmark",
+ "myst_parser",
]
# Add any paths that contain templates here, relative to this directory.
@@ -72,7 +71,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
deleted file mode 100644
index caf0ea8f..00000000
--- a/readability/compat/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""
-This module contains compatibility helpers for Python 2/3 interoperability.
-
-It mainly exists because their are certain incompatibilities in the Python
-syntax that can only be solved by conditionally importing different functions.
-"""
-import sys
-from lxml.etree import tostring
-
-if sys.version_info[0] == 2:
- bytes_ = str
- str_ = unicode
- def tostring_(s):
- return tostring(s, encoding='utf-8').decode('utf-8')
-
-elif sys.version_info[0] == 3:
- bytes_ = bytes
- str_ = str
- def tostring_(s):
- return tostring(s, encoding='utf-8')
-
-
-try:
- from re import Pattern as pattern_type
-except ImportError:
- from re import _pattern_type as pattern_type
diff --git a/readability/compat/three.py b/readability/compat/three.py
deleted file mode 100644
index 26351575..00000000
--- a/readability/compat/three.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
- """
- Raise a new exception of type `exc_type` with an existing `traceback`. All
- additional (keyword-)arguments are forwarded to `exc_type`
- """
- raise exc_type(*args, **kwargs).with_traceback(traceback)
diff --git a/readability/compat/two.py b/readability/compat/two.py
deleted file mode 100644
index 642ecb75..00000000
--- a/readability/compat/two.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
- """
- Raise a new exception of type `exc_type` with an existing `traceback`. All
- additional (keyword-)arguments are forwarded to `exc_type`
- """
- raise exc_type(*args, **kwargs), None, traceback
diff --git a/readability/encoding.py b/readability/encoding.py
index 212ff929..c95cc14d 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -39,11 +39,10 @@ def get_encoding(page):
for declared_encoding in declared_encodings:
try:
# Python3 only
- if sys.version_info[0] == 3:
- # declared_encoding will actually be bytes but .decode() only
- # accepts `str` type. Decode blindly with ascii because no one should
- # ever use non-ascii characters in the name of an encoding.
- declared_encoding = declared_encoding.decode("ascii", "replace")
+ # declared_encoding will actually be bytes but .decode() only
+ # accepts `str` type. Decode blindly with ascii because no one should
+ # ever use non-ascii characters in the name of an encoding.
+ declared_encoding = declared_encoding.decode("ascii", "replace")
encoding = fix_charset(declared_encoding)
# Now let's decode the page
diff --git a/readability/htmls.py b/readability/htmls.py
index acacb5ab..87299f5a 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -4,13 +4,12 @@
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
-from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
def build_doc(page):
- if isinstance(page, str_):
+ if isinstance(page, str):
encoding = None
decoded_page = page
else:
@@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl):
def normalize_entities(cur_title):
entities = {
- u"\u2014": "-",
- u"\u2013": "-",
- u"—": "-",
- u"–": "-",
- u"\u00A0": " ",
- u"\u00AB": '"',
- u"\u00BB": '"',
- u""": '"',
+ "\u2014": "-",
+ "\u2013": "-",
+ "—": "-",
+ "–": "-",
+ "\u00A0": " ",
+ "\u00AB": '"',
+ "\u00BB": '"',
+ """: '"',
}
for c, r in entities.items():
if c in cur_title:
diff --git a/readability/readability.py b/readability/readability.py
index f16b170a..c86e7d17 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,9 +1,12 @@
#!/usr/bin/env python
-from __future__ import print_function
import logging
import re
import sys
+import urllib.request
+import urllib.parse
+import urllib.error
+from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.etree import _ElementTree
from lxml.html import document_fromstring
@@ -17,7 +20,6 @@
from .htmls import get_title
from .htmls import get_author
from .htmls import shorten_title
-from .compat import str_, bytes_, tostring_, pattern_type
from .debug import describe, text_content
@@ -80,16 +82,16 @@ def text_length(i):
def compile_pattern(elements):
if not elements:
return None
- elif isinstance(elements, pattern_type):
+ elif isinstance(elements, re.Pattern):
return elements
- elif isinstance(elements, (str_, bytes_)):
- if isinstance(elements, bytes_):
- elements = str_(elements, "utf-8")
- elements = elements.split(u",")
+ elif isinstance(elements, (str, bytes)):
+ if isinstance(elements, bytes):
+ elements = str(elements, "utf-8")
+ elements = elements.split(",")
if isinstance(elements, (list, tuple)):
- return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
+ return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
else:
- raise Exception("Unknown type for the pattern: {}".format(type(elements)))
+ raise Exception(f"Unknown type for the pattern: {type(elements)}")
# assume string or string like object
@@ -242,19 +244,15 @@ def summary(self, html_partial=False):
log.info("ruthless removal did not work. ")
ruthless = False
log.debug(
- (
"ended up stripping too much - "
"going for a safer _parse"
- )
)
# try again
continue
else:
log.debug(
- (
"Ruthless and lenient parsing did not work. "
"Returning raw html"
- )
)
article = self.html.find("body")
if article is None:
@@ -272,11 +270,7 @@ def summary(self, html_partial=False):
return cleaned_article
except Exception as e:
log.exception("error getting summary: ")
- if sys.version_info[0] == 2:
- from .compat.two import raise_with_traceback
- else:
- from .compat.three import raise_with_traceback
- raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
+ raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
@@ -338,7 +332,7 @@ def select_best_candidate(self, candidates):
)
for candidate in sorted_candidates[:5]:
elem = candidate["elem"]
- log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
+ log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem)))
best_candidate = sorted_candidates[0]
return best_candidate
@@ -454,7 +448,7 @@ def score_node(self, elem):
def remove_unlikely_candidates(self):
for elem in self.html.findall(".//*"):
- s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
+ s = "{} {}".format(elem.get("class", ""), elem.get("id", ""))
if len(s) < 2:
continue
if (
@@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self):
# This results in incorrect results in case there is an
# buried within an for example
if not REGEXES["divToPElementsRe"].search(
- str_(b"".join(map(tostring_, list(elem))))
+ str(b"".join(tostring(s, encoding='utf-8') for s in elem))
+ # str(b"".join(map(tostring_, list(elem))))
):
# log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
@@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self):
def tags(self, node, *tag_names):
for tag_name in tag_names:
- for e in node.findall(".//%s" % tag_name):
- yield e
+ yield from node.findall(".//%s" % tag_name)
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
- for e in reversed(node.findall(".//%s" % tag_name)):
- yield e
+ yield from reversed(node.findall(".//%s" % tag_name))
def sanitize(self, node, candidates):
MIN_LEN = self.min_text_length
@@ -594,13 +587,13 @@ def sanitize(self, node, candidates):
)
to_remove = True
elif weight < 25 and link_density > 0.2:
- reason = "too many links %.3f for its weight %s" % (
+ reason = "too many links {:.3f} for its weight {}".format(
link_density,
weight,
)
to_remove = True
elif weight >= 25 and link_density > 0.5:
- reason = "too many links %.3f for its weight %s" % (
+ reason = "too many links {:.3f} for its weight {}".format(
link_density,
weight,
)
@@ -726,18 +719,10 @@ def main():
file = None
if options.url:
headers = {"User-Agent": "Mozilla/5.0"}
- if sys.version_info[0] == 3:
- import urllib.request, urllib.parse, urllib.error
-
- request = urllib.request.Request(options.url, None, headers)
- file = urllib.request.urlopen(request)
- else:
- import urllib2
-
- request = urllib2.Request(options.url, None, headers)
- file = urllib2.urlopen(request)
+ request = urllib.request.Request(options.url, None, headers)
+ file = urllib.request.urlopen(request)
else:
- file = open(args[0], "rt")
+ file = open(args[0])
try:
doc = Document(
file.read(),
@@ -751,14 +736,8 @@ def main():
result = "" + doc.short_title() + "
" + doc.summary()
open_in_browser(result)
else:
- enc = (
- sys.__stdout__.encoding or "utf-8"
- ) # XXX: this hack could not always work, better to set PYTHONIOENCODING
result = "Title:" + doc.short_title() + "\n" + doc.summary()
- if sys.version_info[0] == 3:
- print(result)
- else:
- print(result.encode(enc, "replace"))
+ print(result)
finally:
file.close()
diff --git a/setup.py b/setup.py
index 2770abef..294572d7 100755
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,9 @@
#!/usr/bin/env python
-from __future__ import print_function
import codecs
import os
import re
from setuptools import setup
-import sys
-
-lxml_requirement = "lxml"
-if sys.platform == "darwin":
- import platform
-
- mac_ver = platform.mac_ver()[0]
- mac_major, mac_minor = mac_ver.split('.')[:2]
- if int(mac_major) == 10 and int(mac_minor) < 9:
- print("Using lxml<2.4")
- lxml_requirement = "lxml<2.4"
speed_deps = [
"cchardet",
@@ -59,8 +47,13 @@ def find_version(*file_paths):
long_description_content_type='text/x-rst',
license="Apache License 2.0",
url="http://github.com/buriy/python-readability",
- packages=["readability", "readability.compat"],
- install_requires=["chardet", lxml_requirement, "cssselect"],
+ packages=["readability"],
+ install_requires=[
+ "chardet",
+ "lxml[html_clean]",
+ "lxml-html-clean; python_version < '3.11'",
+ "cssselect"
+ ],
tests_require=test_deps,
extras_require=extras,
classifiers=[
@@ -72,12 +65,12 @@ def find_version(*file_paths):
"Topic :: Internet",
"Topic :: Software Development :: Libraries :: Python Modules",
"Programming Language :: Python",
- "Programming Language :: Python :: 2",
- "Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.5",
- "Programming Language :: Python :: 3.6",
- "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: Implementation :: PyPy",
],
)
diff --git a/tox.ini b/tox.ini
index d6954339..3f03df82 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
[tox]
envlist =
- py{27,35,36,37,38,39,310,py,py3}, doc
+ py{38,39,310,311,312,py3}, doc
skip_missing_interpreters =
True
@@ -14,7 +14,7 @@ deps =
pytest
doc: sphinx
doc: sphinx_rtd_theme
- doc: recommonmark
+ doc: myst-parser
# This creates the virtual envs with --site-packages so already packages
# that are already installed will be reused. This is especially useful on
@@ -30,4 +30,4 @@ commands =
[testenv:doc]
commands =
- python setup.py build_sphinx
+ sphinx-build -b html doc/source/ build/