Skip to content

Commit

Permalink
html fix for lxml v5+ (#127)
Browse files Browse the repository at this point in the history
* html fix for lxml v5+

* shorter code

* fix tests
  • Loading branch information
adbar committed Jan 16, 2024
1 parent 2c03f06 commit db76cfe
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 11 deletions.
17 changes: 12 additions & 5 deletions htmldate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
## under GNU GPL v3 license


# standard
import logging
import re

Expand Down Expand Up @@ -46,6 +45,7 @@
)

DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)


class Extractor:
Expand Down Expand Up @@ -167,12 +167,19 @@ def is_dubious_html(beginning: str) -> bool:
return "html" not in beginning


def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
"Repair faulty doctype strings to make then palatable for libxml2."
def repair_faulty_html(htmlstring: str, beginning: str) -> str:
"Repair faulty HTML strings to make then palatable for libxml2."
# libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915
if "doctype" in beginning:
firstline, _, rest = htmlstring.partition("\n")
return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
# other issue with malformed documents: check first three lines
for i, line in enumerate(iter(htmlstring.splitlines())):
if "<html" in line and line.endswith("/>"):
htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
break
if i > 2:
break
return htmlstring


Expand Down Expand Up @@ -215,7 +222,7 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
beginning = htmlobject[:50].lower()
check_flag = is_dubious_html(beginning)
# repair first
htmlobject = strip_faulty_doctypes(htmlobject, beginning)
htmlobject = repair_faulty_html(htmlobject, beginning)
# first pass: use Unicode string
fallback_parse = False
try:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def get_version(package):
"charset_normalizer >= 3.3.2; python_version >= '3.7'",
"dateparser >= 1.1.2", # 1.1.3+ slower
# see tests on Github Actions
"lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
"lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'",
"lxml >= 5.1.0, < 6; platform_system != 'Darwin' or python_version > '3.8'",
"python-dateutil >= 2.8.2",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
"urllib3 >= 1.26, < 3; python_version >= '3.7'",
Expand Down
22 changes: 18 additions & 4 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
fetch_url,
is_dubious_html,
load_html,
strip_faulty_doctypes,
repair_faulty_html,
)
from htmldate.validators import (
convert_date,
Expand All @@ -83,12 +83,26 @@
def test_input():
"""test if loaded strings/trees are handled properly"""
assert is_dubious_html("This is a string.") is True
htmlstring = "<!DOCTYPE html PUBLIC />\n<html/>"

htmlstring = "<!DOCTYPE html PUBLIC />\n<html></html>"
beginning = htmlstring[:50].lower()
assert strip_faulty_doctypes(htmlstring, beginning) == "\n<html/>"
assert repair_faulty_html(htmlstring, beginning) == "\n<html></html>"

htmlstring = "<html>\n</html>"
beginning = htmlstring[:50].lower()
assert strip_faulty_doctypes(htmlstring, beginning) == htmlstring
assert repair_faulty_html(htmlstring, beginning) == htmlstring

htmlstring = "<html/>\n</html>"
beginning = htmlstring[:50].lower()
assert repair_faulty_html(htmlstring, beginning) == "<html>\n</html>"

htmlstring = '<!DOCTYPE html>\n<html lang="en-US"/>\n<head/>\n<body/>\n</html>'
beginning = htmlstring[:50].lower()
assert (
repair_faulty_html(htmlstring, beginning)
== '<!DOCTYPE html>\n<html lang="en-US">\n<head/>\n<body/>\n</html>'
)

with pytest.raises(TypeError) as err:
assert load_html(123) is None
assert "incompatible" in str(err.value)
Expand Down

0 comments on commit db76cfe

Please sign in to comment.