html fix for lxml v5+ (#127)

* html fix for lxml v5+ * shorter code * fix tests
adbar · Jan 16, 2024 · db76cfe · db76cfe
1 parent 2c03f06
commit db76cfe
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 11 deletions.
diff --git a/htmldate/utils.py b/htmldate/utils.py
@@ -7,7 +7,6 @@
 ## under GNU GPL v3 license
 
 
-# standard
 import logging
 import re
 
@@ -46,6 +45,7 @@
 )
 
 DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I)
+FAULTY_HTML = re.compile(r"(<html.*?)\s*/>", re.I)
 
 
 class Extractor:
@@ -167,12 +167,19 @@ def is_dubious_html(beginning: str) -> bool:
     return "html" not in beginning
 
 
-def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str:
-    "Repair faulty doctype strings to make then palatable for libxml2."
+def repair_faulty_html(htmlstring: str, beginning: str) -> str:
+    "Repair faulty HTML strings to make then palatable for libxml2."
     # libxml2/LXML issue: https://bugs.launchpad.net/lxml/+bug/1955915
     if "doctype" in beginning:
         firstline, _, rest = htmlstring.partition("\n")
-        return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
+        htmlstring = DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest
+    # other issue with malformed documents: check first three lines
+    for i, line in enumerate(iter(htmlstring.splitlines())):
+        if "<html" in line and line.endswith("/>"):
+            htmlstring = FAULTY_HTML.sub(r"\1>", htmlstring, count=1)
+            break
+        if i > 2:
+            break
     return htmlstring
 
 
@@ -215,7 +222,7 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
     beginning = htmlobject[:50].lower()
     check_flag = is_dubious_html(beginning)
     # repair first
-    htmlobject = strip_faulty_doctypes(htmlobject, beginning)
+    htmlobject = repair_faulty_html(htmlobject, beginning)
     # first pass: use Unicode string
     fallback_parse = False
     try:

diff --git a/setup.py b/setup.py
@@ -119,8 +119,8 @@ def get_version(package):
         "charset_normalizer >= 3.3.2; python_version >= '3.7'",
         "dateparser >= 1.1.2",  # 1.1.3+ slower
         # see tests on Github Actions
-        "lxml == 4.9.2 ; platform_system == 'Darwin' and python_version <= '3.8'",
-        "lxml == 4.9.4 ; platform_system != 'Darwin' or python_version > '3.8'",
+        "lxml == 4.9.2; platform_system == 'Darwin' and python_version <= '3.8'",
+        "lxml >= 5.1.0, < 6; platform_system != 'Darwin' or python_version > '3.8'",
         "python-dateutil >= 2.8.2",
         "urllib3 >= 1.26, < 2; python_version < '3.7'",
         "urllib3 >= 1.26, < 3; python_version >= '3.7'",

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -59,7 +59,7 @@
     fetch_url,
     is_dubious_html,
     load_html,
-    strip_faulty_doctypes,
+    repair_faulty_html,
 )
 from htmldate.validators import (
     convert_date,
@@ -83,12 +83,26 @@
 def test_input():
     """test if loaded strings/trees are handled properly"""
     assert is_dubious_html("This is a string.") is True
-    htmlstring = "<!DOCTYPE html PUBLIC />\n<html/>"
+
+    htmlstring = "<!DOCTYPE html PUBLIC />\n<html></html>"
     beginning = htmlstring[:50].lower()
-    assert strip_faulty_doctypes(htmlstring, beginning) == "\n<html/>"
+    assert repair_faulty_html(htmlstring, beginning) == "\n<html></html>"
+
     htmlstring = "<html>\n</html>"
     beginning = htmlstring[:50].lower()
-    assert strip_faulty_doctypes(htmlstring, beginning) == htmlstring
+    assert repair_faulty_html(htmlstring, beginning) == htmlstring
+
+    htmlstring = "<html/>\n</html>"
+    beginning = htmlstring[:50].lower()
+    assert repair_faulty_html(htmlstring, beginning) == "<html>\n</html>"
+
+    htmlstring = '<!DOCTYPE html>\n<html lang="en-US"/>\n<head/>\n<body/>\n</html>'
+    beginning = htmlstring[:50].lower()
+    assert (
+        repair_faulty_html(htmlstring, beginning)
+        == '<!DOCTYPE html>\n<html lang="en-US">\n<head/>\n<body/>\n</html>'
+    )
+
     with pytest.raises(TypeError) as err:
         assert load_html(123) is None
     assert "incompatible" in str(err.value)