From 96de0ec4517e719c142def3640a813364ff51110 Mon Sep 17 00:00:00 2001 From: Dominik Geyer Date: Thu, 18 Sep 2025 11:41:32 +0200 Subject: [PATCH] Fix Unicode handling in table diff by encoding non-ASCII chars as numeric entities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, `TableDiff::createDocumentWithHtml()` attempted to normalize text for `DOMDocument::loadHTML()` by passing it through `htmlentities()` and `iconv('UTF-8', 'ISO-8859-1//IGNORE', ...)`. This caused non-ASCII characters (e.g. emoji, Cyrillic, CJK) that are not representable in ISO-8859-1 to be dropped entirely. This patch replaces that logic with a call to `mb_encode_numericentity()` to convert all non-ASCII Unicode codepoints (U+0080 – U+10FFFF) into decimal HTML numeric entities. This ensures that the full Unicode range is preserved and parsed correctly by `DOMDocument`, which otherwise defaults to ISO-8859-1. --- lib/Caxy/HtmlDiff/Table/TableDiff.php | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/Caxy/HtmlDiff/Table/TableDiff.php b/lib/Caxy/HtmlDiff/Table/TableDiff.php index ed46d94..6fd4ec0 100644 --- a/lib/Caxy/HtmlDiff/Table/TableDiff.php +++ b/lib/Caxy/HtmlDiff/Table/TableDiff.php @@ -626,8 +626,18 @@ protected function buildTableDoms() */ protected function createDocumentWithHtml($text) { + // As DOMDocument::loadHTML() does not support UTF-8 properly without specifying the encoding in the HTML, + // we convert all non-ASCII characters to numeric entities. + $convmap = [ + 0x80, // Leave ASCII range intact + 0x10FFFF, // Convert the rest of the Unicode range + 0, + 0xFFFFFF, + ]; + $text = mb_encode_numericentity($text, $convmap, 'UTF-8'); + $dom = new \DOMDocument(); - $dom->loadHTML(htmlspecialchars_decode(iconv('UTF-8', 'ISO-8859-1//IGNORE', htmlentities($text, ENT_COMPAT, 'UTF-8')), ENT_QUOTES)); + $dom->loadHTML($text); return $dom; }