diff --git a/ext/java/nokogiri/internals/NokogiriHelpers.java b/ext/java/nokogiri/internals/NokogiriHelpers.java index 6123800d78..081570e933 100644 --- a/ext/java/nokogiri/internals/NokogiriHelpers.java +++ b/ext/java/nokogiri/internals/NokogiriHelpers.java @@ -40,7 +40,6 @@ import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.List; @@ -724,21 +723,14 @@ public static boolean isUTF8(String encoding) { } public static byte[] convertEncoding(Charset output_charset, String input_string) throws CharacterCodingException { - Charset input = Charset.forName("UTF-8"); - CharsetDecoder decoder = input.newDecoder(); CharsetEncoder encoder = output_charset.newEncoder(); - decoder.reset(); - encoder.reset(); - ByteBuffer bbuf = ByteBuffer.wrap(input_string.getBytes()); - CharBuffer cbuf = decoder.decode(bbuf); - bbuf.clear(); - encoder.encode(cbuf, bbuf, true); - int length = bbuf.position(); - byte[] bytes = new byte[length]; - System.arraycopy(bbuf.array(), 0, bytes, 0, length); - return bytes; + CharBuffer charBuffer = CharBuffer.wrap(input_string); + ByteBuffer byteBuffer = encoder.encode(charBuffer); + byte[] buffer = new byte[byteBuffer.remaining()]; + byteBuffer.get(buffer); + return buffer; } - + public static String convertEncodingByNKFIfNecessary(Ruby runtime, XmlDocument doc, String thing) { if (!(doc instanceof HtmlDocument)) return thing; String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding(); diff --git a/ext/java/nokogiri/internals/SaveContextVisitor.java b/ext/java/nokogiri/internals/SaveContextVisitor.java index cf79a53a67..c1b26be8ff 100644 --- a/ext/java/nokogiri/internals/SaveContextVisitor.java +++ b/ext/java/nokogiri/internals/SaveContextVisitor.java @@ -37,6 +37,8 @@ import static nokogiri.internals.NokogiriHelpers.isNamespace; import static nokogiri.internals.NokogiriHelpers.isWhitespaceText; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; @@ -746,9 +748,7 @@ public boolean enter(Text text) { textContent = encodeJavaString(textContent); } - if (getEncoding(text) == null) { - textContent = encodeStringToHtmlEntity(textContent); - } + textContent = encodeStringToHtmlEntity(textContent); buffer.append(textContent); return true; } @@ -760,12 +760,15 @@ private String getEncoding(Text text) { } private String encodeStringToHtmlEntity(String text) { + if (encoding == null) + return text; + CharsetEncoder encoder = Charset.forName(encoding).newEncoder(); int last = 126; // = U+007E. No need to encode under U+007E. StringBuffer sb = new StringBuffer(); - for (int i=0; i last) sb.append("&#x" + Integer.toHexString(codePoint) + ";"); - else sb.append(text.charAt(i)); + for (int i = 0; i < text.length(); i++) { + char ch = text.charAt(i); + if (encoder.canEncode(ch)) sb.append(ch); + else sb.append("&#x" + Integer.toHexString(ch) + ";"); } return new String(sb); }