partial fix for sparklemotion#798.

It turns out that the missing closing tags was caused by an encoding exception and misuse of the Encoder API. The exception was due to the » entity which isn't supported by the Shift_JIS encoding which caused the encoder to stop half way while it's converting the UTF-16 content of the document to Shift_JIS. To fix this, we convert all characters not supported by the current encoding to html entities.
bigfix · Jan 11, 2013 · 271bbaf · 271bbaf
1 parent 7e74c9c
commit 271bbaf
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 21 deletions.
diff --git a/ext/java/nokogiri/internals/NokogiriHelpers.java b/ext/java/nokogiri/internals/NokogiriHelpers.java
@@ -40,7 +40,6 @@
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.util.ArrayList;
 import java.util.List;
@@ -724,21 +723,14 @@ public static boolean isUTF8(String encoding) {
     }
 
     public static byte[] convertEncoding(Charset output_charset, String input_string) throws CharacterCodingException {
-        Charset input = Charset.forName("UTF-8");
-        CharsetDecoder decoder = input.newDecoder();
         CharsetEncoder encoder = output_charset.newEncoder();
-        decoder.reset();
-        encoder.reset();
-        ByteBuffer bbuf = ByteBuffer.wrap(input_string.getBytes());
-        CharBuffer cbuf = decoder.decode(bbuf);
-        bbuf.clear();
-        encoder.encode(cbuf, bbuf, true);
-        int length = bbuf.position();
-        byte[] bytes = new byte[length];
-        System.arraycopy(bbuf.array(), 0, bytes, 0, length);
-        return bytes;
+        CharBuffer charBuffer = CharBuffer.wrap(input_string);
+        ByteBuffer byteBuffer = encoder.encode(charBuffer);
+        byte[] buffer = new byte[byteBuffer.remaining()];
+        byteBuffer.get(buffer);
+        return buffer;
     }
-    
+
     public static String convertEncodingByNKFIfNecessary(Ruby runtime, XmlDocument doc, String thing) {
         if (!(doc instanceof HtmlDocument)) return thing;
         String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding();

diff --git a/ext/java/nokogiri/internals/SaveContextVisitor.java b/ext/java/nokogiri/internals/SaveContextVisitor.java
@@ -37,6 +37,8 @@
 import static nokogiri.internals.NokogiriHelpers.isNamespace;
 import static nokogiri.internals.NokogiriHelpers.isWhitespaceText;
 
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -746,9 +748,7 @@ public boolean enter(Text text) {
             textContent = encodeJavaString(textContent);
         }
 
-        if (getEncoding(text) == null) {
-            textContent = encodeStringToHtmlEntity(textContent);
-        }
+        textContent = encodeStringToHtmlEntity(textContent);
         buffer.append(textContent);
         return true;
     }
@@ -760,12 +760,15 @@ private String getEncoding(Text text) {
     }
 
     private String encodeStringToHtmlEntity(String text) {
+        if (encoding == null)
+          return text;
+        CharsetEncoder encoder = Charset.forName(encoding).newEncoder();
         int last = 126; // = U+007E. No need to encode under U+007E.
         StringBuffer sb = new StringBuffer();
-        for (int i=0; i<text.length(); i++) {
-            int codePoint = text.codePointAt(i);
-            if (codePoint > last) sb.append("&#x" + Integer.toHexString(codePoint) + ";");
-            else sb.append(text.charAt(i));
+        for (int i = 0; i < text.length(); i++) {
+            char ch = text.charAt(i);
+            if (encoder.canEncode(ch)) sb.append(ch);
+            else sb.append("&#x" + Integer.toHexString(ch) + ";");
         }
         return new String(sb);
     }