Skip to content

Commit

Permalink
partial fix for sparklemotion#798.
Browse files Browse the repository at this point in the history
It turns out that the missing closing tags was caused by an encoding
exception and misuse of the Encoder API. The exception was due to
the » entity which isn't supported by the Shift_JIS encoding
which caused the encoder to stop half way while it's converting the
UTF-16 content of the document to Shift_JIS. To fix this, we convert
all characters not supported by the current encoding to html entities.
  • Loading branch information
jvshahid committed Jan 11, 2013
1 parent 7e74c9c commit 271bbaf
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 21 deletions.
20 changes: 6 additions & 14 deletions ext/java/nokogiri/internals/NokogiriHelpers.java
Expand Up @@ -40,7 +40,6 @@
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -724,21 +723,14 @@ public static boolean isUTF8(String encoding) {
}

public static byte[] convertEncoding(Charset output_charset, String input_string) throws CharacterCodingException {
Charset input = Charset.forName("UTF-8");
CharsetDecoder decoder = input.newDecoder();
CharsetEncoder encoder = output_charset.newEncoder();
decoder.reset();
encoder.reset();
ByteBuffer bbuf = ByteBuffer.wrap(input_string.getBytes());
CharBuffer cbuf = decoder.decode(bbuf);
bbuf.clear();
encoder.encode(cbuf, bbuf, true);
int length = bbuf.position();
byte[] bytes = new byte[length];
System.arraycopy(bbuf.array(), 0, bytes, 0, length);
return bytes;
CharBuffer charBuffer = CharBuffer.wrap(input_string);
ByteBuffer byteBuffer = encoder.encode(charBuffer);
byte[] buffer = new byte[byteBuffer.remaining()];
byteBuffer.get(buffer);
return buffer;
}

public static String convertEncodingByNKFIfNecessary(Ruby runtime, XmlDocument doc, String thing) {
if (!(doc instanceof HtmlDocument)) return thing;
String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding();
Expand Down
17 changes: 10 additions & 7 deletions ext/java/nokogiri/internals/SaveContextVisitor.java
Expand Up @@ -37,6 +37,8 @@
import static nokogiri.internals.NokogiriHelpers.isNamespace;
import static nokogiri.internals.NokogiriHelpers.isWhitespaceText;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -746,9 +748,7 @@ public boolean enter(Text text) {
textContent = encodeJavaString(textContent);
}

if (getEncoding(text) == null) {
textContent = encodeStringToHtmlEntity(textContent);
}
textContent = encodeStringToHtmlEntity(textContent);
buffer.append(textContent);
return true;
}
Expand All @@ -760,12 +760,15 @@ private String getEncoding(Text text) {
}

private String encodeStringToHtmlEntity(String text) {
if (encoding == null)
return text;
CharsetEncoder encoder = Charset.forName(encoding).newEncoder();
int last = 126; // = U+007E. No need to encode under U+007E.
StringBuffer sb = new StringBuffer();
for (int i=0; i<text.length(); i++) {
int codePoint = text.codePointAt(i);
if (codePoint > last) sb.append("&#x" + Integer.toHexString(codePoint) + ";");
else sb.append(text.charAt(i));
for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i);
if (encoder.canEncode(ch)) sb.append(ch);
else sb.append("&#x" + Integer.toHexString(ch) + ";");
}
return new String(sb);
}
Expand Down

0 comments on commit 271bbaf

Please sign in to comment.