Permalink
Browse files

Clean up tokenizer code and strenghten unit tests

Clean up the tokenzier to be a bit more clear as to what's going on, and
strenghten the unit tests to better test handling of multi-byte Unicode
characters

git-svn-id: https://svn.apache.org/repos/asf/mina/vysper/trunk@1389075 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 1290edd commit ab7c90661fd41448e371715d057e1b07f2969fda Niklas Gustavsson committed Sep 23, 2012
@@ -19,6 +19,8 @@
*/
package org.apache.vysper.xml.sax.impl;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
@@ -59,76 +61,76 @@ public XMLTokenizer(TokenListener listeners) {
*/
public void parse(IoBuffer byteBuffer, CharsetDecoder decoder) throws SAXException {
while (byteBuffer.hasRemaining() && state != State.CLOSED) {
- char c = (char) byteBuffer.get();
+ byte c = byteBuffer.get();
if (state == State.START) {
if (c == '<') {
- emit(c, byteBuffer);
+ emit(c);
state = State.IN_TAG;
} else if (Character.isWhitespace(c)) {
// ignore
} else {
state = State.IN_TEXT;
- buffer.put((byte) c);
+ buffer.put(c);
}
} else if (state == State.IN_TEXT) {
if (c == '<') {
- emit(byteBuffer, decoder);
- emit(c, byteBuffer);
+ emit(decoder);
+ emit(c);
state = State.IN_TAG;
} else {
- buffer.put((byte) c);
+ buffer.put(c);
}
} else if (state == State.IN_TAG) {
if (c == '>') {
- emit(c, byteBuffer);
+ emit(c);
state = State.START;
} else if (c == '"') {
- emit(c, byteBuffer);
+ emit(c);
state = State.IN_DOUBLE_ATTRIBUTE_VALUE;
} else if (c == '\'') {
- emit(c, byteBuffer);
+ emit(c);
state = State.IN_SINGLE_ATTRIBUTE_VALUE;
} else if (c == '-') {
- emit(c, byteBuffer);
+ emit(c);
} else if (isControlChar(c)) {
- emit(c, byteBuffer);
+ emit(c);
} else if (Character.isWhitespace(c)) {
buffer.clear();
} else {
state = State.IN_STRING;
- buffer.put((byte) c);
+ buffer.put(c);
}
} else if (state == State.IN_STRING) {
if (c == '>') {
- emit(byteBuffer, CharsetUtil.UTF8_DECODER);
- emit(c, byteBuffer);
+ emit(CharsetUtil.UTF8_DECODER);
+ emit(c);
state = State.START;
} else if (isControlChar(c)) {
- emit(byteBuffer, CharsetUtil.UTF8_DECODER);
- emit(c, byteBuffer);
+ emit(CharsetUtil.UTF8_DECODER);
+ emit(c);
state = State.IN_TAG;
} else if (Character.isWhitespace(c)) {
- emit(byteBuffer, CharsetUtil.UTF8_DECODER);
+ emit(CharsetUtil.UTF8_DECODER);
state = State.IN_TAG;
} else {
- buffer.put((byte) c);
+ buffer.put(c);
}
} else if (state == State.IN_DOUBLE_ATTRIBUTE_VALUE) {
if (c == '"') {
- emit(byteBuffer, decoder);
- emit(c, byteBuffer);
+ emit(decoder);
+ emit(c);
state = State.IN_TAG;
} else {
- buffer.put((byte) c);
+ buffer.put(c);
}
} else if (state == State.IN_SINGLE_ATTRIBUTE_VALUE) {
if (c == '\'') {
- emit(byteBuffer, decoder);
- emit(c, byteBuffer);
+ emit(decoder);
+ emit(c);
state = State.IN_TAG;
} else {
- buffer.put((byte) c);
+ buffer.put(c);
}
}
}
@@ -143,18 +145,20 @@ public void restart() {
buffer.clear();
}
- private boolean isControlChar(char c) {
+ private boolean isControlChar(byte c) {
return c == '<' || c == '>' || c == '!' || c == '/' || c == '?' || c == '=';
}
- private void emit(char token, IoBuffer byteBuffer) throws SAXException {
- listener.token(token, null);
+ private void emit(byte token) throws SAXException {
+ // method will only be called for control chars, thus the cast to char should be safe
+ listener.token((char)token, null);
}
- private void emit(IoBuffer byteBuffer, CharsetDecoder decoder) throws SAXException {
+ private void emit(CharsetDecoder decoder) throws SAXException {
try {
buffer.flip();
- listener.token(NO_CHAR, buffer.getString(decoder));
+ CharBuffer charBuffer = decoder.decode(buffer.buf());
+ listener.token(NO_CHAR, charBuffer.toString());
buffer.clear();
} catch (CharacterCodingException e) {
throw new SAXException(e);
@@ -24,6 +24,7 @@
import org.apache.mina.core.buffer.IoBuffer;
import org.apache.vysper.charset.CharsetUtil;
import org.apache.vysper.xml.sax.NonBlockingXMLReader;
+import org.apache.vysper.xml.sax.impl.TestHandler.CharacterEvent;
import org.apache.vysper.xml.sax.impl.TestHandler.TestEvent;
/**
@@ -120,7 +121,7 @@ public void testDashInName() throws Exception {
assertNoMoreevents(events);
}
-
+
public void testInvalidUnicodeInName() throws Exception {
Iterator<TestEvent> events = parse("<r\u2190oot />").iterator();
@@ -169,24 +170,32 @@ public void testMixedXmlBeginName() throws Exception {
assertNoMoreevents(events);
}
-
+
public void testSplitBuffers() throws Exception {
TestHandler handler = new TestHandler();
NonBlockingXMLReader reader = new DefaultNonBlockingXMLReader();
reader.setContentHandler(handler);
reader.setErrorHandler(handler);
- String xml1 = "<root></r";
- String xml2 = "oot>";
-
- reader.parse(IoBuffer.wrap(xml1.getBytes("UTF-8")), CharsetUtil.UTF8_DECODER);
- reader.parse(IoBuffer.wrap(xml2.getBytes("UTF-8")), CharsetUtil.UTF8_DECODER);
+ String s = "<root>\u1251</root>";
+
+ // split in the middle of the Unicode char
+ byte[] xml = s.getBytes("UTF-8");
+ byte[] xml1 = new byte[8];
+ byte[] xml2 = new byte[8];
+
+ System.arraycopy(xml, 0, xml1, 0, 8);
+ System.arraycopy(xml, 8, xml2, 0, 8);
+
+ reader.parse(IoBuffer.wrap(xml1), CharsetUtil.UTF8_DECODER);
+ reader.parse(IoBuffer.wrap(xml2), CharsetUtil.UTF8_DECODER);
Iterator<TestEvent> events = handler.getEvents().iterator();
-
+
assertStartDocument(events.next());
assertStartElement("", "root", "root", events.next());
+ assertText("\u1251", events.next());
assertEndElement("", "root", "root", events.next());
assertEndDocument(events.next());
@@ -65,23 +65,23 @@ public void testDoubleEscapedAmp() throws Exception {
}
public void testUnicodeEscape() throws Exception {
- Iterator<TestEvent> events = parse("<root>t&#251;ext</root>").iterator();
+ Iterator<TestEvent> events = parse("<root>t&#4689;ext</root>").iterator();
assertStartDocument(events.next());
assertStartElement("", "root", "root", events.next());
- assertText("tûext", events.next());
+ assertText("t\u1251ext", events.next());
assertEndElement("", "root", "root", events.next());
assertEndDocument(events.next());
assertFalse(events.hasNext());
}
public void testUnicodeHexEscape() throws Exception {
- Iterator<TestEvent> events = parse("<root>t&#xFB;ext</root>").iterator();
+ Iterator<TestEvent> events = parse("<root>t&#x1251;ext</root>").iterator();
assertStartDocument(events.next());
assertStartElement("", "root", "root", events.next());
- assertText("tûext", events.next());
+ assertText("t\u1251ext", events.next());
assertEndElement("", "root", "root", events.next());
assertEndDocument(events.next());

0 comments on commit ab7c906

Please sign in to comment.