Skip to content

Commit

Permalink
improved parse from bytes performance
Browse files Browse the repository at this point in the history
  • Loading branch information
wenshao committed Apr 26, 2016
1 parent 84f3bfd commit 7816d04
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 624 deletions.
8 changes: 6 additions & 2 deletions src/main/java/com/alibaba/fastjson/JSON.java
Expand Up @@ -121,7 +121,9 @@ public static Object parse(String text, int features) {
} }


public static Object parse(byte[] input, Feature... features) { public static Object parse(byte[] input, Feature... features) {
return parse(input, 0, input.length, IOUtils.getUTF8Decoder(), features); char[] chars = IOUtils.getChars(input.length);
int len = IOUtils.decodeUTF8(input, 0, input.length, chars);
return parse(new String(chars, 0, len), features);
} }


public static Object parse(byte[] input, int off, int len, CharsetDecoder charsetDecoder, Feature... features) { public static Object parse(byte[] input, int off, int len, CharsetDecoder charsetDecoder, Feature... features) {
Expand Down Expand Up @@ -270,7 +272,9 @@ public static <T> T parseObject(String input, Type clazz, ParserConfig config, P


@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public static <T> T parseObject(byte[] input, Type clazz, Feature... features) { public static <T> T parseObject(byte[] input, Type clazz, Feature... features) {
return (T) parseObject(input, 0, input.length, IOUtils.getUTF8Decoder(), clazz, features); char[] chars = IOUtils.getChars(input.length);
int len = IOUtils.decodeUTF8(input, 0, input.length, chars);
return (T) parseObject(new String(chars, 0, len), clazz, features);
} }


@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
Expand Down
237 changes: 228 additions & 9 deletions src/main/java/com/alibaba/fastjson/util/IOUtils.java
Expand Up @@ -27,6 +27,7 @@
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult; import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException; import java.nio.charset.MalformedInputException;
import java.util.Arrays; import java.util.Arrays;


Expand Down Expand Up @@ -542,16 +543,7 @@ public static byte[] decodeFast(String s) {


private final static ThreadLocal<SoftReference<char[]>> charsBufLocal = new ThreadLocal<SoftReference<char[]>>(); private final static ThreadLocal<SoftReference<char[]>> charsBufLocal = new ThreadLocal<SoftReference<char[]>>();


private final static ThreadLocal<CharsetDecoder> decoderLocal = new ThreadLocal<CharsetDecoder>();


public static CharsetDecoder getUTF8Decoder() {
CharsetDecoder decoder = decoderLocal.get();
if (decoder == null) {
decoder = new UTF8Decoder();
decoderLocal.set(decoder);
}
return decoder;
}


public static void clearChars() { public static void clearChars() {
charsBufLocal.set(null); charsBufLocal.set(null);
Expand Down Expand Up @@ -682,4 +674,231 @@ public static int encodeUTF8(char[] sa, int sp, int len, byte[] da) {
} }
return dp; return dp;
} }

private static boolean isNotContinuation(int b) {
return (b & 0xc0) != 0x80;
}

public static int decodeUTF8(byte[] sa, int sp, int len, char[] da) {
final int sl = sp + len;
int dp = 0;
int dlASCII = Math.min(len, da.length);
ByteBuffer bb = null; // only necessary if malformed
final String replacement = "\uFFFD";
final CodingErrorAction malformedInputAction = CodingErrorAction.REPORT;

// ASCII only optimized loop
while (dp < dlASCII && sa[sp] >= 0)
da[dp++] = (char) sa[sp++];

while (sp < sl) {
int b1 = sa[sp++];
if (b1 >= 0) {
// 1 byte, 7 bits: 0xxxxxxx
da[dp++] = (char) b1;
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
if (sp < sl) {
int b2 = sa[sp++];
if (isNotContinuation(b2)) {
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement.charAt(0);
sp--; // malformedN(bb, 2) always returns 1
} else {
da[dp++] = (char) (((b1 << 6) ^ b2)^
(((byte) 0xC0 << 6) ^
((byte) 0x80 << 0)));
}
continue;
}
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement.charAt(0);
return dp;
} else if ((b1 >> 4) == -2) {
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
if (sp + 1 < sl) {
int b2 = sa[sp++];
int b3 = sa[sp++];
if (isMalformed3(b1, b2, b3)) {
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement.charAt(0);
sp -= 3;
bb = getByteBuffer(bb, sa, sp);
sp += malformedN(bb, 3).length();
} else {
char c = (char)((b1 << 12) ^
(b2 << 6) ^
(b3 ^
(((byte) 0xE0 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
if (Character.isSurrogate(c)) {
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement.charAt(0);
} else {
da[dp++] = c;
}
}
continue;
}
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
if (sp < sl && isMalformed3_2(b1, sa[sp])) {
da[dp++] = replacement.charAt(0);
continue;

}
da[dp++] = replacement.charAt(0);
return dp;
} else if ((b1 >> 3) == -2) {
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (sp + 2 < sl) {
int b2 = sa[sp++];
int b3 = sa[sp++];
int b4 = sa[sp++];
int uc = ((b1 << 18) ^
(b2 << 12) ^
(b3 << 6) ^
(b4 ^
(((byte) 0xF0 << 18) ^
((byte) 0x80 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
if (isMalformed4(b2, b3, b4) ||
// shortest form check
!Character.isSupplementaryCodePoint(uc)) {
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement.charAt(0);
sp -= 4;
bb = getByteBuffer(bb, sa, sp);
sp += malformedN(bb, 4).length();
} else {
da[dp++] = Character.highSurrogate(uc);
da[dp++] = Character.lowSurrogate(uc);
}
continue;
}
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
b1 &= 0xff;
if (b1 > 0xf4 ||
sp < sl && isMalformed4_2(b1, sa[sp] & 0xff)) {
da[dp++] = replacement.charAt(0);
continue;
}
sp++;
if (sp < sl && isMalformed4_3(sa[sp])) {
da[dp++] = replacement.charAt(0);
continue;
}
da[dp++] = replacement.charAt(0);
return dp;
} else {
if (malformedInputAction != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement.charAt(0);
}
}
return dp;
}

private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)
{
if (bb == null)
bb = ByteBuffer.wrap(ba);
bb.position(sp);
return bb;
}

// [E0] [A0..BF] [80..BF]
// [E1..EF] [80..BF] [80..BF]
private static boolean isMalformed3(int b1, int b2, int b3) {
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
}

// only used when there is only one byte left in src buffer
private static boolean isMalformed3_2(int b1, int b2) {
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
(b2 & 0xc0) != 0x80;
}

// [F0] [90..BF] [80..BF] [80..BF]
// [F1..F3] [80..BF] [80..BF] [80..BF]
// [F4] [80..8F] [80..BF] [80..BF]
// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
// will be checked by Character.isSupplementaryCodePoint(uc)
private static boolean isMalformed4(int b2, int b3, int b4) {
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
(b4 & 0xc0) != 0x80;
}

// only used when there is less than 4 bytes left in src buffer.
// both b1 and b2 should be "& 0xff" before passed in.
private static boolean isMalformed4_2(int b1, int b2) {
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
(b2 & 0xc0) != 0x80;
}

// tests if b1 and b2 are malformed as the first 2 bytes of a
// legal`4-byte utf-8 byte sequence.
// only used when there is less than 4 bytes left in src buffer,
// after isMalformed4_2 has been invoked.
private static boolean isMalformed4_3(int b3) {
return (b3 & 0xc0) != 0x80;
}

public static CoderResult malformedN(ByteBuffer src, int nb) {
switch (nb) {
case 1:
int b1 = src.get();
if ((b1 >> 2) == -2) {
// 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
if (src.remaining() < 4) return CoderResult.UNDERFLOW;

int n = 5;
for (int i = 1; i < n; i++) {
if ((src.get() & 0xc0) != 0x80) {
return CoderResult.malformedForLength(i);
}
}
return CoderResult.malformedForLength(n);
}
if ((b1 >> 1) == -2) {
// 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
if (src.remaining() < 5) {
return CoderResult.UNDERFLOW;
}

int n = 6;
for (int i = 1; i < n; i++) {
if ((src.get() & 0xc0) != 0x80) {
return CoderResult.malformedForLength(i);
}
}
return CoderResult.malformedForLength(n);
}
return CoderResult.malformedForLength(1);
case 2: // always 1
return CoderResult.malformedForLength(1);
case 3:
b1 = src.get();
int b2 = src.get(); // no need to lookup b3
return CoderResult.malformedForLength(((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80) ? 1 : 2);
case 4: // we don't care the speed here
b1 = src.get() & 0xff;
b2 = src.get() & 0xff;
if (b1 > 0xf4 || (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || (b2 & 0xc0) != 0x80) return CoderResult.malformedForLength(1);
if ((src.get() & 0xc0) != 0x80) return CoderResult.malformedForLength(2);
return CoderResult.malformedForLength(3);
default:
throw new IllegalStateException();
}
}
} }

0 comments on commit 7816d04

Please sign in to comment.