-
Notifications
You must be signed in to change notification settings - Fork 8.8k
/
Text.java
795 lines (725 loc) · 24.6 KB
/
Text.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io;
import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.MalformedInputException;
import java.nio.charset.StandardCharsets;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Arrays;
import org.apache.avro.reflect.Stringable;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
/** This class stores text using standard UTF8 encoding. It provides methods
* to serialize, deserialize, and compare texts at byte level. The type of
* length is integer and is serialized using zero-compressed format. <p>In
* addition, it provides methods for string traversal without converting the
* byte array to a string. <p>Also includes utilities for
* serializing/deserialing a string, coding/decoding a string, checking if a
* byte array contains valid UTF8 code, calculating the length of an encoded
* string.
*/
@Stringable
@InterfaceAudience.Public
@InterfaceStability.Stable
public class Text extends BinaryComparable
implements WritableComparable<BinaryComparable> {
private static final ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
new ThreadLocal<CharsetEncoder>() {
@Override
protected CharsetEncoder initialValue() {
return StandardCharsets.UTF_8.newEncoder().
onMalformedInput(CodingErrorAction.REPORT).
onUnmappableCharacter(CodingErrorAction.REPORT);
}
};
private static final ThreadLocal<CharsetDecoder> DECODER_FACTORY =
new ThreadLocal<CharsetDecoder>() {
@Override
protected CharsetDecoder initialValue() {
return StandardCharsets.UTF_8.newDecoder().
onMalformedInput(CodingErrorAction.REPORT).
onUnmappableCharacter(CodingErrorAction.REPORT);
}
};
// max size of the byte array, seems to be a safe choice for multiple JVMs
// (see ArrayList.MAX_ARRAY_SIZE)
private static final int ARRAY_MAX_SIZE = Integer.MAX_VALUE - 8;
private static final byte[] EMPTY_BYTES = new byte[0];
private byte[] bytes = EMPTY_BYTES;
private int length = 0;
private int textLength = -1;
/**
* Construct an empty text string.
*/
public Text() {
}
/**
* Construct from a string.
* @param string input string.
*/
public Text(String string) {
set(string);
}
/**
* Construct from another text.
* @param utf8 input utf8.
*/
public Text(Text utf8) {
set(utf8);
}
/**
* Construct from a byte array.
*
* @param utf8 input utf8.
*/
public Text(byte[] utf8) {
set(utf8);
}
/**
* @return Get a copy of the bytes that is exactly the length of the data.
* See {@link #getBytes()} for faster access to the underlying array.
*/
public byte[] copyBytes() {
return Arrays.copyOf(bytes, length);
}
/**
* Returns the raw bytes; however, only data up to {@link #getLength()} is
* valid. Please use {@link #copyBytes()} if you
* need the returned array to be precisely the length of the data.
*/
@Override
public byte[] getBytes() {
return bytes;
}
/**
* Returns the number of bytes in the byte array.
*/
@Override
public int getLength() {
return length;
}
/**
* @return Returns the length of this text. The length is equal to the number of
* Unicode code units in the text.
*/
public int getTextLength() {
if (textLength < 0) {
textLength = toString().length();
}
return textLength;
}
/**
* Returns the Unicode Scalar Value (32-bit integer value)
* for the character at <code>position</code>. Note that this
* method avoids using the converter or doing String instantiation.
*
* @param position input position.
* @return the Unicode scalar value at position or -1
* if the position is invalid or points to a
* trailing byte
*/
public int charAt(int position) {
if (position > this.length) return -1; // too long
if (position < 0) return -1; // duh.
ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
return bytesToCodePoint(bb.slice());
}
public int find(String what) {
return find(what, 0);
}
/**
* Finds any occurrence of <code>what</code> in the backing
* buffer, starting as position <code>start</code>. The starting
* position is measured in bytes and the return value is in
* terms of byte position in the buffer. The backing buffer is
* not converted to a string for this operation.
*
* @param what input what.
* @param start input start.
* @return byte position of the first occurrence of the search
* string in the UTF-8 buffer or -1 if not found
*/
public int find(String what, int start) {
try {
ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length);
ByteBuffer tgt = encode(what);
byte b = tgt.get();
src.position(start);
while (src.hasRemaining()) {
if (b == src.get()) { // matching first byte
src.mark(); // save position in loop
tgt.mark(); // save position in target
boolean found = true;
int pos = src.position()-1;
while (tgt.hasRemaining()) {
if (!src.hasRemaining()) { // src expired first
tgt.reset();
src.reset();
found = false;
break;
}
if (!(tgt.get() == src.get())) {
tgt.reset();
src.reset();
found = false;
break; // no match
}
}
if (found) return pos;
}
}
return -1; // not found
} catch (CharacterCodingException e) {
throw new RuntimeException("Should not have happened", e);
}
}
/**
* Set to contain the contents of a string.
*
* @param string input string.
*/
public void set(String string) {
try {
ByteBuffer bb = encode(string, true);
bytes = bb.array();
length = bb.limit();
textLength = string.length();
} catch (CharacterCodingException e) {
throw new RuntimeException("Should not have happened", e);
}
}
/**
* Set to a utf8 byte array. If the length of <code>utf8</code> is
* <em>zero</em>, actually clear {@link #bytes} and any existing
* data is lost.
*
* @param utf8 input utf8.
*/
public void set(byte[] utf8) {
if (utf8.length == 0) {
bytes = EMPTY_BYTES;
length = 0;
textLength = -1;
} else {
set(utf8, 0, utf8.length);
}
}
/**
* Copy a text.
* @param other other.
*/
public void set(Text other) {
set(other.getBytes(), 0, other.getLength());
this.textLength = other.textLength;
}
/**
* Set the Text to range of bytes.
*
* @param utf8 the data to copy from
* @param start the first position of the new string
* @param len the number of bytes of the new string
*/
public void set(byte[] utf8, int start, int len) {
ensureCapacity(len);
System.arraycopy(utf8, start, bytes, 0, len);
this.length = len;
this.textLength = -1;
}
/**
* Append a range of bytes to the end of the given text.
*
* @param utf8 the data to copy from
* @param start the first position to append from utf8
* @param len the number of bytes to append
*/
public void append(byte[] utf8, int start, int len) {
byte[] original = bytes;
if (ensureCapacity(length + len)) {
System.arraycopy(original, 0, bytes, 0, length);
}
System.arraycopy(utf8, start, bytes, length, len);
length += len;
textLength = -1;
}
/**
* Clear the string to empty.
*
* <em>Note</em>: For performance reasons, this call does not clear the
* underlying byte array that is retrievable via {@link #getBytes()}.
* In order to free the byte-array memory, call {@link #set(byte[])}
* with an empty byte array (For example, <code>new byte[0]</code>).
*/
public void clear() {
length = 0;
textLength = -1;
}
/**
* Sets the capacity of this Text object to <em>at least</em>
* <code>capacity</code> bytes. If the current buffer is longer, then the
* capacity and existing content of the buffer are unchanged. If
* <code>capacity</code> is larger than the current capacity, the Text
* object's capacity is increased to match and any existing data is lost.
*
* @param capacity the number of bytes we need
* @return true if the internal array was resized or false otherwise
*/
private boolean ensureCapacity(final int capacity) {
if (bytes.length < capacity) {
// Try to expand the backing array by the factor of 1.5x
// (by taking the current size + diving it by half).
//
// If the calculated value is beyond the size
// limit, we cap it to ARRAY_MAX_SIZE
long targetSizeLong = bytes.length + (bytes.length >> 1);
int targetSize = (int)Math.min(targetSizeLong, ARRAY_MAX_SIZE);
targetSize = Math.max(capacity, targetSize);
bytes = new byte[targetSize];
return true;
}
return false;
}
@Override
public String toString() {
try {
return decode(bytes, 0, length);
} catch (CharacterCodingException e) {
throw new RuntimeException("Should not have happened", e);
}
}
@Override
public void readFields(DataInput in) throws IOException {
int newLength = WritableUtils.readVInt(in);
readWithKnownLength(in, newLength);
}
public void readFields(DataInput in, int maxLength) throws IOException {
int newLength = WritableUtils.readVInt(in);
if (newLength < 0) {
throw new IOException("tried to deserialize " + newLength +
" bytes of data! newLength must be non-negative.");
} else if (newLength >= maxLength) {
throw new IOException("tried to deserialize " + newLength +
" bytes of data, but maxLength = " + maxLength);
}
readWithKnownLength(in, newLength);
}
/**
* Skips over one Text in the input.
* @param in input in.
* @throws IOException raised on errors performing I/O.
*/
public static void skip(DataInput in) throws IOException {
int length = WritableUtils.readVInt(in);
WritableUtils.skipFully(in, length);
}
/**
* Read a Text object whose length is already known.
* This allows creating Text from a stream which uses a different serialization
* format.
*
* @param in input in.
* @param len input len.
* @throws IOException raised on errors performing I/O.
*/
public void readWithKnownLength(DataInput in, int len) throws IOException {
ensureCapacity(len);
in.readFully(bytes, 0, len);
length = len;
textLength = -1;
}
/**
* Serialize. Write this object to out length uses zero-compressed encoding.
*
* @see Writable#write(DataOutput)
*/
@Override
public void write(DataOutput out) throws IOException {
WritableUtils.writeVInt(out, length);
out.write(bytes, 0, length);
}
public void write(DataOutput out, int maxLength) throws IOException {
if (length > maxLength) {
throw new IOException("data was too long to write! Expected " +
"less than or equal to " + maxLength + " bytes, but got " +
length + " bytes.");
}
WritableUtils.writeVInt(out, length);
out.write(bytes, 0, length);
}
/**
* Returns true iff <code>o</code> is a Text with the same length and same
* contents.
*/
@Override
public boolean equals(Object o) {
if (o instanceof Text)
return super.equals(o);
return false;
}
@Override
public int hashCode() {
return super.hashCode();
}
/** A WritableComparator optimized for Text keys. */
public static class Comparator extends WritableComparator {
public Comparator() {
super(Text.class);
}
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int n1 = WritableUtils.decodeVIntSize(b1[s1]);
int n2 = WritableUtils.decodeVIntSize(b2[s2]);
return compareBytes(b1, s1 + n1, l1 - n1, b2, s2 + n2, l2 - n2);
}
}
static {
// register this comparator
WritableComparator.define(Text.class, new Comparator());
}
/// STATIC UTILITIES FROM HERE DOWN
/**
* @return Converts the provided byte array to a String using the
* UTF-8 encoding. If the input is malformed,
* replace by a default value.
*
* @param utf8 input utf8.
* @throws CharacterCodingException when a character
* encoding or decoding error occurs.
*/
public static String decode(byte[] utf8) throws CharacterCodingException {
return decode(ByteBuffer.wrap(utf8), true);
}
public static String decode(byte[] utf8, int start, int length)
throws CharacterCodingException {
return decode(ByteBuffer.wrap(utf8, start, length), true);
}
/**
* @return Converts the provided byte array to a String using the
* UTF-8 encoding. If <code>replace</code> is true, then
* malformed input is replaced with the
* substitution character, which is U+FFFD. Otherwise the
* method throws a MalformedInputException.
*
* @param utf8 input utf8.
* @param start input start.
* @param length input length.
* @param replace input replace.
* @throws CharacterCodingException when a character
* encoding or decoding error occurs.
*/
public static String decode(byte[] utf8, int start, int length, boolean replace)
throws CharacterCodingException {
return decode(ByteBuffer.wrap(utf8, start, length), replace);
}
private static String decode(ByteBuffer utf8, boolean replace)
throws CharacterCodingException {
CharsetDecoder decoder = DECODER_FACTORY.get();
if (replace) {
decoder.onMalformedInput(
java.nio.charset.CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
}
String str = decoder.decode(utf8).toString();
// set decoder back to its default value: REPORT
if (replace) {
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
}
return str;
}
/**
* Converts the provided String to bytes using the
* UTF-8 encoding. If the input is malformed,
* invalid chars are replaced by a default value.
*
* @param string input string.
* @return ByteBuffer: bytes stores at ByteBuffer.array()
* and length is ByteBuffer.limit()
* @throws CharacterCodingException when a character
* encoding or decoding error occurs.
*/
public static ByteBuffer encode(String string)
throws CharacterCodingException {
return encode(string, true);
}
/**
* Converts the provided String to bytes using the
* UTF-8 encoding. If <code>replace</code> is true, then
* malformed input is replaced with the
* substitution character, which is U+FFFD. Otherwise the
* method throws a MalformedInputException.
*
* @param string input string.
* @param replace input replace.
* @return ByteBuffer: bytes stores at ByteBuffer.array()
* and length is ByteBuffer.limit()
* @throws CharacterCodingException when a character
* encoding or decoding error occurs.
*/
public static ByteBuffer encode(String string, boolean replace)
throws CharacterCodingException {
CharsetEncoder encoder = ENCODER_FACTORY.get();
if (replace) {
encoder.onMalformedInput(CodingErrorAction.REPLACE);
encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
}
ByteBuffer bytes =
encoder.encode(CharBuffer.wrap(string.toCharArray()));
if (replace) {
encoder.onMalformedInput(CodingErrorAction.REPORT);
encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
}
return bytes;
}
static final public int DEFAULT_MAX_LEN = 1024 * 1024;
/**
* @return Read a UTF8 encoded string from in.
* @param in input in.
* @throws IOException raised on errors performing I/O.
*/
public static String readString(DataInput in) throws IOException {
return readString(in, Integer.MAX_VALUE);
}
/**
* @return Read a UTF8 encoded string with a maximum size.
* @param in input datainput.
* @param maxLength input maxLength.
* @throws IOException raised on errors performing I/O.
*/
public static String readString(DataInput in, int maxLength)
throws IOException {
int length = WritableUtils.readVIntInRange(in, 0, maxLength);
byte [] bytes = new byte[length];
in.readFully(bytes, 0, length);
return decode(bytes);
}
/**
* Write a UTF8 encoded string to out.
*
* @param out input out.
* @param s input s.
* @throws IOException raised on errors performing I/O.
* @return a UTF8 encoded string to out.
*/
public static int writeString(DataOutput out, String s) throws IOException {
ByteBuffer bytes = encode(s);
int length = bytes.limit();
WritableUtils.writeVInt(out, length);
out.write(bytes.array(), 0, length);
return length;
}
/**
* @return Write a UTF8 encoded string with a maximum size to out.
*
* @param out input out.
* @param s input s.
* @param maxLength input maxLength.
* @throws IOException raised on errors performing I/O.
*/
public static int writeString(DataOutput out, String s, int maxLength)
throws IOException {
ByteBuffer bytes = encode(s);
int length = bytes.limit();
if (length > maxLength) {
throw new IOException("string was too long to write! Expected " +
"less than or equal to " + maxLength + " bytes, but got " +
length + " bytes.");
}
WritableUtils.writeVInt(out, length);
out.write(bytes.array(), 0, length);
return length;
}
////// states for validateUTF8
private static final int LEAD_BYTE = 0;
private static final int TRAIL_BYTE_1 = 1;
private static final int TRAIL_BYTE = 2;
/**
* Check if a byte array contains valid UTF-8.
*
* @param utf8 byte array
* @throws MalformedInputException if the byte array contains invalid UTF-8
*/
public static void validateUTF8(byte[] utf8) throws MalformedInputException {
validateUTF8(utf8, 0, utf8.length);
}
/**
* Check to see if a byte array is valid UTF-8.
*
* @param utf8 the array of bytes
* @param start the offset of the first byte in the array
* @param len the length of the byte sequence
* @throws MalformedInputException if the byte array contains invalid bytes
*/
public static void validateUTF8(byte[] utf8, int start, int len)
throws MalformedInputException {
int count = start;
int leadByte = 0;
int length = 0;
int state = LEAD_BYTE;
while (count < start+len) {
int aByte = utf8[count] & 0xFF;
switch (state) {
case LEAD_BYTE:
leadByte = aByte;
length = bytesFromUTF8[aByte];
switch (length) {
case 0: // check for ASCII
if (leadByte > 0x7F)
throw new MalformedInputException(count);
break;
case 1:
if (leadByte < 0xC2 || leadByte > 0xDF)
throw new MalformedInputException(count);
state = TRAIL_BYTE_1;
break;
case 2:
if (leadByte < 0xE0 || leadByte > 0xEF)
throw new MalformedInputException(count);
state = TRAIL_BYTE_1;
break;
case 3:
if (leadByte < 0xF0 || leadByte > 0xF4)
throw new MalformedInputException(count);
state = TRAIL_BYTE_1;
break;
default:
// too long! Longest valid UTF-8 is 4 bytes (lead + three)
// or if < 0 we got a trail byte in the lead byte position
throw new MalformedInputException(count);
} // switch (length)
break;
case TRAIL_BYTE_1:
if (leadByte == 0xF0 && aByte < 0x90)
throw new MalformedInputException(count);
if (leadByte == 0xF4 && aByte > 0x8F)
throw new MalformedInputException(count);
if (leadByte == 0xE0 && aByte < 0xA0)
throw new MalformedInputException(count);
if (leadByte == 0xED && aByte > 0x9F)
throw new MalformedInputException(count);
// falls through to regular trail-byte test!!
case TRAIL_BYTE:
if (aByte < 0x80 || aByte > 0xBF)
throw new MalformedInputException(count);
if (--length == 0) {
state = LEAD_BYTE;
} else {
state = TRAIL_BYTE;
}
break;
default:
break;
} // switch (state)
count++;
}
}
/**
* Magic numbers for UTF-8. These are the number of bytes
* that <em>follow</em> a given lead byte. Trailing bytes
* have the value -1. The values 4 and 5 are presented in
* this table, even though valid UTF-8 cannot include the
* five and six byte sequences.
*/
static final int[] bytesFromUTF8 =
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
// trail bytes
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
/**
* @return Returns the next code point at the current position in
* the buffer. The buffer's position will be incremented.
* Any mark set on this buffer will be changed by this method!
*
* @param bytes input bytes.
*/
public static int bytesToCodePoint(ByteBuffer bytes) {
bytes.mark();
byte b = bytes.get();
bytes.reset();
int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
if (extraBytesToRead < 0) return -1; // trailing byte!
int ch = 0;
switch (extraBytesToRead) {
case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
case 0: ch += (bytes.get() & 0xFF);
}
ch -= offsetsFromUTF8[extraBytesToRead];
return ch;
}
static final int offsetsFromUTF8[] =
{ 0x00000000, 0x00003080,
0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
/**
* For the given string, returns the number of UTF-8 bytes
* required to encode the string.
* @param string text to encode
* @return number of UTF-8 bytes required to encode
*/
public static int utf8Length(String string) {
CharacterIterator iter = new StringCharacterIterator(string);
char ch = iter.first();
int size = 0;
while (ch != CharacterIterator.DONE) {
if ((ch >= 0xD800) && (ch < 0xDC00)) {
// surrogate pair?
char trail = iter.next();
if ((trail > 0xDBFF) && (trail < 0xE000)) {
// valid pair
size += 4;
} else {
// invalid pair
size += 3;
iter.previous(); // rewind one
}
} else if (ch < 0x80) {
size++;
} else if (ch < 0x800) {
size += 2;
} else {
// ch < 0x10000, that is, the largest char value
size += 3;
}
ch = iter.next();
}
return size;
}
}