/
utf8.js
executable file
·143 lines (121 loc) · 4.21 KB
/
utf8.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/**
* Make sure the charset of the page using this script is
* set to utf-8 or you will not get the correct results.
*/
var utf8 = (function () {
var highSurrogateMin = 0xd800,
highSurrogateMax = 0xdbff,
lowSurrogateMin = 0xdc00,
lowSurrogateMax = 0xdfff,
surrogateBase = 0x10000;
function isHighSurrogate(charCode) {
return highSurrogateMin <= charCode && charCode <= highSurrogateMax;
}
function isLowSurrogate(charCode) {
return lowSurrogateMin <= charCode && charCode <= lowSurrogateMax;
}
function combineSurrogate(high, low) {
return ((high - highSurrogateMin) << 10) + (low - lowSurrogateMin) + surrogateBase;
}
/**
* Convert charCode to JavaScript String
* handling UTF16 surrogate pair
*/
function chr(charCode) {
var high, low;
if (charCode < surrogateBase) {
return String.fromCharCode(charCode);
}
// convert to UTF16 surrogate pair
high = ((charCode - surrogateBase) >> 10) + highSurrogateMin,
low = (charCode & 0x3ff) + lowSurrogateMin;
return String.fromCharCode(high, low);
}
/**
* Convert JavaScript String to an Array of
* UTF8 bytes
* @export
*/
function stringToBytes(str) {
var bytes = [],
strLength = str.length,
strIndex = 0,
charCode, charCode2;
while (strIndex < strLength) {
charCode = str.charCodeAt(strIndex++);
// handle surrogate pair
if (isHighSurrogate(charCode)) {
if (strIndex === strLength) {
throw new Error('Invalid format');
}
charCode2 = str.charCodeAt(strIndex++);
if (!isLowSurrogate(charCode2)) {
throw new Error('Invalid format');
}
charCode = combineSurrogate(charCode, charCode2);
}
// convert charCode to UTF8 bytes
if (charCode < 0x80) {
// one byte
bytes.push(charCode);
}
else if (charCode < 0x800) {
// two bytes
bytes.push(0xc0 | (charCode >> 6));
bytes.push(0x80 | (charCode & 0x3f));
}
else if (charCode < 0x10000) {
// three bytes
bytes.push(0xe0 | (charCode >> 12));
bytes.push(0x80 | ((charCode >> 6) & 0x3f));
bytes.push(0x80 | (charCode & 0x3f));
}
else {
// four bytes
bytes.push(0xf0 | (charCode >> 18));
bytes.push(0x80 | ((charCode >> 12) & 0x3f));
bytes.push(0x80 | ((charCode >> 6) & 0x3f));
bytes.push(0x80 | (charCode & 0x3f));
}
}
return bytes;
}
/**
* Convert an Array of UTF8 bytes to
* a JavaScript String
* @export
*/
function bytesToString(bytes) {
var str = '',
length = bytes.length,
index = 0,
byte,
charCode;
while (index < length) {
// first byte
byte = bytes[index++];
if (byte < 0x80) {
// one byte
charCode = byte;
}
else if ((byte >> 5) === 0x06) {
// two bytes
charCode = ((byte & 0x1f) << 6) | (bytes[index++] & 0x3f);
}
else if ((byte >> 4) === 0x0e) {
// three bytes
charCode = ((byte & 0x0f) << 12) | ((bytes[index++] & 0x3f) << 6) | (bytes[index++] & 0x3f);
}
else {
// four bytes
charCode = ((byte & 0x07) << 18) | ((bytes[index++] & 0x3f) << 12) | ((bytes[index++] & 0x3f) << 6) | (bytes[index++] & 0x3f);
}
str += chr(charCode);
}
return str;
}
return {
stringToBytes: stringToBytes,
bytesToString: bytesToString
};
}());