Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(encoder): utf8-encode wo/ perf improvement
- Loading branch information
1 parent
b6981cd
commit b1432be
Showing
6 changed files
with
194 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
/** | ||
* My UTF-8 encoding implementation | ||
* Follow instruction by https://zh.wikipedia.org/wiki/UTF-8#UTF-8%E7%9A%84%E7%B7%A8%E7%A2%BC%E6%96%B9%E5%BC%8F | ||
*/ | ||
export function utf8Encode(str: string): Uint8Array { | ||
const out: Array<number> = [] | ||
|
||
const mask = 0b00111111 | ||
const prefix = 0x80 | ||
|
||
for (const char of str) { | ||
const code = char.codePointAt(0) | ||
if (!code) { | ||
continue | ||
} | ||
if (code <= 0x7f) { | ||
// Byte 1 | ||
out.push(code) | ||
} else if (code <= 0x7ff) { | ||
// Byte 2 | ||
out.push(0b11000000 + (code >>> 6), prefix + (code & mask)) | ||
} else if (code <= 0xffff) { | ||
// Byte 3 | ||
out.push( | ||
0b11100000 + (code >>> 12), | ||
prefix + ((code >>> 6) & mask), | ||
prefix + (code & mask) | ||
) | ||
} else if (code <= 0x1fffff) { | ||
// Byte 4 | ||
out.push( | ||
0b11110000 + (code >>> 18), | ||
prefix + ((code >>> 12) & mask), | ||
prefix + ((code >>> 6) & mask), | ||
prefix + (code & mask) | ||
) | ||
} else if (code <= 0x3ffffff) { | ||
// Byte 5 | ||
out.push( | ||
0b11111000 + (code >>> 24), | ||
prefix + ((code >>> 18) & mask), | ||
prefix + ((code >>> 12) & mask), | ||
prefix + ((code >>> 6) & mask), | ||
prefix + (code & mask) | ||
) | ||
} else { | ||
// Byte 6 | ||
out.push( | ||
0b11111100 + (code >>> 30), | ||
prefix + ((code >>> 24) & mask), | ||
prefix + ((code >>> 18) & mask), | ||
prefix + ((code >>> 12) & mask), | ||
prefix + ((code >>> 6) & mask), | ||
prefix + (code & mask) | ||
) | ||
} | ||
} | ||
return Uint8Array.of(...out) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import { utf8Decode } from "../decoder/utf8-decode" | ||
import { utf8Encode } from "./utf8-encode" | ||
import { performance } from "perf_hooks" | ||
|
||
const testBase = | ||
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." | ||
const testCases = { | ||
10: testBase.slice(0, 10), | ||
100: testBase.slice(0, 100), | ||
200: testBase.slice(0, 200), | ||
300: testBase.slice(0, 300), | ||
} | ||
const types: Array<{ title: string; run: (data: string) => void }> = [ | ||
{ | ||
title: "TextEncoder", | ||
run(data: string): void { | ||
const textEncoder = new TextEncoder() | ||
for (let i = 0; i < 100; i++) { | ||
textEncoder.encode(data) | ||
} | ||
}, | ||
}, | ||
{ | ||
title: "TextDecoder", | ||
run(data: string): void { | ||
const buf = new TextEncoder().encode(data) | ||
const textDecoder = new TextDecoder() | ||
for (let i = 0; i < 100; i++) { | ||
textDecoder.decode(buf) | ||
} | ||
}, | ||
}, | ||
{ | ||
title: "utf8Encode", | ||
run(data: string): void { | ||
for (let i = 0; i < 100; i++) { | ||
utf8Encode(data) | ||
} | ||
}, | ||
}, | ||
{ | ||
title: "utf8Decode", | ||
run(data: string): void { | ||
const buf = new TextEncoder().encode(data) | ||
for (let i = 0; i < 100; i++) { | ||
utf8Decode(buf) | ||
} | ||
}, | ||
}, | ||
] | ||
const result: Record<string, Record<string, number>> = { | ||
TextEncoder: { | ||
10: 0, | ||
100: 0, | ||
200: 0, | ||
300: 0, | ||
}, | ||
TextDecoder: { | ||
10: 0, | ||
100: 0, | ||
200: 0, | ||
300: 0, | ||
}, | ||
utf8Encode: { | ||
10: 0, | ||
100: 0, | ||
200: 0, | ||
300: 0, | ||
}, | ||
utf8Decode: { | ||
10: 0, | ||
100: 0, | ||
200: 0, | ||
300: 0, | ||
}, | ||
} | ||
|
||
for (const type of types) { | ||
for (const [k, v] of Object.entries(testCases)) { | ||
const start = performance.now() | ||
type.run(v) | ||
const end = performance.now() | ||
result[type.title][k] = Math.trunc((end - start) * 1000) | ||
} | ||
} | ||
|
||
console.table(result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import assert from "assert" | ||
import { utf8Encode } from "../src/encoder/utf8-encode.js" | ||
import { utf8Decode } from "../src/decoder/utf8-decode.js" | ||
|
||
// https://zh.wikipedia.org/wiki/%E7%B6%AD%E5%9F%BA%E7%99%BE%E7%A7%91%E6%A8%99%E8%AA%8C | ||
const textCases = [ | ||
"Վ", | ||
"វិ", | ||
"উ", | ||
"वि", | ||
"ვ", | ||
"Ω", | ||
"維", | ||
"ವಿ", | ||
"ཝི", | ||
"ウィ", | ||
"W", | ||
"И", | ||
"ו", | ||
"வி", | ||
"ው", | ||
"و", | ||
"위", | ||
"วิ", | ||
"😋", | ||
] | ||
|
||
describe("UTF-8", () => { | ||
describe("Encode", () => { | ||
const encoder = new TextEncoder() | ||
for (const text of textCases) { | ||
it(text, () => { | ||
assert.deepEqual(utf8Encode(text), encoder.encode(text)) | ||
}) | ||
} | ||
}) | ||
describe("Decode", () => { | ||
const encoder = new TextEncoder() | ||
for (const text of textCases) { | ||
it(text, () => { | ||
assert.deepEqual(utf8Decode(encoder.encode(text)), text) | ||
}) | ||
} | ||
}) | ||
}) |