Skip to content

Commit

Permalink
feat(encoder): utf8-encode wo/ perf improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
artyomliou committed Jan 8, 2023
1 parent b6981cd commit b1432be
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 45 deletions.
13 changes: 2 additions & 11 deletions src/decoder/utf8-decode.ts
Expand Up @@ -51,17 +51,8 @@ export function utf8Decode(
sum = (sum << 6) | (bytes[offset++] & 0b00111111)
sixBitsPadding--
}
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCharCode#returning_supplementary_characters
// https://zh.wikipedia.org/wiki/UTF-16#%E4%BB%8EU+10000%E5%88%B0U+10FFFF%E7%9A%84%E7%A0%81%E4%BD%8D
if (sum > 0xffff) {
sum -= 0x10000
const lead = (sum >>> 10) + 0xd800
const low = (sum & 0b1111111111) + 0xdc00
out.push(lead, low)
} else {
out.push(sum)
}
out.push(sum)
}

return String.fromCharCode(...out)
return String.fromCodePoint(...out)
}
59 changes: 59 additions & 0 deletions src/encoder/utf8-encode.ts
@@ -0,0 +1,59 @@
/**
* My UTF-8 encoding implementation
* Follow instruction by https://zh.wikipedia.org/wiki/UTF-8#UTF-8%E7%9A%84%E7%B7%A8%E7%A2%BC%E6%96%B9%E5%BC%8F
*/
export function utf8Encode(str: string): Uint8Array {
const out: Array<number> = []

const mask = 0b00111111
const prefix = 0x80

for (const char of str) {
const code = char.codePointAt(0)
if (!code) {
continue
}
if (code <= 0x7f) {
// Byte 1
out.push(code)
} else if (code <= 0x7ff) {
// Byte 2
out.push(0b11000000 + (code >>> 6), prefix + (code & mask))
} else if (code <= 0xffff) {
// Byte 3
out.push(
0b11100000 + (code >>> 12),
prefix + ((code >>> 6) & mask),
prefix + (code & mask)
)
} else if (code <= 0x1fffff) {
// Byte 4
out.push(
0b11110000 + (code >>> 18),
prefix + ((code >>> 12) & mask),
prefix + ((code >>> 6) & mask),
prefix + (code & mask)
)
} else if (code <= 0x3ffffff) {
// Byte 5
out.push(
0b11111000 + (code >>> 24),
prefix + ((code >>> 18) & mask),
prefix + ((code >>> 12) & mask),
prefix + ((code >>> 6) & mask),
prefix + (code & mask)
)
} else {
// Byte 6
out.push(
0b11111100 + (code >>> 30),
prefix + ((code >>> 24) & mask),
prefix + ((code >>> 18) & mask),
prefix + ((code >>> 12) & mask),
prefix + ((code >>> 6) & mask),
prefix + (code & mask)
)
}
}
return Uint8Array.of(...out)
}
87 changes: 87 additions & 0 deletions src/encoder/utf8.benchmark.ts
@@ -0,0 +1,87 @@
import { utf8Decode } from "../decoder/utf8-decode"
import { utf8Encode } from "./utf8-encode"
import { performance } from "perf_hooks"

const testBase =
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
const testCases = {
10: testBase.slice(0, 10),
100: testBase.slice(0, 100),
200: testBase.slice(0, 200),
300: testBase.slice(0, 300),
}
const types: Array<{ title: string; run: (data: string) => void }> = [
{
title: "TextEncoder",
run(data: string): void {
const textEncoder = new TextEncoder()
for (let i = 0; i < 100; i++) {
textEncoder.encode(data)
}
},
},
{
title: "TextDecoder",
run(data: string): void {
const buf = new TextEncoder().encode(data)
const textDecoder = new TextDecoder()
for (let i = 0; i < 100; i++) {
textDecoder.decode(buf)
}
},
},
{
title: "utf8Encode",
run(data: string): void {
for (let i = 0; i < 100; i++) {
utf8Encode(data)
}
},
},
{
title: "utf8Decode",
run(data: string): void {
const buf = new TextEncoder().encode(data)
for (let i = 0; i < 100; i++) {
utf8Decode(buf)
}
},
},
]
const result: Record<string, Record<string, number>> = {
TextEncoder: {
10: 0,
100: 0,
200: 0,
300: 0,
},
TextDecoder: {
10: 0,
100: 0,
200: 0,
300: 0,
},
utf8Encode: {
10: 0,
100: 0,
200: 0,
300: 0,
},
utf8Decode: {
10: 0,
100: 0,
200: 0,
300: 0,
},
}

for (const type of types) {
for (const [k, v] of Object.entries(testCases)) {
const start = performance.now()
type.run(v)
const end = performance.now()
result[type.title][k] = Math.trunc((end - start) * 1000)
}
}

console.table(result)
2 changes: 1 addition & 1 deletion test/encoder-and-decoder.spec.ts
Expand Up @@ -245,7 +245,7 @@ describe("Encoder/Decoder integration test", () => {
it("map Map()", () => {
const values = [
[new Map(), {}],
[new Map([[1, 1]]), { 1: 1 }],
[new Map([["1", 1]]), { "1": 1 }],
]
for (const [map, obj] of values) {
assert.deepStrictEqual(decode(encode(map)), obj)
Expand Down
33 changes: 0 additions & 33 deletions test/utf8-decode.spec.ts

This file was deleted.

45 changes: 45 additions & 0 deletions test/utf8.spec.ts
@@ -0,0 +1,45 @@
import assert from "assert"
import { utf8Encode } from "../src/encoder/utf8-encode.js"
import { utf8Decode } from "../src/decoder/utf8-decode.js"

// https://zh.wikipedia.org/wiki/%E7%B6%AD%E5%9F%BA%E7%99%BE%E7%A7%91%E6%A8%99%E8%AA%8C
const textCases = [
"Վ",
"វិ",
"উ",
"वि",
"ვ",
"Ω",
"維",
"ವಿ",
"ཝི",
"ウィ",
"W",
"И",
"ו",
"வி",
"ው",
"و",
"위",
"วิ",
"😋",
]

describe("UTF-8", () => {
describe("Encode", () => {
const encoder = new TextEncoder()
for (const text of textCases) {
it(text, () => {
assert.deepEqual(utf8Encode(text), encoder.encode(text))
})
}
})
describe("Decode", () => {
const encoder = new TextEncoder()
for (const text of textCases) {
it(text, () => {
assert.deepEqual(utf8Decode(encoder.encode(text)), text)
})
}
})
})

0 comments on commit b1432be

Please sign in to comment.