feat(encoder): utf8-encode wo/ perf improvement

artyomliou · Jan 8, 2023 · b1432be · b1432be
1 parent b6981cd
commit b1432be
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 45 deletions.
diff --git a/src/decoder/utf8-decode.ts b/src/decoder/utf8-decode.ts
@@ -51,17 +51,8 @@ export function utf8Decode(
       sum = (sum << 6) | (bytes[offset++] & 0b00111111)
       sixBitsPadding--
     }
-    // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCharCode#returning_supplementary_characters
-    // https://zh.wikipedia.org/wiki/UTF-16#%E4%BB%8EU+10000%E5%88%B0U+10FFFF%E7%9A%84%E7%A0%81%E4%BD%8D
-    if (sum > 0xffff) {
-      sum -= 0x10000
-      const lead = (sum >>> 10) + 0xd800
-      const low = (sum & 0b1111111111) + 0xdc00
-      out.push(lead, low)
-    } else {
-      out.push(sum)
-    }
+    out.push(sum)
   }
 
-  return String.fromCharCode(...out)
+  return String.fromCodePoint(...out)
 }
diff --git a/src/encoder/utf8-encode.ts b/src/encoder/utf8-encode.ts
@@ -0,0 +1,59 @@
+/**
+ * My UTF-8 encoding implementation
+ * Follow instruction by https://zh.wikipedia.org/wiki/UTF-8#UTF-8%E7%9A%84%E7%B7%A8%E7%A2%BC%E6%96%B9%E5%BC%8F
+ */
+export function utf8Encode(str: string): Uint8Array {
+  const out: Array<number> = []
+
+  const mask = 0b00111111
+  const prefix = 0x80
+
+  for (const char of str) {
+    const code = char.codePointAt(0)
+    if (!code) {
+      continue
+    }
+    if (code <= 0x7f) {
+      // Byte 1
+      out.push(code)
+    } else if (code <= 0x7ff) {
+      // Byte 2
+      out.push(0b11000000 + (code >>> 6), prefix + (code & mask))
+    } else if (code <= 0xffff) {
+      // Byte 3
+      out.push(
+        0b11100000 + (code >>> 12),
+        prefix + ((code >>> 6) & mask),
+        prefix + (code & mask)
+      )
+    } else if (code <= 0x1fffff) {
+      // Byte 4
+      out.push(
+        0b11110000 + (code >>> 18),
+        prefix + ((code >>> 12) & mask),
+        prefix + ((code >>> 6) & mask),
+        prefix + (code & mask)
+      )
+    } else if (code <= 0x3ffffff) {
+      // Byte 5
+      out.push(
+        0b11111000 + (code >>> 24),
+        prefix + ((code >>> 18) & mask),
+        prefix + ((code >>> 12) & mask),
+        prefix + ((code >>> 6) & mask),
+        prefix + (code & mask)
+      )
+    } else {
+      // Byte 6
+      out.push(
+        0b11111100 + (code >>> 30),
+        prefix + ((code >>> 24) & mask),
+        prefix + ((code >>> 18) & mask),
+        prefix + ((code >>> 12) & mask),
+        prefix + ((code >>> 6) & mask),
+        prefix + (code & mask)
+      )
+    }
+  }
+  return Uint8Array.of(...out)
+}
diff --git a/src/encoder/utf8.benchmark.ts b/src/encoder/utf8.benchmark.ts
@@ -0,0 +1,87 @@
+import { utf8Decode } from "../decoder/utf8-decode"
+import { utf8Encode } from "./utf8-encode"
+import { performance } from "perf_hooks"
+
+const testBase =
+  "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
+const testCases = {
+  10: testBase.slice(0, 10),
+  100: testBase.slice(0, 100),
+  200: testBase.slice(0, 200),
+  300: testBase.slice(0, 300),
+}
+const types: Array<{ title: string; run: (data: string) => void }> = [
+  {
+    title: "TextEncoder",
+    run(data: string): void {
+      const textEncoder = new TextEncoder()
+      for (let i = 0; i < 100; i++) {
+        textEncoder.encode(data)
+      }
+    },
+  },
+  {
+    title: "TextDecoder",
+    run(data: string): void {
+      const buf = new TextEncoder().encode(data)
+      const textDecoder = new TextDecoder()
+      for (let i = 0; i < 100; i++) {
+        textDecoder.decode(buf)
+      }
+    },
+  },
+  {
+    title: "utf8Encode",
+    run(data: string): void {
+      for (let i = 0; i < 100; i++) {
+        utf8Encode(data)
+      }
+    },
+  },
+  {
+    title: "utf8Decode",
+    run(data: string): void {
+      const buf = new TextEncoder().encode(data)
+      for (let i = 0; i < 100; i++) {
+        utf8Decode(buf)
+      }
+    },
+  },
+]
+const result: Record<string, Record<string, number>> = {
+  TextEncoder: {
+    10: 0,
+    100: 0,
+    200: 0,
+    300: 0,
+  },
+  TextDecoder: {
+    10: 0,
+    100: 0,
+    200: 0,
+    300: 0,
+  },
+  utf8Encode: {
+    10: 0,
+    100: 0,
+    200: 0,
+    300: 0,
+  },
+  utf8Decode: {
+    10: 0,
+    100: 0,
+    200: 0,
+    300: 0,
+  },
+}
+
+for (const type of types) {
+  for (const [k, v] of Object.entries(testCases)) {
+    const start = performance.now()
+    type.run(v)
+    const end = performance.now()
+    result[type.title][k] = Math.trunc((end - start) * 1000)
+  }
+}
+
+console.table(result)
diff --git a/test/encoder-and-decoder.spec.ts b/test/encoder-and-decoder.spec.ts
@@ -245,7 +245,7 @@ describe("Encoder/Decoder integration test", () => {
     it("map Map()", () => {
       const values = [
         [new Map(), {}],
-        [new Map([[1, 1]]), { 1: 1 }],
+        [new Map([["1", 1]]), { "1": 1 }],
       ]
       for (const [map, obj] of values) {
         assert.deepStrictEqual(decode(encode(map)), obj)

diff --git a/test/utf8-decode.spec.ts b/test/utf8-decode.spec.ts
diff --git a/test/utf8.spec.ts b/test/utf8.spec.ts
@@ -0,0 +1,45 @@
+import assert from "assert"
+import { utf8Encode } from "../src/encoder/utf8-encode.js"
+import { utf8Decode } from "../src/decoder/utf8-decode.js"
+
+// https://zh.wikipedia.org/wiki/%E7%B6%AD%E5%9F%BA%E7%99%BE%E7%A7%91%E6%A8%99%E8%AA%8C
+const textCases = [
+  "Վ",
+  "វិ",
+  "উ",
+  "वि",
+  "ვ",
+  "Ω",
+  "維",
+  "ವಿ",
+  "ཝི",
+  "ウィ",
+  "W",
+  "И",
+  "ו",
+  "வி",
+  "ው",
+  "و",
+  "위",
+  "วิ",
+  "😋",
+]
+
+describe("UTF-8", () => {
+  describe("Encode", () => {
+    const encoder = new TextEncoder()
+    for (const text of textCases) {
+      it(text, () => {
+        assert.deepEqual(utf8Encode(text), encoder.encode(text))
+      })
+    }
+  })
+  describe("Decode", () => {
+    const encoder = new TextEncoder()
+    for (const text of textCases) {
+      it(text, () => {
+        assert.deepEqual(utf8Decode(encoder.encode(text)), text)
+      })
+    }
+  })
+})