botisan-ai · lhr0909 · Feb 15, 2022 · Feb 15, 2022
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 .DS_Store
 node_modules
 dist
+dist-browser
diff --git a/package.json b/package.json
@@ -1,29 +1,30 @@
 {
   "name": "gpt3-tokenizer",
-  "version": "1.0.2",
+  "version": "1.1.0",
   "license": "MIT",
   "author": "Simon Liang <simon@x-tech.io>",
   "repository": {
     "type": "git",
     "url": "https://github.com/xanthous-tech/gpt3-tokenizer.git"
   },
   "main": "dist/index.js",
+  "browser": "dist-browser/gpt3-tokenizer.js",
   "typings": "dist/index.d.ts",
   "files": [
     "dist",
     "src"
   ],
   "engines": {
-    "node": ">=10"
+    "node": ">=12"
   },
   "scripts": {
     "start": "tsdx watch",
-    "build": "tsdx build",
+    "build": "npm run build:browser && tsdx build",
+    "build:browser": "rimraf dist-browser && tsdx build --target browser --format esm",
     "test": "tsdx test",
     "lint": "tsdx lint",
     "prepare": "tsdx build",
-    "size": "size-limit",
-    "analyze": "size-limit --why"
+    "size": "size-limit"
   },
   "peerDependencies": {},
   "husky": {
@@ -40,17 +41,14 @@
   "module": "dist/gpt3-tokenizer.esm.js",
   "size-limit": [
     {
-      "path": "dist/gpt3-tokenizer.cjs.production.min.js",
-      "limit": "10 KB"
-    },
-    {
-      "path": "dist/gpt3-tokenizer.esm.js",
-      "limit": "10 KB"
+      "path": "dist-browser/gpt3-tokenizer.js",
+      "limit": "1024 KB"
     }
   ],
   "devDependencies": {
     "@size-limit/preset-small-lib": "^7.0.5",
     "husky": "^7.0.4",
+    "rimraf": "^3.0.2",
     "size-limit": "^7.0.5",
     "tsdx": "^0.14.1",
     "tslib": "^2.3.1",

diff --git a/src/index-browser.ts b/src/index-browser.ts
@@ -0,0 +1,230 @@
+import ArrayKeyedMap from 'array-keyed-map';
+
+import TextEncoder from './text-encoder';
+import TextDecoder from './text-decoder';
+
+import bpeVocab from './bpe-vocab';
+import bpeRegex from './bpe-regex';
+import encodings from './encodings';
+
+const range = (x: number, y: number) => {
+  const res = Array.from(Array(y).keys()).slice(x);
+  return res;
+};
+
+const ord = (x: string): number => {
+  return x.charCodeAt(0);
+};
+
+const chr = (n: number): string => {
+  return String.fromCharCode(n);
+};
+
+export default class GPT3Tokenizer {
+  private vocab: string;
+  private nMergedSpaces: number;
+  private nVocab: number;
+
+  private encodings: { [key: string]: number };
+  private decodings: { [key: number]: string };
+
+  private textEncoder: TextEncoder;
+  private textDecoder: TextDecoder;
+  private byteEncoder: Map<number, string>;
+  private byteDecoder: Map<string, number>;
+
+  private bpeRanks: ArrayKeyedMap<[string, string], number>;
+  private cache: { [key: string]: string };
+
+  constructor(options: { type: 'gpt3' | 'codex' }) {
+    this.encodings = encodings;
+    this.vocab = bpeVocab;
+    this.textEncoder = new TextEncoder();
+    this.textDecoder = new TextDecoder();
+    this.nMergedSpaces = options.type === 'codex' ? 24 : 0;
+    this.nVocab = 50257 + this.nMergedSpaces;
+    this.decodings = {};
+    this.bpeRanks = new ArrayKeyedMap<[string, string], number>();
+    this.byteEncoder = new Map();
+    this.byteDecoder = new Map();
+    this.cache = {};
+    this.initialize();
+  }
+
+  initialize() {
+    if (this.vocab.length < 100) {
+      throw new Error('Tokenizer vocab file did not load correctly');
+    }
+    const vocabLines = this.vocab.split('\n');
+    const bpeMerges: [string, string][] = vocabLines
+      .slice(1, vocabLines.length - 1)
+      .map((line: string) =>
+        line.split(/(\s+)/).filter((part: string) => part.trim().length > 0)
+      ) as [string, string][];
+
+    // add merged spaces for codex tokenizer
+    if (this.nMergedSpaces > 0) {
+      for (let i = 1; i < this.nMergedSpaces; i++) {
+        for (let j = 1; j < this.nMergedSpaces; j++) {
+          if (i + j <= this.nMergedSpaces) {
+            bpeMerges.push(['\u0120'.repeat(i), '\u0120'.repeat(j)]);
+          }
+        }
+      }
+
+      for (let i = 0; i < this.nMergedSpaces; i++) {
+        this.encodings['\u0120'.repeat(i + 2)] =
+          this.nVocab - this.nMergedSpaces + i;
+      }
+    }
+
+    for (const key of Object.keys(this.encodings)) {
+      this.decodings[this.encodings[key]] = key;
+    }
+
+    this.byteEncoder = this.bytesToUnicode();
+
+    this.byteEncoder.forEach((value, key) => {
+        this.byteDecoder.set(value, key);
+    });
+
+    this.zip(this.bpeRanks, bpeMerges, range(0, bpeMerges.length));
+  }
+
+  zip<X, Y>(result: Map<X, Y>, x: X[], y: Y[]): Map<X, Y> {
+    x.forEach((_, idx) => {
+      result.set(x[idx], y[idx]);
+    });
+
+    return result;
+  }
+
+  bytesToUnicode(): Map<number, string> {
+    const bs = range(ord('!'), ord('~') + 1).concat(
+      range(ord('\xa1'), ord('\xac') + 1),
+      range(ord('\xae'), ord('\xff') + 1)
+    );
+
+    let cs: any[] = bs.slice();
+    let n = 0;
+
+    for (let b = 0; b < Math.pow(2, 8); b++) {
+      if (!bs.includes(b)) {
+        bs.push(b);
+        cs.push(Math.pow(2, 8) + n);
+        n = n + 1;
+      }
+    }
+
+    cs = cs.map((c: number) => chr(c));
+
+    const result = new Map<number, string>();
+    this.zip(result, bs, cs as string[]);
+    return result;
+  }
+
+  getPairs(word: string[]): Set<[string, string]> {
+    const pairs = new Set<[string, string]>();
+    let prevChar = word[0];
+
+    for (let i = 1; i < word.length; i++) {
+      const char = word[i];
+      pairs.add([prevChar, char]);
+      prevChar = char;
+    }
+
+    return pairs;
+  }
+
+  bpe(token: string) {
+    if (token in this.cache) {
+      return this.cache[token];
+    }
+
+    let word: string[] | string = token.split('');
+
+    let pairs = this.getPairs(word);
+
+    if (!pairs || pairs.size === 0) {
+      return token;
+    }
+
+    while (true) {
+      const minPairs: { [key: number]: [string, string] } = {};
+      for (const pair of Array.from(pairs)) {
+        const rank = this.bpeRanks.get(pair);
+        minPairs[(isNaN(rank as number) ? 1e11 : rank as number)] = pair;
+      }
+
+      const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => parseInt(x)))];
+
+      if (!this.bpeRanks.has(bigram)) {
+        break;
+      }
+
+      const first = bigram[0];
+      const second = bigram[1];
+      let newWord: string[] = [];
+      let i = 0;
+
+      while (i < word.length) {
+        const j = word.indexOf(first, i);
+        if (j === -1) {
+          newWord = newWord.concat(word.slice(i));
+          break;
+        }
+        newWord = newWord.concat(word.slice(i, j));
+        i = j;
+
+        if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
+          newWord.push(first + second);
+          i = i + 2;
+        } else {
+          newWord.push(word[i]);
+          i = i + 1;
+        }
+      }
+
+      word = newWord;
+      if (word.length === 1) {
+        break;
+      } else {
+        pairs = this.getPairs(word);
+      }
+    }
+
+    word = word.join(' ');
+    this.cache[token] = word;
+
+    return word;
+  }
+
+  encode(text: string): { bpe: number[]; text: string[] } {
+    let bpeTokens: number[] = [];
+    let texts: string[] = [];
+    const matches = text.match(bpeRegex) || [];
+
+    for (let token of matches) {
+      token = Array.from(this.textEncoder.encode(token)).map((x) => this.byteEncoder.get(x)).join('');
+      const newTokens = this.bpe(token).split(' ').map((x) => this.encodings[x]);
+      bpeTokens = bpeTokens.concat(newTokens);
+      texts = texts.concat(
+        newTokens.map((x) => this.decode([x])),
+      );
+    }
+
+    return {
+      bpe: bpeTokens,
+      text: texts,
+    };
+  }
+
+  decode(tokens: number[]): string {
+    const text = tokens.map((x) => this.decodings[x]).join('');
+    return this.textDecoder.decode(
+      new Uint8Array(
+        text.split('').map((x) => this.byteDecoder.get(x) as number),
+      ),
+    );
+  }
+}
diff --git a/src/index.ts b/src/index.ts
@@ -1,4 +1,3 @@
-// TODO: make a version that works on browser
 import { TextEncoder, TextDecoder } from 'util';
 import ArrayKeyedMap from 'array-keyed-map';
 
@@ -226,8 +225,4 @@ export default class GPT3Tokenizer {
       ),
     );
   }
-
-  isSupported() {
-    return typeof TextEncoder !== 'undefined';
-  }
 }
diff --git a/src/text-decoder.ts b/src/text-decoder.ts
@@ -0,0 +1,10 @@
+// import { TextDecoder } from 'util';
+
+if (typeof TextDecoder === 'undefined') {
+  throw new Error(
+    'TextDecoder is required for this module to work in the browser'
+  );
+}
+
+// @ts-ingore
+export default TextDecoder;
diff --git a/src/text-encoder.ts b/src/text-encoder.ts
@@ -0,0 +1,10 @@
+// import { TextEncoder } from 'util';
+
+if (typeof TextEncoder === 'undefined') {
+  throw new Error(
+    'TextEncoder is required for this module to work in the browser'
+  );
+}
+
+// @ts-ignore
+export default TextEncoder;
diff --git a/tsdx.config.js b/tsdx.config.js
@@ -0,0 +1,26 @@
+const path = require('path');
+
+module.exports = {
+  rollup(config, options) {
+    if (options.target === 'browser') {
+      config.input = config.input.replace('index.ts', 'index-browser.ts');
+
+      config.output.file = config.output.file
+        .replace('dist', 'dist-browser')
+        .replace('.esm', '')
+        .replace('.umd', '');
+
+      config.output.globals = {
+        ...config.output.globals,
+        [path.resolve(__dirname, 'src/text-encoder.ts')]: 'TextEncoder',
+        [path.resolve(__dirname, 'src/text-decoder.ts')]: 'TextDecoder',
+      }
+
+      console.log(config.output);
+
+      return config;
+    }
+
+    return config;
+  },
+};
diff --git a/yarn.lock b/yarn.lock
@@ -1141,6 +1141,13 @@
     "@nodelib/fs.scandir" "2.1.5"
     fastq "^1.6.0"
 
+"@rollup/plugin-alias@^3.1.9":
+  version "3.1.9"
+  resolved "https://registry.yarnpkg.com/@rollup/plugin-alias/-/plugin-alias-3.1.9.tgz#a5d267548fe48441f34be8323fb64d1d4a1b3fdf"
+  integrity sha512-QI5fsEvm9bDzt32k39wpOwZhVzRcL5ydcffUHMyLVaVaLeC70I8TJZ17F1z1eMoLu4E/UOcH9BWVkKpIKdrfiw==
+  dependencies:
+    slash "^3.0.0"
+
 "@rollup/plugin-babel@^5.1.0":
   version "5.3.0"
   resolved "https://registry.yarnpkg.com/@rollup/plugin-babel/-/plugin-babel-5.3.0.tgz#9cb1c5146ddd6a4968ad96f209c50c62f92f9879"
@@ -5399,7 +5406,7 @@ rimraf@2.6.3:
   dependencies:
     glob "^7.1.3"
 
-rimraf@^3.0.0:
+rimraf@^3.0.0, rimraf@^3.0.2:
   version "3.0.2"
   resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
   integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==