Skip to content

Commit

Permalink
Merge pull request #2 from botisan-ai/browser
Browse files Browse the repository at this point in the history
Browser Build
  • Loading branch information
lhr0909 committed Feb 15, 2022
2 parents 8a29cfb + 2240efb commit 6429997
Show file tree
Hide file tree
Showing 8 changed files with 294 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.DS_Store
node_modules
dist
dist-browser
20 changes: 9 additions & 11 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
{
"name": "gpt3-tokenizer",
"version": "1.0.2",
"version": "1.1.0",
"license": "MIT",
"author": "Simon Liang <simon@x-tech.io>",
"repository": {
"type": "git",
"url": "https://github.com/xanthous-tech/gpt3-tokenizer.git"
},
"main": "dist/index.js",
"browser": "dist-browser/gpt3-tokenizer.js",
"typings": "dist/index.d.ts",
"files": [
"dist",
"src"
],
"engines": {
"node": ">=10"
"node": ">=12"
},
"scripts": {
"start": "tsdx watch",
"build": "tsdx build",
"build": "npm run build:browser && tsdx build",
"build:browser": "rimraf dist-browser && tsdx build --target browser --format esm",
"test": "tsdx test",
"lint": "tsdx lint",
"prepare": "tsdx build",
"size": "size-limit",
"analyze": "size-limit --why"
"size": "size-limit"
},
"peerDependencies": {},
"husky": {
Expand All @@ -40,17 +41,14 @@
"module": "dist/gpt3-tokenizer.esm.js",
"size-limit": [
{
"path": "dist/gpt3-tokenizer.cjs.production.min.js",
"limit": "10 KB"
},
{
"path": "dist/gpt3-tokenizer.esm.js",
"limit": "10 KB"
"path": "dist-browser/gpt3-tokenizer.js",
"limit": "1024 KB"
}
],
"devDependencies": {
"@size-limit/preset-small-lib": "^7.0.5",
"husky": "^7.0.4",
"rimraf": "^3.0.2",
"size-limit": "^7.0.5",
"tsdx": "^0.14.1",
"tslib": "^2.3.1",
Expand Down
230 changes: 230 additions & 0 deletions src/index-browser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import ArrayKeyedMap from 'array-keyed-map';

import TextEncoder from './text-encoder';
import TextDecoder from './text-decoder';

import bpeVocab from './bpe-vocab';
import bpeRegex from './bpe-regex';
import encodings from './encodings';

const range = (x: number, y: number) => {
const res = Array.from(Array(y).keys()).slice(x);
return res;
};

const ord = (x: string): number => {
return x.charCodeAt(0);
};

const chr = (n: number): string => {
return String.fromCharCode(n);
};

export default class GPT3Tokenizer {
private vocab: string;
private nMergedSpaces: number;
private nVocab: number;

private encodings: { [key: string]: number };
private decodings: { [key: number]: string };

private textEncoder: TextEncoder;
private textDecoder: TextDecoder;
private byteEncoder: Map<number, string>;
private byteDecoder: Map<string, number>;

private bpeRanks: ArrayKeyedMap<[string, string], number>;
private cache: { [key: string]: string };

constructor(options: { type: 'gpt3' | 'codex' }) {
this.encodings = encodings;
this.vocab = bpeVocab;
this.textEncoder = new TextEncoder();
this.textDecoder = new TextDecoder();
this.nMergedSpaces = options.type === 'codex' ? 24 : 0;
this.nVocab = 50257 + this.nMergedSpaces;
this.decodings = {};
this.bpeRanks = new ArrayKeyedMap<[string, string], number>();
this.byteEncoder = new Map();
this.byteDecoder = new Map();
this.cache = {};
this.initialize();
}

initialize() {
if (this.vocab.length < 100) {
throw new Error('Tokenizer vocab file did not load correctly');
}
const vocabLines = this.vocab.split('\n');
const bpeMerges: [string, string][] = vocabLines
.slice(1, vocabLines.length - 1)
.map((line: string) =>
line.split(/(\s+)/).filter((part: string) => part.trim().length > 0)
) as [string, string][];

// add merged spaces for codex tokenizer
if (this.nMergedSpaces > 0) {
for (let i = 1; i < this.nMergedSpaces; i++) {
for (let j = 1; j < this.nMergedSpaces; j++) {
if (i + j <= this.nMergedSpaces) {
bpeMerges.push(['\u0120'.repeat(i), '\u0120'.repeat(j)]);
}
}
}

for (let i = 0; i < this.nMergedSpaces; i++) {
this.encodings['\u0120'.repeat(i + 2)] =
this.nVocab - this.nMergedSpaces + i;
}
}

for (const key of Object.keys(this.encodings)) {
this.decodings[this.encodings[key]] = key;
}

this.byteEncoder = this.bytesToUnicode();

this.byteEncoder.forEach((value, key) => {
this.byteDecoder.set(value, key);
});

this.zip(this.bpeRanks, bpeMerges, range(0, bpeMerges.length));
}

zip<X, Y>(result: Map<X, Y>, x: X[], y: Y[]): Map<X, Y> {
x.forEach((_, idx) => {
result.set(x[idx], y[idx]);
});

return result;
}

bytesToUnicode(): Map<number, string> {
const bs = range(ord('!'), ord('~') + 1).concat(
range(ord('\xa1'), ord('\xac') + 1),
range(ord('\xae'), ord('\xff') + 1)
);

let cs: any[] = bs.slice();
let n = 0;

for (let b = 0; b < Math.pow(2, 8); b++) {
if (!bs.includes(b)) {
bs.push(b);
cs.push(Math.pow(2, 8) + n);
n = n + 1;
}
}

cs = cs.map((c: number) => chr(c));

const result = new Map<number, string>();
this.zip(result, bs, cs as string[]);
return result;
}

getPairs(word: string[]): Set<[string, string]> {
const pairs = new Set<[string, string]>();
let prevChar = word[0];

for (let i = 1; i < word.length; i++) {
const char = word[i];
pairs.add([prevChar, char]);
prevChar = char;
}

return pairs;
}

bpe(token: string) {
if (token in this.cache) {
return this.cache[token];
}

let word: string[] | string = token.split('');

let pairs = this.getPairs(word);

if (!pairs || pairs.size === 0) {
return token;
}

while (true) {
const minPairs: { [key: number]: [string, string] } = {};
for (const pair of Array.from(pairs)) {
const rank = this.bpeRanks.get(pair);
minPairs[(isNaN(rank as number) ? 1e11 : rank as number)] = pair;
}

const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => parseInt(x)))];

if (!this.bpeRanks.has(bigram)) {
break;
}

const first = bigram[0];
const second = bigram[1];
let newWord: string[] = [];
let i = 0;

while (i < word.length) {
const j = word.indexOf(first, i);
if (j === -1) {
newWord = newWord.concat(word.slice(i));
break;
}
newWord = newWord.concat(word.slice(i, j));
i = j;

if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
newWord.push(first + second);
i = i + 2;
} else {
newWord.push(word[i]);
i = i + 1;
}
}

word = newWord;
if (word.length === 1) {
break;
} else {
pairs = this.getPairs(word);
}
}

word = word.join(' ');
this.cache[token] = word;

return word;
}

encode(text: string): { bpe: number[]; text: string[] } {
let bpeTokens: number[] = [];
let texts: string[] = [];
const matches = text.match(bpeRegex) || [];

for (let token of matches) {
token = Array.from(this.textEncoder.encode(token)).map((x) => this.byteEncoder.get(x)).join('');
const newTokens = this.bpe(token).split(' ').map((x) => this.encodings[x]);
bpeTokens = bpeTokens.concat(newTokens);
texts = texts.concat(
newTokens.map((x) => this.decode([x])),
);
}

return {
bpe: bpeTokens,
text: texts,
};
}

decode(tokens: number[]): string {
const text = tokens.map((x) => this.decodings[x]).join('');
return this.textDecoder.decode(
new Uint8Array(
text.split('').map((x) => this.byteDecoder.get(x) as number),
),
);
}
}
5 changes: 0 additions & 5 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
// TODO: make a version that works on browser
import { TextEncoder, TextDecoder } from 'util';
import ArrayKeyedMap from 'array-keyed-map';

Expand Down Expand Up @@ -226,8 +225,4 @@ export default class GPT3Tokenizer {
),
);
}

isSupported() {
return typeof TextEncoder !== 'undefined';
}
}
10 changes: 10 additions & 0 deletions src/text-decoder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// import { TextDecoder } from 'util';

if (typeof TextDecoder === 'undefined') {
throw new Error(
'TextDecoder is required for this module to work in the browser'
);
}

// @ts-ingore
export default TextDecoder;
10 changes: 10 additions & 0 deletions src/text-encoder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// import { TextEncoder } from 'util';

if (typeof TextEncoder === 'undefined') {
throw new Error(
'TextEncoder is required for this module to work in the browser'
);
}

// @ts-ignore
export default TextEncoder;
26 changes: 26 additions & 0 deletions tsdx.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const path = require('path');

module.exports = {
rollup(config, options) {
if (options.target === 'browser') {
config.input = config.input.replace('index.ts', 'index-browser.ts');

config.output.file = config.output.file
.replace('dist', 'dist-browser')
.replace('.esm', '')
.replace('.umd', '');

config.output.globals = {
...config.output.globals,
[path.resolve(__dirname, 'src/text-encoder.ts')]: 'TextEncoder',
[path.resolve(__dirname, 'src/text-decoder.ts')]: 'TextDecoder',
}

console.log(config.output);

return config;
}

return config;
},
};
9 changes: 8 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,13 @@
"@nodelib/fs.scandir" "2.1.5"
fastq "^1.6.0"

"@rollup/plugin-alias@^3.1.9":
version "3.1.9"
resolved "https://registry.yarnpkg.com/@rollup/plugin-alias/-/plugin-alias-3.1.9.tgz#a5d267548fe48441f34be8323fb64d1d4a1b3fdf"
integrity sha512-QI5fsEvm9bDzt32k39wpOwZhVzRcL5ydcffUHMyLVaVaLeC70I8TJZ17F1z1eMoLu4E/UOcH9BWVkKpIKdrfiw==
dependencies:
slash "^3.0.0"

"@rollup/plugin-babel@^5.1.0":
version "5.3.0"
resolved "https://registry.yarnpkg.com/@rollup/plugin-babel/-/plugin-babel-5.3.0.tgz#9cb1c5146ddd6a4968ad96f209c50c62f92f9879"
Expand Down Expand Up @@ -5399,7 +5406,7 @@ rimraf@2.6.3:
dependencies:
glob "^7.1.3"

rimraf@^3.0.0:
rimraf@^3.0.0, rimraf@^3.0.2:
version "3.0.2"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
Expand Down

0 comments on commit 6429997

Please sign in to comment.