Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Browser Build #2

Merged
merged 1 commit into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.DS_Store
node_modules
dist
dist-browser
20 changes: 9 additions & 11 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
{
"name": "gpt3-tokenizer",
"version": "1.0.2",
"version": "1.1.0",
"license": "MIT",
"author": "Simon Liang <simon@x-tech.io>",
"repository": {
"type": "git",
"url": "https://github.com/xanthous-tech/gpt3-tokenizer.git"
},
"main": "dist/index.js",
"browser": "dist-browser/gpt3-tokenizer.js",
"typings": "dist/index.d.ts",
"files": [
"dist",
"src"
],
"engines": {
"node": ">=10"
"node": ">=12"
},
"scripts": {
"start": "tsdx watch",
"build": "tsdx build",
"build": "npm run build:browser && tsdx build",
"build:browser": "rimraf dist-browser && tsdx build --target browser --format esm",
"test": "tsdx test",
"lint": "tsdx lint",
"prepare": "tsdx build",
"size": "size-limit",
"analyze": "size-limit --why"
"size": "size-limit"
},
"peerDependencies": {},
"husky": {
Expand All @@ -40,17 +41,14 @@
"module": "dist/gpt3-tokenizer.esm.js",
"size-limit": [
{
"path": "dist/gpt3-tokenizer.cjs.production.min.js",
"limit": "10 KB"
},
{
"path": "dist/gpt3-tokenizer.esm.js",
"limit": "10 KB"
"path": "dist-browser/gpt3-tokenizer.js",
"limit": "1024 KB"
}
],
"devDependencies": {
"@size-limit/preset-small-lib": "^7.0.5",
"husky": "^7.0.4",
"rimraf": "^3.0.2",
"size-limit": "^7.0.5",
"tsdx": "^0.14.1",
"tslib": "^2.3.1",
Expand Down
230 changes: 230 additions & 0 deletions src/index-browser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import ArrayKeyedMap from 'array-keyed-map';

import TextEncoder from './text-encoder';
import TextDecoder from './text-decoder';

import bpeVocab from './bpe-vocab';
import bpeRegex from './bpe-regex';
import encodings from './encodings';

const range = (x: number, y: number) => {
const res = Array.from(Array(y).keys()).slice(x);
return res;
};

const ord = (x: string): number => {
return x.charCodeAt(0);
};

const chr = (n: number): string => {
return String.fromCharCode(n);
};

export default class GPT3Tokenizer {
private vocab: string;
private nMergedSpaces: number;
private nVocab: number;

private encodings: { [key: string]: number };
private decodings: { [key: number]: string };

private textEncoder: TextEncoder;
private textDecoder: TextDecoder;
private byteEncoder: Map<number, string>;
private byteDecoder: Map<string, number>;

private bpeRanks: ArrayKeyedMap<[string, string], number>;
private cache: { [key: string]: string };

constructor(options: { type: 'gpt3' | 'codex' }) {
this.encodings = encodings;
this.vocab = bpeVocab;
this.textEncoder = new TextEncoder();
this.textDecoder = new TextDecoder();
this.nMergedSpaces = options.type === 'codex' ? 24 : 0;
this.nVocab = 50257 + this.nMergedSpaces;
this.decodings = {};
this.bpeRanks = new ArrayKeyedMap<[string, string], number>();
this.byteEncoder = new Map();
this.byteDecoder = new Map();
this.cache = {};
this.initialize();
}

initialize() {
if (this.vocab.length < 100) {
throw new Error('Tokenizer vocab file did not load correctly');
}
const vocabLines = this.vocab.split('\n');
const bpeMerges: [string, string][] = vocabLines
.slice(1, vocabLines.length - 1)
.map((line: string) =>
line.split(/(\s+)/).filter((part: string) => part.trim().length > 0)
) as [string, string][];

// add merged spaces for codex tokenizer
if (this.nMergedSpaces > 0) {
for (let i = 1; i < this.nMergedSpaces; i++) {
for (let j = 1; j < this.nMergedSpaces; j++) {
if (i + j <= this.nMergedSpaces) {
bpeMerges.push(['\u0120'.repeat(i), '\u0120'.repeat(j)]);
}
}
}

for (let i = 0; i < this.nMergedSpaces; i++) {
this.encodings['\u0120'.repeat(i + 2)] =
this.nVocab - this.nMergedSpaces + i;
}
}

for (const key of Object.keys(this.encodings)) {
this.decodings[this.encodings[key]] = key;
}

this.byteEncoder = this.bytesToUnicode();

this.byteEncoder.forEach((value, key) => {
this.byteDecoder.set(value, key);
});

this.zip(this.bpeRanks, bpeMerges, range(0, bpeMerges.length));
}

zip<X, Y>(result: Map<X, Y>, x: X[], y: Y[]): Map<X, Y> {
x.forEach((_, idx) => {
result.set(x[idx], y[idx]);
});

return result;
}

bytesToUnicode(): Map<number, string> {
const bs = range(ord('!'), ord('~') + 1).concat(
range(ord('\xa1'), ord('\xac') + 1),
range(ord('\xae'), ord('\xff') + 1)
);

let cs: any[] = bs.slice();
let n = 0;

for (let b = 0; b < Math.pow(2, 8); b++) {
if (!bs.includes(b)) {
bs.push(b);
cs.push(Math.pow(2, 8) + n);
n = n + 1;
}
}

cs = cs.map((c: number) => chr(c));

const result = new Map<number, string>();
this.zip(result, bs, cs as string[]);
return result;
}

getPairs(word: string[]): Set<[string, string]> {
const pairs = new Set<[string, string]>();
let prevChar = word[0];

for (let i = 1; i < word.length; i++) {
const char = word[i];
pairs.add([prevChar, char]);
prevChar = char;
}

return pairs;
}

bpe(token: string) {
if (token in this.cache) {
return this.cache[token];
}

let word: string[] | string = token.split('');

let pairs = this.getPairs(word);

if (!pairs || pairs.size === 0) {
return token;
}

while (true) {
const minPairs: { [key: number]: [string, string] } = {};
for (const pair of Array.from(pairs)) {
const rank = this.bpeRanks.get(pair);
minPairs[(isNaN(rank as number) ? 1e11 : rank as number)] = pair;
}

const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => parseInt(x)))];

if (!this.bpeRanks.has(bigram)) {
break;
}

const first = bigram[0];
const second = bigram[1];
let newWord: string[] = [];
let i = 0;

while (i < word.length) {
const j = word.indexOf(first, i);
if (j === -1) {
newWord = newWord.concat(word.slice(i));
break;
}
newWord = newWord.concat(word.slice(i, j));
i = j;

if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
newWord.push(first + second);
i = i + 2;
} else {
newWord.push(word[i]);
i = i + 1;
}
}

word = newWord;
if (word.length === 1) {
break;
} else {
pairs = this.getPairs(word);
}
}

word = word.join(' ');
this.cache[token] = word;

return word;
}

encode(text: string): { bpe: number[]; text: string[] } {
let bpeTokens: number[] = [];
let texts: string[] = [];
const matches = text.match(bpeRegex) || [];

for (let token of matches) {
token = Array.from(this.textEncoder.encode(token)).map((x) => this.byteEncoder.get(x)).join('');
const newTokens = this.bpe(token).split(' ').map((x) => this.encodings[x]);
bpeTokens = bpeTokens.concat(newTokens);
texts = texts.concat(
newTokens.map((x) => this.decode([x])),
);
}

return {
bpe: bpeTokens,
text: texts,
};
}

decode(tokens: number[]): string {
const text = tokens.map((x) => this.decodings[x]).join('');
return this.textDecoder.decode(
new Uint8Array(
text.split('').map((x) => this.byteDecoder.get(x) as number),
),
);
}
}
5 changes: 0 additions & 5 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
// TODO: make a version that works on browser
import { TextEncoder, TextDecoder } from 'util';
import ArrayKeyedMap from 'array-keyed-map';

Expand Down Expand Up @@ -226,8 +225,4 @@ export default class GPT3Tokenizer {
),
);
}

isSupported() {
return typeof TextEncoder !== 'undefined';
}
}
10 changes: 10 additions & 0 deletions src/text-decoder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// import { TextDecoder } from 'util';

if (typeof TextDecoder === 'undefined') {
throw new Error(
'TextDecoder is required for this module to work in the browser'
);
}

// @ts-ingore
export default TextDecoder;
10 changes: 10 additions & 0 deletions src/text-encoder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// import { TextEncoder } from 'util';

if (typeof TextEncoder === 'undefined') {
throw new Error(
'TextEncoder is required for this module to work in the browser'
);
}

// @ts-ignore
export default TextEncoder;
26 changes: 26 additions & 0 deletions tsdx.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const path = require('path');

module.exports = {
rollup(config, options) {
if (options.target === 'browser') {
config.input = config.input.replace('index.ts', 'index-browser.ts');

config.output.file = config.output.file
.replace('dist', 'dist-browser')
.replace('.esm', '')
.replace('.umd', '');

config.output.globals = {
...config.output.globals,
[path.resolve(__dirname, 'src/text-encoder.ts')]: 'TextEncoder',
[path.resolve(__dirname, 'src/text-decoder.ts')]: 'TextDecoder',
}

console.log(config.output);

return config;
}

return config;
},
};
9 changes: 8 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,13 @@
"@nodelib/fs.scandir" "2.1.5"
fastq "^1.6.0"

"@rollup/plugin-alias@^3.1.9":
version "3.1.9"
resolved "https://registry.yarnpkg.com/@rollup/plugin-alias/-/plugin-alias-3.1.9.tgz#a5d267548fe48441f34be8323fb64d1d4a1b3fdf"
integrity sha512-QI5fsEvm9bDzt32k39wpOwZhVzRcL5ydcffUHMyLVaVaLeC70I8TJZ17F1z1eMoLu4E/UOcH9BWVkKpIKdrfiw==
dependencies:
slash "^3.0.0"

"@rollup/plugin-babel@^5.1.0":
version "5.3.0"
resolved "https://registry.yarnpkg.com/@rollup/plugin-babel/-/plugin-babel-5.3.0.tgz#9cb1c5146ddd6a4968ad96f209c50c62f92f9879"
Expand Down Expand Up @@ -5399,7 +5406,7 @@ rimraf@2.6.3:
dependencies:
glob "^7.1.3"

rimraf@^3.0.0:
rimraf@^3.0.0, rimraf@^3.0.2:
version "3.0.2"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
Expand Down