Skip to content

Commit

Permalink
Merge pull request #1231 from Apollon77/bertfix
Browse files Browse the repository at this point in the history
fix: correct the search logic for BertWordpieceTokenizer
  • Loading branch information
ericzon committed May 25, 2023
2 parents 71e29e2 + 9c33ff2 commit de6e3ed
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 15 deletions.
51 changes: 39 additions & 12 deletions packages/bert-tokenizer/src/bert-word-piece-tokenizer.js
Expand Up @@ -118,31 +118,56 @@ class BertWordPieceTokenizer extends Clonable {
return result;
}

getBestAffix(word) {
getBestPrefix(word) {
const maxLength = Math.min(word.length - 1, this.affixMaxLength);

// we try searching from shortest to longest.
for (let i = maxLength; i > 0; i -= 1) {
const current = word.slice(-i);
const current = word.substring(0, i);
if (this.words[current]) {
return current;
}
}
return undefined;
}

getBestAffix(word) {
const maxLength = Math.min(word.length, this.affixMaxLength);

// we try searching from shortest to longest.
for (let i = maxLength; i > 0; i -= 1) {
const current = word.substring(0, i);
if (this.affixes[current]) {
return current;
}
}
return undefined;
}

tokenizeWord(srcWord, useExtra = false) {
tokenizeWord(srcWord, useExtra = false, isInside = false) {
const word = this.lowercase ? srcWord.toLowerCase() : srcWord;
const result = {
tokens: [],
ids: [],
};
const wordIndex = this.words[word];

if (srcWord.length === 0) {
return result;
}

const wordIndex = isInside ? this.affixes[word] : this.words[word];
if (wordIndex !== undefined) {
result.tokens.push(word);
result.tokens.push((isInside ? '##' : '') + word);
result.ids.push(wordIndex);
return result;
}
const bestAffix = this.getBestAffix(word);
if (!bestAffix) {

// this might be in the prefixes part
const bestPart = isInside
? this.getBestAffix(word)
: this.getBestPrefix(word);

if (!bestPart) {
if (useExtra) {
const index = this.numWords + this.numExtra;
this.extra[word] = index;
Expand All @@ -155,15 +180,17 @@ class BertWordPieceTokenizer extends Clonable {
}
return result;
}
const newWord = word.slice(0, -bestAffix.length);
const newWordTokens = this.tokenizeWord(newWord, useExtra);
const newWord = word.substring(bestPart.length);
const newWordTokens = this.tokenizeWord(newWord, useExtra, true);

const text = bestPart;
result.tokens.push((isInside ? '##' : '') + text);
result.ids.push(isInside ? this.affixes[text] : this.words[text]);

for (let i = 0; i < newWordTokens.tokens.length; i += 1) {
result.tokens.push(newWordTokens.tokens[i]);
result.ids.push(newWordTokens.ids[i]);
}
const text = `##${bestAffix}`;
result.tokens.push(text);
result.ids.push(this.words[text]);
return result;
}

Expand Down
36 changes: 33 additions & 3 deletions packages/bert-tokenizer/test/bert-word-piece-tokenizer.test.js
Expand Up @@ -56,22 +56,52 @@ describe('BertWordPieceTokenizer', () => {
});
});

describe('Get Best Prefix', () => {
test('Should calculate the best prefix for a word', () => {
const input = 'Supervised';
const expected = 'Super';
const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
const actual = tokenizer.getBestPrefix(input);
expect(actual).toEqual(expected);
});
});

describe('Get Best Affix', () => {
test('Should calculate the best affix for a word', () => {
const input = 'Supervised';
const expected = 'ised';
const expected = 'S';
const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
const actual = tokenizer.getBestAffix(input);
expect(actual).toEqual(expected);
});
});

describe('Tokenize Word', () => {
test('Should return several tokens if word does not match - with isInside', () => {
const input = 'Supervised';
const expected = {
tokens: ['##S', '##upe', '##r', '##vise', '##d'],
ids: [1708, 26939, 1197, 16641, 1181],
};
const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
const actual = tokenizer.tokenizeWord(input, false, true);
expect(actual).toEqual(expected);
});
test('Should return several tokens if word does not match and has affixes', () => {
const input = 'Supervised';
const expected = {
tokens: ['Super', '##v', '##ised'],
ids: [3198, 1964, 3673],
tokens: ['Super', '##vise', '##d'],
ids: [3198, 16641, 1181],
};
const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
const actual = tokenizer.tokenizeWord(input);
expect(actual).toEqual(expected);
});
test('Should return several tokens if word does not match and has affixes #2', () => {
const input = 'vegan';
const expected = {
tokens: ['ve', '##gan'],
ids: [1396, 3820],
};
const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
const actual = tokenizer.tokenizeWord(input);
Expand Down

0 comments on commit de6e3ed

Please sign in to comment.