Merge pull request #1231 from Apollon77/bertfix

fix: correct the search logic for BertWordpieceTokenizer
axa-group · May 25, 2023 · de6e3ed · de6e3ed
2 parents 71e29e2 + 9c33ff2
commit de6e3ed
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 15 deletions.
diff --git a/packages/bert-tokenizer/src/bert-word-piece-tokenizer.js b/packages/bert-tokenizer/src/bert-word-piece-tokenizer.js
@@ -118,31 +118,56 @@ class BertWordPieceTokenizer extends Clonable {
     return result;
   }
 
-  getBestAffix(word) {
+  getBestPrefix(word) {
     const maxLength = Math.min(word.length - 1, this.affixMaxLength);
+
+    // we try searching from shortest to longest.
     for (let i = maxLength; i > 0; i -= 1) {
-      const current = word.slice(-i);
+      const current = word.substring(0, i);
+      if (this.words[current]) {
+        return current;
+      }
+    }
+    return undefined;
+  }
+
+  getBestAffix(word) {
+    const maxLength = Math.min(word.length, this.affixMaxLength);
+
+    // we try searching from shortest to longest.
+    for (let i = maxLength; i > 0; i -= 1) {
+      const current = word.substring(0, i);
       if (this.affixes[current]) {
         return current;
       }
     }
     return undefined;
   }
 
-  tokenizeWord(srcWord, useExtra = false) {
+  tokenizeWord(srcWord, useExtra = false, isInside = false) {
     const word = this.lowercase ? srcWord.toLowerCase() : srcWord;
     const result = {
       tokens: [],
       ids: [],
     };
-    const wordIndex = this.words[word];
+
+    if (srcWord.length === 0) {
+      return result;
+    }
+
+    const wordIndex = isInside ? this.affixes[word] : this.words[word];
     if (wordIndex !== undefined) {
-      result.tokens.push(word);
+      result.tokens.push((isInside ? '##' : '') + word);
       result.ids.push(wordIndex);
       return result;
     }
-    const bestAffix = this.getBestAffix(word);
-    if (!bestAffix) {
+
+    // this might be in the prefixes part
+    const bestPart = isInside
+      ? this.getBestAffix(word)
+      : this.getBestPrefix(word);
+
+    if (!bestPart) {
       if (useExtra) {
         const index = this.numWords + this.numExtra;
         this.extra[word] = index;
@@ -155,15 +180,17 @@ class BertWordPieceTokenizer extends Clonable {
       }
       return result;
     }
-    const newWord = word.slice(0, -bestAffix.length);
-    const newWordTokens = this.tokenizeWord(newWord, useExtra);
+    const newWord = word.substring(bestPart.length);
+    const newWordTokens = this.tokenizeWord(newWord, useExtra, true);
+
+    const text = bestPart;
+    result.tokens.push((isInside ? '##' : '') + text);
+    result.ids.push(isInside ? this.affixes[text] : this.words[text]);
+
     for (let i = 0; i < newWordTokens.tokens.length; i += 1) {
       result.tokens.push(newWordTokens.tokens[i]);
       result.ids.push(newWordTokens.ids[i]);
     }
-    const text = `##${bestAffix}`;
-    result.tokens.push(text);
-    result.ids.push(this.words[text]);
     return result;
   }
 

diff --git a/packages/bert-tokenizer/test/bert-word-piece-tokenizer.test.js b/packages/bert-tokenizer/test/bert-word-piece-tokenizer.test.js
@@ -56,22 +56,52 @@ describe('BertWordPieceTokenizer', () => {
     });
   });
 
+  describe('Get Best Prefix', () => {
+    test('Should calculate the best prefix for a word', () => {
+      const input = 'Supervised';
+      const expected = 'Super';
+      const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
+      const actual = tokenizer.getBestPrefix(input);
+      expect(actual).toEqual(expected);
+    });
+  });
+
   describe('Get Best Affix', () => {
     test('Should calculate the best affix for a word', () => {
       const input = 'Supervised';
-      const expected = 'ised';
+      const expected = 'S';
       const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
       const actual = tokenizer.getBestAffix(input);
       expect(actual).toEqual(expected);
     });
   });
 
   describe('Tokenize Word', () => {
+    test('Should return several tokens if word does not match - with isInside', () => {
+      const input = 'Supervised';
+      const expected = {
+        tokens: ['##S', '##upe', '##r', '##vise', '##d'],
+        ids: [1708, 26939, 1197, 16641, 1181],
+      };
+      const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
+      const actual = tokenizer.tokenizeWord(input, false, true);
+      expect(actual).toEqual(expected);
+    });
     test('Should return several tokens if word does not match and has affixes', () => {
       const input = 'Supervised';
       const expected = {
-        tokens: ['Super', '##v', '##ised'],
-        ids: [3198, 1964, 3673],
+        tokens: ['Super', '##vise', '##d'],
+        ids: [3198, 16641, 1181],
+      };
+      const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
+      const actual = tokenizer.tokenizeWord(input);
+      expect(actual).toEqual(expected);
+    });
+    test('Should return several tokens if word does not match and has affixes #2', () => {
+      const input = 'vegan';
+      const expected = {
+        tokens: ['ve', '##gan'],
+        ids: [1396, 3820],
       };
       const tokenizer = new BertWordPieceTokenizer({ vocabContent: vocabEn });
       const actual = tokenizer.tokenizeWord(input);