From d8c72b586105ff722b3b1f9687efabb91dd1ce59 Mon Sep 17 00:00:00 2001 From: "Dr. David A. Kunz" Date: Thu, 14 Aug 2025 11:38:21 +0200 Subject: [PATCH] no need for tokenizer --- lib/tokenizer.js | 244 ---------------------------------------- tests/tokenizer.test.js | 187 ------------------------------ 2 files changed, 431 deletions(-) delete mode 100644 lib/tokenizer.js delete mode 100644 tests/tokenizer.test.js diff --git a/lib/tokenizer.js b/lib/tokenizer.js deleted file mode 100644 index d816dcf..0000000 --- a/lib/tokenizer.js +++ /dev/null @@ -1,244 +0,0 @@ -/** - * Custom BERT-style tokenizer implementation - * Extracted from @huggingface/transformers.js WordPiece tokenization logic - */ - -/** - * Remove accents from text using Unicode normalization - */ -function removeAccents(text) { - return text.normalize('NFD').replace(/\p{Mn}/gu, '') -} - -/** - * Check if character is a control character - */ -function isControl(char) { - switch (char) { - case '\t': - case '\n': - case '\r': - return false - default: - return /^\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}$/u.test(char) - } -} - -/** - * Clean and normalize text (BERT normalization) - */ -function normalizeText(text, options = {}) { - const { cleanText = true, lowercase = true, stripAccents = true } = options - - if (cleanText) { - // Remove control characters and normalize whitespace - const output = [] - for (const char of text) { - const cp = char.charCodeAt(0) - if (cp === 0 || cp === 0xfffd || isControl(char)) { - continue - } - if (/^\s$/.test(char)) { - output.push(' ') - } else { - output.push(char) - } - } - text = output.join('') - } - - if (lowercase) { - text = text.toLowerCase() - if (stripAccents) { - text = removeAccents(text) - } - } else if (stripAccents) { - text = removeAccents(text) - } - - return text -} - -/** - * BERT pre-tokenization - split on whitespace and punctuation - */ -function preTokenize(text) { - const punctuationRegex = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E' - const pattern = new RegExp(`[^\\s${punctuationRegex}]+|[${punctuationRegex}]`, 'gu') - return text.trim().match(pattern) || [] -} - -/** - * WordPiece encoding implementation - */ -function wordPieceEncode( - tokens, - vocab, - unkToken = '[UNK]', - continuingSubwordPrefix = '##', - maxInputCharsPerWord = 100 -) { - const outputTokens = [] - - for (const token of tokens) { - const chars = [...token] - if (chars.length > maxInputCharsPerWord) { - outputTokens.push(unkToken) - continue - } - - let isUnknown = false - let start = 0 - const subTokens = [] - - while (start < chars.length) { - let end = chars.length - let currentSubstring = null - - while (start < end) { - let substr = chars.slice(start, end).join('') - - if (start > 0) { - substr = continuingSubwordPrefix + substr - } - - if (vocab.has(substr)) { - currentSubstring = substr - break - } - - end-- - } - - if (currentSubstring === null) { - isUnknown = true - break - } - - subTokens.push(currentSubstring) - start = end - } - - if (isUnknown) { - outputTokens.push(unkToken) - } else { - outputTokens.push(...subTokens) - } - } - - return outputTokens -} - -/** - * Post-processing - add special tokens - */ -function addSpecialTokens(tokens, tokensPair = null, clsToken = '[CLS]', sepToken = '[SEP]') { - let result = [clsToken, ...tokens, sepToken] - let tokenTypeIds = new Array(result.length).fill(0) - - if (tokensPair !== null) { - result = [...result, ...tokensPair, sepToken] - tokenTypeIds = [...tokenTypeIds, ...new Array(tokensPair.length + 1).fill(1)] - } - - return { tokens: result, tokenTypeIds } -} - -/** - * Convert tokens to IDs using vocabulary - */ -function convertTokensToIds(tokens, vocab, unkTokenId) { - return tokens.map(token => vocab.get(token) ?? unkTokenId) -} - -/** - * Main BERT tokenizer function - * @param {string} text - Input text to tokenize - * @param {Map} vocab - Vocabulary mapping tokens to IDs - * @param {Object} options - Tokenization options - * @returns {Object} Tokenization result with input_ids, attention_mask, token_type_ids - */ -function bertTokenize(text, vocab, options = {}) { - const { - textPair = null, - addSpecialTokensFlag = true, - maxLength = 512, - padding = false, - truncation = false, - unkToken = '[UNK]', - clsToken = '[CLS]', - sepToken = '[SEP]', - padToken = '[PAD]', - continuingSubwordPrefix = '##', - normalizationOptions = {} - } = options - - // Get token IDs - const unkTokenId = vocab.get(unkToken) ?? 100 // Default BERT UNK ID - const padTokenId = vocab.get(padToken) ?? 0 // Default BERT PAD ID - - // Step 1: Normalize text - const normalizedText = normalizeText(text, normalizationOptions) - const normalizedTextPair = textPair ? normalizeText(textPair, normalizationOptions) : null - - // Step 2: Pre-tokenize - const preTokens = preTokenize(normalizedText) - const preTokensPair = normalizedTextPair ? preTokenize(normalizedTextPair) : null - - // Step 3: WordPiece encode - const tokens = wordPieceEncode(preTokens, vocab, unkToken, continuingSubwordPrefix) - const tokensPair = preTokensPair ? wordPieceEncode(preTokensPair, vocab, unkToken, continuingSubwordPrefix) : null - - // Step 4: Add special tokens - let finalTokens = tokens - let tokenTypeIds = null - - if (addSpecialTokensFlag) { - const result = addSpecialTokens(tokens, tokensPair, clsToken, sepToken) - finalTokens = result.tokens - tokenTypeIds = result.tokenTypeIds - } else if (tokensPair) { - finalTokens = [...tokens, ...tokensPair] - tokenTypeIds = [...new Array(tokens.length).fill(0), ...new Array(tokensPair.length).fill(1)] - } - - // Step 5: Convert to IDs - let inputIds = convertTokensToIds(finalTokens, vocab, unkTokenId) - - // Step 6: Handle truncation - if (truncation && inputIds.length > maxLength) { - inputIds = inputIds.slice(0, maxLength) - if (tokenTypeIds) tokenTypeIds = tokenTypeIds.slice(0, maxLength) - } - - // Step 7: Handle padding - let attentionMask = new Array(inputIds.length).fill(1) - if (padding && inputIds.length < maxLength) { - const padLength = maxLength - inputIds.length - inputIds = [...inputIds, ...new Array(padLength).fill(padTokenId)] - attentionMask = [...attentionMask, ...new Array(padLength).fill(0)] - if (tokenTypeIds) tokenTypeIds = [...tokenTypeIds, ...new Array(padLength).fill(0)] - } - - const result = { - input_ids: inputIds, - attention_mask: attentionMask - } - - if (tokenTypeIds) { - result.token_type_ids = tokenTypeIds - } - - return result -} - -export { - bertTokenize, - normalizeText, - preTokenize, - wordPieceEncode, - addSpecialTokens, - convertTokensToIds, - removeAccents, - isControl -} diff --git a/tests/tokenizer.test.js b/tests/tokenizer.test.js deleted file mode 100644 index f801e2f..0000000 --- a/tests/tokenizer.test.js +++ /dev/null @@ -1,187 +0,0 @@ -// Node.js test runner (test) for lib/tokenizer.js -import { bertTokenize, normalizeText, preTokenize, wordPieceEncode } from '../lib/tokenizer.js' -import assert from 'node:assert' -import { test } from 'node:test' - -// Mock vocabulary for testing -const mockVocab = new Map([ - ['[CLS]', 101], - ['[SEP]', 102], - ['[UNK]', 100], - ['[PAD]', 0], - ['hello', 7592], - ['world', 2088], - ['!', 999], - ['this', 2023], - ['is', 2003], - ['a', 1037], - ['test', 3231], - ['sentence', 6251], - ['.', 1012], - [',', 1010], - ['how', 2129], - ['are', 2024], - ['you', 2017], - ['doing', 2509], - ['today', 2651], - ['?', 1029], - ['i', 1045], - ['love', 2293], - ['transform', 10938], - ['##ers', 2545], // WordPiece subword - ['the', 1996], - ['quick', 4248], - ['brown', 2829], - ['fox', 4419], - ['jumps', 14523], - ['over', 2058], - ['lazy', 13971], - ['dog', 3899] -]) - -test.describe('tokenizer', () => { - test('normalizeText: should clean and lowercase text', () => { - const text = 'Hello World!' - const normalized = normalizeText(text) - assert.strictEqual(normalized, 'hello world!') - }) - - test('normalizeText: should remove accents when requested', () => { - const text = 'café' - const normalized = normalizeText(text, { stripAccents: true }) - assert.strictEqual(normalized, 'cafe') - }) - - test('preTokenize: should split on whitespace and punctuation', () => { - const text = 'Hello world!' - const preTokens = preTokenize(text) - assert.deepStrictEqual(preTokens, ['Hello', 'world', '!']) - }) - - test('preTokenize: should handle complex punctuation', () => { - const text = 'How are you doing today?' - const preTokens = preTokenize(text) - assert.deepStrictEqual(preTokens, ['How', 'are', 'you', 'doing', 'today', '?']) - }) - - test('bertTokenize: should tokenize simple text', () => { - const text = 'Hello world!' - const result = bertTokenize(text, mockVocab) - - assert(Array.isArray(result.input_ids), 'Should return input_ids array') - assert(Array.isArray(result.attention_mask), 'Should return attention_mask array') - assert.strictEqual(result.input_ids.length, result.attention_mask.length, 'Arrays should have same length') - - // Should include CLS token at start and SEP token at end - assert.strictEqual(result.input_ids[0], 101, 'Should start with CLS token') - assert.strictEqual(result.input_ids[result.input_ids.length - 1], 102, 'Should end with SEP token') - - // All attention mask values should be 1 (no padding) - assert( - result.attention_mask.every(val => val === 1), - 'All attention mask values should be 1' - ) - }) - - test('bertTokenize: should handle unknown tokens', () => { - const text = 'unknownword' - const result = bertTokenize(text, mockVocab) - - // Should contain UNK token (100) - assert(result.input_ids.includes(100), 'Should contain UNK token for unknown word') - }) - - test('bertTokenize: should handle padding', () => { - const text = 'Hello world!' - const result = bertTokenize(text, mockVocab, { - maxLength: 10, - padding: true, - truncation: false - }) - - assert.strictEqual(result.input_ids.length, 10, 'Should pad to maxLength') - assert.strictEqual(result.attention_mask.length, 10, 'Should pad attention mask to maxLength') - - // Check that padding tokens (0) are present - assert(result.input_ids.includes(0), 'Should contain PAD tokens') - - // Check that attention mask has 0s for padding - const paddingCount = result.attention_mask.filter(val => val === 0).length - assert(paddingCount > 0, 'Should have 0s in attention mask for padding') - }) - - test('bertTokenize: should handle truncation', () => { - const longText = 'The quick brown fox jumps over the lazy dog and runs very fast' - const result = bertTokenize(longText, mockVocab, { - maxLength: 8, - padding: false, - truncation: true - }) - - assert.strictEqual(result.input_ids.length, 8, 'Should truncate to maxLength') - assert.strictEqual(result.attention_mask.length, 8, 'Should truncate attention mask to maxLength') - }) - - test('bertTokenize: should handle both padding and truncation', () => { - const text = 'The quick brown fox jumps over the lazy dog' - const result = bertTokenize(text, mockVocab, { - maxLength: 15, - padding: true, - truncation: true - }) - - assert.strictEqual(result.input_ids.length, 15, 'Should be exactly maxLength') - assert.strictEqual(result.attention_mask.length, 15, 'Should be exactly maxLength') - }) - - test('bertTokenize: should handle sentence pairs', () => { - const text1 = 'Hello world!' - const text2 = 'How are you?' - const result = bertTokenize(text1, mockVocab, { - textPair: text2, - addSpecialTokensFlag: true - }) - - assert(Array.isArray(result.input_ids), 'Should return input_ids array') - assert(Array.isArray(result.attention_mask), 'Should return attention_mask array') - assert(Array.isArray(result.token_type_ids), 'Should return token_type_ids array for sentence pairs') - - // Should have CLS at start and SEP tokens between and at end - assert.strictEqual(result.input_ids[0], 101, 'Should start with CLS token') - - // Should have both 0s and 1s in token_type_ids - assert(result.token_type_ids.includes(0), 'Should have 0s for first sentence') - assert(result.token_type_ids.includes(1), 'Should have 1s for second sentence') - }) - - test('bertTokenize: should work without special tokens', () => { - const text = 'Hello world!' - const result = bertTokenize(text, mockVocab, { - addSpecialTokensFlag: false - }) - - // Should not start with CLS or end with SEP - assert.notStrictEqual(result.input_ids[0], 101, 'Should not start with CLS token') - assert.notStrictEqual(result.input_ids[result.input_ids.length - 1], 102, 'Should not end with SEP token') - }) - - test('wordPieceEncode: should handle subword tokenization', () => { - const tokens = ['transform', 'transformers'] - const result = wordPieceEncode(tokens, mockVocab) - - // 'transform' should be in vocab as-is - assert(result.includes('transform'), 'Should contain base word') - - // 'transformers' should be split into 'transform' + '##ers' - if (result.includes('##ers')) { - assert(result.includes('transform'), 'Should split unknown word using subword tokens') - } - }) - - test('wordPieceEncode: should handle unknown words', () => { - const tokens = ['completelyunknownword'] - const result = wordPieceEncode(tokens, mockVocab) - - assert(result.includes('[UNK]'), 'Should return UNK token for completely unknown words') - }) -})