From d8c72b586105ff722b3b1f9687efabb91dd1ce59 Mon Sep 17 00:00:00 2001
From: "Dr. David A. Kunz" <david.kunz@sap.com>
Date: Thu, 14 Aug 2025 11:38:21 +0200
Subject: [PATCH] no need for tokenizer

---
 lib/tokenizer.js        | 244 ----------------------------------------
 tests/tokenizer.test.js | 187 ------------------------------
 2 files changed, 431 deletions(-)
 delete mode 100644 lib/tokenizer.js
 delete mode 100644 tests/tokenizer.test.js

diff --git a/lib/tokenizer.js b/lib/tokenizer.js
deleted file mode 100644
index d816dcf..0000000
--- a/lib/tokenizer.js
+++ /dev/null
@@ -1,244 +0,0 @@
-/**
- * Custom BERT-style tokenizer implementation
- * Extracted from @huggingface/transformers.js WordPiece tokenization logic
- */
-
-/**
- * Remove accents from text using Unicode normalization
- */
-function removeAccents(text) {
-  return text.normalize('NFD').replace(/\p{Mn}/gu, '')
-}
-
-/**
- * Check if character is a control character
- */
-function isControl(char) {
-  switch (char) {
-    case '\t':
-    case '\n':
-    case '\r':
-      return false
-    default:
-      return /^\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}$/u.test(char)
-  }
-}
-
-/**
- * Clean and normalize text (BERT normalization)
- */
-function normalizeText(text, options = {}) {
-  const { cleanText = true, lowercase = true, stripAccents = true } = options
-
-  if (cleanText) {
-    // Remove control characters and normalize whitespace
-    const output = []
-    for (const char of text) {
-      const cp = char.charCodeAt(0)
-      if (cp === 0 || cp === 0xfffd || isControl(char)) {
-        continue
-      }
-      if (/^\s$/.test(char)) {
-        output.push(' ')
-      } else {
-        output.push(char)
-      }
-    }
-    text = output.join('')
-  }
-
-  if (lowercase) {
-    text = text.toLowerCase()
-    if (stripAccents) {
-      text = removeAccents(text)
-    }
-  } else if (stripAccents) {
-    text = removeAccents(text)
-  }
-
-  return text
-}
-
-/**
- * BERT pre-tokenization - split on whitespace and punctuation
- */
-function preTokenize(text) {
-  const punctuationRegex = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E'
-  const pattern = new RegExp(`[^\\s${punctuationRegex}]+|[${punctuationRegex}]`, 'gu')
-  return text.trim().match(pattern) || []
-}
-
-/**
- * WordPiece encoding implementation
- */
-function wordPieceEncode(
-  tokens,
-  vocab,
-  unkToken = '[UNK]',
-  continuingSubwordPrefix = '##',
-  maxInputCharsPerWord = 100
-) {
-  const outputTokens = []
-
-  for (const token of tokens) {
-    const chars = [...token]
-    if (chars.length > maxInputCharsPerWord) {
-      outputTokens.push(unkToken)
-      continue
-    }
-
-    let isUnknown = false
-    let start = 0
-    const subTokens = []
-
-    while (start < chars.length) {
-      let end = chars.length
-      let currentSubstring = null
-
-      while (start < end) {
-        let substr = chars.slice(start, end).join('')
-
-        if (start > 0) {
-          substr = continuingSubwordPrefix + substr
-        }
-
-        if (vocab.has(substr)) {
-          currentSubstring = substr
-          break
-        }
-
-        end--
-      }
-
-      if (currentSubstring === null) {
-        isUnknown = true
-        break
-      }
-
-      subTokens.push(currentSubstring)
-      start = end
-    }
-
-    if (isUnknown) {
-      outputTokens.push(unkToken)
-    } else {
-      outputTokens.push(...subTokens)
-    }
-  }
-
-  return outputTokens
-}
-
-/**
- * Post-processing - add special tokens
- */
-function addSpecialTokens(tokens, tokensPair = null, clsToken = '[CLS]', sepToken = '[SEP]') {
-  let result = [clsToken, ...tokens, sepToken]
-  let tokenTypeIds = new Array(result.length).fill(0)
-
-  if (tokensPair !== null) {
-    result = [...result, ...tokensPair, sepToken]
-    tokenTypeIds = [...tokenTypeIds, ...new Array(tokensPair.length + 1).fill(1)]
-  }
-
-  return { tokens: result, tokenTypeIds }
-}
-
-/**
- * Convert tokens to IDs using vocabulary
- */
-function convertTokensToIds(tokens, vocab, unkTokenId) {
-  return tokens.map(token => vocab.get(token) ?? unkTokenId)
-}
-
-/**
- * Main BERT tokenizer function
- * @param {string} text - Input text to tokenize
- * @param {Map<string, number>} vocab - Vocabulary mapping tokens to IDs
- * @param {Object} options - Tokenization options
- * @returns {Object} Tokenization result with input_ids, attention_mask, token_type_ids
- */
-function bertTokenize(text, vocab, options = {}) {
-  const {
-    textPair = null,
-    addSpecialTokensFlag = true,
-    maxLength = 512,
-    padding = false,
-    truncation = false,
-    unkToken = '[UNK]',
-    clsToken = '[CLS]',
-    sepToken = '[SEP]',
-    padToken = '[PAD]',
-    continuingSubwordPrefix = '##',
-    normalizationOptions = {}
-  } = options
-
-  // Get token IDs
-  const unkTokenId = vocab.get(unkToken) ?? 100 // Default BERT UNK ID
-  const padTokenId = vocab.get(padToken) ?? 0 // Default BERT PAD ID
-
-  // Step 1: Normalize text
-  const normalizedText = normalizeText(text, normalizationOptions)
-  const normalizedTextPair = textPair ? normalizeText(textPair, normalizationOptions) : null
-
-  // Step 2: Pre-tokenize
-  const preTokens = preTokenize(normalizedText)
-  const preTokensPair = normalizedTextPair ? preTokenize(normalizedTextPair) : null
-
-  // Step 3: WordPiece encode
-  const tokens = wordPieceEncode(preTokens, vocab, unkToken, continuingSubwordPrefix)
-  const tokensPair = preTokensPair ? wordPieceEncode(preTokensPair, vocab, unkToken, continuingSubwordPrefix) : null
-
-  // Step 4: Add special tokens
-  let finalTokens = tokens
-  let tokenTypeIds = null
-
-  if (addSpecialTokensFlag) {
-    const result = addSpecialTokens(tokens, tokensPair, clsToken, sepToken)
-    finalTokens = result.tokens
-    tokenTypeIds = result.tokenTypeIds
-  } else if (tokensPair) {
-    finalTokens = [...tokens, ...tokensPair]
-    tokenTypeIds = [...new Array(tokens.length).fill(0), ...new Array(tokensPair.length).fill(1)]
-  }
-
-  // Step 5: Convert to IDs
-  let inputIds = convertTokensToIds(finalTokens, vocab, unkTokenId)
-
-  // Step 6: Handle truncation
-  if (truncation && inputIds.length > maxLength) {
-    inputIds = inputIds.slice(0, maxLength)
-    if (tokenTypeIds) tokenTypeIds = tokenTypeIds.slice(0, maxLength)
-  }
-
-  // Step 7: Handle padding
-  let attentionMask = new Array(inputIds.length).fill(1)
-  if (padding && inputIds.length < maxLength) {
-    const padLength = maxLength - inputIds.length
-    inputIds = [...inputIds, ...new Array(padLength).fill(padTokenId)]
-    attentionMask = [...attentionMask, ...new Array(padLength).fill(0)]
-    if (tokenTypeIds) tokenTypeIds = [...tokenTypeIds, ...new Array(padLength).fill(0)]
-  }
-
-  const result = {
-    input_ids: inputIds,
-    attention_mask: attentionMask
-  }
-
-  if (tokenTypeIds) {
-    result.token_type_ids = tokenTypeIds
-  }
-
-  return result
-}
-
-export {
-  bertTokenize,
-  normalizeText,
-  preTokenize,
-  wordPieceEncode,
-  addSpecialTokens,
-  convertTokensToIds,
-  removeAccents,
-  isControl
-}
diff --git a/tests/tokenizer.test.js b/tests/tokenizer.test.js
deleted file mode 100644
index f801e2f..0000000
--- a/tests/tokenizer.test.js
+++ /dev/null
@@ -1,187 +0,0 @@
-// Node.js test runner (test) for lib/tokenizer.js
-import { bertTokenize, normalizeText, preTokenize, wordPieceEncode } from '../lib/tokenizer.js'
-import assert from 'node:assert'
-import { test } from 'node:test'
-
-// Mock vocabulary for testing
-const mockVocab = new Map([
-  ['[CLS]', 101],
-  ['[SEP]', 102],
-  ['[UNK]', 100],
-  ['[PAD]', 0],
-  ['hello', 7592],
-  ['world', 2088],
-  ['!', 999],
-  ['this', 2023],
-  ['is', 2003],
-  ['a', 1037],
-  ['test', 3231],
-  ['sentence', 6251],
-  ['.', 1012],
-  [',', 1010],
-  ['how', 2129],
-  ['are', 2024],
-  ['you', 2017],
-  ['doing', 2509],
-  ['today', 2651],
-  ['?', 1029],
-  ['i', 1045],
-  ['love', 2293],
-  ['transform', 10938],
-  ['##ers', 2545], // WordPiece subword
-  ['the', 1996],
-  ['quick', 4248],
-  ['brown', 2829],
-  ['fox', 4419],
-  ['jumps', 14523],
-  ['over', 2058],
-  ['lazy', 13971],
-  ['dog', 3899]
-])
-
-test.describe('tokenizer', () => {
-  test('normalizeText: should clean and lowercase text', () => {
-    const text = 'Hello World!'
-    const normalized = normalizeText(text)
-    assert.strictEqual(normalized, 'hello world!')
-  })
-
-  test('normalizeText: should remove accents when requested', () => {
-    const text = 'café'
-    const normalized = normalizeText(text, { stripAccents: true })
-    assert.strictEqual(normalized, 'cafe')
-  })
-
-  test('preTokenize: should split on whitespace and punctuation', () => {
-    const text = 'Hello world!'
-    const preTokens = preTokenize(text)
-    assert.deepStrictEqual(preTokens, ['Hello', 'world', '!'])
-  })
-
-  test('preTokenize: should handle complex punctuation', () => {
-    const text = 'How are you doing today?'
-    const preTokens = preTokenize(text)
-    assert.deepStrictEqual(preTokens, ['How', 'are', 'you', 'doing', 'today', '?'])
-  })
-
-  test('bertTokenize: should tokenize simple text', () => {
-    const text = 'Hello world!'
-    const result = bertTokenize(text, mockVocab)
-
-    assert(Array.isArray(result.input_ids), 'Should return input_ids array')
-    assert(Array.isArray(result.attention_mask), 'Should return attention_mask array')
-    assert.strictEqual(result.input_ids.length, result.attention_mask.length, 'Arrays should have same length')
-
-    // Should include CLS token at start and SEP token at end
-    assert.strictEqual(result.input_ids[0], 101, 'Should start with CLS token')
-    assert.strictEqual(result.input_ids[result.input_ids.length - 1], 102, 'Should end with SEP token')
-
-    // All attention mask values should be 1 (no padding)
-    assert(
-      result.attention_mask.every(val => val === 1),
-      'All attention mask values should be 1'
-    )
-  })
-
-  test('bertTokenize: should handle unknown tokens', () => {
-    const text = 'unknownword'
-    const result = bertTokenize(text, mockVocab)
-
-    // Should contain UNK token (100)
-    assert(result.input_ids.includes(100), 'Should contain UNK token for unknown word')
-  })
-
-  test('bertTokenize: should handle padding', () => {
-    const text = 'Hello world!'
-    const result = bertTokenize(text, mockVocab, {
-      maxLength: 10,
-      padding: true,
-      truncation: false
-    })
-
-    assert.strictEqual(result.input_ids.length, 10, 'Should pad to maxLength')
-    assert.strictEqual(result.attention_mask.length, 10, 'Should pad attention mask to maxLength')
-
-    // Check that padding tokens (0) are present
-    assert(result.input_ids.includes(0), 'Should contain PAD tokens')
-
-    // Check that attention mask has 0s for padding
-    const paddingCount = result.attention_mask.filter(val => val === 0).length
-    assert(paddingCount > 0, 'Should have 0s in attention mask for padding')
-  })
-
-  test('bertTokenize: should handle truncation', () => {
-    const longText = 'The quick brown fox jumps over the lazy dog and runs very fast'
-    const result = bertTokenize(longText, mockVocab, {
-      maxLength: 8,
-      padding: false,
-      truncation: true
-    })
-
-    assert.strictEqual(result.input_ids.length, 8, 'Should truncate to maxLength')
-    assert.strictEqual(result.attention_mask.length, 8, 'Should truncate attention mask to maxLength')
-  })
-
-  test('bertTokenize: should handle both padding and truncation', () => {
-    const text = 'The quick brown fox jumps over the lazy dog'
-    const result = bertTokenize(text, mockVocab, {
-      maxLength: 15,
-      padding: true,
-      truncation: true
-    })
-
-    assert.strictEqual(result.input_ids.length, 15, 'Should be exactly maxLength')
-    assert.strictEqual(result.attention_mask.length, 15, 'Should be exactly maxLength')
-  })
-
-  test('bertTokenize: should handle sentence pairs', () => {
-    const text1 = 'Hello world!'
-    const text2 = 'How are you?'
-    const result = bertTokenize(text1, mockVocab, {
-      textPair: text2,
-      addSpecialTokensFlag: true
-    })
-
-    assert(Array.isArray(result.input_ids), 'Should return input_ids array')
-    assert(Array.isArray(result.attention_mask), 'Should return attention_mask array')
-    assert(Array.isArray(result.token_type_ids), 'Should return token_type_ids array for sentence pairs')
-
-    // Should have CLS at start and SEP tokens between and at end
-    assert.strictEqual(result.input_ids[0], 101, 'Should start with CLS token')
-
-    // Should have both 0s and 1s in token_type_ids
-    assert(result.token_type_ids.includes(0), 'Should have 0s for first sentence')
-    assert(result.token_type_ids.includes(1), 'Should have 1s for second sentence')
-  })
-
-  test('bertTokenize: should work without special tokens', () => {
-    const text = 'Hello world!'
-    const result = bertTokenize(text, mockVocab, {
-      addSpecialTokensFlag: false
-    })
-
-    // Should not start with CLS or end with SEP
-    assert.notStrictEqual(result.input_ids[0], 101, 'Should not start with CLS token')
-    assert.notStrictEqual(result.input_ids[result.input_ids.length - 1], 102, 'Should not end with SEP token')
-  })
-
-  test('wordPieceEncode: should handle subword tokenization', () => {
-    const tokens = ['transform', 'transformers']
-    const result = wordPieceEncode(tokens, mockVocab)
-
-    // 'transform' should be in vocab as-is
-    assert(result.includes('transform'), 'Should contain base word')
-
-    // 'transformers' should be split into 'transform' + '##ers'
-    if (result.includes('##ers')) {
-      assert(result.includes('transform'), 'Should split unknown word using subword tokens')
-    }
-  })
-
-  test('wordPieceEncode: should handle unknown words', () => {
-    const tokens = ['completelyunknownword']
-    const result = wordPieceEncode(tokens, mockVocab)
-
-    assert(result.includes('[UNK]'), 'Should return UNK token for completely unknown words')
-  })
-})