In [1]:
import os

os.chdir("/home/nlp/achimoa/workspace/ModernHebrewBERT")
os.getcwd()

'/home/nlp/achimoa/workspace/ModernHebrewBERT'

In [10]:
import sentencepiece as spm

model_file = "tokenizer/spiece.model"
tokenizer = spm.SentencePieceProcessor(model_file=model_file)

In [None]:
import sentencepiece as spm

# Load the model
sp = tokenizer

# 1. Check special tokens
assert sp.piece_to_id("[CLS]") != sp.unk_id()
assert sp.piece_to_id("[SEP]") != sp.unk_id()
assert sp.piece_to_id("[MASK]") != sp.unk_id()

# 2. Basic test
text = "◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù"  # Or any sentence in your corpus language
encoded = sp.encode(text, out_type=str)
decoded = sp.decode(sp.encode(text))
print("Original:", text)
print("Encoded:", encoded)
print("Decoded:", decoded)

# 3. Roundtrip test
assert decoded.replace(" ", "") in text.replace(" ", "")  # Looser match due to BPE

# 4. Vocabulary size check
print("Vocab size:", sp.get_piece_size())
# assert sp.get_piece_size() == 100_000 + len(sp.GetUserDefinedSymbols())  # May vary slightly

# 5. OOV handling
unknown_text = "êçà"  # Gothic letter or any rare symbol
unk_id = sp.piece_to_id(sp.id_to_piece(sp.unk_id()))
encoded_unk = sp.encode(unknown_text)

assert unk_id in encoded_unk


Original: ◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù
Encoded: ['‚ñÅ◊©◊ú◊ï◊ù', '‚ñÅ◊¢◊ï◊ú◊ù']
Decoded: ◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù
Vocab size: 100000


In [32]:
import sentencepiece as spm
from transformers import AutoTokenizer

# Load the original SentencePiece tokenizer
sp = spm.SentencePieceProcessor()
sp.load("tokenizer/spiece.model")

# Load the saved HuggingFace tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("tokenizer", use_fast=False)

# Test input
text = "◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù"

# --- Original SentencePiece ---
sp_tokens = sp.encode(text, out_type=str)
sp_ids = sp.encode(text, out_type=int)
print("üîπ SentencePiece tokens: ", sp_tokens)
print("üîπ SentencePiece IDs:    ", sp_ids)

# --- HuggingFace tokenizer ---
hf_tokens = hf_tokenizer.tokenize(text)
hf_ids = hf_tokenizer.encode(text, add_special_tokens=False)
print("üî∏ HF tokenizer tokens:   ", hf_tokens)
print("üî∏ HF tokenizer IDs:      ", hf_ids)

# --- Check decoded text ---
print("üîπ SP decoded:", sp.decode(sp_ids))
print("üî∏ HF decoded:", hf_tokenizer.decode(hf_ids))


üîπ SentencePiece tokens:  ['‚ñÅ◊©◊ú◊ï◊ù', '‚ñÅ◊¢◊ï◊ú◊ù']
üîπ SentencePiece IDs:     [3078, 3221]
üî∏ HF tokenizer tokens:    ['‚ñÅ◊©◊ú◊ï◊ù', '‚ñÅ◊¢◊ï◊ú◊ù']
üî∏ HF tokenizer IDs:       [3078, 3221]
üîπ SP decoded: ◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù
üî∏ HF decoded: ◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù


In [35]:
tokenizer.vocab_size

100000

In [34]:
tokenizer("◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù", truncation=True, padding="max_length", max_length=1024)

{'input_ids': [2, 3078, 3221, 3, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001,

In [33]:
tokenizer._pad_token

AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)

In [15]:
output = tokenizer.encode("◊ô◊© ◊î◊®◊ë◊î ◊ó◊ô◊ï◊™ ◊ë◊õ◊ì◊ï◊® ◊ê◊®◊•")
print(output)


[217, 1212, 8376, 12480, 5399]


In [29]:
from transformers import AlbertTokenizer

# Load from your trained SentencePiece model
tokenizer = AlbertTokenizer(
    vocab_file="tokenizer/spiece.model",
    unk_token='[UNK]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    mask_token='[MASK]'
)

# Save it in HuggingFace format (creates tokenizer_config.json, etc.)
tokenizer.save_pretrained("tokenizer")


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json')

In [30]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("tokenizer", use_fast=False)


In [31]:
from transformers import AutoTokenizer

# Load the tokenizer (slow version from SentencePiece)
tokenizer = AutoTokenizer.from_pretrained("tokenizer", use_fast=False)

# Sample input
text = "◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù"

# Encode the text into token IDs
encoded = tokenizer.encode(text, add_special_tokens=True)
print("Encoded IDs:", encoded)

# Decode back to text
decoded = tokenizer.decode(encoded)
print("Decoded text:", decoded)

# Tokenize to individual tokens
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

# Show token ‚Üí ID mapping
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

# Special tokens check
print("Special tokens:")
print("  [PAD]:", tokenizer.pad_token, tokenizer.pad_token_id)
print("  [UNK]:", tokenizer.unk_token, tokenizer.unk_token_id)
print("  [CLS]:", tokenizer.cls_token, tokenizer.cls_token_id)
print("  [SEP]:", tokenizer.sep_token, tokenizer.sep_token_id)
print("  [MASK]:", tokenizer.mask_token, tokenizer.mask_token_id)


Encoded IDs: [2, 3078, 3221, 3]
Decoded text: [CLS] ◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù[SEP]
Tokens: ['‚ñÅ◊©◊ú◊ï◊ù', '‚ñÅ◊¢◊ï◊ú◊ù']
Token IDs: [3078, 3221]
Special tokens:
  [PAD]: [PAD] 100001
  [UNK]: [UNK] 100000
  [CLS]: [CLS] 2
  [SEP]: [SEP] 3
  [MASK]: [MASK] 4


In [16]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("tokenizer/")


In [17]:
tokenizer._pad_token

AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)

In [12]:
output = tokenizer("◊©◊ú◊ï◊ù ◊¢◊ï◊ú◊ù", return_tensors="pt")
print(output)


{'input_ids': tensor([[  3078,   3221, 100100]]), 'attention_mask': tensor([[1, 1, 1]])}


In [13]:
tokenizer.vocab_size

100000