In [2]:
import os
import torch

# Set new cache directories
os.environ["HF_DATASETS_CACHE"] = "/ephemeral/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = "/ephemeral/transformers_cache"
os.environ["TMPDIR"] = "/ephemeral/tmp"

# Ensure the directories exist
os.makedirs("/ephemeral/hf_cache", exist_ok=True)
os.makedirs("/ephemeral/transformers_cache", exist_ok=True)
os.makedirs("/ephemeral/tmp", exist_ok=True)

In [3]:
from transformers import LlamaTokenizer
import sentencepiece as spm
import os

# -----------------------------
# Load SentencePiece model
# -----------------------------
sentencepiece_model_path = "/workspace/mongolian_tokenizer.model"
output_dir = "/workspace/mongolian_llama_tokenizer"  # Directory to save Hugging Face tokenizer

# Check the SentencePiece model
sp = spm.SentencePieceProcessor(model_file=sentencepiece_model_path)
print("Original SentencePiece Vocabulary Size:", sp.vocab_size())

# -----------------------------
# Wrap into Hugging Face format
# -----------------------------
# Use LlamaTokenizer or PreTrainedTokenizerFast
tokenizer = LlamaTokenizer(vocab_file=sentencepiece_model_path)

# -----------------------------
# Add Special Tokens
# -----------------------------
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>"
})

# Save the tokenizer in Hugging Face format
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

tokenizer.save_pretrained(output_dir)
print(f"Tokenizer saved to {output_dir}")

# Test the tokenizer
text = "сайн байна уу?"
encoded = tokenizer(text)
print("Encoded text:", encoded)
decoded = tokenizer.decode(encoded["input_ids"])
print("Decoded text:", decoded)


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


Original SentencePiece Vocabulary Size: 32000
Tokenizer saved to /workspace/mongolian_llama_tokenizer
Encoded text: {'input_ids': [1, 63, 4, 36, 0], 'attention_mask': [1, 1, 1, 1, 1]}
Decoded text: <s> сайн байна уу<unk>


In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from transformers import LlamaTokenizer, LlamaTokenizerFast

# Log in to Hugging Face
login(token="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")

tokenizer.push_to_hub("Billyyy/mongolian-tokenizer-unigram")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Billyyy/mongolian-tokenizer-unigram/commit/b54741a2e0308d38ba637879dc01bff916b3b737', commit_message='Upload tokenizer', commit_description='', oid='b54741a2e0308d38ba637879dc01bff916b3b737', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Billyyy/mongolian-tokenizer-unigram', endpoint='https://huggingface.co', repo_type='model', repo_id='Billyyy/mongolian-tokenizer-unigram'), pr_revision=None, pr_num=None)

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

source_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
source_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

In [6]:
from transformers import AutoTokenizer
import sentencepiece as spm

# Load the SentencePiece tokenizer
sentencepiece_model_path = "/workspace/mongolian_tokenizer.model"
sp = spm.SentencePieceProcessor(model_file=sentencepiece_model_path)

# Optional: Check existing vocab size
print("Vocabulary size:", sp.vocab_size())


Vocabulary size: 32000


In [7]:
# Extract new tokens from SentencePiece tokenizer
sentencepiece_vocab = [sp.id_to_piece(i) for i in range(sp.vocab_size())]

# Compare with base tokenizer's vocabulary
existing_vocab = tokenizer.get_vocab()
new_tokens = [token for token in sentencepiece_vocab if token not in existing_vocab]

print(f"Number of new tokens to add: {len(new_tokens)}")
print("Sample new tokens:", new_tokens[:10])

# Add new tokens to the base tokenizer
tokenizer.add_tokens(new_tokens)

Number of new tokens to add: 32000
Sample new tokens: ['<unk>', '<s>', '</s>', '▁нь', '▁байна', '▁юм', '▁энэ', '▁', '▁ч', '▁байгаа']


32000

In [8]:
target_tokenizer = tokenizer

In [9]:
len(target_tokenizer)

160256

In [13]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from deepfocus import FOCUS  # Ensure `deepfocus` library is installed
import sentencepiece as spm

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

extend_tokenizer = AutoTokenizer.from_pretrained("Billyyy/mongolian-tokenizer-unigram")

print("Running FOCUS to align embeddings...")
target_embeddings = FOCUS(
    source_embeddings=source_model.get_input_embeddings().weight,
    source_tokenizer=source_tokenizer,
    target_tokenizer=target_tokenizer,
    target_training_data_path="/ephemeral/cc100_mn_cleaned.txt",
    fasttext_model_dim=2048,
    device = "cpu",
    extend_tokenizer = extend_tokenizer,
    processes = 27,
    fasttext_model_epochs = 1,
    verbosity="silent"
      # Path to your data
    # Optional: Provide pre-trained FastText model for better alignment
    # fasttext_model_path="/path/to/fasttext.bin",
)

# Update input embeddings with aligned embeddings
source_model.resize_token_embeddings(len(target_tokenizer))
source_model.get_input_embeddings().weight.data = target_embeddings

print("FOCUS alignment complete!")


Running FOCUS to align embeddings...


FOCUS initialization...: 100%|██████████| 31384/31384 [00:19<00:00, 1578.85it/s]                                     
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


FOCUS alignment complete!


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from transformers import LlamaTokenizer, LlamaTokenizerFast

# Log in to Hugging Face
login(token="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")

target_tokenizer.push_to_hub("Billyyy/extended_llama_mongolian")
source_model.push_to_hub("Billyyy/extended_llama_mongolian")

tokenizer.json:   0%|          | 0.00/23.4M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/218M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Billyyy/extended_llama_mongolian/commit/5f5a4aab00bc1cc699a472777ff0e7605c2a2589', commit_message='Upload LlamaForCausalLM', commit_description='', oid='5f5a4aab00bc1cc699a472777ff0e7605c2a2589', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Billyyy/extended_llama_mongolian', endpoint='https://huggingface.co', repo_type='model', repo_id='Billyyy/extended_llama_mongolian'), pr_revision=None, pr_num=None)