In [3]:
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

## Train Tokenizer for Tibetan

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

Note that the tokenizer already contains some Tibetan tokens but not enough.

In [None]:
enc = tokenizer.encode('ཡུན་རིང་དུས་ནས་ཆོས་ཀྱིས་བསྐྱངས་བའི་བུ། ')
tokenizer.decode(enc)

In [6]:
from tokenizers import BertWordPieceTokenizer

new_tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

In [None]:
new_tokenizer.train_from_iterator(
    ds['bo'],
    vocab_size=len(tokenizer.get_vocab()),
    min_frequency=3,
    show_progress=True,
    limit_alphabet=500
)

### Convert New Tokenizer to AutoTokenizer Format

In [None]:
new_tokenizer.save_model('new_tokenizer')

In [None]:
from transformers import BertTokenizerFast

# Load the trained tokenizer
fast_tokenizer = BertTokenizerFast(
    vocab_file="new_tokenizer/vocab.txt",
    do_lower_case=False
)

# Save in Hugging Face format
fast_tokenizer.save_pretrained("fast_tokenizer")

In [10]:
from transformers import AutoTokenizer

# Load your new tokenizer
new_fast_tokenizer = AutoTokenizer.from_pretrained("fast_tokenizer")

In [None]:
enc = new_fast_tokenizer.encode('ཡུན་རིང་དུས་ནས་ཆོས་ཀྱིས་བསྐྱངས་བའི་བུ། ')
new_fast_tokenizer.decode(enc)

## Train Embedding Model

### Load the Models

In [1]:
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from transformers import AutoTokenizer

# Load teacher model
teacher_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

# load student model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')
word_embedding_model.tokenizer = AutoTokenizer.from_pretrained("fast_tokenizer")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')

In [2]:
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

# create teacher embeddings
def create_teacher_embeddings(batch):
    batch['english'] = batch['en']
    batch['tibetan'] = batch['bo']
    batch['label'] = teacher_model.encode(batch['en'])
    return batch

ds = ds.map(create_teacher_embeddings, batched=True)

ds[0]

Map:   0%|          | 0/878004 [00:00<?, ? examples/s]

{'bo': '༄༅༅། །རྒྱ་གར་སྐད་དུ། ཨཱརྱ་སུ་བརྞྞ་བཱ་ལུ་ཀོ་པ་མ་ནཱ་མ་མ་ཧཱ་ཡཱ་ན་སཱུ་ཏྲ། བོད་སྐད་དུ།',
 'en': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'topic': 'Meditation, Ritual, Enlightenment',
 'source': '84000',
 'english': 'The Noble Mahāyāna Sūtra Like Gold Dust',
 'tibetan': '༄༅༅། །རྒྱ་གར་སྐད་དུ། ཨཱརྱ་སུ་བརྞྞ་བཱ་ལུ་ཀོ་པ་མ་ནཱ་མ་མ་ཧཱ་ཡཱ་ན་སཱུ་ཏྲ། བོད་སྐད་དུ།',
 'label': [-0.06716889888048172,
  -0.01000198908150196,
  -0.030257122591137886,
  0.05288662016391754,
  -0.024933090433478355,
  -0.011344384402036667,
  0.05050237104296684,
  -0.044951360672712326,
  -0.00341879203915596,
  0.033140622079372406,
  0.01606477051973343,
  0.04913238063454628,
  -0.07414509356021881,
  -0.03771483898162842,
  -0.03334968909621239,
  -0.00014761531201656908,
  0.05078834295272827,
  0.017935410141944885,
  -0.06188945844769478,
  0.05025920644402504,
  0.03822670876979828,
  0.07103323936462402,
  0.04260242357850075,
  0.0038686522748321295,
  -0.036233022809028625,
  -0.0034739854745566845,
  -0

In [3]:
ds = ds.remove_columns(['bo', 'en', 'topic', 'source'])
ds = ds.train_test_split(.1)
ds

DatasetDict({
    train: Dataset({
        features: ['english', 'tibetan', 'label'],
        num_rows: 790203
    })
    test: Dataset({
        features: ['english', 'tibetan', 'label'],
        num_rows: 87801
    })
})

### Define Loss Function

In [4]:
from sentence_transformers.losses import MSELoss

# 4. Define a loss function
loss = MSELoss(student_model)

### Specify Training Arguments

In [15]:
# 5. Specify training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="do-it-right",
    num_train_epochs=25,
    auto_find_batch_size=True,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    eval_strategy='epoch',
    save_strategy='epoch',
    save_safetensors=False
)

### Define Evaluator

In [6]:
from sentence_transformers.evaluation import MSEEvaluator
# Create an evaluator & evaluate the base model
dev_evaluator = MSEEvaluator(
    source_sentences=ds['test']['english'],
    target_sentences=ds['test']['tibetan'],
    teacher_model=teacher_model,
    name="stsb-dev",
    show_progress_bar=True
)
dev_evaluator(student_model)

Batches:   0%|          | 0/2744 [00:00<?, ?it/s]

Batches:   0%|          | 0/2744 [00:00<?, ?it/s]

{'stsb-dev_negative_mse': np.float32(-7.179603)}

### Train the Model

In [7]:
train_ds = ds['train'].remove_columns(['english'])

In [17]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=student_model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=ds['test'],
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Stsb-dev Negative Mse
1,0.0017,0.002077,-0.17373772


Batches:   0%|          | 0/2744 [00:00<?, ?it/s]

TypeError: Object of type float32 is not JSON serializable

In [18]:
student_model.save('./fine-tuned-minilm3')

In [19]:
student_model.load('./fine-tuned-minilm3')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [20]:
student_model.push_to_hub('billingsmoore/minilm-bo')

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

'https://huggingface.co/billingsmoore/minilm-bo/commit/0b52743d41a3aea475e4bc338cb117b2b3e21770'