In [None]:
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

## Train Tokenizer for Tibetan

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

Note that the tokenizer already contains some Tibetan tokens but not enough.

In [None]:
enc = tokenizer.encode('ཡུན་རིང་དུས་ནས་ཆོས་ཀྱིས་བསྐྱངས་བའི་བུ། ')
tokenizer.decode(enc)

In [None]:
from tokenizers import BertWordPieceTokenizer

new_tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

In [None]:
new_tokenizer.train_from_iterator(
    ds['bo'],
    vocab_size=len(tokenizer.get_vocab()),
    min_frequency=3,
    show_progress=True,
    limit_alphabet=500
)

### Convert New Tokenizer to AutoTokenizer Format

In [None]:
new_tokenizer.save_model('new_tokenizer')

In [None]:
from transformers import BertTokenizerFast

# Load the trained tokenizer
fast_tokenizer = BertTokenizerFast(
    vocab_file="new_tokenizer/vocab.txt",
    do_lower_case=False
)

# Save in Hugging Face format
fast_tokenizer.save_pretrained("fast_tokenizer")

In [None]:
from transformers import AutoTokenizer

# Load your new tokenizer
new_fast_tokenizer = AutoTokenizer.from_pretrained("fast_tokenizer")

In [None]:
enc = new_fast_tokenizer.encode('ཡུན་རིང་དུས་ནས་ཆོས་ཀྱིས་བསྐྱངས་བའི་བུ། ')
new_fast_tokenizer.decode(enc)

## Train Embedding Model

### Load the Models

In [None]:
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from transformers import AutoTokenizer

# Load teacher model
teacher_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

# load student model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')
word_embedding_model.tokenizer = AutoTokenizer.from_pretrained("fast_tokenizer")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')



In [None]:
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

# create teacher embeddings
def create_teacher_embeddings(batch):
    batch['teacher embeddings'] = teacher_model.encode(batch['en'])
    return batch

ds = ds.map(create_teacher_embeddings, batched=True)

ds[0]

### Define Loss Function

In [7]:
from sentence_transformers.losses import MSELoss

# 4. Define a loss function
loss = MSELoss(student_model)

### Specify Training Arguments

In [None]:
# 5. Specify training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="do-it-right?",
    num_train_epochs=1,
    auto_find_batch_size=True,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    eval_strategy="steps",
    eval_steps=100
)

### Define Evaluator

In [None]:
from sentence_transformers.evaluation import MSEEvaluator
# Create an evaluator & evaluate the base model
dev_evaluator = MSEEvaluator(
    source_sentences=ds['test']['en'],
    target_sentences=ds['test']['bo'],
    teacher_model=teacher_model,
    name="stsb-dev",
    show_progress_bar=True
)
dev_evaluator(student_model)

{'stsb-dev_negative_mse': np.float32(-7.1650085)}

### Train the Model

In [None]:
from torch.utils.data import DataLoader
from sentence_transformers import InputExample

train_samples = [
    InputExample(texts=[sentence], label=label)
    for sentence, label in zip(ds['train']['bo'], ds['train']['teacher embeddings'])
    ]

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

In [None]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=student_model,
    args=args,
    train_dataset=train_dataloader,
    eval_dataset=ds['test'],
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/98776 [00:00<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'repeat'

In [None]:
model.save('./fine-tuned-minilm')