In [None]:
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

## Train Tokenizer for Tibetan

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

Note that the tokenizer already contains some Tibetan tokens but not enough.

In [None]:
enc = tokenizer.encode('ཡུན་རིང་དུས་ནས་ཆོས་ཀྱིས་བསྐྱངས་བའི་བུ། ')
tokenizer.decode(enc)

In [None]:
from tokenizers import BertWordPieceTokenizer

new_tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

In [None]:
new_tokenizer.train_from_iterator(
    ds['bo'],
    vocab_size=len(tokenizer.get_vocab()),
    min_frequency=3,
    show_progress=True,
    limit_alphabet=500
)

### Convert New Tokenizer to AutoTokenizer Format

In [None]:
new_tokenizer.save_model('new_tokenizer')

In [None]:
from transformers import BertTokenizerFast

# Load the trained tokenizer
fast_tokenizer = BertTokenizerFast(
    vocab_file="new_tokenizer/vocab.txt",
    do_lower_case=False
)

# Save in Hugging Face format
fast_tokenizer.save_pretrained("fast_tokenizer")

In [None]:
from transformers import AutoTokenizer

# Load your new tokenizer
new_fast_tokenizer = AutoTokenizer.from_pretrained("fast_tokenizer")

In [None]:
enc = new_fast_tokenizer.encode('ཡུན་རིང་དུས་ནས་ཆོས་ཀྱིས་བསྐྱངས་བའི་བུ། ')
new_fast_tokenizer.decode(enc)

## Train Embedding Model

In [1]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')

student_tokenizer = AutoTokenizer.from_pretrained("fast_tokenizer")



In [2]:
teacher_embeddings = model.encode(ds['en'],show_progress_bar=True)

Batches:   0%|          | 0/27438 [00:00<?, ?it/s]

In [3]:
from sentence_transformers import SentenceTransformer, models

# Load the pre-trained MiniLM model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')

# Replace the tokenizer with your custom tokenizer
word_embedding_model.tokenizer = student_tokenizer

# add a pooling layer
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

# Create the SentenceTransformer model
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')

In [4]:
from sentence_transformers import losses

# Define the loss function
loss_fn = losses.MSELoss(model=student_model)

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, datasets, evaluation
from torch.utils.data import DataLoader
from torch.optim import AdamW


# Create InputExample objects for training
train_examples = [
    InputExample(texts=[sentence], label=teacher_embedding)
    for sentence, teacher_embedding in zip(ds['bo'], teacher_embeddings)
]

# Create a DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Fine-tune the student model
student_model.fit(
    train_objectives=[(train_dataloader, loss_fn)],
    epochs=20,  # Adjust the number of epochs as needed
    output_path='./fine-tuned-minilm'  # Save the fine-tuned model
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/164628 [00:00<?, ?it/s]

{'loss': 0.0237, 'grad_norm': 0.03572463244199753, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.0034, 'grad_norm': 0.00758838327601552, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
{'loss': 0.0028, 'grad_norm': 0.006931068375706673, 'learning_rate': 3e-06, 'epoch': 0.03}
{'loss': 0.0026, 'grad_norm': 0.004626153502613306, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}
{'loss': 0.0025, 'grad_norm': 0.004049560055136681, 'learning_rate': 5e-06, 'epoch': 0.05}
{'loss': 0.0025, 'grad_norm': 0.003877197625115514, 'learning_rate': 6e-06, 'epoch': 0.05}
{'loss': 0.0024, 'grad_norm': 0.003397856606170535, 'learning_rate': 7e-06, 'epoch': 0.06}
{'loss': 0.0024, 'grad_norm': 0.0036154408007860184, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.07}
{'loss': 0.0024, 'grad_norm': 0.003426718059927225, 'learning_rate': 9e-06, 'epoch': 0.08}
{'loss': 0.0024, 'grad_norm': 0.003442424815148115, 'learning_rate': 1e-05, 'epoch': 0.09}
{'loss': 0.0024, 'grad_no

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [7]:
model.save('./fine-tuned-minilm')

## Evaluate

In [4]:
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

ds = load_dataset('billingsmoore/Aggregated-bo-en', split='train')

teacher_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')
student_model = SentenceTransformer('./fine-tuned-minilm', device='cuda')

In [6]:
import numpy as np

# Generate embeddings using the fine-tuned model
teacher_embeddings = teacher_model.encode(ds['en'][:100])
student_embeddings = student_model.encode(ds['bo'][:100])

# Compare the embeddings (e.g., using cosine similarity)
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(teacher_embeddings[:100], student_embeddings[:100])
print("Cosine Similarity between Teacher and Student Embeddings:")
print(np.mean(similarity))

Cosine Similarity between Teacher and Student Embeddings:
0.10575532


## Train More

In [8]:
teacher_embeddings = teacher_model.encode(ds['en'],show_progress_bar=True)

Batches:   0%|          | 0/27438 [00:00<?, ?it/s]

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, datasets, evaluation
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sentence_transformers import losses

# Define the loss function
loss_fn = losses.MSELoss(model=student_model)


# Create InputExample objects for training
train_examples = [
    InputExample(texts=[sentence], label=teacher_embedding)
    for sentence, teacher_embedding in zip(ds['bo'], teacher_embeddings)
]

# Create a DataLoader for training
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Fine-tune the student model
student_model.fit(
    train_objectives=[(train_dataloader, loss_fn)],
    epochs=20,  # Adjust the number of epochs as needed
    output_path='./fine-tuned-minilm'  # Save the fine-tuned model
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112291999999495, max=1.0…

  0%|          | 0/1097520 [00:00<?, ?it/s]

{'loss': 0.0042, 'grad_norm': 0.01006426103413105, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.0036, 'grad_norm': 0.007295185700058937, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
{'loss': 0.0035, 'grad_norm': 0.006422760896384716, 'learning_rate': 3e-06, 'epoch': 0.03}
{'loss': 0.0034, 'grad_norm': 0.005309778731316328, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}
{'loss': 0.0034, 'grad_norm': 0.005954865366220474, 'learning_rate': 5e-06, 'epoch': 0.05}
{'loss': 0.0033, 'grad_norm': 0.005869138985872269, 'learning_rate': 6e-06, 'epoch': 0.05}
{'loss': 0.0033, 'grad_norm': 0.0063346978276968, 'learning_rate': 7e-06, 'epoch': 0.06}
{'loss': 0.0033, 'grad_norm': 0.005792014766484499, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.07}
{'loss': 0.0033, 'grad_norm': 0.006025457289069891, 'learning_rate': 9e-06, 'epoch': 0.08}
{'loss': 0.0032, 'grad_norm': 0.006847941782325506, 'learning_rate': 1e-05, 'epoch': 0.09}
{'loss': 0.0032, 'grad_norm