In [1]:
import torch
import datasets
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
from torch import nn

In [None]:
# Triplet Loss class
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        distance_positive = (anchor - positive).pow(2).sum(1)
        distance_negative = (anchor - negative).pow(2).sum(1)
        losses = torch.relu(distance_positive - distance_negative + self.margin)
        return losses.mean()

# Move batch to device
def batch_to_device(batch, device):
    return {key: value.to(device) for key, value in batch.items()}

In [3]:
# Load model and dataset
model_name = "Alibaba-NLP/gte-multilingual-base"
dataset = datasets.load_dataset("jaeyong2/Ja-emb-PreView")
train_dataloader = DataLoader(dataset['train'], batch_size=8, shuffle=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
triplet_loss = TripletLoss(margin=1.0)

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Convert model to bfloat16 if supported
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    model = model.to(torch.bfloat16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Alibaba-NLP/new-impl You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository Alibaba-NLP/gte-multilingual-base references custom code contained in Alibaba-NLP/new-impl which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/new-impl .
 You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Training loop
for epoch in range(3):
    model.train()
    total_loss = 0
    count = 0

    print(f"\nEpoch {epoch + 1}/3")

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()
        loss = None

        for index in range(len(batch["context"])):
            # Tokenize inputs
            anchor_encodings = tokenizer(
                [batch["context"][index]],
                truncation=True,
                padding="max_length",
                max_length=1024,
                return_tensors="pt"
            )
            positive_encodings = tokenizer(
                [batch["Title"][index]],
                truncation=True,
                padding="max_length",
                max_length=256,
                return_tensors="pt"
            )
            negative_encodings = tokenizer(
                [batch["Fake Title"][index]],
                truncation=True,
                padding="max_length",
                max_length=256,
                return_tensors="pt"
            )

            # Move to device
            anchor_encodings = batch_to_device(anchor_encodings, device)
            positive_encodings = batch_to_device(positive_encodings, device)
            negative_encodings = batch_to_device(negative_encodings, device)

            # Get embeddings (CLS token)
            anchor_output = model(**anchor_encodings)[0][:, 0, :]
            positive_output = model(**positive_encodings)[0][:, 0, :]
            negative_output = model(**negative_encodings)[0][:, 0, :]

            # Accumulate loss
            if loss == None:
                loss = triplet_loss(anchor_output, positive_output, negative_output)
            else:
                loss += triplet_loss(anchor_output, positive_output, negative_output)

        # Average loss over batch
        loss /= len(batch["context"])

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track statistics
        total_loss += loss.item()
        count += 1

    # Print epoch statistics
    avg_loss = total_loss / count
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")


Epoch 1/3


Training Epoch 1:   0%|          | 63/17467 [04:22<20:17:14,  4.20s/it]

In [None]:
# Save the fine-tuned model
output_dir = "./fine_tuned_gte_multilingual_japanese"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"\nModel saved to {output_dir}")