In [1]:
import torch 
import datasets
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from torch import nn 
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [8]:
class InfoNCELoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, embeddings_a, embeddings_b):
        embeddings_a = F.normalize(embeddings_a, dim=1)
        embeddings_b = F.normalize(embeddings_b, dim=1)

        similarity_matrix = (
            embeddings_a @ embeddings_b.T
        ) / self.temperature 

        labels = torch.arange(
            embeddings_a.size(0),
            device=embeddings_a.device
        )

        return self.criterion(similarity_matrix, labels)
    

def batch_to_device(batch, device):
    return {key: value.to(device) for key, value in batch.items()}
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).float()
    return (last_hidden_state * mask).sum(1) / mask.sum(1)

In [4]:
# Load model and dataset
model_name = "Alibaba-NLP/gte-multilingual-base"
dataset = datasets.load_dataset("jaeyong2/Ja-emb-PreView")
train_dataloader = DataLoader(dataset['train'], batch_size=8, shuffle=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
infoloss = InfoNCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Convert model to bfloat16 if supported
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
    model = model.to(torch.bfloat16)

configuration.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
model.config.use_cache = False
torch.backends.cuda.matmul.allow_tf32 = True
MAX_LEN_CONTEXT = 256
MAX_LEN_TITLE = 128
EPOCHS = 3

In [None]:
from torch.amp import autocast
import torch.nn.functional as F
from tqdm import tqdm
criterion = InfoNCELoss(temperature=0.07)
model.train()

for epoch in range(EPOCHS):
    total_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}"):

        optimizer.zero_grad()
        context_enc = tokenizer(
            batch["context"],
            padding=True,
            truncation=True,
            max_length=MAX_LEN_CONTEXT,
            return_tensors="pt"
        )

        title_enc = tokenizer(
            batch["Title"],
            padding=True,
            truncation=True,
            max_length=MAX_LEN_TITLE,
            return_tensors="pt"
        )

        context_enc = batch_to_device(context_enc, device)
        title_enc = batch_to_device(title_enc, device)
        with autocast(device_type='cuda', dtype=torch.bfloat16):
            context_emb = model(**context_enc).last_hidden_state[:, 0, :]
            title_emb   = model(**title_enc).last_hidden_state[:, 0, :]

            loss = criterion(context_emb, title_emb)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")

Epoch 1/3:   3%|▎         | 535/17467 [06:33<3:27:18,  1.36it/s]

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import login
login(token="HUGGINGFACE_API_TOKEN")

: 

: 

In [None]:
model_name = "chanchinn/jp_infoNCE_multilingual_embedding_model"

print(f"Uploading to {model_name}...")

try:
    model.push_to_hub(
        model_name,
        commit_message="Fine-tuned with InfoNCE loss"
    )
    tokenizer.push_to_hub(
        model_name,
        commit_message="Fine-tuned with InfoNCE loss"
    )

except Exception as e:
    print("✗ Upload failed!")
    print(f"Error: {type(e).__name__}: {e}")

else:
    print("✓ Model uploaded successfully!")
    print(f"View at: https://huggingface.co/{model_name}")
