# Training Hebrew Text Encoder

In [1]:
!nvidia-smi

Sun Oct 13 20:23:04 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:0E:00.0 Off |                    0 |
| N/A   51C    P0            340W /  400W |   69299MiB /  81920MiB |    100%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          Off |   00

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

## Setup the environment

In [3]:
!pip install -q -U torch transformers bitsandbytes datasets huggingface_hub accelerate tqdm

In [4]:
from huggingface_hub import notebook_login
import os
import sys

In [5]:
os.environ["HF_TOKEN"] = "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"
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
if 'COLAB_GPU' in os.environ:
    # Load the Drive helper and mount
    print("Mounting google drive...")
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Not running in Google Colab!")

Not running in Google Colab!


In [7]:
project_dir = os.getcwd() if not os.getcwd().split("/")[-1] == 'notebooks' else '/'.join(os.getcwd().split("/")[0:-1])
src_dir = os.path.join(project_dir, 'src')

os.chdir(project_dir)
print(f"Current working directory set to: {os.getcwd()}")


if src_dir not in sys.path:
    sys.path.insert(0, src_dir)  # Add it to the front of PYTHONPATH
    print(f"PYTHONPATH updated with: {src_dir}")
else:
    print(f"PYTHONPATH already contains: {src_dir}")

Current working directory set to: /home/nlp/achimoa/projects/hebrew_text_encoder
PYTHONPATH updated with: /home/nlp/achimoa/projects/hebrew_text_encoder/src


In [8]:
%reload_ext autoreload
%autoreload 2
from transformers import AutoModel, AutoTokenizer
from datasets import concatenate_datasets
import torch
from torch.optim import AdamW
from datetime import datetime
from data import *
from loss import *
from trainings import *
from utils import *

In [9]:
from data import *

dataset = build_dataset('synthesized_query_document')
dataset

Loading data files: 100%|██████████| 13/13 [00:01<00:00, 12.51it/s]


DatasetDict({
    train: Dataset({
        features: ['anchor_text', 'positive_text', 'negative_text'],
        num_rows: 73595
    })
    validation: Dataset({
        features: ['anchor_text', 'positive_text', 'negative_text'],
        num_rows: 9199
    })
})

In [11]:
dataset['train'][42]

{'anchor_text': '[TASK_QUERY_DOC] [QUERY] אני מחפש דעות ודירוגים של משתמשים שמדברים על מצלמות DSLR איכותיות בטווח מחיר נגיש להפקה של סרטי יוטיוב בעלי קונספט של סרטי אימה עצמאיים.',
 'positive_text': "[DOCUMENT] צוות הליבה של בלוג הצילום 'מציאות מרובת מצלמות' פרסם לאחרונה סקירה מקיפה על מצלמות DSLR זולות יחסית עם ביצועים גבוהים שמתאימות במיוחד להפקת תכנים חזותיים לרשתות החברתיות. הם המליצו במיוחד על המצלמה המשובחת Canon EOS Rebel T6i, שלטענתם מציעה איכות וכלים מקצועיים שלא היו זמינים בעבר במחיר כה שפוי. המבקרים התרשמו מהפוקוס האוטומטי המהיר והיציב, מרזולוציית התמונה הגבוהה ומאיכות ההקלטה של הווידאו ברזולוציה גבוהה. הם גם הביעו התפעלות מכך שהמצלמה קלת המשקל ועמידה היטב בשימוש יום-יומי, ומהשגת יתרונות נוספים כמו הגנת אבק ומים בסיסית.",
 'negative_text': "[DOCUMENT] בשנים האחרונות, מגמת ייצור המצלמות הדיגיטליות קופצת כמעט מדי שנה על מנת להציע רזולוציית תמונה גבוהה יותר, מיקוד אוטומטי מהיר יותר ומכלולי חומרה משוכללים נוספים לשיפור הצילום והוידאו. חברות המצלמות הגדולות כמו ניקון, קנון ופוג'י

### Wiki40b

In [9]:
MODEL_NAME = 'intfloat/multilingual-e5-large'
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 1e-4
CLIP_VALUE = 1.0
INFONCE_TEMPERATURE = 0.07
EPOCHS = 10

In [10]:
model_name_slug = MODEL_NAME.replace('/', '_').replace('-', '_')
log_file = f"./logs/hte_training_{model_name_slug}_01_wiki40b.log"
logger = setup_logger(log_file)

In [11]:
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Define model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(device)
logger.info(f"Start train base model: {MODEL_NAME}")

# Initialize the InfoNCE loss and the optimizer
criterion = InfoNCELoss(temperature=INFONCE_TEMPERATURE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

2024-10-09 16:44:25,789 - default - INFO - Using device: cuda
2024-10-09 16:44:33,273 - default - INFO - Start train base model: intfloat/multilingual-e5-large


In [12]:
%%time

start_datetime = datetime.now()
dataset_name = 'wiki40b'
logger.info(f"Switching to new dataset: {dataset_name}")
dataset = transform_dataset(dataset_name, splits=['train', 'validation'])

# Tokenize the train dataset
logger.info(f"Tokenizing train dataset")
anchor_inputs_train = tokenizer(dataset['train']['anchor_text'], return_tensors='pt', padding=True, truncation=True)
positive_inputs_train = tokenizer(dataset['train']['positive_text'], return_tensors='pt', padding=True, truncation=True)

# Create DataLoader for training
logger.info(f"Creating train dataloader")
train_dataset = TensorDataset(anchor_inputs_train['input_ids'], anchor_inputs_train['attention_mask'],
                                positive_inputs_train['input_ids'], positive_inputs_train['attention_mask'])
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Tokenize the validation dataset
logger.info(f"Tokenizing validation dataset")
anchor_inputs_val = tokenizer(dataset['validation']['anchor_text'], return_tensors='pt', padding=True, truncation=True)
positive_inputs_val = tokenizer(dataset['validation']['positive_text'], return_tensors='pt', padding=True, truncation=True)

# Create DataLoader for validation
logger.info(f"Creating validation dataloader")
val_dataset = TensorDataset(anchor_inputs_val['input_ids'], anchor_inputs_val['attention_mask'],
                            positive_inputs_val['input_ids'], positive_inputs_val['attention_mask'])
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load the latest checkpoint if available and resume training
logger.info(f"Loading checkpoint")
checkpoint_dir = f"checkpoints/{model_name_slug}/checkpoints_01_wiki40b"
start_epoch = load_checkpoint(model, optimizer, checkpoint_dir=checkpoint_dir, device=device)

# Train the model for this dataset
train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    epochs=EPOCHS,
    start_epoch=start_epoch,
    checkpoint_dir=checkpoint_dir,
    clip_value=CLIP_VALUE
)

end_datetime = datetime.now()
logger.info(f"Total training on {dataset_name} elapsed time is {(end_datetime - start_datetime).total_seconds()} seconds")

logger.info(f"End train base model: {MODEL_NAME}")

2024-10-09 16:44:35,271 - default - INFO - Switching to new dataset: wiki40b
2024-10-09 16:44:35,272 - default - INFO - Transforming Wiki40B dataset
2024-10-09 16:44:35,274 - default - INFO - Loading Wiki40B dataset
2024-10-09 16:44:45,371 - default - INFO - Transforming train split
2024-10-09 16:44:45,383 - default - INFO - Transforming validation split
2024-10-09 16:44:45,402 - default - INFO - Done transforming Wiki40B dataset
2024-10-09 16:44:45,422 - default - INFO - Tokenizing train dataset
2024-10-09 16:46:07,094 - default - INFO - Creating train dataloader
2024-10-09 16:46:07,103 - default - INFO - Tokenizing validation dataset
2024-10-09 16:46:11,160 - default - INFO - Creating validation dataloader
2024-10-09 16:46:11,161 - default - INFO - Loading checkpoint
2024-10-09 16:46:11,170 - default - INFO - No checkpoint found. Starting from scratch.
2024-10-09 16:46:11,171 - default - INFO - Start training
                                                                           

### Synthesized data

In [20]:
MODEL_NAME = 'intfloat/multilingual-e5-large'
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 1e-4
CLIP_VALUE = 1.0
INFONCE_TEMPERATURE = 0.07
EPOCHS = 20
TRAIN_ID = '02_synthesized'

model_name_slug = MODEL_NAME.replace('/', '_').replace('-', '_')

SOURCE_CHECKPOINT_DIR = f'checkpoints/{model_name_slug}/checkpoints_01_wiki40b'
CHECKPOINT_DIR = f'checkpoints/{model_name_slug}/checkpoints_{TRAIN_ID}'

In [21]:
# Logger
log_file = f"./logs/{model_name_slug}/{TRAIN_ID}.log"
logger = setup_logger(log_file)

# Get device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Define model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(device)
logger.info(f"Loaded base model: {MODEL_NAME}")

# Add special tokens to the tokenizer
new_tokens = [QUERY_TOKEN, DOCUMENT_TOKEN, *TASK_TOKENS.values()]
additional_special_tokens = [token for token in new_tokens if token not in tokenizer.get_vocab()]
special_tokens = {
    "additional_special_tokens": additional_special_tokens
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Initialize the InfoNCE loss and the optimizer
criterion = InfoNCELoss(temperature=INFONCE_TEMPERATURE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Load model at checkpoint
start_epoch = load_checkpoint(model=model, optimizer=optimizer, checkpoint_dir=CHECKPOINT_DIR, device=device)
logger.info(f"Loaded model from epoch {start_epoch}")

2024-10-05 08:18:57,842 - default - INFO - Using device: cuda
2024-10-05 08:19:00,910 - default - INFO - Start train base model: intfloat/multilingual-e5-large
2024-10-05 08:19:00,916 - default - INFO - Loading checkpoint checkpoint_epoch_9.pth
  checkpoint = torch.load(checkpoint_path, map_location=device)
2024-10-05 08:19:11,517 - default - INFO - Loaded model from epoch 9


In [22]:
dataset_name = 'synthesized_query_document'
logger.info(f"Switching to new dataset: {dataset_name}")
dataset = transform_dataset(dataset_name, data_folder_path='./data/synthetic_data_202409')

# Tokenize the train dataset and creating the dataloader
logger.info("Tokenize the train dataset and creating the dataloader")
train_dataloader = tokenize_inputs_and_create_dataloader(tokenizer, dataset['train'], batch_size=BATCH_SIZE, shuffle=True)

# Tokenize the validation dataset and creating the dataloader
logger.info("Tokenize the validation dataset and creating the dataloader")
val_dataloader = tokenize_inputs_and_create_dataloader(tokenizer, dataset['validation'], batch_size=BATCH_SIZE, shuffle=False)

2024-10-05 08:19:11,646 - default - INFO - Switching to new dataset: synthesized_query_document
2024-10-05 08:19:11,647 - default - INFO - Transforming synthesized dataset
2024-10-05 08:19:11,648 - default - INFO - Loading synthesize query document dataset from {data_folder_path}
Loading data files:   0%|                                                                                                   | 0/13 [00:00<?, ?it/s]2024-10-05 08:19:11,651 - default - DEBUG - Loading data from synthetic_data_20240906_0018.pkl
2024-10-05 08:19:11,720 - default - DEBUG - Loading data from synthetic_data_20240920_0557.pkl
2024-10-05 08:19:11,722 - default - DEBUG - Loading data from synthetic_data_20240924_1958.pkl
Loading data files:  23%|█████████████████████                                                                      | 3/13 [00:00<00:00, 26.08it/s]2024-10-05 08:19:11,767 - default - DEBUG - Loading data from synthetic_data_20240924_1959.pkl
2024-10-05 08:19:11,849 - default - DEBUG - L

In [16]:
import torch

# Example setup: assume you have a batch size of 4 and an embedding dimension of 3
batch_size = 4

# Simulated anchor and positive embeddings
anchor_embeds = torch.randn(batch_size, 3)  # (batch_size, embed_dim)
positive_embeds = torch.randn(batch_size, 3)  # (batch_size, embed_dim)

# Dynamically get the embedding dimension
embed_dim = anchor_embeds.size(1)  # or positive_embeds.size(1)

# Create a mask to exclude the diagonal (positives)
negatives_mask = torch.eye(batch_size, dtype=torch.bool)

# Reshape positive_embeds for masked_select (so it can work with the mask)
positive_embeds_reshaped = positive_embeds.unsqueeze(0)  # Shape: (1, batch_size, embed_dim)

# Use masked_select to select all non-diagonal (negative) embeddings for the batch
negatives_embeds = positive_embeds_reshaped.masked_select(~negatives_mask.unsqueeze(-1)).view(batch_size, batch_size - 1, embed_dim)

# Flag to determine if anchor_embeds should be appended as negatives
should_insert_anchor = True  # Set this to False if you don't want to append anchor_embeds

if should_insert_anchor:
    # Pre-allocate a tensor for negatives and anchor embeddings (shape: batch_size, batch_size, embed_dim)
    negatives_embeds_with_anchors = torch.zeros(batch_size, batch_size, embed_dim)

    # Fill the tensor with negative embeddings
    negatives_embeds_with_anchors[:, :-1] = negatives_embeds  # Place all negatives (batch_size, batch_size - 1, embed_dim)

    # Add the anchor embeddings as the last negative for each sample
    negatives_embeds_with_anchors[:, -1] = anchor_embeds  # In-place assignment of anchor embeddings
else:
    # If anchor embeddings are not added, only use the negative embeddings
    negatives_embeds_with_anchors = negatives_embeds  # No anchor embeddings added

print("Anchor Embeddings:")
print(anchor_embeds)

print("Positive Embeddings:")
print(positive_embeds)

print("\nNegative Embeddings with Anchors appended (if should_insert_anchor=True):")
print(negatives_embeds_with_anchors)


Anchor Embeddings:
tensor([[-1.7846,  1.1002, -0.0540],
        [ 0.1024,  1.0096,  0.6717],
        [-2.2878,  0.5169,  2.0464],
        [-1.3198, -1.3308,  1.1557]])
Positive Embeddings:
tensor([[-0.2019,  0.1621,  0.0498],
        [-1.3455, -0.7849,  2.6280],
        [-1.2824, -0.9683,  1.1840],
        [ 2.3373, -0.8575,  1.1034]])

Negative Embeddings with Anchors appended (if should_insert_anchor=True):
tensor([[[-1.3455, -0.7849,  2.6280],
         [-1.2824, -0.9683,  1.1840],
         [ 2.3373, -0.8575,  1.1034],
         [-1.7846,  1.1002, -0.0540]],

        [[-0.2019,  0.1621,  0.0498],
         [-1.2824, -0.9683,  1.1840],
         [ 2.3373, -0.8575,  1.1034],
         [ 0.1024,  1.0096,  0.6717]],

        [[-0.2019,  0.1621,  0.0498],
         [-1.3455, -0.7849,  2.6280],
         [ 2.3373, -0.8575,  1.1034],
         [-2.2878,  0.5169,  2.0464]],

        [[-0.2019,  0.1621,  0.0498],
         [-1.3455, -0.7849,  2.6280],
         [-1.2824, -0.9683,  1.1840],
         [-

In [32]:
%%time

start_datetime = datetime.now()
# Train the model for this dataset
train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    epochs=EPOCHS,
    start_epoch=start_epoch,
    checkpoint_dir=CHECKPOINT_DIR,
    clip_value=CLIP_VALUE
)

end_datetime = datetime.now()
logger.info(f"Total training on {dataset_name} elapsed time is {(end_datetime - start_datetime).total_seconds()} seconds")

logger.info(f"End train base model: {MODEL_NAME}")

2024-10-05 00:15:02,701 - default - INFO - Start training
2024-10-05 03:10:55,027 - default - INFO - Epoch 9, Train Loss: 0.5584008833895558                                                                 
2024-10-05 03:17:28,403 - default - INFO - Epoch 9, Validation Loss: 0.45115236167071593                                                                                                                                  
2024-10-05 03:17:40,814 - default - INFO - Checkpoint saved at checkpoints/intfloat_multilingual_e5_large/checkpoints_02_synthesized/checkpoint_epoch_8.pth
2024-10-05 06:10:43,442 - default - INFO - Epoch 10, Train Loss: 0.4135947355466044                                                                                                                                       
2024-10-05 06:17:16,750 - default - INFO - Epoch 10, Validation Loss: 0.438507239955167                                                                                                                 

CPU times: user 3h 3min 6s, sys: 2h 58min 46s, total: 6h 1min 53s
Wall time: 6h 2min 30s


In [24]:
%%time

start_datetime = datetime.now()
# Train the model for this dataset
train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    epochs=EPOCHS,
    start_epoch=start_epoch,
    checkpoint_dir=CHECKPOINT_DIR,
    clip_value=CLIP_VALUE
)

end_datetime = datetime.now()
logger.info(f"Total training on {dataset_name} elapsed time is {(end_datetime - start_datetime).total_seconds()} seconds")

logger.info(f"End train base model: {MODEL_NAME}")

2024-10-05 08:21:59,409 - default - INFO - Start training
2024-10-05 11:17:21,156 - default - INFO - Epoch 10, Train Loss: 0.398862443935612                                                                 
2024-10-05 11:24:02,501 - default - INFO - Epoch 10, Validation Loss: 0.36916552473687464                                                          
2024-10-05 11:24:20,858 - default - INFO - Checkpoint saved at checkpoints/intfloat_multilingual_e5_large/checkpoints_02_synthesized/checkpoint_epoch_9.pth
2024-10-05 14:17:00,447 - default - INFO - Epoch 11, Train Loss: 0.315696266932494                                                                 
2024-10-05 14:23:41,496 - default - INFO - Epoch 11, Validation Loss: 0.36382425034470445                                                          
2024-10-05 14:23:53,827 - default - INFO - Checkpoint saved at checkpoints/intfloat_multilingual_e5_large/checkpoints_02_synthesized/checkpoint_epoch_10.pth
2024-10-05 17:16:32,272 - default - I

### Wiki40b + Synthesized data

In [114]:
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Define model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(device)
logger.info(f"Start train base model: {MODEL_NAME}")

# Initialize the InfoNCE loss and the optimizer
criterion = InfoNCELoss(temperature=INFONCE_TEMPERATURE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Load model at checkpoint
start_epoch = load_checkpoint(model=model, optimizer=optimizer, checkpoint_dir=SOURCE_CHECKPOINT_DIR, device=device)
logger.info(f"Loaded model from epoch {start_epoch}")

2024-10-04 22:35:48,235 - default - INFO - Using device: cuda
2024-10-04 22:35:50,039 - default - INFO - Start train base model: intfloat/multilingual-e5-large
2024-10-04 22:35:50,051 - default - INFO - No checkpoint found. Starting from scratch.
2024-10-04 22:35:50,051 - default - INFO - Loaded model from epoch 0


In [115]:
%%time

# Tokenize the train dataset and creating the dataloader
logger.info("Tokenize the train dataset and creating the dataloader")
train_dataloader = tokenize_inputs_and_create_dataloader(tokenizer, dataset['train'], batch_size=BATCH_SIZE, shuffle=True)

# Tokenize the validation dataset and creating the dataloader
logger.info("Tokenize the validation dataset and creating the dataloader")
val_dataloader = tokenize_inputs_and_create_dataloader(tokenizer, dataset['validation'], batch_size=BATCH_SIZE, shuffle=False)

start_datetime = datetime.now()

# Train the model for this dataset
train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    epochs=EPOCHS,
    start_epoch=start_epoch,
    checkpoint_dir=CHECKPOINT_DIR,
    clip_value=None#CLIP_VALUE
)

end_datetime = datetime.now()
logger.info(f"Total training for {MODEL_NAME} model elapsed time is {(end_datetime - start_datetime).total_seconds()} seconds")

logger.info(f"End train base model: {MODEL_NAME}")

2024-10-04 22:35:54,252 - default - INFO - Tokenize the train dataset and creating the dataloader
2024-10-04 22:35:54,253 - default - INFO - Tokenizing dataset
2024-10-04 22:36:07,307 - default - INFO - Creating dataloader
2024-10-04 22:36:07,561 - default - INFO - Tokenize the validation dataset and creating the dataloader
2024-10-04 22:36:07,562 - default - INFO - Tokenizing dataset
2024-10-04 22:36:16,727 - default - INFO - Creating dataloader
2024-10-04 22:36:16,907 - default - INFO - Start training
                                                                                                                                                   

KeyboardInterrupt: 

In [120]:
from torch.utils.data import DataLoader, TensorDataset
import torch

# Define model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(device)
logger.info(f"Start train base model: {MODEL_NAME}")

# Initialize the InfoNCE loss and the optimizer
criterion = InfoNCELoss(temperature=INFONCE_TEMPERATURE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Assuming 'dataset' is your dataset and 'model' is your embedding model

# Tokenize inputs for anchor, positive, and negative examples
anchor_inputs_train = tokenizer(dataset['train']['anchor_text'], return_tensors='pt', padding=True, truncation=True)
positive_inputs_train = tokenizer(dataset['train']['positive_text'], return_tensors='pt', padding=True, truncation=True)

# Handle None values in negative text (use pad_token for None to avoid errors in tokenization)
positive_texts = dataset['train']['positive_text']
negative_texts = [negative_text or positive_texts[i] for i, negative_text in enumerate(dataset['train']['negative_text'])]
negative_inputs_train = tokenizer(negative_texts, return_tensors='pt', padding=True, truncation=True)
PAD_MASK = 0
for i, text in enumerate(dataset['train']['negative_text']):
    if text is None:
        # Set input_ids and attention_mask back to None for the corresponding index
        # negative_inputs_train['input_ids'][i] = PAD_TOKEN_ID
        negative_inputs_train['attention_mask'][i] = PAD_MASK

# Create DataLoader for the dataset
train_dataset = TensorDataset(anchor_inputs_train['input_ids'], anchor_inputs_train['attention_mask'],
                              positive_inputs_train['input_ids'], positive_inputs_train['attention_mask'],
                              negative_inputs_train['input_ids'], negative_inputs_train['attention_mask']
                              )
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

model.train()

total_train_loss = 0

# Now the main loop to handle embeddings
for batch_idx, batch in enumerate(train_dataloader):
    anchor_ids, anchor_mask, positive_ids, positive_mask, negative_ids, negative_mask = [x.to(device) for x in batch]

    # Get embeddings for anchor, positive, and negative inputs
    anchor_outputs = model(input_ids=anchor_ids, attention_mask=anchor_mask)
    anchor_embeds = anchor_outputs.last_hidden_state[:, 0, :]  # CLS token embeddings

    positive_outputs = model(input_ids=positive_ids, attention_mask=positive_mask)
    positive_embeds = positive_outputs.last_hidden_state[:, 0, :]  # CLS token embeddings

    negative_outputs = model(input_ids=negative_ids, attention_mask=negative_mask)
    negative_embeds = negative_outputs.last_hidden_state[:, 0, :]  # CLS token embeddings

    batch_size = anchor_embeds.size(0)
    negatives_embeds_list = []

    PAD_MASK = 0
    for i in range(batch_size):
        if not (torch.all(negative_mask[i] == PAD_MASK)):
            additional_index = (i + 1) % batch_size  # Choose the next index in a cyclic manner
            if i < additional_index:
                negatives_embeds_i = torch.cat([
                    negative_embeds[i:i+1],  # Keep the negative embedding at index i
                    positive_embeds[:i],     # Keep all positives before i
                    positive_embeds[i+1:additional_index],  # Keep all positives between i and additional_index
                    positive_embeds[additional_index+1:]    # Keep all positives after additional_index
                ], dim=0)
            else:
                negatives_embeds_i = torch.cat([
                    negative_embeds[i:i+1],  # Keep the negative embedding at index i
                    positive_embeds[:additional_index],  # Keep all positives before additional_index
                    positive_embeds[additional_index+1:i],  # Keep all positives between additional_index and i
                    positive_embeds[i+1:]  # Keep all positives after i
                ], dim=0)
        else:
            negatives_embeds_i = torch.cat([positive_embeds[:i], positive_embeds[i+1:]], dim=0)

        # Append the result to the list
        negatives_embeds_list.append(negatives_embeds_i)

    # Stack the negatives for each sample in the batch (ensure the size consistency)
    negatives_embeds = torch.stack(negatives_embeds_list)

    # Now you have anchor_embeds, positive_embeds, and negatives_embeds ready for loss calculation
    # print("Anchor Embeds:", anchor_embeds.shape)
    # print("Positive Embeds:", positive_embeds.shape)
    # print("Negatives Embeds:", negatives_embeds.shape)

    # Compute your contrastive or InfoNCE loss here
    loss = criterion(anchor_embeds, positive_embeds, negatives_embeds)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    # if clip_value is not None:
    #     torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
    optimizer.step()

    total_train_loss += loss.item()

    print(total_train_loss / (batch_idx + 1))


2024-10-04 22:43:16,981 - default - INFO - Start train base model: intfloat/multilingual-e5-large


1.099416971206665
nan
nan
nan


KeyboardInterrupt: 

In [None]:
%%time


2024-10-04 20:36:20,757 - default - INFO - Training starting now
2024-10-04 20:36:20,758 - default - INFO - Start training
                                                                                                                                                           

new batch
0)
torch.Size([3, 1024])




AttributeError: 'Tensor' object has no attribute 'unqueeze'

In [None]:
if 'COLAB_GPU' in os.environ:
    # Load the Drive helper and mount
    print("Shutting down colab...")
    from google.colab import runtime
    runtime.unassign()
else:
    print("Not running in Google Colab!")