# H T E

In [6]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


## Setup the environment

In [2]:
!pip install -q -U torch transformers bitsandbytes datasets huggingface_hub accelerate

In [7]:
from huggingface_hub import notebook_login
import os
import sys
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
os.environ["HF_TOKEN"] = "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"
notebook_login()

ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.

In [9]:
project_dir = os.getcwd() if not os.getcwd().split("/")[-1] == 'notebooks' else '/'.join(os.getcwd().split("/")[0:-1])
src_dir = os.path.join(project_dir, 'src')

os.chdir(project_dir)
print(f"Current working directory set to: {os.getcwd()}")


if src_dir not in sys.path:
    sys.path.insert(0, src_dir)  # Add it to the front of PYTHONPATH
    print(f"PYTHONPATH updated with: {src_dir}")
else:
    print(f"PYTHONPATH already contains: {src_dir}")

Current working directory set to: /Users/asafam/Workspace/biu/hebrew_text_encoder
PYTHONPATH updated with: /Users/asafam/Workspace/biu/hebrew_text_encoder/src


## Load the data and prepare it

In [6]:
dataset = load_dataset("wiki40b", "he")  # Specific version and Hebrew language code
dataset

  return _bootstrap._gcd_import(name[level:], package, level)


DatasetDict({
    train: Dataset({
        features: ['wikidata_id', 'text', 'version_id'],
        num_rows: 165359
    })
    validation: Dataset({
        features: ['wikidata_id', 'text', 'version_id'],
        num_rows: 9231
    })
    test: Dataset({
        features: ['wikidata_id', 'text', 'version_id'],
        num_rows: 9344
    })
})

In [7]:
import chardet

def decode_text(text):
    decoded_text = bytes(text, "utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
    return decoded_text

# Apply the decoding function to the dataset
decoded_dataset = dataset.map(lambda x: {'text': decode_text(x['text'])})

In [8]:
def parse_wiki_article(text):
    lines = text.strip().split('\n')

    PARAGRAPH_DIVIDER = '_NEWLINE_'

    # Initialize variables
    article_dict = {'title': '', 'abstract': '', 'sections': []}
    current_section = None
    abstract_parsed = False

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        if line == "_START_ARTICLE_":
            # The next line is the title
            article_dict['title'] = lines[i + 1].strip()
            i += 2  # Move to the next relevant line
        elif line == "_START_PARAGRAPH_":
            # If the abstract has not been parsed and the current section is None, this is the abstract
            paragraph = lines[i + 1].strip()
            if not abstract_parsed and not current_section:
                article_dict['abstract'] = paragraph.split(PARAGRAPH_DIVIDER)
                abstract_parsed = True
            elif current_section:
                current_section['paragraphs'] = paragraph.split(PARAGRAPH_DIVIDER)
            i += 2
        elif line == "_START_SECTION_":
            # The next line is the section name
            section_name = lines[i + 1].strip()
            current_section = {'section': section_name, 'paragraphs': ''}
            article_dict['sections'].append(current_section)
            i += 2
        else:
            i += 1  # Move to the next line if none of the cases match

    return article_dict


In [9]:
# Example usage
text = decoded_dataset['train'][12]['text']
parsed_article = parse_wiki_article(text)
parsed_article

{'title': 'הקמרילה של קרול השני, מלך רומניה',
 'abstract': ['הקמרילה של קרול השני, מלך רומניה (ברומנית: Camarila lui Carol al II lea) הוא השם המקובל של החוג הפנימי של קרול השני, מלך רומניה. הקמרילה השפיעה על הכלכלה והפוליטיקה הרומנית ותפסה חלק ניכר מהכיסוי התקשורתי של התקופה.',
  'הקמרילה עמדה במרכז התקפות האופוזיציה על ממשלו של המלך קרול השני ועם הדחתו הגיעה לקץ השפעתה וחלק מחבריה עזבו את רומניה יחד עם המלך.'],
 'sections': [{'section': 'חברי הקמרילה',
   'paragraphs': ['במרכז הקמרילה עמדו בעלי תפקידים בארמון, פילגש המלך, אלנה לופסקו, תעשיינים ובנקאים גדולים ותופסי עמדות מפתח בממשל. בין חברי הקמרילה היו אנטישמיים ויהודים, חלק פעלו בשיתוף פעולה וחלקם זממו האחד נגד חברו. היו חברי קמרילה שהודרו והיו אחרים שצורפו.',
    'פויו (קונסטנטין) דומיטרסקו (Puiu (Constantin) Dumitrescu), בנו של קולונל רומני, היה סטודנט בפריז בתקופה בה שהה שם הזוג קרול קאראימאן (השם של קרול השני בתקופה בה ויתר על ירושת המלוכה וגלה מארצו) ואלנה לופסקו. הוא ביצע עבור קרול שירותים אישיים וליווה אותו לבתי הימורים. עם ש

## Helper functions

In [10]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import logging
import os
from datasets import DatasetDict, Dataset

In [11]:
def transform_dataset_wiki40b(tokenizer):
    dataset = load_dataset("wiki40b", "he")
    decoded_dataset = dataset.map(lambda x: {'text': decode_text(x['text'])})

    def transform_entry(entry):
        # Process the 'text' using parse_wiki_article
        article = parse_wiki_article(entry['text'])

        # Extract anchor_text and positive_text based on the parsed output
        anchor_text = article['title']
        if 'sections' in article and len(article['sections']) > 0:
            anchor_text += " " + article['sections'][0]['section']
            positive_text = article['sections'][0]['paragraphs'][0]
            positive_text += tokenizer.eos_token
        else:
            positive_text = article['abstract'][0]
            positive_text += tokenizer.eos_token

        # Return the transformed data
        return {
            'anchor_text': anchor_text,
            'positive_text': positive_text
        }

    # Apply the transformation to the train, validation, and test subsets
    transformed_dataset = {}
    for subset in ['train', 'validation', 'test']:
        # Transform each subset of the dataset using map (this processes each 'text' entry)
        transformed_subset = decoded_dataset[subset].map(transform_entry)
        transformed_dataset[subset] = transformed_subset

    # Return the transformed dataset as a DatasetDict
    return DatasetDict(transformed_dataset)


def transform_dataset(dataset_name, **kwargs):
    if dataset_name == 'wiki40b':
        return transform_dataset_wiki40b(**kwargs)

In [12]:
# Setting up the logger
def setup_logger(log_file):
    log_dir = os.path.dirname(log_file)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # Create handlers for both console and file output
    console_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(log_file)

    # Set up the format for logging
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    # Add the handlers to the logger
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)

    return logger

In [13]:
class InfoNCELoss(torch.nn.Module):
    def __init__(self, temperature=0.07):
        """
        Parameters:
        - temperature: Scaling factor applied to the logits before applying the softmax function.
        """
        super(InfoNCELoss, self).__init__()
        self.temperature = temperature

    def forward(self, anchor, positive, negatives):
        """
        Compute the InfoNCE loss.

        Parameters:
        - anchor: Tensor of shape (batch_size, embedding_dim) - anchor samples
        - positive: Tensor of shape (batch_size, embedding_dim) - positive samples corresponding to each anchor
        - negatives: Tensor of shape (batch_size, num_negatives, embedding_dim) - negative samples

        Returns:
        - loss: Computed InfoNCE loss
        """
        batch_size = anchor.size(0)
        num_negatives = negatives.size(1)

        # Normalize embeddings to unit vectors
        anchor = F.normalize(anchor, dim=-1)
        positive = F.normalize(positive, dim=-1)
        negatives = F.normalize(negatives, dim=-1)

        # Calculate the positive logits (similarity between anchor and positive)
        positive_logits = torch.sum(anchor * positive, dim=-1, keepdim=True)  # Shape: (batch_size, 1)

        # Calculate the negative logits (similarity between anchor and negatives)
        negative_logits = torch.bmm(negatives, anchor.unsqueeze(2)).squeeze(2)  # Shape: (batch_size, num_negatives)

        # Concatenate positive and negative logits
        logits = torch.cat([positive_logits, negative_logits], dim=1)  # Shape: (batch_size, 1 + num_negatives)

        # Apply temperature scaling
        logits = logits / self.temperature

        # Create labels - 0 for the positive samples, as it is the first in the concatenated logits
        labels = torch.zeros(batch_size, dtype=torch.long, device=logits.device)

        # Compute the InfoNCE loss using cross-entropy
        loss = F.cross_entropy(logits, labels)

        return loss

In [14]:
def load_checkpoint(model, optimizer, checkpoint_dir):
    latest_checkpoint = None
    if os.path.exists(checkpoint_dir):
        checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pth")]
        if checkpoint_files:
            latest_checkpoint = sorted(checkpoint_files)[-1]  # Get the latest checkpoint

    if latest_checkpoint:
        logger.info(f"Loading checkpoint {latest_checkpoint}")
        checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        return checkpoint['epoch']  # return the epoch to resume from

    logger.info("No checkpoint found. Starting from scratch.")
    return 0  # Start from the first epoch if no checkpoint found


# Save model and optimizer state
def save_checkpoint(model, optimizer, epoch, checkpoint_dir):
    create_directory(checkpoint_dir)
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    logger.info(f"Checkpoint saved at {checkpoint_path}")

In [15]:
def validate(model, val_loader, criterion):
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            anchor_ids, positive_ids, *negative_ids = batch
            
            # Forward pass to get embeddings for validation
            anchor_embeds = model(anchor_ids).last_hidden_state[:, 0, :]  # CLS token embeddings
            positive_embeds = model(positive_ids).last_hidden_state[:, 0, :]

            # Process multiple negatives
            negatives_embeds = torch.stack([
                model(negative_id_batch).last_hidden_state[:, 0, :] for negative_id_batch in negative_ids
            ], dim=1)

            # Compute the validation loss
            val_loss = criterion(anchor_embeds, positive_embeds, negatives_embeds)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    return avg_val_loss


def train(
    model, 
    optimizer,
    criterion, 
    train_dataloader, 
    val_dataloader, 
    epochs, 
    start_epoch=0, 
    checkpoint_dir='checkpoints', 
):
    model.train()
    best_val_loss = float('inf')  # Initialize best validation loss to infinity
    checkpoint_dir = "checkpoints"

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    for epoch in range(start_epoch, epochs):
        total_train_loss = 0.0
        model.train()

        for batch in train_dataloader:
            anchor_ids, anchor_mask, positive_ids, positive_mask = batch

            # Forward pass to get the embeddings
            anchor_outputs = model(input_ids=anchor_ids, attention_mask=anchor_mask)
            anchor_embeds = anchor_outputs.last_hidden_state[:, 0, :]  # CLS token embeddings

            positive_outputs = model(input_ids=positive_ids, attention_mask=positive_mask)
            positive_embeds = positive_outputs.last_hidden_state[:, 0, :]  # CLS token embeddings

            # Set negatives as the other positives in the batch
            # Create a matrix where the negatives are shifted versions of positives
            batch_size = positive_embeds.size(0)
#             negatives_embeds = torch.stack([positive_embeds[i:] + positive_embeds[:i] for i in range(1, batch_size)], dim=0)
            # Create the negatives for each index `i` by excluding the positive embedding at index `i`
            negatives_embeds_list = []

            for i in range(batch_size):
                # Exclude the current index `i` using slicing
                negatives_embeds = torch.cat([positive_embeds[:i], positive_embeds[i+1:]], dim=0)

                # Append the result to the list
                negatives_embeds_list.append(negatives_embeds)

            # Stack the negatives for each sample in the batch
            # Each entry in the batch now has (batch_size - 1) negative embeddings
            negatives_embeds = torch.stack(negatives_embeds_list)

            # Compute the InfoNCE loss
            loss = criterion(anchor_embeds, positive_embeds, negatives_embeds)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        logger.info(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}")

        # Compute validation loss after each epoch
        avg_val_loss = validate(model, val_dataloader, criterion)
        logger.info(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss}")

        # Save checkpoint after each epoch
        save_checkpoint(model, optimizer, epoch, "checkpoints")

## Train the model

In [16]:
from datetime import datetime
from torch.optim import AdamW

In [17]:
MODEL_NAME = 'intfloat/multilingual-e5-base'
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
INFONCE_TEMPERATURE = 0.07

In [18]:
model_name_slug = MODEL_NAME.replace('/', '_').replace('-', '_')
log_file = f"logs/hte_training_{model_name_slug}.log"
logger = setup_logger(log_file)

In [None]:
%%time

# Log file setup
model_name_slug = MODEL_NAME.replace('/', '_').replace('-', '_')
log_file = f"logs/hte_training_{model_name_slug}.log"


# Define model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
logger.info(f"Start train base model: {MODEL_NAME}")

# Initialize the InfoNCE loss and the optimizer
criterion = InfoNCELoss(temperature=INFONCE_TEMPERATURE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Datasets to train on
dataset_names = ['wiki40b']

# Iterate over datasets and train
for dataset_name in dataset_names:
    start_datetime = datetime.now()
    
    logger.info(f"Switching to new dataset: {dataset_name}")
    dataset = transform_dataset(dataset_name, tokenizer=tokenizer)
    
    # Tokenize the train dataset
    anchor_inputs_train = tokenizer(dataset['train']['anchor_text'], return_tensors='pt', padding=True, truncation=True)
    positive_inputs_train = tokenizer(dataset['train']['positive_text'], return_tensors='pt', padding=True, truncation=True)

    # Create DataLoader for training
    train_dataset = TensorDataset(anchor_inputs_train['input_ids'], anchor_inputs_train['attention_mask'], 
                                  positive_inputs_train['input_ids'], positive_inputs_train['attention_mask'])
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # Tokenize the validation dataset
    anchor_inputs_val = tokenizer(dataset['validation']['anchor_text'], return_tensors='pt', padding=True, truncation=True)
    positive_inputs_val = tokenizer(dataset['validation']['positive_text'], return_tensors='pt', padding=True, truncation=True)
    
    # Create DataLoader for validation
    val_dataset = TensorDataset(anchor_inputs_val['input_ids'], anchor_inputs_val['attention_mask'], 
                                positive_inputs_val['input_ids'], positive_inputs_val['attention_mask'])
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Load the latest checkpoint if available and resume training
    checkpoint_dir = "checkpoints"
    start_epoch = load_checkpoint(model, optimizer, checkpoint_dir)

    # Train the model for this dataset
    train(
        model=model, 
        optimizer=optimizer,
        criterion=criterion,
        train_dataloader=train_dataloader, 
        val_dataloader=val_dataloader, 
        epochs=3, 
        start_epoch=start_epoch,
    )
    
    end_datetime = datetime.now()
    logger.info(f"Total training on {dataset_name} elapsed time is {(end_datetime - start_datetime).total_seconds()} seconds")
    
logger.info(f"End train base model: {MODEL_NAME}")

2024-09-05 16:13:39,350 - INFO - Start train base model: intfloat/multilingual-e5-base
2024-09-05 16:13:40,397 - INFO - Switching to new dataset: wiki40b
2024-09-05 16:14:50,239 - INFO - No checkpoint found. Starting from scratch.


In [None]:
!nvidia-smi

In [12]:
!cp ../data/synthetic_data_20240906_0018.pkl ./data.pkl

In [13]:
import pickle

with open('./data.pkl', 'rb') as f:
    data = pickle.load(f)

In [14]:
len(data)

1902

In [5]:
data[-1]

{'user_query': 'דוחות בדיקה ספרותיים לתקופה הנצרות',
 'positive_document': "הנצרות המוקדמת התפתחה בתוך הקהילה היהודית המתרחבת של אותה תקופה. בעוד שהיהדות המקורית התבססה על כתבי הקודש העבריים, הנצרות אימצה גם את הברית החדשה, הכוללת את הבשורות על חייו של ישו והאיגרות של שליחיו. עם זאת, דוחות מפורטים על חייו של ישו ועל התפתחות הנצרות המוקדמת בתקופה זו הם מועטים ביותר. מרבית הידע שלנו על התקופה נשען על האמונות והמסורות שהתפתחו מאוחר יותר בכנסייה הנוצרית. דוחות ספרותיים מאותה תקופה מתארים במקרים רבים את היחסים המורכבים בין הנצרים לבין הקהילות היהודיות והרומיות ששלטו בארץ ישראל. כמה דוחות משקפים גם את המאבקים הפנימיים בין קבוצות שונות של נוצרים על פרשנות התנ'ך החדש והאמונות האמיתיות של ישו.",
 'hard_negative_document': "פילוסופיה יהודית היא תחום עיון עתיק יומין, הנוגע בהיבטים המטאפיזיים והאתיים של המחשבה היהודית. תורת הקבלה, למשל, חוקרת את הטבע האלוהי ואת הקשרים המיסטיים בין האל לבריאה. בנוסף, חכמת המוסר העברית דנה ברעיונות מוסריים כמו חמלה, צדקה וחסד. עם זאת, בניגוד לפילוסופיה המערבית, המחש

In [1]:
import torch

# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set the batch size and embedding dimension
batch_size = 1000
embed_dim = 512

# Create dummy positive embeddings (batch_size, embed_dim)
positive_embeds = torch.randn(batch_size, embed_dim, device=device)

# Measure initial memory usage
initial_allocated_memory = torch.cuda.memory_allocated(device)
initial_reserved_memory = torch.cuda.memory_reserved(device)

print(f"Initial allocated memory: {initial_allocated_memory / (1024 ** 2)} MB")
print(f"Initial reserved memory: {initial_reserved_memory / (1024 ** 2)} MB")

# Pre-allocate a tensor for negatives (shape: batch_size, batch_size - 1, embed_dim)
negatives_embeds = torch.zeros(batch_size, batch_size - 1, embed_dim, device=device)

# Create an identity mask to exclude diagonal elements (positives)
negatives_mask = torch.eye(batch_size, dtype=torch.bool).to(device)

# Fill the negatives_embeds in place, row by row
for i in range(batch_size):
    # Select all positive embeddings except the current index `i`
    negatives_i = positive_embeds[~negatives_mask[i]]  # Exclude diagonal
    negatives_embeds[i] = negatives_i  # In-place assignment

# Measure memory usage after in-place assignments
final_allocated_memory = torch.cuda.memory_allocated(device)
final_reserved_memory = torch.cuda.memory_reserved(device)

print(f"Final allocated memory: {final_allocated_memory / (1024 ** 2)} MB")
print(f"Final reserved memory: {final_reserved_memory / (1024 ** 2)} MB")

# Compare memory usage before and after assignments
print(f"Memory increase from initial to final (allocated): {(final_allocated_memory - initial_allocated_memory) / (1024 ** 2)} MB")


Initial allocated memory: 1.953125 MB
Initial reserved memory: 20.0 MB
Final allocated memory: 1956.8583984375 MB
Final reserved memory: 1974.0 MB
Memory increase from initial to final (allocated): 1954.9052734375 MB


In [1]:
import torch

# Check if CUDA is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set the batch size and embedding dimension
batch_size = 1000
embed_dim = 512

# Create dummy positive embeddings (batch_size, embed_dim)
positive_embeds = torch.randn(batch_size, embed_dim, device=device)

# Measure initial memory usage
initial_allocated_memory = torch.cuda.memory_allocated(device)
initial_reserved_memory = torch.cuda.memory_reserved(device)

print(f"Initial allocated memory: {initial_allocated_memory / (1024 ** 2)} MB")
print(f"Initial reserved memory: {initial_reserved_memory / (1024 ** 2)} MB")

batch_size = positive_embeds.size(0)
negatives_mask = torch.eye(batch_size, dtype=torch.bool).to(device)  # Identity matrix to mask out positives
positive_embeds_reshaped = positive_embeds.unsqueeze(0)  # Shape: (1, batch_size, embed_dim)

# Use the mask to select negatives (all non-diagonal elements are negatives)
# negatives_embeds = positive_embeds_reshaped.masked_select(~negatives_mask.unsqueeze(-1))#.view(batch_size, batch_size - 1, -1)
# Pre-allocate a tensor for negatives (shape: batch_size, batch_size - 1, embed_dim)
negatives_embeds = torch.zeros(batch_size, batch_size - 1, embed_dim, device=device)

# Measure memory usage after in-place assignments
final_allocated_memory = torch.cuda.memory_allocated(device)
final_reserved_memory = torch.cuda.memory_reserved(device)

print(f"Final allocated memory: {final_allocated_memory / (1024 ** 2)} MB")
print(f"Final reserved memory: {final_reserved_memory / (1024 ** 2)} MB")

# Compare memory usage before and after assignments
print(f"Memory increase from initial to final (allocated): {(final_allocated_memory - initial_allocated_memory) / (1024 ** 2)} MB")

Initial allocated memory: 1.953125 MB
Initial reserved memory: 20.0 MB
Final allocated memory: 1954.9072265625 MB
Final reserved memory: 1974.0 MB
Memory increase from initial to final (allocated): 1952.9541015625 MB


In [1]:
import os
import sys

project_dir = os.getcwd() if not os.getcwd().split("/")[-1] == 'notebooks' else '/'.join(os.getcwd().split("/")[0:-1])
src_dir = os.path.join(project_dir, 'src')

os.chdir(project_dir)
print(f"Current working directory set to: {os.getcwd()}")


if src_dir not in sys.path:
    sys.path.insert(0, src_dir)  # Add it to the front of PYTHONPATH
    print(f"PYTHONPATH updated with: {src_dir}")
else:
    print(f"PYTHONPATH already contains: {src_dir}")

Current working directory set to: /home/nlp/achimoa/projects/hebrew_text_encoder
PYTHONPATH updated with: /home/nlp/achimoa/projects/hebrew_text_encoder/src


In [2]:
from train_model import main

main(
    model_name='onlplab/alephbert-base',
    dataset_name='heq',
    batch_size=32,
    cuda_visible_devices="0",
    source_checkpoint_dir='checkpoints/temp',
    checkpoint_dir='checkpoints/temp',
)

2024-10-11 01:27:13,063 - default - INFO - Arguments:
2024-10-11 01:27:13,064 - default - INFO - Dataset: heq
2024-10-11 01:27:13,065 - default - INFO - Model: onlplab/alephbert-base
2024-10-11 01:27:13,065 - default - INFO - Target checkpoint path: checkpoints/temp
2024-10-11 01:27:13,066 - default - INFO - Source checkpoint path: checkpoints/temp
2024-10-11 01:27:13,066 - default - INFO - Source checkpoint epoch: None
2024-10-11 01:27:13,067 - default - INFO - Learning rate: 5e-05
2024-10-11 01:27:13,067 - default - INFO - Weight decay: 0.0001
2024-10-11 01:27:13,068 - default - INFO - Clip value: 1.0
2024-10-11 01:27:13,069 - default - INFO - InfoNCE temperature: 0.07
2024-10-11 01:27:13,069 - default - INFO - Epochs: 10
2024-10-11 01:27:13,069 - default - INFO - CUDA_VISIBLE_DEVICES: 0
2024-10-11 01:27:13,463 - default - INFO - Using device: cuda
Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pool

url = https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/train.json


2024-10-11 01:27:18,484 - default - INFO - Loading json file from https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/val.json


url = https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/val.json


2024-10-11 01:27:19,074 - default - INFO - Tokenizing train dataset
2024-10-11 01:27:20,425 - default - INFO - Creating train dataloader
2024-10-11 01:27:20,427 - default - INFO - Tokenizing validation dataset
2024-10-11 01:27:20,831 - default - INFO - Creating validation dataloader
2024-10-11 01:27:20,832 - default - INFO - Start training...
Epoch 1/10 [Train]:   0%|          | 0/140 [00:00<?, ?it/s]

Epoch 1, Batch 1: Start of Batch - Allocated memory: 0.47 GB
Epoch 1, Batch 1: Start of Batch - Reserved memory: 0.67 GB
Epoch 1, Batch 1: Start of Batch - Max allocated memory: 0.62 GB

Epoch 1, Batch 1: After Query Forward Pass - Allocated memory: 1.15 GB
Epoch 1, Batch 1: After Query Forward Pass - Reserved memory: 1.26 GB
Epoch 1, Batch 1: After Query Forward Pass - Max allocated memory: 1.15 GB

Epoch 1, Batch 1: After Positive Forward Pass - Allocated memory: 9.34 GB
Epoch 1, Batch 1: After Positive Forward Pass - Reserved memory: 9.72 GB
Epoch 1, Batch 1: After Positive Forward Pass - Max allocated memory: 9.40 GB

Epoch 1, Batch 1: After Loss Calculation - Allocated memory: 9.34 GB
Epoch 1, Batch 1: After Loss Calculation - Reserved memory: 9.72 GB
Epoch 1, Batch 1: After Loss Calculation - Max allocated memory: 9.40 GB



Epoch 1/10 [Train]:   0%|          | 0/140 [00:00<?, ?it/s, Batch=1, Train Loss=2.79]2024-10-11 01:27:21,642 - default - INFO - Epoch 1 / 10, Batch 1 / 140, Train Loss: 2.7930428981781006
Epoch 1/10 [Train]:   1%|          | 1/140 [00:00<01:51,  1.25it/s, Batch=1, Train Loss=2.79]

Epoch 1, Batch 1: After Backward Pass - Allocated memory: 1.01 GB
Epoch 1, Batch 1: After Backward Pass - Reserved memory: 10.00 GB
Epoch 1, Batch 1: After Backward Pass - Max allocated memory: 9.62 GB

Epoch 1, Batch 1: After Optimizer Step - Allocated memory: 1.96 GB
Epoch 1, Batch 1: After Optimizer Step - Reserved memory: 2.23 GB
Epoch 1, Batch 1: After Optimizer Step - Max allocated memory: 9.62 GB

Epoch 1, Batch 2: Start of Batch - Allocated memory: 1.96 GB
Epoch 1, Batch 2: Start of Batch - Reserved memory: 2.23 GB
Epoch 1, Batch 2: Start of Batch - Max allocated memory: 9.62 GB

Epoch 1, Batch 2: After Query Forward Pass - Allocated memory: 2.63 GB
Epoch 1, Batch 2: After Query Forward Pass - Reserved memory: 2.79 GB
Epoch 1, Batch 2: After Query Forward Pass - Max allocated memory: 9.62 GB

Epoch 1, Batch 2: After Positive Forward Pass - Allocated memory: 10.77 GB
Epoch 1, Batch 2: After Positive Forward Pass - Reserved memory: 11.25 GB
Epoch 1, Batch 2: After Positive Forwar

Epoch 1/10 [Train]:   1%|▏         | 2/140 [00:01<01:34,  1.46it/s, Batch=2, Train Loss=2.45]

Epoch 1, Batch 2: After Optimizer Step - Allocated memory: 1.96 GB
Epoch 1, Batch 2: After Optimizer Step - Reserved memory: 2.37 GB
Epoch 1, Batch 2: After Optimizer Step - Max allocated memory: 10.87 GB

Epoch 1, Batch 3: Start of Batch - Allocated memory: 1.96 GB
Epoch 1, Batch 3: Start of Batch - Reserved memory: 2.37 GB
Epoch 1, Batch 3: Start of Batch - Max allocated memory: 10.87 GB

Epoch 1, Batch 3: After Query Forward Pass - Allocated memory: 2.63 GB
Epoch 1, Batch 3: After Query Forward Pass - Reserved memory: 2.78 GB
Epoch 1, Batch 3: After Query Forward Pass - Max allocated memory: 10.87 GB

Epoch 1, Batch 3: After Positive Forward Pass - Allocated memory: 10.78 GB
Epoch 1, Batch 3: After Positive Forward Pass - Reserved memory: 11.24 GB
Epoch 1, Batch 3: After Positive Forward Pass - Max allocated memory: 10.88 GB

Epoch 1, Batch 3: After Loss Calculation - Allocated memory: 10.78 GB
Epoch 1, Batch 3: After Loss Calculation - Reserved memory: 11.24 GB
Epoch 1, Batch 3: Af

Epoch 1/10 [Train]:   2%|▏         | 3/140 [00:02<01:28,  1.55it/s, Batch=3, Train Loss=2.23]

Epoch 1, Batch 3: After Optimizer Step - Allocated memory: 1.96 GB
Epoch 1, Batch 3: After Optimizer Step - Reserved memory: 2.35 GB
Epoch 1, Batch 3: After Optimizer Step - Max allocated memory: 10.88 GB

Epoch 1, Batch 4: Start of Batch - Allocated memory: 1.96 GB
Epoch 1, Batch 4: Start of Batch - Reserved memory: 2.35 GB
Epoch 1, Batch 4: Start of Batch - Max allocated memory: 10.88 GB

Epoch 1, Batch 4: After Query Forward Pass - Allocated memory: 2.63 GB
Epoch 1, Batch 4: After Query Forward Pass - Reserved memory: 2.77 GB
Epoch 1, Batch 4: After Query Forward Pass - Max allocated memory: 10.88 GB

Epoch 1, Batch 4: After Positive Forward Pass - Allocated memory: 10.78 GB
Epoch 1, Batch 4: After Positive Forward Pass - Reserved memory: 11.24 GB
Epoch 1, Batch 4: After Positive Forward Pass - Max allocated memory: 10.88 GB

Epoch 1, Batch 4: After Loss Calculation - Allocated memory: 10.78 GB
Epoch 1, Batch 4: After Loss Calculation - Reserved memory: 11.24 GB
Epoch 1, Batch 4: Af

Epoch 1/10 [Train]:   3%|▎         | 4/140 [00:02<01:25,  1.59it/s, Batch=4, Train Loss=2.02]

Epoch 1, Batch 4: After Optimizer Step - Allocated memory: 1.96 GB
Epoch 1, Batch 4: After Optimizer Step - Reserved memory: 2.39 GB
Epoch 1, Batch 4: After Optimizer Step - Max allocated memory: 10.88 GB

Epoch 1, Batch 5: Start of Batch - Allocated memory: 1.96 GB
Epoch 1, Batch 5: Start of Batch - Reserved memory: 2.39 GB
Epoch 1, Batch 5: Start of Batch - Max allocated memory: 10.88 GB

Epoch 1, Batch 5: After Query Forward Pass - Allocated memory: 2.63 GB
Epoch 1, Batch 5: After Query Forward Pass - Reserved memory: 2.78 GB
Epoch 1, Batch 5: After Query Forward Pass - Max allocated memory: 10.88 GB

Epoch 1, Batch 5: After Positive Forward Pass - Allocated memory: 10.78 GB
Epoch 1, Batch 5: After Positive Forward Pass - Reserved memory: 11.24 GB
Epoch 1, Batch 5: After Positive Forward Pass - Max allocated memory: 10.88 GB

Epoch 1, Batch 5: After Loss Calculation - Allocated memory: 10.78 GB
Epoch 1, Batch 5: After Loss Calculation - Reserved memory: 11.24 GB
Epoch 1, Batch 5: Af

Epoch 1/10 [Train]:   4%|▎         | 5/140 [00:03<01:23,  1.62it/s, Batch=5, Train Loss=2.03]

Epoch 1, Batch 5: After Optimizer Step - Allocated memory: 1.96 GB
Epoch 1, Batch 5: After Optimizer Step - Reserved memory: 2.39 GB
Epoch 1, Batch 5: After Optimizer Step - Max allocated memory: 10.88 GB

Epoch 1, Batch 6: Start of Batch - Allocated memory: 1.96 GB
Epoch 1, Batch 6: Start of Batch - Reserved memory: 2.39 GB
Epoch 1, Batch 6: Start of Batch - Max allocated memory: 10.88 GB

Epoch 1, Batch 6: After Query Forward Pass - Allocated memory: 2.63 GB
Epoch 1, Batch 6: After Query Forward Pass - Reserved memory: 2.77 GB
Epoch 1, Batch 6: After Query Forward Pass - Max allocated memory: 10.88 GB



                                                                                             

Epoch 1, Batch 6: After Positive Forward Pass - Allocated memory: 10.78 GB
Epoch 1, Batch 6: After Positive Forward Pass - Reserved memory: 11.24 GB
Epoch 1, Batch 6: After Positive Forward Pass - Max allocated memory: 10.88 GB

Epoch 1, Batch 6: After Loss Calculation - Allocated memory: 10.78 GB
Epoch 1, Batch 6: After Loss Calculation - Reserved memory: 11.24 GB
Epoch 1, Batch 6: After Loss Calculation - Max allocated memory: 10.88 GB





KeyboardInterrupt: 

In [19]:
%reload_ext autoreload
%autoreload 2

from data import build_dataset

dataset = build_dataset(dataset_name='heq')
dataset

2024-10-11 00:11:56,620 - default - INFO - Building HeQ dataset
2024-10-11 00:11:56,622 - default - INFO - Loading json file from https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/train.json


url = https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/train.json


2024-10-11 00:11:58,115 - default - INFO - Loading json file from https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/val.json


url = https://raw.githubusercontent.com/NNLP-IL/Hebrew-Question-Answering-Dataset/main/data/val.json


DatasetDict({
    train: Dataset({
        features: ['anchor_text', 'positive_text'],
        num_rows: 4462
    })
    val: Dataset({
        features: ['anchor_text', 'positive_text'],
        num_rows: 239
    })
})

In [2]:
%reload_ext autoreload
%autoreload 2

from data import build_dataset

dataset = build_dataset(dataset_name='synthesized_query_document')
dataset

Loading data files: 100%|██████████| 13/13 [00:03<00:00,  3.25it/s]


DatasetDict({
    train: Dataset({
        features: ['anchor_text', 'positive_text', 'negative_text'],
        num_rows: 73595
    })
    validation: Dataset({
        features: ['anchor_text', 'positive_text', 'negative_text'],
        num_rows: 9199
    })
})

In [11]:
!ls /Users/asafam/Workspace/biu/hebrew_text_encoder/data/synthetic_data_202409

322.50s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
ls: /Users/asafam/Workspace/biu/hebrew_text_encoder/data/synthetic_data_202409: No such file or directory


In [4]:
%reload_ext autoreload
%autoreload 2

from data import build_dataset

dataset = build_dataset(dataset_name='wiki40b')
dataset

ModuleNotFoundError: No module named 'data'

In [4]:
dataset['train'][0]

{'wikidata_id': "b'Q2336243'",
 'text': 'b\'\n_START_ARTICLE_\nהקרב על גבעת התחמושת\n_START_SECTION_\nתחילת המלחמה בירושלים\n_START_PARAGRAPH_\nבבוקר יום 5 ביוני 1967 פתחו כוחות הלגיון הירדני בהרעשה ארטילרית על ירושלים המערבית, וחיילי הלגיון הירדני השתלטו על מטה משקיפי האו"ם ששכן בארמון הנציב. מלך ירדן, חוסיין, שעד אותו בוקר היסס בשאלה האם להצטרף למלחמה נגד ישראל, הכריע בעד שיתוף הפעולה עם מצרים וסוריה, והחליט להעביר את צבאו לפיקוד מצרי. המלך הסתמך על ידיעות מצריות מוטעות, והשתכנע כי הקרב בסיני הוכרע לטובת הכוחות המצריים, ומכאן שיהיה זה לטובת האינטרסים הירדניים להיכנס למלחמה. יחד עם זאת, הוצבו לצבא הירדני מטרות מוגבלות בלבד, מתוך כוונה לבצע מחטפים מוצלחים בנקודות בעלות חשיבות אסטרטגית לאורך הקו העירוני._NEWLINE_הצד הישראלי הופתע מכניסתה של ירדן למלחמה. ישראל העבירה מסרים לירדן לפני המלחמה, בתקווה לשכנעה שלא להתערב בעימות, ולפי הערכת מטכ"ל צה"ל, פיקוד המרכז לא היה אמור כלל להשתתף בלחימה. עם תחילת הקרבות הסתבר שלפיקוד לא היו כוחות לביצוע מהלך התקפי. לרשות הפיקוד עמדו במרחב ירושלים שתי חט

In [None]:
dataset = build_dataset(dataset_name, splits=['train', 'validation'])