# HTE Evaluation

In [1]:
!nvidia-smi

Tue Oct  1 14:21:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              47W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Setup the environment

In [2]:
!pip install -q -U torch transformers bitsandbytes datasets huggingface_hub accelerate tqdm faiss-gpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.4/436.4 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from huggingface_hub import notebook_login
import os
import sys
from datasets import load_dataset

In [4]:
os.environ["HF_TOKEN"] = "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"
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!ls /content/drive/MyDrive/university/projects/research/hebrew_sentence_encoder

checkpoints  data  embeddings  eval  logs  notebooks  presentations


In [7]:
project_dir = '/content/drive/MyDrive/university/projects/research/hebrew_sentence_encoder'

os.chdir(project_dir)
print(f"Current working directory set to: {os.getcwd()}")


if project_dir not in sys.path:
    sys.path.insert(0, project_dir)  # Add it to the front of PYTHONPATH
    print(f"PYTHONPATH updated with: {project_dir}")

Current working directory set to: /content/drive/MyDrive/university/projects/research/hebrew_sentence_encoder
PYTHONPATH updated with: /content/drive/MyDrive/university/projects/research/hebrew_sentence_encoder


## Code library

In [37]:
from typing import Optional
import numpy as np
import logging
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics.pairwise import cosine_similarity
import os
from datasets import DatasetDict, Dataset
from tqdm import tqdm
import pickle
import chardet

### Misc

In [9]:
def setup_logger(file_path: str):
    # Create or retrieve the logger
    logger = logging.getLogger('default')

    # Remove all existing handlers
    if logger.hasHandlers():
        for handler in logger.handlers[:]:
            logger.removeHandler(handler)

    logger.setLevel(logging.DEBUG)
    logger.propagate = False

    # Stream Handler (for console output)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    # File Handler (for file output)
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    file_handler = logging.FileHandler(file_path, delay=False)  # Log file name (you can specify the path)
    file_handler.setLevel(logging.DEBUG) # Set the log level for file handler
    file_handler.setFormatter(formatter) # Use the same formatter
    logger.addHandler(file_handler)

    return logger

In [42]:
def load_checkpoint(model, optimizer, checkpoint_dir, device, epoch=None):
    logger = logging.getLogger('default')

    if os.path.exists(checkpoint_dir):
        checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pth")]
        if checkpoint_files:
            checkpoint_id = sorted(checkpoint_files)[-1] if epoch is None else f"checkpoint_epoch_{epoch}.pth" # Get the latest checkpoint

    if checkpoint_id:
        logger.info(f"Loading checkpoint {checkpoint_id}")
        checkpoint_path = os.path.join(checkpoint_dir, checkpoint_id)
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        if optimizer is not None:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        return checkpoint['epoch']  # return the epoch to resume from

    logger.info("No checkpoint found. Starting from scratch.")
    return 0  # Start from the first epoch if no checkpoint found


# Save model and optimizer state
def save_checkpoint(model, optimizer, epoch, checkpoint_dir):
    logger = logging.getLogger('default')
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    logger.info(f"Checkpoint saved at {checkpoint_path}")

### Dataset loading

In [43]:
def transform_dataset_wiki40b(tokenizer, subsets: list = ['train', 'validation', 'test']):
    logger = logging.getLogger('default')
    logger.info("Transforming Wiki40B dataset")

    dataset = load_dataset("wiki40b", "he")
    decoded_dataset = dataset.map(lambda x: {'text': decode_text(x['text'])})

    def transform_entry(entry):
        # Process the 'text' using parse_wiki_article
        article = parse_wiki_article(entry['text'])

        # Extract anchor_text and positive_text based on the parsed output
        anchor_text = article['title']
        if 'sections' in article and len(article['sections']) > 0:
            anchor_text += " " + article['sections'][0]['section']
            positive_text = article['sections'][0]['paragraphs'][0]
        else:
            positive_text = article['abstract'][0]

        # Return the transformed data
        return {
            'anchor_text': anchor_text,
            'positive_text': positive_text
        }

    # Apply the transformation to the train, validation, and test subsets
    transformed_dataset = {}
    for subset in subsets:
        # Transform each subset of the dataset using map (this processes each 'text' entry)
        logger.info(f"Transforming {subset} subset")
        transformed_subset = decoded_dataset[subset].map(transform_entry)
        transformed_dataset[subset] = transformed_subset

    # Return the transformed dataset as a DatasetDict
    logger.info("Done transforming Wiki40B dataset")
    return DatasetDict(transformed_dataset)


def decode_text(text):
    decoded_text = bytes(text, "utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
    return decoded_text


def parse_wiki_article(text):
    lines = text.strip().split('\n')

    PARAGRAPH_DIVIDER = '_NEWLINE_'

    # Initialize variables
    article_dict = {'title': '', 'abstract': '', 'sections': []}
    current_section = None
    abstract_parsed = False

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        if line == "_START_ARTICLE_":
            # The next line is the title
            article_dict['title'] = lines[i + 1].strip()
            i += 2  # Move to the next relevant line
        elif line == "_START_PARAGRAPH_":
            # If the abstract has not been parsed and the current section is None, this is the abstract
            paragraph = lines[i + 1].strip()
            if not abstract_parsed and not current_section:
                article_dict['abstract'] = paragraph.split(PARAGRAPH_DIVIDER)
                abstract_parsed = True
            elif current_section:
                current_section['paragraphs'] = paragraph.split(PARAGRAPH_DIVIDER)
            i += 2
        elif line == "_START_SECTION_":
            # The next line is the section name
            section_name = lines[i + 1].strip()
            current_section = {'section': section_name, 'paragraphs': ''}
            article_dict['sections'].append(current_section)
            i += 2
        else:
            i += 1  # Move to the next line if none of the cases match

    return article_dict


def transform_dataset_synthesized(tokenizer, file_path, test_size=0.2):
    logger.info("Transforming synthesized dataset")
    with open(file_path, 'rb') as f:
        data = pickle.load(f)

    def transform_entry(entry):
        # Return the transformed data
        return {
            'anchor_text': 'query: ' + entry['user_query'],
            'positive_text': 'document: ' + entry['positive_document'],
            'negative_text': 'document: ' + entry['hard_negative_document'],
        }

    # Apply the transformation to each entry
    transformed_data = list(map(transform_entry, data))

    # Convert the list of dictionaries to a Hugging Face Dataset
    dataset = Dataset.from_list(transformed_data)

    # Split the dataset into train and test sets
    train_test_dataset = dataset.train_test_split(test_size=test_size)
    train_validation_dataset = DatasetDict({
        'train': train_test_dataset['train'],  # Keep the 'train' split
        'validation': train_test_dataset['test']  # Rename 'test' to 'validation'
    })

    # Tokenize the dataset using the provided tokenizer
    def tokenize_function(examples):
        return tokenizer(examples['anchor_text'], examples['positive_text'], examples['negative_text'], truncation=True)

    # tokenized_train_validation_dataset = train_validation_dataset.map(tokenize_function, batched=True)

    logger.info("Done transforming synthesized dataset")
    return train_validation_dataset


def transform_dataset(dataset_name, **kwargs):
    if dataset_name == 'wiki40b':
        return transform_dataset_wiki40b(**kwargs)
    elif dataset_name == 'synthesized_dataset':
        return transform_dataset_synthesized(**kwargs)
    else:
        raise ValueError(f"Unknown dataset name: {dataset_name}")

### Evaluation code

In [44]:
def precision_at_k(relevant_index, retrieved_indices, k):
    """Calculate Precision@k."""
    top_k_retrieved = retrieved_indices[:k]
    relevant_in_top_k = 1 if relevant_index in top_k_retrieved else 0
    return relevant_in_top_k / k


def mean_reciprocal_rank(relevant_index, retrieved_indices):
    """Calculate MRR."""
    for rank, doc_id in enumerate(retrieved_indices, start=1):
        if doc_id == relevant_index:
            return 1.0 / rank
    return 0.0


def dcg_at_k(relevant_index, retrieved_indices, k):
    """Calculate DCG@k."""
    dcg = 0.0
    for i in range(min(k, len(retrieved_indices))):
        if retrieved_indices[i] == relevant_index:
            dcg += 1.0 / np.log2(i + 2)
    return dcg


def ndcg_at_k(relevant_index, retrieved_indices, k):
    """Calculate NDCG@k."""
    ideal_dcg = dcg_at_k(relevant_index, [relevant_index], k)
    if ideal_dcg == 0:
        return 0.0
    return dcg_at_k(relevant_index, retrieved_indices, k) / ideal_dcg

In [45]:
def encode_texts(texts, tokenizer, model, device, batch_size=128):
    # Create a DataLoader to batch the inputs
    dataloader = DataLoader(texts, batch_size=batch_size, shuffle=False)
    all_embeddings = []

    # Process each batch
    for batch in tqdm(dataloader, desc="Encoding batches"):
        # Tokenize the texts
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

        # Get the embeddings
        with torch.no_grad():
            outputs = model(input_ids=inputs['input_ids'].to(device), attention_mask=inputs['attention_mask'].to(device))
        batch_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings

        # Move embeddings to CPU and convert to NumPy
        all_embeddings.append(batch_embeddings.cpu().numpy())

    # Concatenate the embeddings from all batches
    embeddings = np.vstack(all_embeddings).astype('float32')
    return embeddings


def get_embeddings(texts, tokenizer, model, device, embedding_file_path: Optional[str] = None, batch_size: int = 128):
    logger = logging.getLogger('default')

    if embedding_file_path and os.path.exists(embedding_file_path):
        logger.info(f"Loading embeddings from {embedding_file_path}")
        with open(embedding_file_path, 'rb') as f:
            embeddings = pickle.load(f)
    else:
        logger.info(f"Encode {len(texts)} texts to their embeddings")
        embeddings = encode_texts(texts, tokenizer, model, device, batch_size=batch_size)

        # Create the folder path if it does not exist
        if embedding_file_path:
            folder_path = os.path.dirname(embedding_file_path)
            os.makedirs(folder_path, exist_ok=True)

            logger.info(f"Save embeddings to {embedding_file_path}")
            with open(embedding_file_path, 'wb') as f:
                pickle.dump(embeddings, f)

    return embeddings

In [46]:
def evaluate(queries, documents, k: int = 10):
    logger = logging.getLogger('default')

    # Encode the queries and positive documents into embeddings
    logger.info(f"Encoding queries")
    query_embeddings = get_embeddings(
        queries,
        tokenizer,
        model,
        device,
        batch_size=1024,
    )
    logger.info(f"Encoding documents")
    doc_embeddings = get_embeddings(
        documents,
        tokenizer,
        model,
        device,
        batch_size=1024,
    )

    # Create a FAISS index for exact search (using cosine similarity with inner product)
    logger.info(f"Index {doc_embeddings.shape[0]} documents")
    index = faiss.IndexFlatIP(doc_embeddings.shape[1])  # Inner product for cosine similarity
    index.add(doc_embeddings)  # Add the document embeddings to the index
    logger.info(f"Total documents indexed: {index.ntotal}")

    # Retrieve top-k documents for each query
    distances, indices = index.search(query_embeddings, k)

    logger.info(f"Evaluating model with k={k}")

    # For each query, compute the evaluation metrics
    precision_scores = []
    mrr_scores = []
    ndcg_scores = []

    for i, (relevant_index, retrieved_indices) in enumerate(zip(range(len(documents)), indices)):
        precision = precision_at_k(relevant_index, retrieved_indices, k)
        mrr = mean_reciprocal_rank(relevant_index, retrieved_indices)
        ndcg = ndcg_at_k(relevant_index, retrieved_indices, k)

        precision_scores.append(precision)
        mrr_scores.append(mrr)
        ndcg_scores.append(ndcg)

    # Compute average metrics for the dataset
    avg_precision = np.mean(precision_scores)
    avg_mrr = np.mean(mrr_scores)
    avg_ndcg = np.mean(ndcg_scores)

    logger.info(f"Average Precision@{k}: {avg_precision}")
    logger.info(f"Average MRR: {avg_mrr}")
    logger.info(f"Average NDCG@{k}: {avg_ndcg}")

    return dict(
        precision=avg_precision,
        mrr=avg_mrr,
        ndcg=avg_ndcg
    )


## Evaluate the model

## Model evaluation

### Wiki40b

In [47]:
from datetime import datetime
import faiss

In [48]:
MODEL_NAME = 'intfloat/multilingual-e5-base'
DATASET_NAME = 'wiki40b'

In [49]:
model_name_slug = MODEL_NAME.replace('/', '_').replace('-', '_')
log_file = f"./logs/hte_evaluation_{model_name_slug}_01_wiki40b.log"
logger = setup_logger(log_file)

In [50]:
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Define model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model = model.to(device)
logger.info(f"Start evaluation on base model: {MODEL_NAME}")

# Loading the dataset
logger.info(f"Switching to new dataset: {DATASET_NAME}")
dataset = transform_dataset(dataset_name=DATASET_NAME, tokenizer=tokenizer, subsets=['test'])

2024-10-01 15:09:38,506 - default - INFO - Using device: cuda
2024-10-01 15:09:40,946 - default - INFO - Start evaluation on base model: intfloat/multilingual-e5-base
2024-10-01 15:09:40,948 - default - INFO - Switching to new dataset: wiki40b
2024-10-01 15:09:40,949 - default - INFO - Transforming Wiki40B dataset
2024-10-01 15:09:48,239 - default - INFO - Transforming test subset
2024-10-01 15:09:48,245 - default - INFO - Done transforming Wiki40B dataset


In [51]:
%%time

# Get the queries and their related documents
queries = dataset['test']['anchor_text']
documents = dataset['test']['positive_text']

for epoch in range(10):
    # Load the latest checkpoint if available and resume training
    logger.info(f"Loading checkpoint at {epoch if epoch is not None else 'last'} epoch")
    checkpoint_dir = "checkpoints/checkpoints_01_wiki40b"
    start_epoch = load_checkpoint(model, optimizer=None, checkpoint_dir=checkpoint_dir, device=device, epoch=epoch)

    # Evaluate the model with k
    for k in [10]:
        logger.info(f"Evaluating model with k={k}")
        evaluate(queries, documents, k=k)

2024-10-01 15:09:48,321 - default - INFO - Loading checkpoint at 0 epoch
2024-10-01 15:09:48,324 - default - INFO - Loading checkpoint checkpoint_epoch_0.pth
  checkpoint = torch.load(checkpoint_path, map_location=device)
2024-10-01 15:10:26,867 - default - INFO - Evaluating model with k=10
2024-10-01 15:10:26,869 - default - INFO - Encoding queries
2024-10-01 15:10:26,871 - default - INFO - Encode 9344 texts to their embeddings
Encoding batches: 100%|██████████| 10/10 [00:03<00:00,  3.22it/s]
2024-10-01 15:10:29,991 - default - INFO - Encoding documents
2024-10-01 15:10:29,992 - default - INFO - Encode 9344 texts to their embeddings
Encoding batches: 100%|██████████| 10/10 [00:52<00:00,  5.28s/it]
2024-10-01 15:11:22,782 - default - INFO - Index 9344 documents
2024-10-01 15:11:22,793 - default - INFO - Total documents indexed: 9344
2024-10-01 15:11:24,785 - default - INFO - Evaluating model with k=10
2024-10-01 15:11:24,912 - default - INFO - Average Precision@10: 0.08351883561643836


CPU times: user 11min 5s, sys: 27.9 s, total: 11min 33s
Wall time: 16min 20s


In [25]:
%%time

k = 10  # Number of top results to retrieve



k = 10  # Evaluate top 10 results
precision_scores = []
mrr_scores = []
ndcg_scores = []

# For each query, compute the evaluation metrics
for i, (relevant_index, retrieved_indices) in enumerate(zip(range(len(documents)), indices)):
    precision = precision_at_k(relevant_index, retrieved_indices, k)
    mrr = mean_reciprocal_rank(relevant_index, retrieved_indices)
    ndcg = ndcg_at_k(relevant_index, retrieved_indices, k)

    precision_scores.append(precision)
    mrr_scores.append(mrr)
    ndcg_scores.append(ndcg)

# Compute average metrics for the dataset
avg_precision = np.mean(precision_scores)
avg_mrr = np.mean(mrr_scores)
avg_ndcg = np.mean(ndcg_scores)

print(f"Average Precision@{k}: {avg_precision}")
print(f"Average MRR: {avg_mrr}")
print(f"Average NDCG@{k}: {avg_ndcg}")


Average Precision@10: 0.08217037671232877
Average MRR: 0.6948312472820178
Average NDCG@10: 0.7253659796873694
CPU times: user 3.04 s, sys: 3.72 ms, total: 3.04 s
Wall time: 2.14 s
