In [None]:
import pandas as pd
import json
import voyageai
import torch
from time import sleep
import logging
from tqdm.auto import tqdm


In [None]:
# Configurations
TASKS = ['TATQA', 'FinQA', 'ConvFinQA', 'MultiHiertt']
MODEL_NAME = "voyage-3"
BATCH_SIZE = 32
DELAY_BETWEEN_BATCHES = 0.1
LOG_FILE = "process.log"

logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

vo_client = voyageai.Client(api_key="")


In [None]:
def generate_embeddings(task, attribute, version, client):
    """
    Generate embeddings for a given task and attribute (e.g., corpus or queries).
    """
    file_suffix = "_convert" if version == "convert" else ""
    input_path = f"./data/{task}_{attribute}{file_suffix}.csv"
    output_file = f"{task}_{attribute}{file_suffix}.json"
    output_path = f"./{MODEL_NAME}/embed/{output_file}"

    data = pd.read_csv(input_path).dropna(subset=["text"]).reset_index(drop=True)
    embeddings = {}

    for start_idx in tqdm(range(0, len(data), BATCH_SIZE), desc=f"Processing {attribute} - {version}", leave=False):
        batch = data.iloc[start_idx:start_idx + BATCH_SIZE]
        batch_ids = batch["_id"].tolist()
        batch_texts = (
            batch["convert_text"].tolist() if version == "convert" else batch["text"].tolist()
        )

        try:
            embed_type = "query" if attribute == "queries" else "document"
            result = client.embed(batch_texts, model=MODEL_NAME, input_type=embed_type).embeddings
            embeddings.update(dict(zip(batch_ids, result)))
        except Exception as error:
            logging.error(f"Error in task '{task}' - {attribute}: Batch starting at {start_idx}: {error}")
        
        sleep(DELAY_BETWEEN_BATCHES)

    with open(output_path, "w") as file:
        json.dump(embeddings, file)
    logging.info(f"Embeddings saved for {task} - {attribute} ({version}) at {output_path}")


In [None]:
# Generate embeddings for all tasks
for task in TASKS:
    for attribute in ["corpus", "queries"]:
        versions = ["convert", "original"] if attribute == "corpus" else ["original"]
        for version in versions:
            generate_embeddings(task, attribute, version, vo_client)


In [None]:
def cosine_similarity(a, b):
    """Calculate cosine similarity between two tensors."""
    normalized_a = torch.nn.functional.normalize(a, p=2, dim=1)
    normalized_b = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(normalized_a, normalized_b.T)

def calculate_scores(task):
    """Compute similarity scores and save top matches."""
    with open(f"./{MODEL_NAME}/embed/{task}_queries.json", "r") as q_file:
        query_data = json.load(q_file)
    
    with open(f"./{MODEL_NAME}/embed/{task}_corpus_convert.json", "r") as c_file:
        corpus_data = json.load(c_file)
    
    query_ids = list(query_data.keys())
    corpus_ids = list(corpus_data.keys())
    
    query_embeddings = torch.tensor([query_data[qid] for qid in query_ids])
    corpus_embeddings = torch.tensor([corpus_data[cid] for cid in corpus_ids])

    similarity_matrix = cosine_similarity(query_embeddings, corpus_embeddings)
    top_k = 500 if task not in ['FinQABench', 'FinanceBench'] else 50

    matches = {}
    for idx, query_id in enumerate(query_ids):
        top_scores, top_indices = torch.topk(similarity_matrix[idx], top_k)
        matches[query_id] = {
            corpus_ids[i]: top_scores[j].item() for j, i in enumerate(top_indices)
        }

    output_path = f"./{MODEL_NAME}/{task}_convert.json"
    with open(output_path, "w") as output_file:
        json.dump(matches, output_file)
    logging.info(f"Similarity scores saved for {task} at {output_path}")


In [None]:
# Calculate scores for all tasks
for task in TASKS:
    calculate_scores(task)
