## Generate Item Embedding for Evaluating Diversity

In [None]:
import pandas as pd
from src.utils.embedding import setup_model_and_tokenizer, encode_in_batches, save_embeddings


def get_id2name():
    df = pd.read_parquet('../../dataset/shopping_queries_dataset/shopping_queries_dataset_products.parquet')
    item_id2name = dict(zip(df.product_id, df.product_title))
    return item_id2name


id2name = get_id2name()
model, tokenizer, device = setup_model_and_tokenizer()
corpus = list(id2name.values())
embeddings = encode_in_batches(corpus, model, tokenizer, device)
save_embeddings(embeddings, list(id2name.keys()), f'dataset/esci-npos10-nneg5/embeddings.npy')

## Evaluate ESCI

In [None]:
import glob
import re
from tqdm import tqdm
import pandas as pd
from embedding_utils import load_embeddings
from src.utils.eval import calculate_diversity


def calculate_relevance(labels, k):
    """Compute Precision@k and NDCG@k for a list of labels with custom gain values."""
    gain_map = {'E': 1, 'S': 0.2, 'C': 0.1, 'I': 0}  # Define gain for each label

    # Compute Precision@k
    precision_score = sum([label == 'E' for label in labels[:k]]) / k

    # Calculate DCG with custom gains
    dcg = sum((2 ** gain_map[labels[i]] - 1) / np.log2(i + 2) for i in range(min(k, len(labels))))

    # Generate ideal labels sorted by highest gain for IDCG calculation
    ideal_labels = ['E']*k
    idcg = sum((2 ** gain_map[ideal_labels[i]] - 1) / np.log2(i + 2) for i in range(min(k, len(ideal_labels))))

    # Compute NDCG
    ndcg_score = dcg / idcg if idcg > 0 else 0.0
    return precision_score, ndcg_score


category = 'All_Beauty'
fn = 'output/esci/esci-dpo-iter3/test.json'
dataset = pd.read_json(fn, lines=True)

embeddings = load_embeddings(f'dataset/esci-npos10-nneg5/embeddings.npy')
id2name = get_id2name(category)
name2id = {v: k for k, v in id2name.items()}


precision_scores = []
ndcg_scores = []
diversity_scores = []
k = 5
valid = 0
for _, example in tqdm(dataset.iterrows(), total=len(dataset)):
    pred = re.findall(r'\d+\.\s(.+?)\s-', example['output'].strip())
    cans = example['itemIDList']
    label_mapping = {i: j for i, j in zip(example['itemIDList'], example['esciLabel'])}
    valid_pred = [p for p in pred if p in cans]
    if len(valid_pred) != k:
        continue
    valid += 1
    label = [label_mapping[name2id[p]] for p in valid_pred]
    precision_score, ndcg_score = calculate_relevance(valid_pred, label, k)
    valid_pred_ids = [name2id[i] for i in valid_pred]
    diversity_score = calculate_diversity(valid_pred_ids, embeddings, k)

    precision_scores.append(precision_score)
    ndcg_scores.append(ndcg_score)
    diversity_scores.append(diversity_score)

print(f'Evaluation Results:')
print(len(dataset))
print(f'Valid Fraction: {valid/len(dataset):.4f}')
print(f'Hit@{k}: {sum(precision_scores)/len(dataset):.4f}')
print(f'NDCG@{k}: {sum(ndcg_scores)/len(dataset):.4f}')
print(f'Diversity@{k}: {sum(diversity_scores)/len(dataset):.4f}')