## Generate Item Embedding for Evaluating Diversity

In [None]:
from src.utils.embedding import setup_model_and_tokenizer, encode_in_batches, save_embeddings


def get_id2name():
    music_id_to_name = {}
    data_path = "dataset/lastfm/id2name.csv"
    with open(data_path, "r") as file:
        for line in file:
            music_id_name = line.strip().split(',')
            if len(music_id_name) == 2:
                music_id, name = music_id_name
            else:
                music_id = music_id_name[0]
                name = ",".join(music_id_name[1:])
            music_id_to_name[int(music_id)] = name.strip()
    return music_id_to_name


id2name = get_id2name()
model, tokenizer, device = setup_model_and_tokenizer()
corpus = list(id2name.values())
embeddings = encode_in_batches(corpus, model, tokenizer, device)
save_embeddings(embeddings, list(id2name.keys()), f'dataset/lastfm-cans20/embeddings.npy')

## Evaluate LastFM

In [None]:
import glob
import re
from tqdm import tqdm
import pandas as pd
from embedding_utils import load_embeddings
from src.utils.eval import calculate_relevance, calculate_diversity


# files = glob.glob(f'output/lastfm_stage2/claude-3-sonnet/test/*.json')
# data = []
# for fn in files:
#     with open(fn) as f:
#         data.append(json.load(f))
# dataset = pd.DataFrame(data)


fn = 'output/lastfm_stage2/lastfm-stage2-dpo-iter3/test.json'
dataset = pd.read_json(fn, lines=True)

embeddings = load_embeddings(f'dataset/lastfm-cans20/embeddings.npy')
id2name = get_id2name()
name2id = {v: k for k, v in id2name.items()}

hit_at_1 = []
precision_scores = []
ndcg_scores = []
diversity_scores = []
k = 5
valid = 0
for _, example in tqdm(dataset.iterrows(), total=len(dataset)):
    pred = re.findall(r'\d+\.\s(.+?)\s-', example['output'])
    cans = example['item_list']
    label = example['true_selection']
    valid_pred = [p for p in pred if p in cans]
    if len(valid_pred) != k:
        continue
    valid += 1
    hit_at_1.append(1 if label in valid_pred[:1] else 0)
    precision_score, ndcg_score = calculate_relevance(valid_pred, label, k)
    valid_pred_ids = [name2id[i] for i in valid_pred]
    diversity_score = calculate_diversity(valid_pred_ids, embeddings, k)

    precision_scores.append(precision_score)
    ndcg_scores.append(ndcg_score)
    diversity_scores.append(diversity_score)

print(f'Evaluation Results:')
print(len(dataset))
print(f'Valid Fraction: {valid/len(dataset):.4f}')
print(f'Hit@{1}: {sum(hit_at_1)/len(dataset):.4f}')
print(f'Hit@{k}: {sum(precision_scores)/len(dataset):.4f}')
print(f'NDCG@{k}: {sum(ndcg_scores)/len(dataset):.4f}')
print(f'Diversity@{k}: {sum(diversity_scores)/len(dataset):.4f}')

## Generate Preference Pairs

In [None]:
from collections import Counter
import pandas as pd
import json
import glob
from tqdm import tqdm
import re
import numpy as np
import os
filtered = []
prec = []
ndcg = []


def eval(ground_truth, predictions):

    # Calculate Hit@1
    hit_1 = 1 if predictions[0] == ground_truth else 0

    # Calculate Hit@3
    hit_3 = 1 if ground_truth in predictions[:3] else 0

    # Calculate NDCG@3
    def dcg_at_k(r, k):
        r = np.asfarray(r)[:k]
        if r.size == 0:
            return 0.0
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))

    def ndcg_at_k(r, k):
        dcg_max = dcg_at_k(sorted(r, reverse=True), k)
        if not dcg_max:
            return 0.0
        return dcg_at_k(r, k) / dcg_max

    # Relevance scores: 1 if correct, 0 otherwise
    relevance = [1 if pred == ground_truth else 0 for pred in predictions]
    ndcg_3 = ndcg_at_k(relevance, 3)

    return hit_1, hit_3, ndcg_3


def get_preference(output):
    """Extract preference from the output string."""
    if 'Chosen Option: A' in output or 'Chosen Option: Option A' in output:
        return 'A'
    elif 'Chosen Option: B' in output or 'Chosen Option: Option B' in output:
        return 'B'
    return 'None'


def assign_choices(df):
    """Assign chosen and rejected outputs based on preferences."""
    df['preference'] = df.output.apply(get_preference)
    df['chosen'] = df.apply(lambda row: row.output_1 if row.preference == 'A' else row.output_2, axis=1)
    df['rejected'] = df.apply(lambda row: row.output_2 if row.preference == 'A' else row.output_1, axis=1)
    return df


split = 'test'
task = 'lastfm-stage2'
model_name = 'lastfm-stage2-dpo-iter3'
output_dir = os.path.join("output", task, model_name.split("/")[-1], split)
files = glob.glob(f'{output_dir}/*.json')
data = []
for fn in files:
    with open(fn) as f:
        data.append(json.load(f))
df = pd.DataFrame(data)
df = assign_choices(df)

print(len(df))
print(Counter(df.preference.tolist()))


for i in range(len(df)):
    example = df.iloc[i]
    pred_1 = re.findall(r'\d+\.\s(.+?)\s-', example['output_1'].strip())
    pred_2 = re.findall(r'\d+\.\s(.+?)\s-', example['output_2'].strip())
    cans = example['itemIDList']
    label = example['trueSelectionID']
    valid_pred_1 = [p for p in pred_1 if p in cans]
    valid_pred_2 = [p for p in pred_2 if p in cans]
    if len(valid_pred_1) != 3 or len(valid_pred_2) != 3:
        continue
    prec_1, _, ndcg_1 = eval(ground_truth=label, predictions=valid_pred_1)
    prec_2, _, ndcg_2 = eval(ground_truth=label, predictions=valid_pred_2)
    if ndcg_1 == 0 and ndcg_2 == 0:
        continue
    if ndcg_1 > ndcg_2 and example['preference'] == 'A':
        filtered.append(i)
        prec.append(prec_1)
        ndcg.append(ndcg_1)
    elif ndcg_2 > ndcg_1 and example['preference'] == 'B':
        filtered.append(i)
        prec.append(prec_2)
        ndcg.append(ndcg_2)

df_filtered = df.iloc[filtered]
print(f'Filtered {len(df_filtered)} examples')
print(f'Precision: {sum(prec)/len(prec)}')
print(f'NDCG: {sum(ndcg)/len(ndcg)}')
print(Counter(df_filtered.preference.tolist()))