In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")
data.drop(['title','description','features'], axis=1, inplace=True)

def load_embeddings(file_path):
    return np.load(file_path)

all_mpnet_base_embeddings = load_embeddings('embeddings/all_mpnet_base_embeddings.npy')
bm25_embeddings = load_embeddings('embeddings/bm25_embeddings.npy')
bert_base_embeddings = load_embeddings('embeddings/bert_base_embeddings.npy')
distilroberta_embeddings = load_embeddings('embeddings/distilroberta_embeddings.npy')

item_id_map = {item_id: idx for idx, item_id in enumerate(data['item_id'].unique())}

def calculate_serendipity(history_embeddings, item_embedding):
    similarities = cosine_similarity(history_embeddings, item_embedding.reshape(1, -1))
    avg_similarity = np.mean(similarities)
    return 1 - avg_similarity

data['serendipity_mpnet'] = np.nan
data['serendipity_bm25'] = np.nan
data['serendipity_bert'] = np.nan
data['serendipity_distilroberta'] = np.nan

for user_id in data['user_id'].unique():
    user_data = data[data['user_id'] == user_id]
    item_ids = user_data['item_id'].tolist()
    
    history_indices = [item_id_map[item_id] for item_id in item_ids]
    
    history_embeddings_all_mpnet = all_mpnet_base_embeddings[history_indices]
    history_embeddings_bm25 = bm25_embeddings[history_indices]
    history_embeddings_bert = bert_base_embeddings[history_indices]
    history_embeddings_distilroberta = distilroberta_embeddings[history_indices]

    for item_id in item_ids:
        item_idx = item_id_map[item_id]

        item_embedding_all_mpnet = all_mpnet_base_embeddings[item_idx]
        item_embedding_bm25 = bm25_embeddings[item_idx]
        item_embedding_bert = bert_base_embeddings[item_idx]
        item_embedding_distilroberta = distilroberta_embeddings[item_idx]

        serendipity_all_mpnet = calculate_serendipity(history_embeddings_all_mpnet, item_embedding_all_mpnet)
        serendipity_bm25 = calculate_serendipity(history_embeddings_bm25, item_embedding_bm25)
        serendipity_bert = calculate_serendipity(history_embeddings_bert, item_embedding_bert)
        serendipity_distilroberta = calculate_serendipity(history_embeddings_distilroberta, item_embedding_distilroberta)

        data.loc[(data['user_id'] == user_id) & (data['item_id'] == item_id), 'serendipity_mpnet'] = serendipity_all_mpnet
        data.loc[(data['user_id'] == user_id) & (data['item_id'] == item_id), 'serendipity_bm25'] = serendipity_bm25
        data.loc[(data['user_id'] == user_id) & (data['item_id'] == item_id), 'serendipity_bert'] = serendipity_bert
        data.loc[(data['user_id'] == user_id) & (data['item_id'] == item_id), 'serendipity_distilroberta'] = serendipity_distilroberta

data.to_csv('serendipity_comparison_with_embeddings.csv', index=False, sep=";")
print("Serendipity calculated and saved.")

Serendipity calculated and saved.
