In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from data_modules.mind_component import load_news_data, load_history_data
from utils.file import load_aspect_vectors


In [2]:
candidates = pickle.load(open(
    "/home/users1/hardy/hardy/project/vae/outputs/mind/seq_prediction_beam_5_cands.pickle", "rb"
))


In [3]:
candidates10 = pickle.load(open(
    "/home/users1/hardy/hardy/project/vae/outputs/mind/seq_prediction_beam_10_cands.pickle", "rb"
))

In [4]:
std_dev_data_path='/home/users1/hardy/hardy/datasets/mind_resplit/MINDlarge_dev'
news_std = load_news_data(Path(std_dev_data_path), 'dev')
behavior = load_history_data(Path(std_dev_data_path), 'dev', news_std, fix_history=False)


In [5]:
impressions_by_user = behavior.groupby('user_id')['impressions'].apply(list)
impressions_by_user = impressions_by_user.apply(lambda impressions: [news.split('-')[0][1:] for impression in impressions for news in impression.split() if news.split('-')[1] == '1'])

In [6]:
behavior = behavior.merge(impressions_by_user.rename('user_clicks'), left_on='user_id', right_index=True, how='left')

In [8]:
behavior['generated_impressions'] = candidates
behavior['generated_impressions10'] = candidates10

In [99]:
len(candidates[0][1])

10

In [9]:
behavior['generated_impressions'] = behavior['generated_impressions'].apply(lambda x: list(set([str(item[0]) for item in sorted([candidate for candidates in x for candidate in candidates],key=lambda x: x[1])])))
behavior['generated_impressions10'] = behavior['generated_impressions10'].apply(lambda x: list(set([str(item[0]) for item in sorted([candidate for candidates in x for candidate in candidates],key=lambda x: x[1])])))

In [10]:
std_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_std_sts_aspect_vectors.txt'))
cat_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_category_aspect_vectors.txt'))
mfc_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_frame_aspect_vectors.txt'))
sentiment_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_sentiment_aspect_vectors.txt'))
political_aspect_vector = load_aspect_vectors(Path('/home/users1/hardy/hardy/project/vae/outputs/mind/dev_mind_political_aspect_vectors.txt'))

In [11]:
behavior['overlap_count'] = behavior.apply(
    lambda row: len(set(row['generated_impressions10']) & set(row['user_clicks'])),
    axis=1
)

In [12]:
behavior['overlap_count'].sum()

np.int64(18815)

In [8]:
behavior['impressions'].apply(lambda x: len(x.split())).mean()

np.float64(40.39401944260664)

In [17]:
from numpy import concat


def calculate_pairwise_dissimilarity(features):
    features = np.stack(features)
    norms = np.linalg.norm(features, axis=1, keepdims=True)
    normalized = features / norms
    similarity_matrix = np.dot(normalized, normalized.T)
    return 1 - similarity_matrix

def extract_aspect_vectors(articles, aspect_vectors):
    results = []
    for article in articles:
        concatenated_vector = []
        for aspect_vector in aspect_vectors:
            concatenated_vector.append(aspect_vector[article])
        results.append(concat(concatenated_vector))
    return results

def average_pairwise_dissimilarity(articles):
    if len(articles) < 2:
        return 0.0 # A list with 0 or 1 item has no diversity
    aspect_vectors = [
        std_aspect_vector,
        cat_aspect_vector,
        mfc_aspect_vector,
        political_aspect_vector,
        sentiment_aspect_vector
    ]
    features = extract_aspect_vectors(articles, aspect_vectors) 
    dissimilarity_matrix = calculate_pairwise_dissimilarity(features)
    
    # We only need the upper triangle (excluding diagonal)
    # The sum of unique pairs is the sum of all elements divided by 2 (because it's symmetric)
    # and subtracting the diagonal (which is 0 for dissimilarity to itself).
    # Easier: sum the upper triangle
    upper_triangle_sum = np.sum(np.triu(dissimilarity_matrix, k=1))
    
    num_pairs = len(features) * (len(features) - 1) / 2
    
    if num_pairs == 0:
        return 0.0
        
    return upper_triangle_sum / num_pairs



In [None]:
def serendipity(recommended, clicked, aspect_vectors):
    """
    Measures serendipity as the average dissimilarity between recommended items and previously clicked items.
    """
    if not recommended or not clicked:
        return 0.0
    recommended_vecs = extract_aspect_vectors(recommended, aspect_vectors) 
    clicked_vecs = extract_aspect_vectors(clicked, aspect_vectors)
    if not recommended_vecs or not clicked_vecs:
        return 0.0
    # recommended_vecs = np.stack(recommended_vecs)
    # clicked_vecs = np.stack(clicked_vecs)
    # Cosine similarity
    recommended_norm = recommended_vecs / np.linalg.norm(recommended_vecs, axis=1, keepdims=True)
    clicked_norm = clicked_vecs / np.linalg.norm(clicked_vecs, axis=1, keepdims=True)
    similarity = np.dot(recommended_norm, clicked_norm.T)
    max_sim = np.max(similarity, axis=1)  # For each recommended, max similarity to any clicked
    serendipity_score = np.mean(1 - max_sim)
    return serendipity_score

def novelty(recommended, all_clicked):
    """
    Measures novelty as the proportion of recommended items not seen in the user's history.
    """
    if not recommended:
        return 0.0
    novel_count = len([news for news in recommended if news not in all_clicked])
    return novel_count / len(recommended)

In [18]:
behavior['impressions_diversity'] = behavior['impressions'].apply(lambda x: average_pairwise_dissimilarity([int(news.split('-')[0][1:]) for news in x.split()]))

In [19]:
behavior['gen_impressions_diversity'] = behavior['generated_impressions'].apply(lambda x: average_pairwise_dissimilarity([int(news) for news in x]))


In [20]:
behavior['gen_impressions_diversity10'] = behavior['generated_impressions10'].apply(lambda x: average_pairwise_dissimilarity([int(news) for news in x]))

In [21]:
behavior['gen_impressions_diversity'].mean(), behavior['gen_impressions_diversity10'].mean(), behavior['impressions_diversity'].mean(), behavior['overlap_count'].mean()

(np.float32(0.5363731),
 np.float32(0.5786311),
 np.float32(0.6834256),
 np.float64(0.0454312771947786))

In [None]:
behavior['parsed_history'] = behavior['history'].apply(lambda x: [int(news[1:]) for news in x.split()])
behavior['parsed_impressions'] = behavior['impressions'].apply(lambda x: [int(news.split('-')[0][1:]) for news in x.split()])

In [38]:
behavior['parsed_generated_impressions'] = behavior['generated_impressions'].apply(lambda x: [int(news) for news in x])
behavior['parsed_generated_impressions10'] = behavior['generated_impressions10'].apply(lambda x: [int(news) for news in x])

In [36]:
behavior['impression_serenditity'] = behavior.apply(lambda row: serendipity(
    row['parsed_impressions'],
    row['parsed_history'],
    [
        std_aspect_vector,
        cat_aspect_vector,
        mfc_aspect_vector,
        political_aspect_vector,
        sentiment_aspect_vector
    ]
), axis=1)

In [39]:
behavior['gen_impression_serendipity'] = behavior.apply(lambda row: serendipity(
    row['parsed_generated_impressions'],
    row['parsed_history'],
    [
        std_aspect_vector,
        cat_aspect_vector,
        mfc_aspect_vector,
        political_aspect_vector,
        sentiment_aspect_vector
    ]
), axis=1)

In [40]:
behavior['gen_impression_serendipity10'] = behavior.apply(lambda row: serendipity(
    row['parsed_generated_impressions10'],
    row['parsed_history'],
    [
        std_aspect_vector,
        cat_aspect_vector,
        mfc_aspect_vector,
        political_aspect_vector,
        sentiment_aspect_vector
    ]
), axis=1)

In [41]:
behavior['impression_serenditity'].mean(), behavior['gen_impression_serendipity'].mean(), behavior['gen_impression_serendipity10'].mean()

(np.float32(0.43359718), np.float32(0.3906677), np.float32(0.40109715))

In [None]:
def history_nearest_neighbors(history, news_id, k=5):
    """
    Find the k nearest neighbors of a news item in the user's history.
    """
    if not history:
        return []
    history = [int(news[1:]) for news in history.split()]
    if news_id not in history:
        return []
    distances = [(abs(news_id - other_news), other_news) for other_news in history if other_news != news_id]
    distances.sort()
    return [news for _, news in distances[:k]]

0         [64300, 48084, 91968, 13434, 73137, 1886, 9096...
1         [4833, 61319, 94639, 50163, 107002, 112041, 49...
2         [112192, 82348, 80126, 78767, 7553, 87364, 121...
3         [64593, 82779, 33216, 9321, 128643, 64496, 680...
4         [36816, 51964, 16426, 85101, 49839, 102403, 12...
                                ...                        
414137    [29295, 41003, 71165, 96713, 71977, 72976, 104...
414138    [50645, 107441, 14121, 71665, 81774, 54360, 58...
414139    [87446, 104938, 49247, 112324, 103181, 84940, ...
414140        [108621, 87105, 63676, 102743, 106985, 35508]
414141    [26834, 86208, 83597, 90359, 91280, 55318, 859...
Name: history, Length: 414142, dtype: object

In [None]:
def calculate_pairwise_dissimilarity(features):
    features = np.stack(features)
    norms = np.linalg.norm(features, axis=1, keepdims=True)
    normalized = features / norms
    similarity_matrix = np.dot(normalized, normalized.T)
    return 1 - similarity_matrix

def average_pairwise_dissimilarity(features):
    if len(features) < 2:
        return 0.0 # A list with 0 or 1 item has no diversity

    dissimilarity_matrix = calculate_pairwise_dissimilarity(features)
    
    # We only need the upper triangle (excluding diagonal)
    # The sum of unique pairs is the sum of all elements divided by 2 (because it's symmetric)
    # and subtracting the diagonal (which is 0 for dissimilarity to itself).
    # Easier: sum the upper triangle
    upper_triangle_sum = np.sum(np.triu(dissimilarity_matrix, k=1))
    
    num_pairs = len(features) * (len(features) - 1) / 2
    
    if num_pairs == 0:
        return 0.0
        
    return upper_triangle_sum / num_pairs



In [41]:

len(items), len(targets)

(129, 25)

In [53]:
item_similarity = average_pairwise_dissimilarity(items)
target_similarity = average_pairwise_dissimilarity(targets)
targets_2_similarity = average_pairwise_dissimilarity(targets_2)
targets_3_similarity = average_pairwise_dissimilarity(targets_3)

In [54]:
print(f"Item similarity: {item_similarity}")
print(f"Target similarity: {target_similarity}")
print(f"Targets 2 similarity: {targets_2_similarity}")
print(f"Targets 3 similarity: {targets_3_similarity}")

Item similarity: 0.5139478445053101
Target similarity: 0.4761843979358673
Targets 2 similarity: 0.4427332878112793
Targets 3 similarity: 0.35763439536094666
