In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity
from rec import inter_matr_implicit
from tqdm import tqdm
from typing import Callable, List, Dict

from rec import svd_decompose, svd_recommend_to_list
from rec import recTopK
from rec import recTopKPop
from sklearn.metrics import ndcg_score

import itertools

In [2]:
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

# Load User Data
users = read('lfm-challenge', 'user')
print("Users Data Head:")
print(users.head())

# Load Item Data
items = read('lfm-challenge', 'item')
print("\nItem Data Head:")
print(items.head())

# Load Training Interactions
train_inters = read('lfm-challenge', 'inter_train')
print("\nTraining Interactions Head:")
print(train_inters.head())

# Load Testing Interactions
test_inters = read('lfm-challenge', 'inter_test')
print("\nTesting Interactions Head:")
print(test_inters.head())

# Load Embeddings
embedding = read('lfm-challenge', 'musicnn')
print("\nEmbeddings Head:")
print(embedding.head())

train_interaction_matrix = inter_matr_implicit(users, items, train_inters, 'lfm-challenge')
test_interaction_matrix = inter_matr_implicit(users, items, test_inters, 'lfm-challenge')

Users Data Head:
   user_id country  age_at_registration gender    registration_date
0        0     NaN                   -1      n  2012-01-17 18:42:44
1        1     NaN                   -1      n  2011-03-24 13:27:26
2        2      US                   -1      m  2011-12-29 06:46:36
3        3     NaN                   -1      n  2012-04-16 11:21:04
4        4     NaN                   -1      n  2012-01-18 19:01:26

Item Data Head:
   item_id               artist                       song  \
0        0           Bucks Fizz        Making Your Mind Up   
1        1               Delain                    Invidia   
2        2  Death Cab for Cutie  Pictures in an Exhibition   
3        3             Maroon 5                        How   
4        4           Anne Clark      Sleeper In Metropolis   

                             album_name  \
0                            Bucks Fizz   
1                            April Rain   
2  You Can Play These Songs With Chords   
3            

In [3]:
def calculate_user_profile_embedding(seen_item_ids: list, item_embeddings: pd.DataFrame) -> np.ndarray:
    
    user_profile_embedding = None
    seen_embeddings = item_embeddings[item_embeddings['item_id'].isin(seen_item_ids)]
    user_profile_embedding = seen_embeddings.drop(columns='item_id').sum(axis=0)

    return user_profile_embedding


def average_embedding_similarity_rec(seen_item_ids: list, item_embeddings: pd.DataFrame, _calculate_user_profile_embedding: Callable[[List[int], pd.DataFrame], np.ndarray], top_k: int=10) -> np.ndarray:

    recommended_item_ids = None
    user_profile_embedding = None
    user_profile_embedding = _calculate_user_profile_embedding(seen_item_ids, item_embeddings)
    user_profile_embedding_reshaped = user_profile_embedding.values.reshape(1, -1)
    embed_values = item_embeddings.drop(columns='item_id').values
    similarities = cosine_similarity(user_profile_embedding_reshaped, embed_values)
    similarities_df = pd.DataFrame(similarities, columns=item_embeddings['item_id'])
    similarities_df.loc[:, seen_item_ids] = 0
    
    recommended_item_ids = similarities_df.T.nlargest(top_k, 0).index.values

    return recommended_item_ids

In [4]:
user_id_example = users['user_id'].iloc[1]
seen_item_ids = train_inters[train_inters['user_id'] == user_id_example]['item_id'].values.tolist()
recommended_items = average_embedding_similarity_rec(seen_item_ids, embedding, calculate_user_profile_embedding, top_k=10)
print(f"Recommended Items for User {user_id_example}: {recommended_items}")

Recommended Items for User 1: [ 449 2309 1011  707 4055  104 2396 1948 3142  928]


In [5]:
def compute_aggregated_scores(seen_item_ids: list, item_embeddings: pd.DataFrame) -> np.ndarray:

    seen_embeddings = item_embeddings[item_embeddings['item_id'].isin(seen_item_ids)]
    similarity_scores = cosine_similarity(seen_embeddings.drop(columns='item_id'), item_embeddings.drop(columns='item_id'))
    scores_agg = similarity_scores.sum(axis=0)
    recommendation_scores = pd.DataFrame(scores_agg, index=item_embeddings['item_id'], columns=['score'])
    return recommendation_scores

def aggregated_item_similarity_rec(seen_item_ids: list, item_embeddings: pd.DataFrame, _compute_aggregated_scores: Callable[[List[int], pd.DataFrame], np.ndarray], top_k: int=10) -> np.ndarray:

    recommendation_scores = _compute_aggregated_scores(seen_item_ids, item_embeddings)
    recommendation_scores.loc[seen_item_ids, 'score'] = 0.0
    recommended_item_ids = recommendation_scores['score'].nlargest(top_k).index.values
    return recommended_item_ids

In [6]:
user_id_example = users['user_id'].iloc[1]
seen_items_user_example = np.where(train_interaction_matrix[user_id_example, :] > 0)[0]
recommended_items = aggregated_item_similarity_rec(seen_items_user_example.tolist(), embedding, compute_aggregated_scores, top_k=10)
print(f"Recommended Items for User {user_id_example}: {recommended_items}")

Recommended Items for User 1: [ 449 2309 1011  707 4055  104 2396 1948 3142  928]


In [7]:
def evaluate_ndcg_by_user_groups(user_groups: dict, recommenders: dict, train_interaction_matrix: np.ndarray, test_interaction_matrix: np.ndarray,
                                 U: np.ndarray, V: np.ndarray, item_embeddings: pd.DataFrame, _calculate_user_profile_embedding, 
                                 _compute_aggregated_scores, topK: int=10, n_neighbors: int=5) -> pd.DataFrame:

    results = []

    for group_name, users in user_groups.items():
        for recommender_name, recommender_func in tqdm(recommenders.items(), desc=f'Evaluating {group_name} Users'):
            nDCG_scores = []
            for user_id in users:
                seen_items = np.where(train_interaction_matrix[user_id, :] > 0)[0]  # Items already interacted with by the user

                if recommender_name == 'SVD':
                    recommendations = recommender_func(user_id, seen_items.tolist(), U, V, topK)
                elif recommender_name == 'ItemKNN':
                    recommendations = recommender_func(train_interaction_matrix, user_id, topK, n_neighbors)
                elif recommender_name == 'TopPop':
                    recommendations = recommender_func(train_interaction_matrix, user_id, topK)
                elif recommender_name == 'Avg_Item_Embd':
                    recommendations = recommender_func(seen_items.tolist(), item_embeddings, _calculate_user_profile_embedding, topK)
                elif recommender_name == 'Aggr_Item_Sim':
                    recommendations = recommender_func(seen_items.tolist(), item_embeddings, _compute_aggregated_scores, topK)
                else:
                    raise NotImplementedError(f'Recommender {recommender_name} not implemented.')

                if not isinstance(recommendations, np.ndarray):
                    recommendations = np.array(recommendations)

                # Calculate nDCG
                true_relevance = test_interaction_matrix[user_id, :].reshape(1, -1)
                predicted_scores = np.zeros((1, train_interaction_matrix.shape[1]))
                predicted_scores[0, recommendations] = 1
                nDCG_score = ndcg_score(true_relevance, predicted_scores)
                nDCG_scores.append(nDCG_score)

            avg_nDCG = np.mean(nDCG_scores)
            results.append({'User Group': group_name, 'Recommender': recommender_name, 'Average nDCG': avg_nDCG})

    return pd.DataFrame(results)

In [8]:
def evaluate_recommenders(user_info: pd.DataFrame, parameters: dict, recommender: dict, user_threshold: int) -> (pd.DataFrame, dict):

    evaluation_results_df = None

    user_info['interactions'] = np.sum(parameters['train_interaction_matrix'] > 0, axis=1)

    user_groups = {
        'Low Interaction': user_info[user_info['interactions'] <= user_threshold].index.to_list(),
        'High Interaction': user_info[user_info['interactions'] > user_threshold]['user_id'].index.to_list()
    }

    # TODO: YOUR IMPLEMENTATION
    evaluation_results_df = evaluate_ndcg_by_user_groups(user_groups, 
                                                         recommender,
                                                         parameters['train_interaction_matrix'], 
                                                         parameters['test_interaction_matrix'], 
                                                         parameters['U'], 
                                                         parameters['V'], 
                                                         parameters['item_embeddings'], 
                                                         calculate_user_profile_embedding, 
                                                         compute_aggregated_scores, 
                                                         parameters['topK'], 
                                                         parameters['n_neighbors'])

    return evaluation_results_df, user_groups

In [9]:
# Define recommenders with correct parameters
recommenders = {
    'Avg_Item_Embd': average_embedding_similarity_rec,
    'Aggr_Item_Sim': aggregated_item_similarity_rec,
    'SVD': svd_recommend_to_list,
    'ItemKNN': recTopK,
    'TopPop': recTopKPop
}

U, V = svd_decompose(train_interaction_matrix)

data = {
    'train_interaction_matrix': train_interaction_matrix,
    'test_interaction_matrix': test_interaction_matrix,
    'U': U,
    'V': V,
    'item_embeddings': embedding,
    '_calculate_user_profile_embedding': calculate_user_profile_embedding,
    '_compute_aggregated_scores': compute_aggregated_scores,
    'topK': 10,
    'n_neighbors': 5}

evaluation_results_df, user_groups = evaluate_recommenders(users, data, recommenders, user_threshold=5)
print(f"Number of Users with low interaction levels: {len(user_groups['Low Interaction'])}")
print(f"Number of Users with high interaction levels: {len(user_groups['High Interaction'])}")
print(evaluation_results_df)

Evaluating Low Interaction Users: 100%|██████████| 5/5 [05:26<00:00, 65.31s/it] 
  score = np.sum(intersection) / np.sum(union)
  score = np.sum(intersection) / np.sum(union)
  score = np.sum(intersection) / np.sum(union)
  score = np.sum(intersection) / np.sum(union)
  score = np.sum(intersection) / np.sum(union)
  score = np.sum(intersection) / np.sum(union)
  score = np.sum(intersection) / np.sum(union)
Evaluating High Interaction Users:  60%|██████    | 3/5 [21:03<14:02, 421.12s/it]


KeyboardInterrupt: 