Task: Recommend 10 unseen songs for every users
Goal: maximize nDCG
Data: user info
      item info
      user interactions with items(test + train)
      item embeddings
Submit: report.txt + codes.zip + recommendations.tsv
TODO: create train-test-val data splits
      setup nDCG evaluation
      produce & evaluate random recommendations
      produce and evaluate POP recommendations

      


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity
from tqdm import tqdm
from typing import Callable, List
from sklearn.metrics import ndcg_score




In [3]:
def inter_matr_implicit(users: int,
                       items: int,
                       interactions: pd.DataFrame,
                       threshold=1) -> np.ndarray:
    """
    Create an implicit interaction matrix from user-item interactions.
    
    Parameters:
        users: DataFrame containing user information
        items: DataFrame containing item information
        interactions: DataFrame containing user-item interaction data
        threshold: Minimum value for a valid interaction (default: 1)
        
    Returns:
        2D numpy array where rows represent users and columns represent items
    """
    interactions = interactions.copy()

    n_users = len(users.index)
    n_items = len(items.index)
    res = np.zeros([n_users, n_items], dtype=np.int8)

    row = interactions['user_id'].to_numpy()
    col = interactions["item_id"].to_numpy()

    data = interactions['count'].to_numpy()
    data[data < threshold] = 0
    data[data >= threshold] = 1

    res[row, col] = data

    return res

def get_ndcg_score_sk(df_predictions, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    Calculate the NDCG score for recommendation predictions.
    
    Parameters:
        df_predictions: DataFrame containing recommendation predictions
        test_interaction_matrix: Ground truth interaction matrix
        topK: Number of top recommendations to evaluate (default: 10)
        
    Returns:
        Average NDCG score across all users
    """
    ndcg_avg = 0
    
    for _, row in df_predictions.iterrows():
        g_truth = test_interaction_matrix[row["user_id"]]

        predicted_scores = np.zeros(len(g_truth),dtype=np.int8)

        predictions = list(map(int, row["recs"].split(",")))[:topK]

        for j, rec in enumerate(predictions):
            predicted_scores[rec] = topK-j

        ndcg_avg += ndcg_score(g_truth.reshape(1, -1), predicted_scores.reshape(1, -1), k=topK)

    return ndcg_avg/len(df_predictions)


In [4]:
def compute_itemknn_scores(seen_item_ids: list, item_embeddings: pd.DataFrame, k: int = 10) -> np.ndarray:
    """
    ItemKNN-like scoring using item embeddings
    
    For each item, find its k most similar items (based on embeddings),
    and if any of those have been seen by the user, use their similarities
    to compute an aggregated score.
    
    seen_item_ids - list[int], items the user has seen
    item_embeddings - pd.DataFrame, must include 'item_id' and embedding columns
    k - int, number of nearest neighbors to consider
    
    returns - np.ndarray of scores for each item in item_embeddings
    """
    
    recommendation_scores = np.zeros(len(item_embeddings))

    seen_item_embeddings = item_embeddings[item_embeddings['item_id'].isin(seen_item_ids)]
    seen_item_embeddings = seen_item_embeddings.sort_values("item_id")
    seen_item_embeddings = seen_item_embeddings.drop('item_id', axis=1).to_numpy()

    embeddings_sorted = item_embeddings.sort_values('item_id')
    embeddings_sorted = embeddings_sorted.drop("item_id", axis=1).to_numpy()

    for item_embedding in seen_item_embeddings:
        similarity_list = cosine_similarity(item_embedding.reshape(1,-1), embeddings_sorted).flatten()
        recommended_item_ids = np.argsort(similarity_list)[-k:][::-1]
        for id in recommended_item_ids:
            recommendation_scores[id] = max(similarity_list[id],recommendation_scores[id])
    recommendation_scores[seen_item_ids] = -np.inf
    return recommendation_scores


def cb_itemknn_recommendation(seen_item_ids: list, item_embeddings: pd.DataFrame, 
                              _compute_itemknn_scores: Callable[[List[int], pd.DataFrame], np.ndarray], 
                              top_k: int=10, knn_k: int=10) -> np.ndarray:
    """
    Recommends items to a user based on the items they have already seen, by sorting the calculated similarity scores
    and selecting the top-k items.

    seen_item_ids - list[int], ids of items already seen by the user (to exclude from recommendation);
    embedding - pd.DataFrame, Unsorted DataFrame containing item_id and item embeddings as separate columns;
    _compute_itemknn_scores - function, function to compute aggregated similarity scores for all items;
    topK - int, number of recommendations per user to be returned;

    returns - 1D np.ndarray, array of IDs of the top-K recommended items, sorted by decreasing similarity
            to the user's average embedding profile;
    """
    recs = _compute_itemknn_scores(seen_item_ids, item_embeddings, knn_k)
    top_k_indices = recs.argsort()[-top_k:][::-1]
    
    # Map indices to item_ids based on sorted order
    item_ids_sorted = item_embeddings.sort_values('item_id').reset_index(drop=True)['item_id'].to_numpy()
    recommended_item_ids = item_ids_sorted[top_k_indices]
    
    return recommended_item_ids


In [5]:
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

# Load User Data
users = read('lfm-challenge', 'user')
print("Users Data Head:")
#print(users.head())

# Load Item Data
items = read('lfm-challenge', 'item')
print("\nItem Data Head:")
#print(items.head())

# Load Training Interactions
train_inters = read('lfm-challenge', 'inter_train')
print("\nTraining Interactions Head:")
#print(train_inters.head())

# Load Testing Interactions
test_inters = read('lfm-challenge', 'inter_test')
print("\nTesting Interactions Head:")
#print(test_inters.head())

# Load Embeddings
embedding = read('lfm-challenge', 'musicnn')
print("\nEmbeddings Head:")
print(embedding.head())

train_interaction_matrix = inter_matr_implicit(users, items, train_inters)
test_interaction_matrix = inter_matr_implicit(users, items, test_inters)

Users Data Head:

Item Data Head:

Training Interactions Head:

Testing Interactions Head:

Embeddings Head:
   item_id         0         1         2         3         4         5  \
0        0  0.221942  0.006455  0.027300  0.091775  0.013135  0.137436   
1        1  0.166340  0.000332  0.018895  0.140315  0.002309  0.111743   
2        2  0.247896  0.003749  0.034527  0.036859  0.008251  0.115214   
3        3  0.229554  0.000968  0.028905  0.027514  0.002186  0.100847   
4        4  0.009760  0.000590  0.008925  0.721381  0.000711  0.073143   

          6         7         8  ...        40        41        42        43  \
0  0.082835  0.275749  0.126342  ...  0.058063  0.014128  0.000574  0.001193   
1  0.102853  0.483104  0.135297  ...  0.191162  0.014372  0.000179  0.000249   
2  0.030934  0.609462  0.058102  ...  0.009187  0.005204  0.000456  0.000602   
3  0.029319  0.564656  0.080171  ...  0.008916  0.004114  0.000110  0.000287   
4  0.454569  0.118651  0.368946  ...  0.008477

In [7]:
train_recs_list = []

for i in range(len(users)):
    seen_item_ids = np.where(train_interaction_matrix[i] > 0)[0]
    train_rec_i = cb_itemknn_recommendation(seen_item_ids, embedding, compute_itemknn_scores, 10, 10)
    train_recs_list.append(",".join(map(str, train_rec_i)))
    
user_id_list = np.array([i for i in range(len(users))])

pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))

0.010577907369733106
