Task: Recommend 10 unseen songs for every users
Goal: maximize nDCG
Data: user info
      item info
      user interactions with items(test + train)
      item embeddings
Submit: report.txt + codes.zip + recommendations.tsv
TODO: create train-test-val data splits
      setup nDCG evaluation
      produce & evaluate random recommendations
      produce and evaluate POP recommendations

      


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity
from rec import inter_matr_implicit
from tqdm import tqdm
from typing import Callable, List
from sklearn.metrics import ndcg_score


In [2]:
def inter_matr_implicit(users: int,
                       items: int,
                       interactions: pd.DataFrame,
                       threshold=1) -> np.ndarray:
    """
    Create an implicit interaction matrix from user-item interactions.
    
    Parameters:
        users: DataFrame containing user information
        items: DataFrame containing item information
        interactions: DataFrame containing user-item interaction data
        threshold: Minimum value for a valid interaction (default: 1)
        
    Returns:
        2D numpy array where rows represent users and columns represent items
    """
    interactions = interactions.copy()

    n_users = len(users.index)
    n_items = len(items.index)
    res = np.zeros([n_users, n_items], dtype=np.int8)

    row = interactions['user_id'].to_numpy()
    col = interactions["item_id"].to_numpy()

    data = interactions['count'].to_numpy()
    data[data < threshold] = 0
    data[data >= threshold] = 1

    res[row, col] = data

    return res

def get_ndcg_score_sk(df_predictions, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    Calculate the NDCG score for recommendation predictions.
    
    Parameters:
        df_predictions: DataFrame containing recommendation predictions
        test_interaction_matrix: Ground truth interaction matrix
        topK: Number of top recommendations to evaluate (default: 10)
        
    Returns:
        Average NDCG score across all users
    """
    ndcg_avg = 0
    
    for _, row in df_predictions.iterrows():
        g_truth = test_interaction_matrix[row["user_id"]]

        predicted_scores = np.zeros(len(g_truth),dtype=np.int8)

        predictions = list(map(int, row["recs"].split(",")))[:topK]

        for j, rec in enumerate(predictions):
            predicted_scores[rec] = topK-j

        ndcg_avg += ndcg_score(g_truth.reshape(1, -1), predicted_scores.reshape(1, -1), k=topK)

    return ndcg_avg/len(df_predictions)


In [3]:
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

# Load User Data
users = read('lfm-challenge', 'user')
print("Users Data Head:")
#print(users.head())

# Load Item Data
items = read('lfm-challenge', 'item')
print("\nItem Data Head:")
#print(items.head())

# Load Training Interactions
train_inters = read('lfm-challenge', 'inter_train')
print("\nTraining Interactions Head:")
#print(train_inters.head())

# Load Testing Interactions
test_inters = read('lfm-challenge', 'inter_test')
print("\nTesting Interactions Head:")
#print(test_inters.head())

# Load Embeddings
embedding = read('lfm-challenge', 'musicnn')
print("\nEmbeddings Head:")
print(embedding.head())

train_interaction_matrix = inter_matr_implicit(users, items, train_inters)
test_interaction_matrix = inter_matr_implicit(users, items, test_inters)

Users Data Head:

Item Data Head:

Training Interactions Head:

Testing Interactions Head:

Embeddings Head:
   item_id         0         1         2         3         4         5  \
0        0  0.221942  0.006455  0.027300  0.091775  0.013135  0.137436   
1        1  0.166340  0.000332  0.018895  0.140315  0.002309  0.111743   
2        2  0.247896  0.003749  0.034527  0.036859  0.008251  0.115214   
3        3  0.229554  0.000968  0.028905  0.027514  0.002186  0.100847   
4        4  0.009760  0.000590  0.008925  0.721381  0.000711  0.073143   

          6         7         8  ...        40        41        42        43  \
0  0.082835  0.275749  0.126342  ...  0.058063  0.014128  0.000574  0.001193   
1  0.102853  0.483104  0.135297  ...  0.191162  0.014372  0.000179  0.000249   
2  0.030934  0.609462  0.058102  ...  0.009187  0.005204  0.000456  0.000602   
3  0.029319  0.564656  0.080171  ...  0.008916  0.004114  0.000110  0.000287   
4  0.454569  0.118651  0.368946  ...  0.008477

In [4]:

def recTopKPop(inter_matr: np.ndarray,
               user: int,
               top_k: int) -> np.array:
    '''
    inter_matr - np.ndarray, from the task 1;
    user - int, user_id;
    top_k - int, expected length of the resulting list;

    returns - list/array, of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''
    #items - columns
    top_pop = None
    pop_array = np.sum(inter_matr, axis=0)
    unseen_pop_dict = {i:pop_array[i] for i in range(len(pop_array)) if inter_matr[user,i] == 0}
    top_pop = sorted(unseen_pop_dict, key=unseen_pop_dict.get, reverse=True)[:top_k]
    return np.array(top_pop)

In [5]:
train_recs_list = []

for i in range(len(users)):
    train_rec_i = recTopKPop(train_interaction_matrix, user = i, top_k = 10)

    train_recs_list.append(",".join(map(str, train_rec_i)))
    
user_id_list = np.array([ i for i in range(len(users))])

pop_train_df = pd.DataFrame({"user_id":user_id_list, "recs":train_recs_list})

print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix,10))

0.008551183391723085


In [6]:
def recTopKPopByCountry(inter_matr: np.ndarray,
               user: int,
               top_k: int,
               users: pd.DataFrame) -> np.ndarray:
    '''
    inter_matr - np.ndarray, from the task 1;
    user - int, user_id;
    top_k - int, expected length of the resulting list;
    users: pandas Dataframe, consisting of user information for all users, requires a "country" column;

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''

    top_pop = None
    country = users.loc[users["user_id"] == user, "country"].values[0]
    if country is np.nan:
        return recTopKPop(inter_matr, user, top_k)
    else:
        users = users[users.country == country]
        user_ids = users.loc[:,'user_id']
        user_index = np.where(user_ids == user)[0][0]
        country_inter_matr = inter_matr[user_ids]
        top_pop = recTopKPop(country_inter_matr, user_index, top_k)
        return top_pop

In [7]:
train_recs_list = []

for i in range(len(users)):
    train_rec_i = recTopKPopByCountry(train_interaction_matrix, user = i, top_k = 10, users= users)

    train_recs_list.append(",".join(map(str, train_rec_i)))
    
user_id_list = np.array([ i for i in range(len(users))])

pop_train_df = pd.DataFrame({"user_id":user_id_list, "recs":train_recs_list})

print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix,10))

0.018999113087465942


In [8]:
def recTopKPopByGender(inter_matr: np.ndarray,
               user: int,
               top_k: int,
               users: pd.DataFrame) -> np.ndarray:
    '''
    inter_matr - np.ndarray, from the task 1;
    user - int, user_id;
    top_k - int, expected length of the resulting list;
    users: pandas Dataframe, consisting of user information for all users, requires a "country" column;

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''

    top_pop = None
    gender = users.loc[users["user_id"] == user, "gender"].values[0]
    if gender is np.nan:
        return recTopKPop(inter_matr, user, top_k)
    else:
        users = users[users.gender == gender]
        user_ids = users.loc[:,'user_id']
        user_index = np.where(user_ids == user)[0][0]
        gender_inter_matr = inter_matr[user_ids]
        top_pop = recTopKPop(gender_inter_matr, user_index, top_k)
        return top_pop

In [9]:
train_recs_list = []

for i in range(len(users)):
    train_rec_i = recTopKPopByGender(train_interaction_matrix, user = i, top_k = 10, users= users)

    train_recs_list.append(",".join(map(str, train_rec_i)))
    
user_id_list = np.array([ i for i in range(len(users))])

pop_train_df = pd.DataFrame({"user_id":user_id_list, "recs":train_recs_list})

print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix,10))

0.011769561306900796


In [10]:

def check_column_differs(df: pd.DataFrame, column: str, value) -> bool:
    """
    Returns True if any element in the specified column differs from the given value.
    """
    return (df[column] != value).any()

def print_different_elements(df: pd.DataFrame, column: str, value):
    """
    Prints the rows where the specified column differs from the given value.
    """

    diff_rows = df[df[column] != value]
    if not diff_rows.empty:
        print(diff_rows)
    else:
        print(f"All elements in column '{column}' are equal to '{value}'.")

string  = "40,124,1667,146,317,866,1549,2013,343,648"
print(print_different_elements(pop_test_df, 'recs', string))
#print(check_column_differs(pop_test_df, 'recs', str))

NameError: name 'pop_test_df' is not defined