Task: Recommend 10 unseen songs for every users
Goal: maximize nDCG
Data: user info
      item info
      user interactions with items(test + train)
      item embeddings
Submit: report.txt + codes.zip + recommendations.tsv
TODO: create train-test-val data splits
      setup nDCG evaluation
      produce & evaluate random recommendations
      produce and evaluate POP recommendations

      


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity
from rec import inter_matr_implicit
from tqdm import tqdm
from typing import Callable, List
from sklearn.metrics import ndcg_score
from rec import svd_decompose, svd_recommend_to_list  #SVD




In [38]:
def inter_matr_implicit(users: int,
                       items: int,
                       interactions: pd.DataFrame,
                       threshold=2) -> np.ndarray:
    """
    Create an implicit interaction matrix from user-item interactions.
    
    Parameters:
        users: DataFrame containing user information
        items: DataFrame containing item information
        interactions: DataFrame containing user-item interaction data
        threshold: Minimum value for a valid interaction (default: 1)
        
    Returns:
        2D numpy array where rows represent users and columns represent items
    """
    interactions = interactions.copy()

    n_users = len(users.index)
    n_items = len(items.index)
    res = np.zeros([n_users, n_items], dtype=np.int8)

    row = interactions['user_id'].to_numpy()
    col = interactions["item_id"].to_numpy()

    data = interactions['count'].to_numpy()  
    new_data = np.array(data)
    new_data[(data >= threshold * 16)] = 5
    new_data[(data >= threshold * 8) & (data < threshold * 16)] = 4
    new_data[(data >= threshold * 4) & (data < threshold * 8)] = 3
    new_data[(data >= threshold * 2) & (data < threshold * 4)] = 2
    new_data[(data >= threshold * 1) & (data < threshold * 2)] = 1
    new_data[data < threshold] = 0
    res[row, col] = new_data

    return res

def get_ndcg_score_sk(df_predictions, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    Calculate the NDCG score for recommendation predictions.
    
    Parameters:
        df_predictions: DataFrame containing recommendation predictions
        test_interaction_matrix: Ground truth interaction matrix
        topK: Number of top recommendations to evaluate (default: 10)
        
    Returns:
        Average NDCG score across all users
    """
    ndcg_avg = 0
    
    for _, row in df_predictions.iterrows():
        g_truth = test_interaction_matrix[row["user_id"]]

        predicted_scores = np.zeros(len(g_truth),dtype=np.int8)

        predictions = list(map(int, row["recs"].split(",")))[:topK]

        for j, rec in enumerate(predictions):
            predicted_scores[rec] = topK-j

        ndcg_avg += ndcg_score(g_truth.reshape(1, -1), predicted_scores.reshape(1, -1), k=topK)

    return ndcg_avg/len(df_predictions)

def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')
users = read('lfm-challenge', 'user')
items = read('lfm-challenge', 'item')
train_inters = read('lfm-challenge', 'inter_train')
train_interaction_matrix = inter_matr_implicit(users, items, train_inters)


In [39]:
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

# Load User Data
users = read('lfm-challenge', 'user')
# print("Users Data Head:")
#print(users.head())

# Load Item Data
items = read('lfm-challenge', 'item')
# print("\nItem Data Head:")
#print(items.head())

# Load Training Interactions
train_inters = read('lfm-challenge', 'inter_train')
# print("\nTraining Interactions Head:")
#print(train_inters.head())

# Load Testing Interactions
test_inters = read('lfm-challenge', 'inter_test')
# print("\nTesting Interactions Head:")
#print(test_inters.head())

# Load Embeddings
embedding = read('lfm-challenge', 'musicnn')
# print("\nEmbeddings Head:")
# print(embedding.head())

train_interaction_matrix = inter_matr_implicit(users, items, train_inters)
test_interaction_matrix = inter_matr_implicit(users, items, test_inters)

In [None]:
for threshhold in [1,2,3,5,7,10]:
    train_interaction_matrix = inter_matr_implicit(users, items, train_inters,threshold=threshhold)
    test_interaction_matrix = inter_matr_implicit(users, items, test_inters,threshold=threshhold)
    train_recs_list = []
    U, V = svd_decompose(train_interaction_matrix)

    for i in range(len(users)):
        seen_item_ids = np.where(train_interaction_matrix[i] > 0)[0]
        train_rec_i = svd_recommend_to_list(i, seen_item_ids,U, V, 10)
        train_recs_list.append(",".join(map(str, train_rec_i)))
        #print(i)
        
    user_id_list = np.array([i for i in range(len(users))])

    pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

    print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))
# 0.11196116800106494
# 0.09512899502017655
# 0.07734446778603704
# 0.05192086340123569
# 0.03819978991773105
# 0.026667566210615334

0.09512899502017655
0.07734446778603704
0.05192086340123569
0.03819978991773105
0.026667566210615334


In [49]:
train_interaction_matrix = inter_matr_implicit(users, items, train_inters,threshold=1)
test_interaction_matrix = inter_matr_implicit(users, items, test_inters,threshold=1)
train_recs_list = []
U, V = svd_decompose(train_interaction_matrix)

for i in range(len(users)):
    seen_item_ids = np.where(train_interaction_matrix[i] > 0)[0]
    train_rec_i = svd_recommend_to_list(i, seen_item_ids,U, V, 10)
    train_recs_list.append(",".join(map(str, train_rec_i)))
    #print(i)
    
user_id_list = np.array([i for i in range(len(users))])

pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))

0.11196116800106494


In [50]:
def inter_matr_implicit(users: int,
                       items: int,
                       interactions: pd.DataFrame,
                       threshold=2) -> np.ndarray:
    """
    Create an implicit interaction matrix from user-item interactions.
    
    Parameters:
        users: DataFrame containing user information
        items: DataFrame containing item information
        interactions: DataFrame containing user-item interaction data
        threshold: Minimum value for a valid interaction (default: 1)
        
    Returns:
        2D numpy array where rows represent users and columns represent items
    """
    interactions = interactions.copy()

    n_users = len(users.index)
    n_items = len(items.index)
    res = np.zeros([n_users, n_items], dtype=np.int8)

    row = interactions['user_id'].to_numpy()
    col = interactions["item_id"].to_numpy()

    data = interactions['count'].to_numpy()  
    new_data = np.array(data)
    new_data[(data >= threshold * 5)] = 5
    new_data[(data >= threshold * 4) & (data < threshold * 16)] = 4
    new_data[(data >= threshold * 3) & (data < threshold * 8)] = 3
    new_data[(data >= threshold * 2) & (data < threshold * 4)] = 2
    new_data[(data >= threshold * 1) & (data < threshold * 2)] = 1
    new_data[data < threshold] = 0
    res[row, col] = new_data

    return res

In [51]:
for threshhold in [1,2,3]:
    train_interaction_matrix = inter_matr_implicit(users, items, train_inters,threshold=threshhold)
    test_interaction_matrix = inter_matr_implicit(users, items, test_inters,threshold=threshhold)
    train_recs_list = []
    U, V = svd_decompose(train_interaction_matrix)

    for i in range(len(users)):
        seen_item_ids = np.where(train_interaction_matrix[i] > 0)[0]
        train_rec_i = svd_recommend_to_list(i, seen_item_ids,U, V, 10)
        train_recs_list.append(",".join(map(str, train_rec_i)))
        #print(i)
        
    user_id_list = np.array([i for i in range(len(users))])

    pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

    print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))

0.11196116800106494
0.09512899502017655
0.07734446778603704


In [53]:
def inter_matr_implicit(users: int,
                       items: int,
                       interactions: pd.DataFrame,
                       threshold=1) -> np.ndarray:
    """
    Create an implicit interaction matrix from user-item interactions.
    
    Parameters:
        users: DataFrame containing user information
        items: DataFrame containing item information
        interactions: DataFrame containing user-item interaction data
        threshold: Minimum value for a valid interaction (default: 1)
        
    Returns:
        2D numpy array where rows represent users and columns represent items
    """
    interactions = interactions.copy()

    n_users = len(users.index)
    n_items = len(items.index)
    res = np.zeros([n_users, n_items], dtype=np.int8)

    row = interactions['user_id'].to_numpy()
    col = interactions["item_id"].to_numpy()

    data = interactions['count'].to_numpy()
    data[data < threshold] = 0
    data[data >= threshold] = 1

    res[row, col] = data

    return res


In [54]:
for threshhold in [1,3, 5, 10]:
    train_interaction_matrix = inter_matr_implicit(users, items, train_inters,threshold=threshhold)
    test_interaction_matrix = inter_matr_implicit(users, items, test_inters,threshold=threshhold)
    train_recs_list = []
    U, V = svd_decompose(train_interaction_matrix)

    for i in range(len(users)):
        seen_item_ids = np.where(train_interaction_matrix[i] > 0)[0]
        train_rec_i = svd_recommend_to_list(i, seen_item_ids,U, V, 10)
        train_recs_list.append(",".join(map(str, train_rec_i)))
        #print(i)
        
    user_id_list = np.array([i for i in range(len(users))])

    pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

    print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))

0.12370514051845662
0.10505758952124714
0.07819203600367244
0.03773938752237527
