Task: Recommend 10 unseen songs for every users
Goal: maximize nDCG
Data: user info
      item info
      user interactions with items(test + train)
      item embeddings
Submit: report.txt + codes.zip + recommendations.tsv
TODO: create train-test-val data splits
      setup nDCG evaluation
      produce & evaluate random recommendations
      produce and evaluate POP recommendations

      


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity
from tqdm import tqdm
from typing import Callable, List
from sklearn.metrics import ndcg_score
from scipy.sparse import csr_matrix



In [3]:
def inter_matr_implicit(users: int,
                       items: int,
                       interactions: pd.DataFrame,
                       threshold=1) -> np.ndarray:
    """
    Create an implicit interaction matrix from user-item interactions.
    
    Parameters:
        users: DataFrame containing user information
        items: DataFrame containing item information
        interactions: DataFrame containing user-item interaction data
        threshold: Minimum value for a valid interaction (default: 1)
        
    Returns:
        2D numpy array where rows represent users and columns represent items
    """
    interactions = interactions.copy()

    n_users = len(users.index)
    n_items = len(items.index)
    res = np.zeros([n_users, n_items], dtype=np.int8)

    row = interactions['user_id'].to_numpy()
    col = interactions["item_id"].to_numpy()

    data = interactions['count'].to_numpy()
    data[data < threshold] = 0
    data[data >= threshold] = 1

    res[row, col] = data

    return res

def get_ndcg_score_sk(df_predictions, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    Calculate the NDCG score for recommendation predictions.
    
    Parameters:
        df_predictions: DataFrame containing recommendation predictions
        test_interaction_matrix: Ground truth interaction matrix
        topK: Number of top recommendations to evaluate (default: 10)
        
    Returns:
        Average NDCG score across all users
    """
    ndcg_avg = 0
    
    for _, row in df_predictions.iterrows():
        g_truth = test_interaction_matrix[row["user_id"]]

        predicted_scores = np.zeros(len(g_truth),dtype=np.int8)

        predictions = list(map(int, row["recs"].split(",")))[:topK]

        for j, rec in enumerate(predictions):
            predicted_scores[rec] = topK-j

        ndcg_avg += ndcg_score(g_truth.reshape(1, -1), predicted_scores.reshape(1, -1), k=topK)

    return ndcg_avg/len(df_predictions)


In [4]:
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

# Load User Data
users = read('lfm-challenge', 'user')
print("Users Data Head:")
#print(users.head())

# Load Item Data
items = read('lfm-challenge', 'item')
print("\nItem Data Head:")
#print(items.head())

# Load Training Interactions
train_inters = read('lfm-challenge', 'inter_train')
print("\nTraining Interactions Head:")
#print(train_inters.head())

# Load Testing Interactions
test_inters = read('lfm-challenge', 'inter_test')
print("\nTesting Interactions Head:")
#print(test_inters.head())

# Load Item Embeddings
item_embedding = read('lfm-challenge', 'musicnn')
print("\nEmbeddings Head:")
print(item_embedding.head())

# Load User Embeddings
user_embedding = read('lfm-challenge', 'usernn')
print("\nEmbeddings Head:")
print(user_embedding.head())

train_interaction_matrix = inter_matr_implicit(users, items, train_inters)
test_interaction_matrix = inter_matr_implicit(users, items, test_inters)

Users Data Head:

Item Data Head:

Training Interactions Head:

Testing Interactions Head:

Embeddings Head:
   item_id         0         1         2         3         4         5  \
0        0  0.221942  0.006455  0.027300  0.091775  0.013135  0.137436   
1        1  0.166340  0.000332  0.018895  0.140315  0.002309  0.111743   
2        2  0.247896  0.003749  0.034527  0.036859  0.008251  0.115214   
3        3  0.229554  0.000968  0.028905  0.027514  0.002186  0.100847   
4        4  0.009760  0.000590  0.008925  0.721381  0.000711  0.073143   

          6         7         8  ...        40        41        42        43  \
0  0.082835  0.275749  0.126342  ...  0.058063  0.014128  0.000574  0.001193   
1  0.102853  0.483104  0.135297  ...  0.191162  0.014372  0.000179  0.000249   
2  0.030934  0.609462  0.058102  ...  0.009187  0.005204  0.000456  0.000602   
3  0.029319  0.564656  0.080171  ...  0.008916  0.004114  0.000110  0.000287   
4  0.454569  0.118651  0.368946  ...  0.008477

In [5]:



class BPR_MF:
    """
    Bayesian Personalised Ranking - Matrix-Factorisation (Rendle et al., 2009)

    Parameters
    ----------
    n_users : int
    n_items : int
    n_factors : int         # latent dimension
    lr : float              # SGD learning-rate
    reg : float             # L2 regularisation
    n_iter : int            # epochs
    seed : int
    """

    def __init__(self, n_users, n_items,
                 n_factors=64, lr=0.05, reg=0.002,
                 n_iter=50, seed=42):
        rng = np.random.default_rng(seed)
        self.n_users, self.n_items = n_users, n_items
        self.k = n_factors
        self.P = 0.01 * rng.standard_normal((n_users, n_factors))  # user factors
        self.Q = 0.01 * rng.standard_normal((n_items, n_factors))  # item factors
        self.lr, self.reg, self.n_iter = lr, reg, n_iter

    # ---------- training -------------------------------------------------- #
    def fit(self, interactions: csr_matrix,
            samples_per_epoch: int | None = None):
        """
        interactions : csr_matrix  (binary implicit feedback)
        samples_per_epoch : int    (# (u,i,j) triplets per epoch - default = 10x|interactions|)
        """
        if not isinstance(interactions, csr_matrix):
            interactions = csr_matrix(interactions)

        user_pos_items = [interactions[u].indices
                          for u in range(self.n_users)]

        if samples_per_epoch is None:
            samples_per_epoch = 10 * interactions.nnz

        rng = np.random.default_rng()

        for epoch in range(self.n_iter):
            #print("Training epoch:",epoch)
            for _ in range(samples_per_epoch):
                # draw positive (u,i)
                u = rng.integers(self.n_users)
                if len(user_pos_items[u]) == 0:
                    continue
                i = rng.choice(user_pos_items[u])
                # draw negative j  (re-draw until unseen)
                j = rng.integers(self.n_items)
                while j in user_pos_items[u]:
                    j = rng.integers(self.n_items)

                # x̂_uij = p_u·q_i  −  p_u·q_j
                x_uij = self.P[u] @ (self.Q[i] - self.Q[j])
                sigmoid = 1. / (1. + np.exp(x_uij))        # −∂ ln σ(x_uij)

                # SGD updates
                grad_p = sigmoid * (self.Q[j] - self.Q[i]) + self.reg * self.P[u]
                grad_qi = sigmoid * (-self.P[u]) + self.reg * self.Q[i]
                grad_qj = sigmoid * self.P[u] + self.reg * self.Q[j]

                self.P[u] -= self.lr * grad_p
                self.Q[i] -= self.lr * grad_qi
                self.Q[j] -= self.lr * grad_qj

    # ---------- inference -------------------------------------------------- #
    def _score(self, u: int) -> np.ndarray:
        """ Raw scores for ALL items for a single user """
        return self.P[u] @ self.Q.T

    def recommend(self, u: int,
                  train_mat: np.ndarray | csr_matrix,
                  N: int = 10) -> np.ndarray:
        """ Top-N unseen items for user u """
        seen = set(np.where(train_mat[u] > 0)[0])
        scores = self._score(u)
        scores[list(seen)] = -np.inf                     # filter watched / listened
        topN = np.argpartition(scores, -N)[-N:]
        return topN[np.argsort(-scores[topN])]           # sort descending


def get_prediction(user_id: int,
                   n_items: int,
                   model: BPR_MF,
                   train_mat,
                   topN=10):
    return model.recommend(user_id, train_mat, N=topN)


In [6]:

factors = [256]
lr = [0.03]
reg = [1e-1, 1e-2, 1e-3, 1e-4]
results = []
for n_factor in factors:
    for learning_rate in lr:
        for regularization in reg:
            np.random.seed(42)
            print(f"Training with n_factors={n_factor}, lr={learning_rate}, reg={regularization}")
            train_csr = csr_matrix(train_interaction_matrix)
            print("Created csr matrix")
            # Step 2: fit BPR-MF
            model = BPR_MF(n_users=len(users),
                        n_items=len(items),
                        n_factors=n_factor, lr=learning_rate, reg=regularization,
                        n_iter=50)
            model.fit(train_csr)          # ~2–3 min on 1 M users × 25 k items, CPU

            print("model fitted")


            train_recs_list = []

            for u in range(len(users)):
                recs_u = get_prediction(u, len(items), model, train_interaction_matrix, topN=10)
                train_recs_list.append(",".join(map(str, recs_u)))
                if u %1000 ==1:
                    print(u)
                
            user_id_list = np.array([i for i in range(len(users))])

            pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

            print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))
            results.append({
                "n_factors": n_factor,
                "lr": learning_rate,
                "reg": regularization,
                "ndcg_score": get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10)
            })
            save_path = f"bpr_mf_{n_factor}_{learning_rate}_{regularization}.tsv"
            #save model
            with open(save_path, 'w') as f:
                for u in range(len(users)):
                    recs_u = get_prediction(u, len(items), model, train_interaction_matrix, topN=10)
                    f.write(f"{u}\t{','.join(map(str, recs_u))}\n")


Training with n_factors=256, lr=0.03, reg=0.1
Created csr matrix
model fitted
1
1001
2001
0.08033628008615376
Training with n_factors=256, lr=0.03, reg=0.01
Created csr matrix
model fitted
1
1001
2001
0.15897677946002672
Training with n_factors=256, lr=0.03, reg=0.001
Created csr matrix
model fitted
1
1001
2001
0.14944080258196968
Training with n_factors=256, lr=0.03, reg=0.0001
Created csr matrix
model fitted
1
1001
2001
0.14290828152055163


In [7]:
for res in results:
    print(f"n_factors: {res['n_factors']}, lr: {res['lr']}, reg: {res['reg']}, ndcg_score: {res['ndcg_score']}")

n_factors: 256, lr: 0.03, reg: 0.1, ndcg_score: 0.08033628008615376
n_factors: 256, lr: 0.03, reg: 0.01, ndcg_score: 0.15897677946002672
n_factors: 256, lr: 0.03, reg: 0.001, ndcg_score: 0.14944080258196968
n_factors: 256, lr: 0.03, reg: 0.0001, ndcg_score: 0.14290828152055163


In [8]:
train_recs_list = []

for i in range(len(users)):
    seen_item_ids = np.where(train_interaction_matrix[i] > 0)[0]
    train_rec_i = get_prediction(i,len(items), model, train_inters, device, topN=10)
    train_recs_list.append(",".join(map(str, train_rec_i)))
    
user_id_list = np.array([i for i in range(len(users))])

pop_train_df = pd.DataFrame({"user_id": user_id_list, "recs": train_recs_list})

print(get_ndcg_score_sk(pop_train_df, test_interaction_matrix, 10))

NameError: name 'device' is not defined