In [1]:
import os
import re
import gc
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

np.random.seed(42)

In [2]:
# @title Download dataset snippet at https://www.kaggle.com/datasets/himanshuwagh/spotify-million
import kagglehub

path = "/root/.cache/kagglehub/datasets/himanshuwagh/spotify-million/versions/1"
if not os.path.exists(path):
  # Download latest version
  path = kagglehub.dataset_download("himanshuwagh/spotify-million")

# contiene le slices del dataset: 1000 slice das 1000 playlist ciascuna
data: str = os.path.join(path, "data")

In [3]:
# @title New: shuffle slices in a list and pick from them
shuffled_slices = np.array(os.listdir(data))
np.random.shuffle(shuffled_slices)

In [4]:
shuffled_slices[:3]

array(['mpd.slice.528000-528999.json', 'mpd.slice.920000-920999.json',
       'mpd.slice.730000-730999.json'], dtype='<U28')

In [5]:
# @title Metriche
# def precision_at_k(predicted_matrix, ground_truth_matrix, k):
#     precision_scores = []

#     # Per ogni playlist (riga della matrice)
#     for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
#         # Ottieni gli indici delle prime k predizioni ordinate per punteggio
#         top_k_indices = np.argsort(pred_row)[::-1][:k]

#         # Trova se queste predizioni sono nella ground truth
#         relevant_items = true_row[top_k_indices]  # Valori nella ground truth per i top-k

#         # Precision è il numero di rilevanti tra i top-k diviso k
#         precision_scores.append(np.sum(relevant_items) / k)

#     return np.mean(precision_scores)


def precision_at_k(predicted_matrix, ground_truth_matrix, k):
  # Usa argsort per ottenere i top k indici in modo efficiente
  top_k_indices = np.argsort(predicted_matrix, axis=1)[:, ::-1][:, :k]

  # Estrai gli elementi rilevanti nelle prime k raccomandazioni
  relevant_items = ground_truth_matrix[np.arange(ground_truth_matrix.shape[0])[:, None], top_k_indices]

  # Calcola la precisione come il numero di elementi rilevanti diviso k
  precision_scores = np.sum(relevant_items, axis=1) / k

  # Restituisci la precisione media
  return np.mean(precision_scores)

import numpy as np





# def recall_at_k(predicted_matrix, ground_truth_matrix, k):
#     recall_scores = []

#     # Per ogni playlist (riga della matrice)
#     for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
#         # Ottieni gli indici delle prime k predizioni ordinate per punteggio
#         top_k_indices = np.argsort(pred_row)[::-1][:k]

#         # Trova se queste predizioni sono nella ground truth
#         relevant_items = true_row[top_k_indices]  # Valori nella ground truth per i top-k

#         # Recall è il numero di rilevanti trovati diviso i rilevanti totali
#         total_relevant = np.sum(true_row)  # Rilevanti totali nella ground truth
#         recall_scores.append(np.sum(relevant_items) / total_relevant if total_relevant > 0 else 0)

#     return np.mean(recall_scores)


# def recall_at_k(predicted_matrix, ground_truth_matrix, k):
#   # Usa argsort per ottenere i top k indici
#   top_k_indices = np.argsort(predicted_matrix, axis=1)[:, ::-1][:, :k]

#   # Per ogni playlist, calcola quanti elementi rilevanti sono nelle prime k predizioni
#   relevant_items = ground_truth_matrix[np.arange(ground_truth_matrix.shape[0])[:, None], top_k_indices]

#   # Calcola il recall per ogni playlist
#   total_relevant = np.sum(ground_truth_matrix, axis=1)  # Numero totale di rilevanti per playlist

#   # Evita divisione per zero
#   recall_scores = np.sum(relevant_items, axis=1) / total_relevant
#   recall_scores[total_relevant == 0] = 0  # Imposta il recall a 0 per le playlist senza tracce rilevanti

#   # Restituisci la media del recall
#   return np.mean(recall_scores)

def recall_at_k(predicted_matrix, ground_truth_matrix, k):
    # Get the indices of the top k predictions for each row
    top_k_indices = np.argsort(predicted_matrix, axis=1)[:, -k:][:, ::-1]

    # Gather the relevant items in ground truth corresponding to top k predictions
    relevant_items = ground_truth_matrix[np.arange(ground_truth_matrix.shape[0])[:, None], top_k_indices]

    # Calculate the recall for each playlist
    total_relevant = np.sum(ground_truth_matrix, axis=1)  # Total relevant items per playlist

    # Avoid division by zero: mask rows with no relevant items
    recall_scores = np.sum(relevant_items, axis=1) / np.maximum(total_relevant, 1)

    # Return the mean recall, ignoring rows with no relevant items
    return np.mean(recall_scores[total_relevant > 0])


# def mean_reciprocal_rank(predicted_matrix, ground_truth_matrix):
#     reciprocal_ranks = []

#     # Per ogni playlist (riga della matrice)
#     for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
#         # Ottieni gli indici ordinati in base al punteggio predetto
#         sorted_indices = np.argsort(pred_row)[::-1]

#         # Trova i rank della prima traccia rilevante
#         for rank, index in enumerate(sorted_indices, start=1):
#             if true_row[index] == 1:  # Se l'indice predetto è nella ground truth
#                 reciprocal_ranks.append(1 / rank)
#                 break
#         else:
#             reciprocal_ranks.append(0)  # Nessun elemento rilevante trovato

#     return np.mean(reciprocal_ranks)
def mean_reciprocal_rank(predicted_matrix, ground_truth_matrix):
    reciprocal_ranks = []

    # Iterate over each playlist (row in the matrix)
    for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
        # Get the indices sorted by predicted scores in descending order
        sorted_indices = np.argsort(pred_row)[::-1]

        # Find the rank of the first relevant item
        found_relevant = False
        for rank, index in enumerate(sorted_indices, start=1):
            if true_row[index] == 1:  # If the item is relevant in the ground truth
                reciprocal_ranks.append(1 / rank)
                found_relevant = True
                break

        # If no relevant items were found, append 0
        if not found_relevant:
            reciprocal_ranks.append(0)

    # Return the mean of the reciprocal ranks
    return np.mean(reciprocal_ranks)


# dataset

In [6]:
million_df = pd.DataFrame()
num_training_files = 500

# Create an empty list to hold all rows as dictionaries
data_list = []

#for i, filename in tqdm(enumerate(sorted(os.listdir(data), key=extract_starting_number)[:num_training_files]), desc="Processing Slices"):
for i, filename in tqdm(enumerate(shuffled_slices[:num_training_files]), desc="Processing Slices"):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        filepath = os.path.join(data, filename)

        with open(filepath, "r", encoding="utf-8") as jsonfile:
            cur_slice = json.load(jsonfile)

        # for playlist in tqdm(cur_slice["playlists"], desc="Processing playlist..."):
        for playlist in cur_slice["playlists"]:
            playlist_id = playlist["pid"]
            # num_tracks = playlist["num_tracks"]

            # Collect data for the playlist
            for track in playlist["tracks"]:
                data_list.append({
                    "playlist": playlist_id,
                    "track": track["track_uri"][14:]  # remove 'spotify:track:'
                })
    # update every 30 files for speedup
    if i%30 == 0:
        new_data = pd.DataFrame(data_list)
        data_list.clear()
        million_df = pd.concat([million_df, new_data], ignore_index=True)

# Convert the list of dictionaries into a DataFrame in one go
# dumb_dataset = pd.DataFrame(data_list)
new_data = pd.DataFrame(data_list)
data_list = []
million_df = pd.concat([million_df, new_data], ignore_index=True)

million_df["playlist"] = million_df["playlist"].astype("int32")

Processing Slices: 500it [04:27,  1.87it/s]


In [7]:
million_df.shape

(33170567, 2)

In [8]:
million_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33170567 entries, 0 to 33170566
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   playlist  int32 
 1   track     object
dtypes: int32(1), object(1)
memory usage: 379.6+ MB


In [9]:
million_df.head()

Unnamed: 0,playlist,track
0,528000,5mmgfPAMIFIhlP2VneJc0G
1,528000,40riOy7x9W7GXjyGp4pjAv
2,528000,4efoEY8iDBzUqitjmNDhpN
3,528000,0NqQmmLEN9rlnkh2JW0UIs
4,528000,1MQCTOWVfy4PcuBXkBsHVB


In [10]:
# Count how many playlists each track appears in
track_frequency = million_df.groupby("track")["playlist"].nunique()

# Convert to a DataFrame for easier handling
track_frequency_df = track_frequency.reset_index().rename(columns={"playlist": "playlist_count"})

# Total number of playlists
# total_playlists = million_df["playlist_id"].nunique()
total_playlists = 1000*num_training_files

# threshold
threshold = total_playlists * 0.00005

# Filter tracks that appear in at least 25% of playlists
popular_tracks = track_frequency_df[track_frequency_df["playlist_count"] >= threshold]

# Extract popular track IDs
popular_track_ids = popular_tracks["track"].tolist()

# Filter the original dataset
filtered_df = million_df[million_df["track"].isin(popular_track_ids)]


In [11]:
track_frequency_df.max()

Unnamed: 0,0
track,7zzyrYnZIfvYAGwl7lRb7X
playlist_count,22593


In [12]:
track_frequency.shape

(1608601,)

In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28469745 entries, 1 to 33170566
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   playlist  int32 
 1   track     object
dtypes: int32(1), object(1)
memory usage: 543.0+ MB


In [14]:
filtered_df.head()

Unnamed: 0,playlist,track
1,528000,40riOy7x9W7GXjyGp4pjAv
2,528000,4efoEY8iDBzUqitjmNDhpN
3,528000,0NqQmmLEN9rlnkh2JW0UIs
4,528000,1MQCTOWVfy4PcuBXkBsHVB
5,528000,1hGy2eLcmC8eKx7qr1tOqx


In [15]:
filtered_df.shape

(28469745, 2)

In [16]:
# # Map playlist_id and track_id to numerical indices
# playlist_map = {id_: idx for idx, id_ in enumerate(filtered_df["playlist"].unique())}
# track_uri_to_idx = {id_: idx for idx, id_ in enumerate(filtered_df["track"].unique())}

# filtered_df["playlist_idx"] = filtered_df["playlist"].map(playlist_map)
# filtered_df["track_idx"] = filtered_df["track"].map(track_uri_to_idx)

# # Create COO matrix
# rows = filtered_df["playlist_idx"]
# cols = filtered_df["track_idx"]
# data_list = np.ones(len(filtered_df))  # All entries are 1 since a track belongs to a playlist

# coo_rating_matrix = coo_matrix((data_list, (rows, cols)), shape=(len(playlist_map), len(track_uri_to_idx)))
# coo_rating_matrix.shape

# --- warning

# Make an explicit copy of filtered_df
filtered_df = filtered_df.copy()

# Map playlist_id and track_id to numerical indices
playlist_id_to_idx = {id: idx for idx, id in enumerate(filtered_df["playlist"].unique())}
track_uri_to_idx = {uri: idx for idx, uri in enumerate(filtered_df["track"].unique())}

filtered_df["playlist_idx"] = filtered_df["playlist"].map(playlist_id_to_idx)
filtered_df["track_idx"] = filtered_df["track"].map(track_uri_to_idx)

# Create COO matrix
rows = filtered_df["playlist_idx"]
cols = filtered_df["track_idx"]
data_list = np.ones(len(filtered_df))  # All entries are 1 since a track belongs to a playlist

coo_rating_matrix = coo_matrix((data_list, (rows, cols)), shape=(len(playlist_id_to_idx), len(track_uri_to_idx)))
print(coo_rating_matrix.shape)  # Output: (485376, 18857)


(497138, 119003)


In [17]:
coo_rating_matrix.shape

(497138, 119003)

In [18]:
# per rendere il codice della funzione di valutazione come quello sopra
num_tracks = coo_rating_matrix.shape[1]
tracks = set(track_uri_to_idx.keys())

In [19]:
del(million_df, track_frequency_df, filtered_df)
del(rows, cols, data_list, new_data)
del(popular_tracks, popular_track_ids)
del(playlist_id_to_idx, track_frequency)

gc.collect()

0

In [20]:
gc.collect()

0

# Training

In [21]:
class ScipySVD():
  def __init__(self, n_components, **kwargs):
    self.n_components = n_components
    self.kwargs = kwargs


  def fit(self, X):
    _, _, components_ = svds(X, self.n_components, **self.kwargs)
    self.components_ = components_


  def transform(self ,X):
    return X @ self.components_.T

In [22]:
svd_model = ScipySVD(600, random_state=42)

svd_model.fit(coo_rating_matrix)

# Validation

In [23]:
def evaluate_model_k_tracks_removed_df(model, k, num_valid_files=10):
  """
  evaluate model processing a slice of playlists, 200 playlist at time to avoid
  colab cpu overflow
  """

  # num_valid_files = 1000 - num_training_files
  precision_at_10 = np.zeros(num_valid_files)
  precision_at_5 = np.zeros(num_valid_files)
  precision_at_2 = np.zeros(num_valid_files)
  precision_at_1 = np.zeros(num_valid_files)

  recall_at_10 = np.zeros(num_valid_files)
  recall_at_5 = np.zeros(num_valid_files)
  recall_at_2 = np.zeros(num_valid_files)
  recall_at_1 = np.zeros(num_valid_files)

  mrr = np.zeros(num_valid_files)

  # for file_idx, filename in tqdm(enumerate(sorted(os.listdir(data), key=extract_starting_number)[num_training_files:num_training_files+num_valid_files]), desc="Processing Slices"):
  for file_idx, filename in tqdm(enumerate(shuffled_slices[num_training_files:num_training_files+num_valid_files]), desc="Processing Slices"):
    correct_playlists = np.zeros((1000, num_tracks))
    # print(filename)
    p_counter = -1
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
      filepath = os.path.join(data, filename)

      with open(filepath, "r", encoding="utf-8") as jsonfile:
        cur_slice = json.load(jsonfile)

      for playlist in cur_slice["playlists"]:
        p_counter += 1

        for track in playlist["tracks"]:
          track_uri = track["track_uri"][14:]

          if track_uri in tracks:
            t_idx = track_uri_to_idx[track_uri]

            correct_playlists[p_counter, t_idx] = 1


    incomplete_playlists = np.copy(correct_playlists)

    # Turn exactly k ones to zeros per row
    for row in incomplete_playlists:
      # Get the indices of `1`s in the current row
      one_indices = np.where(row == 1)[0]

      # If there are at least k ones, randomly choose 2 of them
      if len(one_indices) >= k:
        indices_to_zero = np.random.choice(one_indices, size=k, replace=False)
        row[indices_to_zero] = 0


    cur_precision_at_10 = [0 for _ in range(5)]
    cur_precision_at_5 = [0 for _ in range(5)]
    cur_precision_at_2 = [0 for _ in range(5)]
    cur_precision_at_1 = [0 for _ in range(5)]

    cur_recall_at_10 = [0 for _ in range(5)]
    cur_recall_at_5 = [0 for _ in range(5)]
    cur_recall_at_2 = [0 for _ in range(5)]
    cur_recall_at_1 = [0 for _ in range(5)]

    cur_mrr = [0 for _ in range(5)]


    for iter in tqdm(range(5), desc="Iterations"):
      P_new = model.transform(incomplete_playlists[200*iter:200*(iter+1), :])

      # Predici la matrice ricostruita per le nuove playlist
      predicted_matrix = np.dot(P_new, model.components_)

      cur_precision_at_10[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 10)
      cur_precision_at_5[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 5)
      cur_precision_at_2[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 2)
      cur_precision_at_1[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 1)

      cur_recall_at_10[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 10)
      cur_recall_at_5[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 5)
      cur_recall_at_2[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 2)
      cur_recall_at_1[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 1)

      cur_mrr[iter] = mean_reciprocal_rank(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :])

    precision_at_10[file_idx] = np.mean(cur_precision_at_10)
    precision_at_5[file_idx] = np.mean(cur_precision_at_5)
    precision_at_2[file_idx] = np.mean(cur_precision_at_2)
    precision_at_1[file_idx] = np.mean(cur_precision_at_1)
    recall_at_10[file_idx] = np.mean(cur_recall_at_10)
    recall_at_5[file_idx] = np.mean(cur_recall_at_5)
    recall_at_2[file_idx] = np.mean(cur_recall_at_2)
    recall_at_1[file_idx] = np.mean(cur_recall_at_1)
    mrr[file_idx] = np.mean(cur_mrr)

  print("\nPrecision@10 = ",np.mean(precision_at_10))
  print("Precision@5 = ",np.mean(precision_at_5))
  print("Precision@2 = ",np.mean(precision_at_2))
  print("Precision@1 = ",np.mean(precision_at_1))

  print("Recall@10 = ",np.mean(recall_at_10))
  print("Recall@5 = ",np.mean(recall_at_5))
  print("Recall@2 = ",np.mean(recall_at_2))
  print("Recall@1 = ",np.mean(recall_at_1))

  print("MRR = ", np.mean(mrr))

In [24]:
evaluate_model_k_tracks_removed_df(svd_model, 0, 2)

Processing Slices: 0it [00:00, ?it/s]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:27<01:48, 27.08s/it][A
Iterations:  40%|████      | 2/5 [00:53<01:19, 26.40s/it][A
Iterations:  60%|██████    | 3/5 [01:18<00:52, 26.16s/it][A
Iterations:  80%|████████  | 4/5 [01:44<00:25, 25.95s/it][A
Iterations: 100%|██████████| 5/5 [02:10<00:00, 26.05s/it]
Processing Slices: 1it [02:11, 131.48s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:42, 25.58s/it][A
Iterations:  40%|████      | 2/5 [00:51<01:18, 26.03s/it][A
Iterations:  60%|██████    | 3/5 [01:18<00:52, 26.31s/it][A
Iterations:  80%|████████  | 4/5 [01:44<00:26, 26.29s/it][A
Iterations: 100%|██████████| 5/5 [02:10<00:00, 26.13s/it]
Processing Slices: 2it [04:23, 131.99s/it]


Precision@10 =  0.6504000000000001
Precision@5 =  0.7381
Precision@2 =  0.80625
Precision@1 =  0.8345
Recall@10 =  0.16578967688636206
Recall@5 =  0.10287013313138296
Recall@2 =  0.04839857990842572
Recall@1 =  0.026231653970486817
MRR =  0.8686302694638846





In [25]:
evaluate_model_k_tracks_removed_df(svd_model, 2)

Processing Slices: 0it [00:00, ?it/s]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:26<01:46, 26.62s/it][A
Iterations:  40%|████      | 2/5 [00:52<01:18, 26.08s/it][A
Iterations:  60%|██████    | 3/5 [01:18<00:51, 25.95s/it][A
Iterations:  80%|████████  | 4/5 [01:44<00:25, 25.93s/it][A
Iterations: 100%|██████████| 5/5 [02:09<00:00, 25.97s/it]
Processing Slices: 1it [02:10, 130.87s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:42, 25.69s/it][A
Iterations:  40%|████      | 2/5 [00:51<01:17, 25.75s/it][A
Iterations:  60%|██████    | 3/5 [01:16<00:51, 25.61s/it][A
Iterations:  80%|████████  | 4/5 [01:43<00:25, 25.90s/it][A
Iterations: 100%|██████████| 5/5 [02:10<00:00, 26.14s/it]
Processing Slices: 2it [04:22, 131.45s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:26<01:45, 26.38s/it][A
Iterations:  40%|████      | 2/5 [00:52<01:19, 26.41


Precision@10 =  0.63328
Precision@5 =  0.71974
Precision@2 =  0.79305
Precision@1 =  0.8263999999999999
Recall@10 =  0.155088336966349
Recall@5 =  0.09651787593702348
Recall@2 =  0.045897550484940694
Recall@1 =  0.02482018484788886
MRR =  0.859879498163914





In [None]:
evaluate_model_k_tracks_removed_df(svd_model, 15, num_valid_files=2)

In [None]:
evaluate_model_k_tracks_removed_df(svd_model, 20, num_valid_files=2)

In [None]:
evaluate_model_k_tracks_removed_df(svd_model, 30, 2)

In [28]:
def evaluate_model_k_tracks_per_playlist(model, k, num_valid_files=10):
  """
  evaluate model processing a slice of playlists, 200 playlist at time to avoid
  colab cpu overflow
  """

  # num_valid_files = 1000 - num_training_files
  precision_at_10 = np.zeros(num_valid_files)
  precision_at_5 = np.zeros(num_valid_files)
  precision_at_2 = np.zeros(num_valid_files)
  precision_at_1 = np.zeros(num_valid_files)

  recall_at_10 = np.zeros(num_valid_files)
  recall_at_5 = np.zeros(num_valid_files)
  recall_at_2 = np.zeros(num_valid_files)
  recall_at_1 = np.zeros(num_valid_files)

  mrr = np.zeros(num_valid_files)

  # for file_idx, filename in tqdm(enumerate(sorted(os.listdir(data), key=extract_starting_number)[num_training_files:num_training_files+num_valid_files]), desc="Processing Slices"):
  for file_idx, filename in tqdm(enumerate(shuffled_slices[num_training_files:num_training_files+num_valid_files]), desc="Processing Slices"):
    correct_playlists = np.zeros((1000, num_tracks))
    # print(filename)
    p_counter = -1
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
      filepath = os.path.join(data, filename)

      with open(filepath, "r", encoding="utf-8") as jsonfile:
        cur_slice = json.load(jsonfile)

      for playlist in cur_slice["playlists"]:
        p_counter += 1

        for track in playlist["tracks"]:
          track_uri = track["track_uri"][14:]

          if track_uri in tracks:
            t_idx = track_uri_to_idx[track_uri]

            correct_playlists[p_counter, t_idx] = 1


    incomplete_playlists = np.copy(correct_playlists)

    for row in incomplete_playlists:
      one_indexes = np.where(row == 1)[0]

      if len(one_indexes) >= k:
        indices_to_zero = np.random.choice(one_indexes, size=(len(one_indexes)-k), replace=False)
        row[indices_to_zero] = 0


    cur_precision_at_10 = [0 for _ in range(5)]
    cur_precision_at_5 = [0 for _ in range(5)]
    cur_precision_at_2 = [0 for _ in range(5)]
    cur_precision_at_1 = [0 for _ in range(5)]

    cur_recall_at_10 = [0 for _ in range(5)]
    cur_recall_at_5 = [0 for _ in range(5)]
    cur_recall_at_2 = [0 for _ in range(5)]
    cur_recall_at_1 = [0 for _ in range(5)]

    cur_mrr = [0 for _ in range(5)]


    for iter in tqdm(range(5), desc="Iterations"):
      P_new = model.transform(incomplete_playlists[200*iter:200*(iter+1), :])

      # Predici la matrice ricostruita per le nuove playlist
      predicted_matrix = np.dot(P_new, model.components_)

      cur_precision_at_10[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 10)
      cur_precision_at_5[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 5)
      cur_precision_at_2[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 2)
      cur_precision_at_1[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 1)

      cur_recall_at_10[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 10)
      cur_recall_at_5[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 5)
      cur_recall_at_2[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 2)
      cur_recall_at_1[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 1)

      cur_mrr[iter] = mean_reciprocal_rank(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :])

    precision_at_10[file_idx] = np.mean(cur_precision_at_10)
    precision_at_5[file_idx] = np.mean(cur_precision_at_5)
    precision_at_2[file_idx] = np.mean(cur_precision_at_2)
    precision_at_1[file_idx] = np.mean(cur_precision_at_1)
    recall_at_10[file_idx] = np.mean(cur_recall_at_10)
    recall_at_5[file_idx] = np.mean(cur_recall_at_5)
    recall_at_2[file_idx] = np.mean(cur_recall_at_2)
    recall_at_1[file_idx] = np.mean(cur_recall_at_1)
    mrr[file_idx] = np.mean(cur_mrr)

  print("\nPrecision@10 = ",np.mean(precision_at_10))
  print("Precision@5 = ",np.mean(precision_at_5))
  print("Precision@2 = ",np.mean(precision_at_2))
  print("Precision@1 = ",np.mean(precision_at_1))

  print("Recall@10 = ",np.mean(recall_at_10))
  print("Recall@5 = ",np.mean(recall_at_5))
  print("Recall@2 = ",np.mean(recall_at_2))
  print("Recall@1 = ",np.mean(recall_at_1))

  print("MRR = ", np.mean(mrr))

In [29]:
evaluate_model_k_tracks_per_playlist(svd_model, 0, 2)

Processing Slices: 0it [00:00, ?it/s]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:09<00:39,  9.89s/it][A
Iterations:  40%|████      | 2/5 [00:17<00:25,  8.57s/it][A
Iterations:  60%|██████    | 3/5 [00:26<00:17,  8.64s/it][A
Iterations:  80%|████████  | 4/5 [00:34<00:08,  8.46s/it][A
Iterations: 100%|██████████| 5/5 [00:41<00:00,  8.34s/it]
Processing Slices: 1it [00:42, 42.81s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:08<00:33,  8.31s/it][A
Iterations:  40%|████      | 2/5 [00:15<00:23,  7.70s/it][A
Iterations:  60%|██████    | 3/5 [00:23<00:15,  7.99s/it][A
Iterations:  80%|████████  | 4/5 [00:32<00:08,  8.20s/it][A
Iterations: 100%|██████████| 5/5 [00:39<00:00,  7.91s/it]
Processing Slices: 2it [01:23, 41.97s/it]


Precision@10 =  0.00030000000000000003
Precision@5 =  0.0004
Precision@2 =  0.00025
Precision@1 =  0.0
Recall@10 =  6.264474196662435e-05
Recall@5 =  4.081209387769486e-05
Recall@2 =  3.2168821977739177e-06
Recall@1 =  0.0
MRR =  0.001848336071373832





In [30]:
evaluate_model_k_tracks_per_playlist(svd_model, 2, 4)

Processing Slices: 0it [00:00, ?it/s]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:43, 25.90s/it][A
Iterations:  40%|████      | 2/5 [00:51<01:17, 25.82s/it][A
Iterations:  60%|██████    | 3/5 [01:17<00:51, 25.82s/it][A
Iterations:  80%|████████  | 4/5 [01:43<00:25, 25.82s/it][A
Iterations: 100%|██████████| 5/5 [02:08<00:00, 25.79s/it]
Processing Slices: 1it [02:09, 129.96s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:43, 25.78s/it][A
Iterations:  40%|████      | 2/5 [00:52<01:19, 26.40s/it][A
Iterations:  60%|██████    | 3/5 [01:18<00:51, 25.96s/it][A
Iterations:  80%|████████  | 4/5 [01:43<00:25, 25.92s/it][A
Iterations: 100%|██████████| 5/5 [02:10<00:00, 26.03s/it]
Processing Slices: 2it [04:21, 130.68s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:26<01:45, 26.36s/it][A
Iterations:  40%|████      | 2/5 [00:52<01:19, 26.35


Precision@10 =  0.24295
Precision@5 =  0.3152
Precision@2 =  0.438375
Precision@1 =  0.5285
Recall@10 =  0.06330136319246144
Recall@5 =  0.04380776802572256
Recall@2 =  0.025926879421708005
Recall@1 =  0.01632174506598679
MRR =  0.6290238759262217





In [31]:
evaluate_model_k_tracks_per_playlist(svd_model, 5, 2)

Processing Slices: 0it [00:00, ?it/s]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:42, 25.71s/it][A
Iterations:  40%|████      | 2/5 [00:52<01:18, 26.06s/it][A
Iterations:  60%|██████    | 3/5 [01:18<00:52, 26.29s/it][A
Iterations:  80%|████████  | 4/5 [01:44<00:26, 26.17s/it][A
Iterations: 100%|██████████| 5/5 [02:10<00:00, 26.06s/it]
Processing Slices: 1it [02:11, 131.70s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:42, 25.70s/it][A
Iterations:  40%|████      | 2/5 [00:51<01:17, 25.77s/it][A
Iterations:  60%|██████    | 3/5 [01:17<00:51, 25.75s/it][A
Iterations:  80%|████████  | 4/5 [01:43<00:25, 25.76s/it][A
Iterations: 100%|██████████| 5/5 [02:08<00:00, 25.73s/it]
Processing Slices: 2it [04:21, 130.70s/it]


Precision@10 =  0.32685
Precision@5 =  0.43410000000000004
Precision@2 =  0.58925
Precision@1 =  0.6605000000000001
Recall@10 =  0.09113562476911449
Recall@5 =  0.06504773163880086
Recall@2 =  0.03695360963676836
Recall@1 =  0.02148666927425187
MRR =  0.7468529047999132





In [32]:
evaluate_model_k_tracks_per_playlist(svd_model, 10, 2)

Processing Slices: 0it [00:00, ?it/s]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:25<01:43, 25.80s/it][A
Iterations:  40%|████      | 2/5 [00:51<01:16, 25.53s/it][A
Iterations:  60%|██████    | 3/5 [01:17<00:51, 25.79s/it][A
Iterations:  80%|████████  | 4/5 [01:43<00:26, 26.06s/it][A
Iterations: 100%|██████████| 5/5 [02:10<00:00, 26.05s/it]
Processing Slices: 1it [02:11, 131.90s/it]
Iterations:   0%|          | 0/5 [00:00<?, ?it/s][A
Iterations:  20%|██        | 1/5 [00:26<01:45, 26.35s/it][A
Iterations:  40%|████      | 2/5 [00:52<01:18, 26.14s/it][A
Iterations:  60%|██████    | 3/5 [01:18<00:51, 25.96s/it][A
Iterations:  80%|████████  | 4/5 [01:43<00:25, 25.92s/it][A
Iterations: 100%|██████████| 5/5 [02:09<00:00, 25.95s/it]
Processing Slices: 2it [04:22, 131.34s/it]


Precision@10 =  0.42064999999999997
Precision@5 =  0.5491
Precision@2 =  0.68225
Precision@1 =  0.7535000000000001
Recall@10 =  0.1189886145214901
Recall@5 =  0.08238090328032377
Recall@2 =  0.042616841229687494
Recall@1 =  0.024308830337788467
MRR =  0.8106560611585234



