In [5]:
import os
import re
import gc
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

np.random.seed(42)

In [2]:
# @title Download dataset snippet at https://www.kaggle.com/datasets/himanshuwagh/spotify-million
import kagglehub

path = "/root/.cache/kagglehub/datasets/himanshuwagh/spotify-million/versions/1"
if not os.path.exists(path):
  # Download latest version
  path = kagglehub.dataset_download("himanshuwagh/spotify-million")

# contiene le slices del dataset: 1000 slice das 1000 playlist ciascuna
data: str = os.path.join(path, "data")

Downloading from https://www.kaggle.com/api/v1/datasets/download/himanshuwagh/spotify-million?dataset_version_number=1...


100%|██████████| 5.20G/5.20G [00:58<00:00, 95.0MB/s]

Extracting files...





In [3]:
# @title Ordina per comodità le slice nel folder

# Function to extract the starting number from the filename
def extract_starting_number(filename: str) -> int:
    n_match = re.search(r'(\d+)-', filename)
    if n_match:
        return int(n_match.group(1))  # Return the starting number of the range
    return 0


for filename in sorted(os.listdir(data), key=extract_starting_number)[:3]:
  print(filename)

print("...")

mpd.slice.0-999.json
mpd.slice.1000-1999.json
mpd.slice.2000-2999.json
...


In [4]:
# @title Metriche
# def precision_at_k(predicted_matrix, ground_truth_matrix, k):
#     precision_scores = []

#     # Per ogni playlist (riga della matrice)
#     for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
#         # Ottieni gli indici delle prime k predizioni ordinate per punteggio
#         top_k_indices = np.argsort(pred_row)[::-1][:k]

#         # Trova se queste predizioni sono nella ground truth
#         relevant_items = true_row[top_k_indices]  # Valori nella ground truth per i top-k

#         # Precision è il numero di rilevanti tra i top-k diviso k
#         precision_scores.append(np.sum(relevant_items) / k)

#     return np.mean(precision_scores)


def precision_at_k(predicted_matrix, ground_truth_matrix, k):
    # Usa argsort per ottenere i top k indici in modo efficiente
    top_k_indices = np.argsort(predicted_matrix, axis=1)[:, ::-1][:, :k]

    # Estrai gli elementi rilevanti nelle prime k raccomandazioni
    relevant_items = ground_truth_matrix[np.arange(ground_truth_matrix.shape[0])[:, None], top_k_indices]

    # Calcola la precisione come il numero di elementi rilevanti diviso k
    precision_scores = np.sum(relevant_items, axis=1) / k

    # Restituisci la precisione media
    return np.mean(precision_scores)


# def recall_at_k(predicted_matrix, ground_truth_matrix, k):
#     recall_scores = []

#     # Per ogni playlist (riga della matrice)
#     for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
#         # Ottieni gli indici delle prime k predizioni ordinate per punteggio
#         top_k_indices = np.argsort(pred_row)[::-1][:k]

#         # Trova se queste predizioni sono nella ground truth
#         relevant_items = true_row[top_k_indices]  # Valori nella ground truth per i top-k

#         # Recall è il numero di rilevanti trovati diviso i rilevanti totali
#         total_relevant = np.sum(true_row)  # Rilevanti totali nella ground truth
#         recall_scores.append(np.sum(relevant_items) / total_relevant if total_relevant > 0 else 0)

#     return np.mean(recall_scores)


def recall_at_k(predicted_matrix, ground_truth_matrix, k):
    # Usa argsort per ottenere i top k indici
    top_k_indices = np.argsort(predicted_matrix, axis=1)[:, ::-1][:, :k]

    # Per ogni playlist, calcola quanti elementi rilevanti sono nelle prime k predizioni
    relevant_items = ground_truth_matrix[np.arange(ground_truth_matrix.shape[0])[:, None], top_k_indices]

    # Calcola il recall per ogni playlist
    total_relevant = np.sum(ground_truth_matrix, axis=1)  # Numero totale di rilevanti per playlist

    # Evita divisione per zero
    recall_scores = np.sum(relevant_items, axis=1) / total_relevant
    recall_scores[total_relevant == 0] = 0  # Imposta il recall a 0 per le playlist senza tracce rilevanti

    # Restituisci la media del recall
    return np.mean(recall_scores)


def mean_reciprocal_rank(predicted_matrix, ground_truth_matrix):
    reciprocal_ranks = []

    # Per ogni playlist (riga della matrice)
    for pred_row, true_row in zip(predicted_matrix, ground_truth_matrix):
        # Ottieni gli indici ordinati in base al punteggio predetto
        sorted_indices = np.argsort(pred_row)[::-1]

        # Trova i rank della prima traccia rilevante
        for rank, index in enumerate(sorted_indices, start=1):
            if true_row[index] == 1:  # Se l'indice predetto è nella ground truth
                reciprocal_ranks.append(1 / rank)
                break
        else:
            reciprocal_ranks.append(0)  # Nessun elemento rilevante trovato

    return np.mean(reciprocal_ranks)


# dataset

In [6]:
million_df = pd.DataFrame()
num_training_files = 500

# Create an empty list to hold all rows as dictionaries
data_list = []

# Process only the first 30 files (if needed, adjust or remove limit)
for i, filename in tqdm(enumerate(sorted(os.listdir(data), key=extract_starting_number)[:num_training_files]), desc="Processing Slices"):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        filepath = os.path.join(data, filename)

        with open(filepath, "r", encoding="utf-8") as jsonfile:
            cur_slice = json.load(jsonfile)

        # for playlist in tqdm(cur_slice["playlists"], desc="Processing playlist..."):
        for playlist in cur_slice["playlists"]:
            playlist_id = playlist["pid"]
            # num_tracks = playlist["num_tracks"]

            # Collect data for the playlist
            for track in playlist["tracks"]:
                data_list.append({
                    "playlist": playlist_id,
                    "track": track["track_uri"][14:]  # remove 'spotify:track:'
                })
    # update every 30 files for speedup
    if i%30 == 0:
        new_data = pd.DataFrame(data_list)
        data_list.clear()
        million_df = pd.concat([million_df, new_data], ignore_index=True)

# Convert the list of dictionaries into a DataFrame in one go
# dumb_dataset = pd.DataFrame(data_list)
new_data = pd.DataFrame(data_list)
data_list = []
million_df = pd.concat([million_df, new_data], ignore_index=True)

million_df["playlist"] = million_df["playlist"].astype("int32")

Processing Slices: 500it [04:51,  1.72it/s]


In [7]:
million_df.shape

(33179067, 2)

In [8]:
million_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33179067 entries, 0 to 33179066
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   playlist  int32 
 1   track     object
dtypes: int32(1), object(1)
memory usage: 379.7+ MB


In [9]:
million_df.head()

Unnamed: 0,playlist,track
0,0,0UaMYEvWZi0ZqiDOoHU3YI
1,0,6I9VzXrHxO9rA9A5euc8Ak
2,0,0WqIKmW4BTrj3eJFmnCKMv
3,0,1AWQoqb9bSvzTjaLralEkT
4,0,1lzr43nnXAijIGYnCT8M8H


In [10]:
# Count how many playlists each track appears in
track_frequency = million_df.groupby("track")["playlist"].nunique()

# Convert to a DataFrame for easier handling
track_frequency_df = track_frequency.reset_index().rename(columns={"playlist": "playlist_count"})

# Total number of playlists
# total_playlists = million_df["playlist_id"].nunique()
total_playlists = 1000*num_training_files

# threshold
threshold = total_playlists * 0.0005

# Filter tracks that appear in at least 25% of playlists
popular_tracks = track_frequency_df[track_frequency_df["playlist_count"] >= threshold]

# Extract popular track IDs
popular_track_ids = popular_tracks["track"].tolist()

# Filter the original dataset
filtered_df = million_df[million_df["track"].isin(popular_track_ids)]


In [11]:
track_frequency_df.max()

Unnamed: 0,0
track,7zzyrYnZIfvYAGwl7lRb7X
playlist_count,22731


In [12]:
track_frequency.shape

(1610661,)

In [13]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21309880 entries, 0 to 33179064
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   playlist  int32 
 1   track     object
dtypes: int32(1), object(1)
memory usage: 406.5+ MB


In [14]:
filtered_df.head()

Unnamed: 0,playlist,track
0,0,0UaMYEvWZi0ZqiDOoHU3YI
1,0,6I9VzXrHxO9rA9A5euc8Ak
2,0,0WqIKmW4BTrj3eJFmnCKMv
3,0,1AWQoqb9bSvzTjaLralEkT
4,0,1lzr43nnXAijIGYnCT8M8H


In [15]:
filtered_df.shape

(21309880, 2)

In [16]:
# # Map playlist_id and track_id to numerical indices
# playlist_map = {id_: idx for idx, id_ in enumerate(filtered_df["playlist"].unique())}
# track_uri_to_idx = {id_: idx for idx, id_ in enumerate(filtered_df["track"].unique())}

# filtered_df["playlist_idx"] = filtered_df["playlist"].map(playlist_map)
# filtered_df["track_idx"] = filtered_df["track"].map(track_uri_to_idx)

# # Create COO matrix
# rows = filtered_df["playlist_idx"]
# cols = filtered_df["track_idx"]
# data_list = np.ones(len(filtered_df))  # All entries are 1 since a track belongs to a playlist

# coo_rating_matrix = coo_matrix((data_list, (rows, cols)), shape=(len(playlist_map), len(track_uri_to_idx)))
# coo_rating_matrix.shape

# --- warning

# Make an explicit copy of filtered_df
filtered_df = filtered_df.copy()

# Map playlist_id and track_id to numerical indices
playlist_map = {id_: idx for idx, id_ in enumerate(filtered_df["playlist"].unique())}
track_uri_to_idx = {id_: idx for idx, id_ in enumerate(filtered_df["track"].unique())}

filtered_df["playlist_idx"] = filtered_df["playlist"].map(playlist_map)
filtered_df["track_idx"] = filtered_df["track"].map(track_uri_to_idx)

# Create COO matrix
rows = filtered_df["playlist_idx"]
cols = filtered_df["track_idx"]
data_list = np.ones(len(filtered_df))  # All entries are 1 since a track belongs to a playlist

coo_rating_matrix = coo_matrix((data_list, (rows, cols)), shape=(len(playlist_map), len(track_uri_to_idx)))
print(coo_rating_matrix.shape)  # Output: (485376, 18857)


(485376, 18857)


In [17]:
coo_rating_matrix.shape

(485376, 18857)

In [18]:
# per rendere il codice della funzione di valutazione come quello sopra
num_tracks = coo_rating_matrix.shape[1]
tracks = set(track_uri_to_idx.keys())

In [19]:
del(million_df, track_frequency_df, filtered_df)
del(rows, cols, data_list, new_data)
del(popular_tracks, popular_track_ids)
del(playlist_map, track_frequency)

gc.collect()

0

In [20]:
gc.collect()

0

# Training

In [23]:
class ScipySVD():
  def __init__(self, n_components, **kwargs):
    self.n_components = n_components
    self.kwargs = kwargs


  def fit(self, X):
    _, _, components_ = svds(X, self.n_components, **self.kwargs)
    self.components_ = components_


  def transform(self ,X):
    return X @ self.components_.T

In [25]:
svd_model = ScipySVD(600, random_state=42)

svd_model.fit(coo_rating_matrix)

# Validation

In [26]:
def evaluate_model_k_tracks_removed_df(model, k):
  """
  evaluate model processing a slice of playlists, 200 playlist at time to avoid
  colab cpu overflow
  """
  correct_playlists = np.zeros((1000, num_tracks))
  p_counter = -1

  for filename in sorted(os.listdir(data), key=extract_starting_number)[num_training_files:num_training_files+1]:
    # print(filename)
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
      filepath = os.path.join(data, filename)

      with open(filepath, "r", encoding="utf-8") as jsonfile:
        cur_slice = json.load(jsonfile)

      for playlist in cur_slice["playlists"]:
        p_counter += 1

        for track in playlist["tracks"]:
          track_uri = track["track_uri"][14:]

          if track_uri in tracks:
            t_idx = track_uri_to_idx[track_uri]

            correct_playlists[p_counter, t_idx] = 1


  incomplete_playlists = np.copy(correct_playlists)

  # Turn exactly k ones to zeros per row
  for row in incomplete_playlists:
    # Get the indices of `1`s in the current row
    one_indices = np.where(row == 1)[0]

    # If there are at least k ones, randomly choose 2 of them
    if len(one_indices) >= k:
      indices_to_zero = np.random.choice(one_indices, size=k, replace=False)
      row[indices_to_zero] = 0


  precision_at_10 = [0 for _ in range(5)]
  precision_at_5 = [0 for _ in range(5)]
  precision_at_2 = [0 for _ in range(5)]
  precision_at_1 = [0 for _ in range(5)]

  recall_at_10 = [0 for _ in range(5)]
  recall_at_5 = [0 for _ in range(5)]
  recall_at_2 = [0 for _ in range(5)]
  recall_at_1 = [0 for _ in range(5)]

  mrr = [0 for _ in range(5)]


  for iter in tqdm(range(5), desc="Iterations"):
    P_new = model.transform(incomplete_playlists[200*iter:200*(iter+1), :])

    # Predici la matrice ricostruita per le nuove playlist
    predicted_matrix = np.dot(P_new, model.components_)

    precision_at_10[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 10)
    precision_at_5[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 5)
    precision_at_2[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 2)
    precision_at_1[iter] = precision_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 1)

    recall_at_10[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 10)
    recall_at_5[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 5)
    recall_at_2[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 2)
    recall_at_1[iter] = recall_at_k(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :], 1)

    mrr[iter] = mean_reciprocal_rank(predicted_matrix, correct_playlists[200*iter:200*(iter+1), :])

  print("\nPrecision@10 = ",np.mean(precision_at_10))
  print("Precision@5 = ",np.mean(precision_at_5))
  print("Precision@2 = ",np.mean(precision_at_2))
  print("Precision@1 = ",np.mean(precision_at_1))

  print("Recall@10 = ",np.mean(recall_at_10))
  print("Recall@5 = ",np.mean(recall_at_5))
  print("Recall@2 = ",np.mean(recall_at_2))
  print("Recall@1 = ",np.mean(recall_at_1))

  print("MRR = ", np.mean(mrr))

In [27]:
evaluate_model_k_tracks_removed_df(svd_model, 2)

  recall_scores = np.sum(relevant_items, axis=1) / total_relevant
Iterations: 100%|██████████| 5/5 [00:18<00:00,  3.61s/it]


Precision@10 =  0.6143
Precision@5 =  0.6966
Precision@2 =  0.7735
Precision@1 =  0.8150000000000001
Recall@10 =  0.20000298863134391
Recall@5 =  0.12658354726049378
Recall@2 =  0.06381231674486407
Recall@1 =  0.03625923923644122
MRR =  0.8448126354793501





In [28]:
evaluate_model_k_tracks_removed_df(svd_model, 15)

  recall_scores = np.sum(relevant_items, axis=1) / total_relevant
Iterations: 100%|██████████| 5/5 [00:20<00:00,  4.09s/it]


Precision@10 =  0.5489999999999999
Precision@5 =  0.6408
Precision@2 =  0.732
Precision@1 =  0.7729999999999999
Recall@10 =  0.18162270806892628
Recall@5 =  0.1206794590066335
Recall@2 =  0.06387758152498095
Recall@1 =  0.03636260571197248
MRR =  0.8121551851290546





In [29]:
evaluate_model_k_tracks_removed_df(svd_model, 20)

  recall_scores = np.sum(relevant_items, axis=1) / total_relevant
Iterations: 100%|██████████| 5/5 [00:17<00:00,  3.54s/it]


Precision@10 =  0.546
Precision@5 =  0.6448
Precision@2 =  0.7365
Precision@1 =  0.7879999999999999
Recall@10 =  0.18694413711071828
Recall@5 =  0.12557268984729172
Recall@2 =  0.0658944178486894
Recall@1 =  0.03760692652355689
MRR =  0.8242201674557558



