In [1]:
!pip install torch
!pip install torchvision torchaudio
!pip install pytorch-lightning
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import kagglehub
from torch.utils.data import Dataset, DataLoader
import random
import gc
import matplotlib.pyplot as plt
import re
import json
import pickle
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
np.random.seed(42)

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Downloading and Preprocessing the Dataset

In [None]:
#@title Download from Kaggle
path = "/root/.cache/kagglehub/datasets/himanshuwagh/spotify-million/versions/1"
if not os.path.exists(path):
  path = kagglehub.dataset_download("himanshuwagh/spotify-million")

data: str = os.path.join(path, "data")

Downloading from https://www.kaggle.com/api/v1/datasets/download/himanshuwagh/spotify-million?dataset_version_number=1...


  6%|▌         | 304M/5.20G [00:12<03:08, 27.9MB/s]

In [None]:
#@title Shuffle Dataset
shuffled_slices = np.array(os.listdir(data))
np.random.shuffle(shuffled_slices)

## Data Visualization & Handling

In [None]:
million_df = pd.DataFrame()

data_list = []

for i, filename in tqdm(enumerate(shuffled_slices), desc="Processing Slices", total = len(shuffled_slices)):
    if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        filepath = os.path.join(data, filename)

        with open(filepath, "r", encoding="utf-8") as jsonfile:
            cur_slice = json.load(jsonfile)

        for playlist in cur_slice["playlists"]:
            playlist_id = playlist["pid"]

            for track in playlist["tracks"]:
                data_list.append({
                    "playlist": playlist_id,
                    "track": track["track_uri"][14:]
                })

    if i%50 == 0:
        new_data = pd.DataFrame(data_list)
        data_list.clear()
        million_df = pd.concat([million_df, new_data], ignore_index=True)

new_data = pd.DataFrame(data_list)
data_list = []
million_df = pd.concat([million_df, new_data], ignore_index=True)

million_df["playlist"] = million_df["playlist"].astype("int32")
million_df.drop_duplicates(inplace=True)

In [None]:
million_df.info()

In [None]:
million_df.shape

In [None]:
million_df.head()

In [None]:
# @title How many playlists share the same number of tracks, and how many tracks are in them?
million_df.groupby("playlist")["track"].nunique().plot(kind="hist", bins=50, ylabel="Number of Playlists", xlabel="Number of Tracks")  # .sort_values().plot()

In [None]:
playlist_counts = million_df.groupby("playlist")["track"].nunique()
# use only playlists with more than 10 tracks and less than 150
min_track_num = 10
max_track_num = 150
valid_playlists = playlist_counts[(playlist_counts >= min_track_num) & (playlist_counts <= max_track_num)]

In [None]:
print(f"Number of Playlists goes from {playlist_counts.shape[0]} to {valid_playlists.shape[0]} (Ratio of {(valid_playlists.shape[0]/playlist_counts.shape[0])*100:.2f}%)")


In [None]:
print(f"Before removing playlists: {million_df.shape = }")
million_df = million_df[million_df["playlist"].isin(valid_playlists.index)]
print(f"After removing playlists:  {million_df.shape = }")

In [None]:
# @title In how many playlists does each track appear?
million_df.groupby("track")["playlist"].nunique().sort_values().plot(logy=True, ylabel="Log Number of Playlist", xlabel="Tracks") # .plot(kind="hist", bins=50, )

In [None]:
track_counts = million_df.groupby("track")["playlist"].nunique()
# use only songs that appear in at least 25 playlists
min_playlist_num = 25
max_playlist_num = np.inf
valid_tracks = track_counts[(track_counts >= min_playlist_num) & (track_counts <= max_playlist_num)]

In [None]:
print(f"Number of Playlists goes from {track_counts.shape[0]} to {valid_tracks.shape[0]} (Ratio of {(valid_tracks.shape[0]/track_counts.shape[0])*100:.2f}%)")

In [None]:
print(f"Before removing tracks: {million_df.shape = }")
million_df = million_df[million_df["track"].isin(valid_tracks.index)]
print(f"After removing tracks: {million_df.shape = }")

In [None]:
playlist_id_to_idx = {id: idx for idx, id in enumerate(million_df["playlist"].unique())}
track_uri_to_idx = {uri: idx for idx, uri in enumerate(million_df["track"].unique())}

million_df["playlist_idx"] = million_df["playlist"].map(playlist_id_to_idx)
million_df["track_idx"] = million_df["track"].map(track_uri_to_idx)

# Create COO matrix
rows = million_df["playlist_idx"]
cols = million_df["track_idx"]
data_list = np.ones(len(million_df))  # All entries are 1 since a track belongs to a playlist

csr_rating_matrix = coo_matrix((data_list, (rows, cols)), shape=(len(playlist_id_to_idx), len(track_uri_to_idx))).tocsr()


print(csr_rating_matrix.shape)

# Language Model Approach

## Define pytorch Dataset and Dataloader

In [None]:
class PlaylistDataset(Dataset):
  def __init__(self, matrix, remove = 5):
    self.matrix = matrix
    self.remove = remove

  def __len__(self):
    return self.matrix.shape[0]

  def __getitem__(self, idx):
    masked = self.matrix[idx].copy
    random_indices = random.sample(range(self.matrix.shape[1]), self.remove)
    masked[random_indices] = 0
    ground_truth = self.matrix[idx]

    return masked, ground_truth


dataset = PlaylistDataset(csr_rating_matrix)
print("Number of playlists: ", len(dataset))
print(dataset[0])

In [None]:
del million_df, csr_rating_matrix, playlist_id_to_idx, playlist_counts, track_counts, valid_playlists, valid_tracks, data_list, rows, cols
gc.collect()

In [None]:
from torch.utils.data.dataset import random_split

VAL_FRACTION = 0.15
TEST_FRACTION = 0.15
BATCH_SIZE = 128


total_length = len(dataset)
val_length = int(total_length * VAL_FRACTION)
test_length = int(total_length * TEST_FRACTION)
train_length = total_length - val_length - test_length

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_length, val_length, test_length])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Define Model