In [44]:
import os
import json
import numpy as np
from tqdm import tqdm
import torch
import kagglehub
from torch.utils.data import Dataset, DataLoader
import random


# Downloading and Preprocessing the Dataset

In [37]:
#@title Download from Kaggle
path = "/root/.cache/kagglehub/datasets/himanshuwagh/spotify-million/versions/1"
if not os.path.exists(path):
  path = kagglehub.dataset_download("himanshuwagh/spotify-million")

In [38]:
#@title Create a unique correspondence using hexadecimal indices
number_of_slices = 1000 #@param {type:"slider", min:1, max:1000, step:1}
visited = set()
correspondence = dict()
inv_correspondence = dict()
c = 0
for i in tqdm(range(number_of_slices)):
  first = 1000*i
  data_url = os.path.join(path, "data" , f'mpd.slice.{first}-{first+999}.json')
  with open(data_url) as f:
    data = json.load(f)['playlists']
    for playlist in data:
      for track in playlist['tracks']:
        if track['track_uri'] not in visited:
          idx = hex(c)
          correspondence[idx] = {"artist": track['artist_name'], "song": track['track_name'], "uri": track["track_uri"]}
          inv_correspondence[track['track_uri']] = idx
          visited.add(track['track_uri'])
          c += 1

with open("correspondence.json", 'w') as f:
  json.dump(correspondence, f)

print("Number of unique songs: ", len(correspondence))
del correspondence

100%|██████████| 1000/1000 [10:15<00:00,  1.62it/s]


Number of unique songs:  2262292


In [39]:
def get_song_details(idx, correspondence = None):
  if not correspondence:
    with open("correspondence.json") as f:
      correspondence = json.load(f)
      d =  correspondence[idx]
      del correspondence
      return d
  else:
    return correspondence[idx]

In [40]:
idx = 50
get_song_details(hex(idx))

{'artist': 'We The Kings',
 'song': 'Check Yes Juliet',
 'uri': 'spotify:track:1b7vg5T9YKR3NNqXfBYRF7'}

In [41]:
#@title Saving playlists as lists of hexadecimal indices
!mkdir -p playlists

for i in tqdm(range(1000)):
  first = 1000*i
  data_url = os.path.join(path, "data" , f'mpd.slice.{first}-{first+999}.json')
  with open(data_url) as f:
    data = json.load(f)['playlists']
    for idx, playlist in enumerate(data):
      filename = f"playlist_{first+idx}.json"
      new_playlist = {'n_followers': playlist['num_followers'], 'tracks': []}
      for track in playlist['tracks']:
        new_playlist['tracks'].append(inv_correspondence[track['track_uri']])


      with open(os.path.join("playlists", filename), 'w') as f:
        json.dump(new_playlist, f)

del new_playlist, inv_correspondence

100%|██████████| 1000/1000 [15:23<00:00,  1.08it/s]


In [48]:
#@title Create torch dataset
class PlaylistDataset(Dataset):
  def __init__(self, root_dir, k = 2, transform=None):
    self.root_dir = root_dir
    self.transform = transform
    self.k = k
    self.playlists = [f for f in os.listdir(root_dir) if f.endswith('.json')]

  def __len__(self):
    return len(self.playlists)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    with open(os.path.join(self.root_dir, self.playlists[idx])) as f:
      playlist = json.load(f)

    tracks = playlist['tracks']
    followers = playlist['n_followers']

    if self.transform:
      tracks = self.tracks(playlist)

    partial_masked_tracks = tracks[:]
    random.shuffle(partial_masked_tracks)
    partial_masked_tracks = partial_masked_tracks[:self.k]

    return partial_masked_tracks, tracks, followers

dataset = PlaylistDataset("playlists")
print("Number of playlists: ", len(dataset))

Number of playlists:  1000000


In [49]:
first_masked_playlist, first_playlist, first_playlist_followers = dataset[0]
print("First playlist followers: ", first_playlist_followers)
print("The first masked playlist contains:")
for track in first_masked_playlist:
  print(get_song_details(track))

print("The first playlist contains:")
for track in first_playlist[:2]:
  print(get_song_details(track))


First playlist followers:  2
The first masked playlist contains:
{'artist': 'The Black Crowes', 'song': 'Jealous Again', 'uri': 'spotify:track:3TxkuaIEUv53lEflPskcN8'}
{'artist': 'Red Hot Chili Peppers', 'song': "Can't Stop", 'uri': 'spotify:track:3ZOEytgrvLwQaqXreDs2Jx'}
The first playlist contains:
{'artist': 'Kenny Wayne Shepherd', 'song': 'Somehow, Somewhere, Someway', 'uri': 'spotify:track:3N8YmS2jHNmziUhqTR7Wk4'}
{'artist': 'Fireball Ministry', 'song': 'Kick Back', 'uri': 'spotify:track:7BMIFNd4EmJKdOvlpg4KSL'}
