In [1]:
!pip install torch
!pip install torchvision torchaudio
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]>=2022.5.0->pytorch-lightning)
  Downloading aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning)
  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[ht

In [2]:
import os
import json
import numpy as np
from tqdm import tqdm
import torch
import kagglehub
from torch.utils.data import Dataset, DataLoader
import random

# Downloading and Preprocessing the Dataset

In [3]:
#@title Download from Kaggle
path = "/root/.cache/kagglehub/datasets/himanshuwagh/spotify-million/versions/1"
if not os.path.exists(path):
  path = kagglehub.dataset_download("himanshuwagh/spotify-million")

Downloading from https://www.kaggle.com/api/v1/datasets/download/himanshuwagh/spotify-million?dataset_version_number=1...


100%|██████████| 5.20G/5.20G [00:24<00:00, 229MB/s]

Extracting files...





In [4]:
#@title Create a unique correspondence using integer indices
MAX_SEQ_LEN = 30
number_of_slices = 10 #@param {type:"slider", min:1, max:1000, step:1}
visited = set()
correspondence = {'0': {"artist": "", "song": "", "uri": ""}}
inv_correspondence = dict()
c = 1
for i in tqdm(range(number_of_slices)):
  first = 1000*i
  data_url = os.path.join(path, "data" , f'mpd.slice.{first}-{first+999}.json')
  with open(data_url) as f:
    data = json.load(f)['playlists']
    for playlist in data:
      if len(playlist['tracks']) > MAX_SEQ_LEN:
        for track in playlist['tracks']:
          if track['track_uri'] not in visited:
            correspondence[c] = {"artist": track['artist_name'], "song": track['track_name'], "uri": track["track_uri"]}
            inv_correspondence[track['track_uri']] = c
            visited.add(track['track_uri'])
            c += 1

with open("correspondence.json", 'w') as f:
  json.dump(correspondence, f)

#with open("inv_correspondence.json", 'w') as f:
#  json.dump(inv_correspondence, f)

vocab_size = len(correspondence)
print("Number of unique songs: ", len(correspondence))
del correspondence

100%|██████████| 10/10 [00:02<00:00,  4.02it/s]


Number of unique songs:  159816


In [5]:
def get_song_details(idx: str, correspondence = None):
  if not correspondence:
    with open("correspondence.json") as f:
      correspondence = json.load(f)
      d = correspondence[idx]
      del correspondence
      return d
  else:
    return correspondence[idx]['artist'], correspondence[idx]['song'], correspondence[idx]['uri']

In [6]:
idx = "1"
get_song_details(idx)

{'artist': 'Missy Elliott',
 'song': 'Lose Control (feat. Ciara & Fat Man Scoop)',
 'uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI'}

In [7]:
#@title Saving playlists as lists of indices
!mkdir -p playlists

for i in tqdm(range(number_of_slices)):
  first = 1000*i
  data_url = os.path.join(path, "data" , f'mpd.slice.{first}-{first+999}.json')
  with open(data_url) as f:
    data = json.load(f)['playlists']
    for idx, playlist in enumerate(data):
      filename = f"playlist_{first+idx}.json"
      new_playlist = {'n_followers': playlist['num_followers'], 'tracks': []}
      if len(playlist['tracks']) > MAX_SEQ_LEN:
        for track in playlist['tracks']:
          new_playlist['tracks'].append(inv_correspondence[track['track_uri']])


      with open(os.path.join("playlists", filename), 'w') as f:
        json.dump(new_playlist, f)

del new_playlist, inv_correspondence

100%|██████████| 10/10 [00:03<00:00,  3.03it/s]


In [8]:
#@title Create torch dataset

class PlaylistDataset(Dataset):
  def __init__(self, root_dir, seq_len = MAX_SEQ_LEN):
    self.root_dir = root_dir
    self.seq_len = seq_len
    self.url_playlists = []
    self.len_original_playlist = []

    for url in os.listdir(root_dir):
      if url.endswith('.json'):
        with open(os.path.join(root_dir, url)) as f:
          playlist = json.load(f)
          if len(playlist['tracks']) > seq_len:
            self.url_playlists.append(url)
            self.len_original_playlist.append(len(playlist['tracks']))


  def __len__(self):
    c = 0
    for l in self.len_original_playlist:
      c += l - (self.seq_len+1)
    return c

  def __getitem__(self, idx):

    for i in range(len(self.len_original_playlist)):
      if idx < self.len_original_playlist[i] - (self.seq_len+1):
        break
      idx -= self.len_original_playlist[i] - (self.seq_len+1)

    with open(os.path.join(self.root_dir, self.url_playlists[i])) as f:
        playlist = json.load(f)

    tracks = playlist['tracks'][idx:idx+self.seq_len+1]
    #followers = playlist['n_followers']

    return (
        torch.tensor(tracks[:-1], dtype=torch.long),
        torch.tensor(tracks[1:], dtype=torch.long)
    )


dataset = PlaylistDataset("playlists")
print("Number of playlists: ", len(dataset))

Number of playlists:  391846


In [9]:
print(dataset[0])

(tensor([  9593,   4003,  30789,   7800,   1953,   1968,   1252,   3037,  30348,
          5037,  19120,   6351,  25061,     76,   6144,  35270,  81174, 129902,
         21891,  21533,  24380,  37468,   3998,  22627,  13884,   6183,   1946,
          5576,  13075,  69779]), tensor([  4003,  30789,   7800,   1953,   1968,   1252,   3037,  30348,   5037,
         19120,   6351,  25061,     76,   6144,  35270,  81174, 129902,  21891,
         21533,  24380,  37468,   3998,  22627,  13884,   6183,   1946,   5576,
         13075,  69779,   4215]))


In [10]:
from torch.utils.data.dataset import random_split

NUM_WORKERS = 2
VAL_FRACTION = 0.1
TEST_FRACTION = 0.1
BATCH_SIZE = 32

total_length = len(dataset)
val_length = int(total_length * VAL_FRACTION)
test_length = int(total_length * TEST_FRACTION)
train_length = total_length - val_length - test_length

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_length, val_length, test_length])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

In [11]:
import pytorch_lightning as pl
import torch.nn as nn
import torch.nn.functional as F
import torch

torch.set_float32_matmul_precision('medium')

HIDDEN_SIZE = 128
NUM_LAYERS = 4
LR = 0.01

class RNNModel(pl.LightningModule):
  def __init__(self, vocab_size, hidden_size = HIDDEN_SIZE, num_layers = NUM_LAYERS):
    super().__init__()
    self.vocab_size = vocab_size
    self.rnn = nn.LSTM(input_size=vocab_size,
                       hidden_size=hidden_size,
                       num_layers=num_layers,
                       dropout=0.1,
                       batch_first=True)
    self.decoder = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, hidden=None):
      x = F.one_hot(x, num_classes=self.vocab_size).float()
      output, hidden = self.rnn(x, hidden)
      output = self.decoder(output)
      return output, hidden

  def configure_optimizers(self):
      optimizer = torch.optim.Adam(self.parameters(), lr=LR)
      return optimizer

  def training_step(self, batch, batch_idx):
      x, y = batch
      y_hat, _ = self.forward(x)
      y_hat = y_hat.squeeze()
      loss = F.cross_entropy(y_hat.transpose(1, 2), y)
      self.log('train_loss', loss)
      return loss

  def validation_step(self, batch, batch_idx):
      x, y = batch
      y_hat, _ = self.forward(x)
      loss = F.cross_entropy(y_hat.transpose(1, 2), y)
      self.log('val_loss', loss)
      return loss



In [None]:
from pytorch_lightning.callbacks import EarlyStopping

MAX_EPOCHS = 10
PATIENCE = 2

early_stop_callback = EarlyStopping(monitor = 'val_loss', patience = PATIENCE, verbose = True, mode = 'min')
model = RNNModel(vocab_size=vocab_size)

trainer = pl.Trainer(max_epochs=MAX_EPOCHS, callbacks=[early_stop_callback])

trainer.fit(model, train_loader, val_loader)