In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("notshrirang/spotify-million-song-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/notshrirang/spotify-million-song-dataset?dataset_version_number=1...


100%|██████████| 20.7M/20.7M [00:00<00:00, 68.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/notshrirang/spotify-million-song-dataset/versions/1


**Pre-process**

In [None]:
import pandas as pd

# Define the path to the dataset CSV file
dataset_path = "/root/.cache/kagglehub/datasets/notshrirang/spotify-million-song-dataset/versions/1/spotify_millsongdata.csv"

# Load the dataset
df = pd.read_csv(dataset_path)
df = df[["artist", "text"]]

# Display the first few rows
print(df.head())

# Check column names
print(df.columns)


  artist                                               text
0   ABBA  Look at her face, it's a wonderful face  \r\nA...
1   ABBA  Take it easy with me, please  \r\nTouch me gen...
2   ABBA  I'll never know why I had to go  \r\nWhy I had...
3   ABBA  Making somebody happy is a question of give an...
4   ABBA  Making somebody happy is a question of give an...
Index(['artist', 'text'], dtype='object')


In [None]:
num_artists = len(set(df['artist']))
print(num_artists)


643


**WordDataset and RNN**

In [None]:
import torch
import re
from torch.utils.data import Dataset

class WordDataset(Dataset):
  def __init__(self, df, artist, seq_len=100, device='cpu'):
    self.seq_len = seq_len
    self.df = df[df["artist"] == artist]
    self.text = self.df['text'].str.cat(sep=' ')

    # Tokenize text while preserving newlines (\r\n)
    self.tokens = re.findall(r"\w+(?:'\w+)?|[\r\n]+|[.,!?;:]", self.text)
    self.vocab = sorted(set(self.tokens))

    self.wordtoidx = {word: idx for idx, word in enumerate(self.vocab)}
    self.idxtoword = {idx: word for idx, word in enumerate(self.vocab)}
    self.device = device
    self.encoded = [self.wordtoidx[word] for word in self.tokens if word in self.wordtoidx]

  def __len__(self):
    # replace this with code to return the number of possible sub-sequences
    return len(self.encoded) - self.seq_len

  def __getitem__(self,i):
    # Get the sequence of token indices and the target (next token in sequence)
    X = self.encoded[i:i+self.seq_len]  # Input sequence
    y = self.encoded[i+1:i+self.seq_len+1]  # Target sequence (shifted by 1)
    # Return both as tensors, where X is the input and y is the target
    return torch.tensor(X, device=self.device), torch.tensor(y, device=self.device)

  def decode(self,tokens):
    # replace this with code to convert a sequence of tokens back into a string
    words = [self.idxtoword[token] for token in tokens]
    text = ""
    for word in words:
      if word == "\r\n":
        text += word
      elif word in ".,!?;:":
        text = text.rstrip() + word
      else:
        text += " " + word
    return text.strip()

In [None]:
ed_sheeran_dataset = WordDataset(df, "Ed Sheeran", seq_len=75)
vocab = ed_sheeran_dataset.vocab

print(vocab)
print(ed_sheeran_dataset.decode(ed_sheeran_dataset.encoded[:500]))

It's alright to cry even my dad does sometimes
 So don't wipe your eyes
 Tears remind you you're alive
 It's alright to die cause death the only thing you haven't tried
 But just for tonight hold on
 So live life like you're giving all
 Cause you act like you are
 Go ahead and just live it up
 Go on and tell me your path

 It's alright to shake
 Even my hand does sometimes
 So inside the rage Against the dying of the light
 It's alright to say that death's
 The only thing you haven't tried
 But just for today hold on
 So live life like you're giving all
 Cause you act like you are
 Go ahead and just live it up
 Go on and tell me your path

 Go ahead and just live it up
 Go on and tell me your path and hold on 

 We're not, no we're not friends, nor have we ever been
 We just try to keep those secrets in our lives
 And if they find out, will it all go wrong?
 I never know, no one wants it to

 So I could take the back road
 But your eyes'll lead me straight

In [None]:
from torch import nn

class LyricRNN(nn.Module):
  def __init__(self,vocabulary_size,hidden_size=256):
    super().__init__()
    self.embedding = nn.Embedding(vocabulary_size,hidden_size)
    self.hidden_size = hidden_size
    self.U = nn.Linear(hidden_size,hidden_size)
    self.W = nn.Linear(hidden_size,hidden_size)
    self.act = nn.SiLU()
    self.V = nn.Linear(hidden_size,vocabulary_size)

  def forward(self,x):
    x = self.embedding(x)
    B,N = x.shape[:2]
    h = torch.zeros(B,self.hidden_size).to(x.device)
    Ux = self.U(x)
    y = []
    for i in range(N):
      Wh = self.W(h)
      h = self.act(Ux[:,i] + Wh)
      y.append(self.V(h))
    return torch.stack(y,dim=1)

In [None]:
class LyricGRU(nn.Module):
    def __init__(self, vocab_size, embed_size=300, hidden_size=256, num_layers=3):
        super(LyricGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=.3, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, vocab_size)  # Predict next word in vocab

    def forward(self, x, hidden=None):
        x = self.embedding(x)  # Shape: (batch, seq_len, embed_size)
        x, hidden = self.gru(x, hidden)  # Shape: (batch, seq_len, hidden_size)

        # Process each timestep separately, similar to your LyricRNN
        y = [self.fc(x[:, i, :]) for i in range(x.shape[1])]

        return torch.stack(y, dim=1)  # Shape: (batch, seq_len, vocab_size)


In [None]:
class LyricLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size=300, hidden_size=256, num_layers=3):
        super(LyricLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=.25, bidirectional=True)
        self.layer_norm = nn.LayerNorm(2*hidden_size)
        self.fc = nn.Linear(2*hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # Shape: [batch_size, seq_len, embed_size]
        lstm_out, _ = self.lstm(x)  # Shape: [batch_size, seq_len, hidden_size]
        lstm_out = self.layer_norm(lstm_out)
        y = [self.fc(lstm_out[:, i, :]) for i in range(lstm_out.shape[1])]
        return torch.stack(y, dim=1)  # Shape: [batch_size, seq_len, vocab_size]


**Model for ABBA**

In [None]:
#train model for just ABBA
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_model(model_type, df, artist, num_epochs=20, batch_size=16, lr=0.001, convergence_threshold=1e-3, device='cuda' if torch.cuda.is_available() else 'cpu'):
  print(f"Training {model_type} for {artist}..\n")
  torch.manual_seed(42)
  train_dataset = WordDataset(df, artist, seq_len=50, device=device)
  train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
  vocab_size = len(train_dataset.vocab)

  if model_type == "RNN":
    model = LyricRNN(vocab_size).to(device)
  elif model_type == "GRU":
    model = LyricGRU(vocab_size).to(device)
  elif model_type == "LSTM":
    model = LyricLSTM(vocab_size).to(device)
  else:
    raise ValueError("Invalid model name. Choose from 'RNN', 'GRU', or 'LSTM'.")

  opt = torch.optim.Adam(model.parameters(),lr)
  loss_fn = nn.CrossEntropyLoss()
  model.train()

  prev_loss = float('inf')
  for epoch in range(num_epochs):
    total_loss = 0
    for batch, (X, y) in enumerate(tqdm(train_loader)):
      opt.zero_grad()
      outputs = model(X)
      loss = loss_fn(outputs.view(-1, vocab_size), y.view(-1))
      loss.backward()
      opt.step()
      total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    if abs(prev_loss - avg_loss) < convergence_threshold:
      print(f"Model converged after {epoch+1} epochs.")
      break

    prev_loss = avg_loss

  return train_dataset, train_loader, model

In [None]:
def top_k_sampling(probs, k=5):
    """ Selects from the top-k most probable next words. """
    top_k_probs, top_k_indices = torch.topk(probs, k)
    top_k_probs = torch.softmax(top_k_probs, dim=0)  # Re-normalize probabilities
    next_word_idx = top_k_indices[torch.multinomial(top_k_probs, num_samples=1)]
    return next_word_idx.item()

def top_p_sampling(probs, p=.8):
    """ Nucleus sampling: select from the smallest set of words whose cumulative probability > p. """
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    cumulative_probs = torch.cumsum(sorted_probs, dim=0)
    sorted_indices_to_keep = sorted_indices[cumulative_probs <= p]
    if len(sorted_indices_to_keep) == 0:  # Ensure we select at least one word
        sorted_indices_to_keep = sorted_indices[:1]
    sorted_probs = sorted_probs[:len(sorted_indices_to_keep)]
    next_word_idx = sorted_indices_to_keep[torch.multinomial(torch.softmax(sorted_probs, dim=0), num_samples=1)]
    return next_word_idx.item()

def generate_text_stochastic(model, dataset, prompt, num_to_generate=100, device='cuda' if torch.cuda.is_available() else 'cpu'):
    #print(f'vocab size: {len(dataset.vocab)}')  # Should match model's vocabulary_size
    model.eval()
    generated_text = prompt.split()
    # Get the tokens use encode func in WordDataset
    tokens = [dataset.wordtoidx[word] for word in prompt.split() if word in dataset.wordtoidx]

    # Generate the text
    for _ in range(num_to_generate):
        if len(tokens) < dataset.seq_len:
            input_seq = torch.tensor(tokens, device=device).unsqueeze(0)
        else:
            input_seq = torch.tensor(tokens[-dataset.seq_len:], device=device).unsqueeze(0)  # Use last seq_len tokens

        with torch.no_grad():
            output = model(input_seq)
            #print(f'output shape: {output.shape}')
            temperature = 1  # Higher temperature increases randomness
            probs = torch.softmax(output[0, -1] / temperature, dim=0)
            #next_word_idx = top_p_sampling(probs)
            #next_word_idx = top_k_sampling(probs)
            next_word_idx = torch.multinomial(probs, num_samples=1).item()
            #print(f'next word idx: {next_word_idx}')

        next_word = dataset.idxtoword[next_word_idx]

        if next_word.startswith("\r\n"):
            generated_text.append(next_word)
        elif next_word in ".,!?;:":
            if generated_text:
                generated_text[-1] += next_word
            else:
                generated_text.append(next_word)
        else:
            generated_text.append(next_word)

        tokens.append(next_word_idx) # update tokens

    return dataset.decode([dataset.wordtoidx[word] for word in generated_text if word in dataset.wordtoidx])

In [None]:
dataset, train_loader, trained_model = train_model("LSTM", df, "Ed Sheeran")

Training LSTM for Ed Sheeran..



100%|██████████| 1365/1365 [00:46<00:00, 29.61it/s]


Epoch 1/20, Loss: 0.2790


100%|██████████| 1365/1365 [00:39<00:00, 34.95it/s]


Epoch 2/20, Loss: 0.0488


100%|██████████| 1365/1365 [00:36<00:00, 37.08it/s]


Epoch 3/20, Loss: 0.0329


100%|██████████| 1365/1365 [00:37<00:00, 36.88it/s]


Epoch 4/20, Loss: 0.0227


100%|██████████| 1365/1365 [00:36<00:00, 36.98it/s]


Epoch 5/20, Loss: 0.0168


100%|██████████| 1365/1365 [00:36<00:00, 36.96it/s]


Epoch 6/20, Loss: 0.0136


100%|██████████| 1365/1365 [00:36<00:00, 37.04it/s]


Epoch 7/20, Loss: 0.0121


100%|██████████| 1365/1365 [00:36<00:00, 37.07it/s]


Epoch 8/20, Loss: 0.0109


100%|██████████| 1365/1365 [00:37<00:00, 36.17it/s]


Epoch 9/20, Loss: 0.0102
Model converged after 9 epochs.


In [None]:
generated_text = generate_text_stochastic(trained_model, dataset, "We found love", num_to_generate=200)
print(generated_text)

We found love you sky
 To refine the purest of kings
 And even though I know these tears come with a pain
 Even and just the same

 Chorus x2

 Verse 3
 The seas are full of water
 Stops by my shore
 All over the track like a feature
 And never wants to
 So tell me when it kicks in

 Pre Chorus
 People fall in love with you every single day
 And I just wanna tell you I am

 Chorus
 So now 

 we're just beyond


 I don't know when I lost my
 Maybe it was every
 You You You You You You You You You You You You You You You You You You You You You You You


**Perplexity**

In [None]:
import torch
import torch.nn.functional as F

def compute_perplexity(model, data_loader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Computes perplexity for a given trained model on a dataset.

    Parameters:
    - model: Trained language model (LyricRNN or LyricGRU)
    - data_loader: DataLoader containing the test data
    - device: 'cpu' or 'cuda'

    Returns:
    - perplexity: Perplexity score
    """
    model.eval()
    total_loss = 0
    total_words = 0
    loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')  # Sum loss across words

    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)  # Move data to device

            logits = model(X)  # (batch, seq_len, vocab_size)
            vocab_size = logits.shape[-1]

            # Reshape for loss calculation: (batch*seq_len, vocab_size)
            logits = logits.view(-1, vocab_size)
            y = y.view(-1)  # Flatten targets: (batch*seq_len)

            loss = loss_fn(logits, y)
            total_loss += loss.item()
            total_words += y.numel()

    avg_loss = total_loss / total_words
    perplexity = torch.exp(torch.tensor(avg_loss))

    return perplexity.item()


In [None]:
def write_lyrics(model_type, df, artist, prompt, num_to_generate=200, device='cuda' if torch.cuda.is_available() else 'cpu'):
    dataset, train_loader, trained_model = train_model(model_type, df, artist)
    generated_text = generate_text_stochastic(trained_model, dataset, prompt, num_to_generate)

    print(f"\nGenerated Text for {artist}:\n")
    print(generated_text)
    print("\n")

    train_perplexity = compute_perplexity(trained_model, train_loader)
    print(f"Perplexity for {artist} using {model_type}: {train_perplexity:.2f}")

In [None]:
write_lyrics("RNN", df, "Ed Sheeran", "We found love")

Training RNN for Ed Sheeran..



100%|██████████| 1365/1365 [00:33<00:00, 40.86it/s]


Epoch 1/20, Loss: 1.1745


100%|██████████| 1365/1365 [00:33<00:00, 41.04it/s]


Epoch 2/20, Loss: 0.1819


100%|██████████| 1365/1365 [00:32<00:00, 41.43it/s]


Epoch 3/20, Loss: 0.1581


100%|██████████| 1365/1365 [00:32<00:00, 41.43it/s]


Epoch 4/20, Loss: 0.1470


100%|██████████| 1365/1365 [00:33<00:00, 40.83it/s]


Epoch 5/20, Loss: 0.1411


100%|██████████| 1365/1365 [00:33<00:00, 40.68it/s]


Epoch 6/20, Loss: 0.1370


100%|██████████| 1365/1365 [00:33<00:00, 40.59it/s]


Epoch 7/20, Loss: 0.1339


100%|██████████| 1365/1365 [00:33<00:00, 40.45it/s]


Epoch 8/20, Loss: 0.1318


100%|██████████| 1365/1365 [00:33<00:00, 41.17it/s]


Epoch 9/20, Loss: 0.1296


100%|██████████| 1365/1365 [00:33<00:00, 40.95it/s]


Epoch 10/20, Loss: 0.1281


100%|██████████| 1365/1365 [00:33<00:00, 41.01it/s]


Epoch 11/20, Loss: 0.1265


100%|██████████| 1365/1365 [00:33<00:00, 40.94it/s]


Epoch 12/20, Loss: 0.1254


100%|██████████| 1365/1365 [00:32<00:00, 41.61it/s]


Epoch 13/20, Loss: 0.1242


100%|██████████| 1365/1365 [00:33<00:00, 40.89it/s]


Epoch 14/20, Loss: 0.1241
Model converged after 14 epochs.

Generated Text for Ed Sheeran:

We found love right

 And oh I've known it for the longest time
 And all my hope
 All my words are all over written on the signs
 But you're on my road walking me home
 home

 See the flames inside my eyes
 It burns so bright I wanna feel your love
 Easy baby maybe I'm a liar
 But for tonight I wanna fall in love
 And put your faith in my stomach

 I messed up this time
 Late last night
 Drinking to suppress devotion
 With fingers intertwined
 I can't shake this feeling now
 We're going through the motions
 Hoping you'd stop

 And oh I've only caused you pain
 You know but all of my words will always below
 Of all the love you spoke
 When you're on my road walking me home
 home

 See the flames inside my eyes
 It burns so bright I wanna feel your love
 Easy baby maybe I'm a liar
 But for tonight I wanna fall in love
 You see your




In [16]:
write_lyrics("GRU", df, "Ed Sheeran", "We found love")

Training GRU for Ed Sheeran..



100%|██████████| 1365/1365 [00:35<00:00, 38.53it/s]


Epoch 1/20, Loss: 0.4373


 34%|███▍      | 462/1365 [00:12<00:24, 37.32it/s]


KeyboardInterrupt: 

In [None]:
write_lyrics("LSTM", df, "Ed Sheeran", "We found love")