### Dependencies

In [343]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

import os
import random
import librosa
from torch.utils.data import Dataset
from dotenv import load_dotenv
from multiprocessing import Pool, cpu_count


### Global Constants

In [344]:
load_dotenv()
DOWNLOAD_FOLDER = os.getenv("DOWNLOAD_FOLDER")
SAMPLE_RATE = 22050
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset

In [345]:
songs_data = pd.read_csv('/mnt/f/Alex Stuff/songs_data_full.csv', index_col=0)

In [346]:
class AudioDataset(Dataset):
    def __init__(self, folder_path, sample_rate=22050, n_mels=128):
        self.folder_path = folder_path
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.audio_files = random.sample([os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.mp3')], 10)
        self.spectrogram_batch_size = 500
        self.spectrogram_batch = {}

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        _, video_id, _ = self.audio_path_info(audio_path)
        if len(songs_data[songs_data['videoID'] == video_id]) == 0:
            idx += 1
            self.__getitem__(idx)
            return
        
        spectrogram = self.get_spectrogram_batch(idx)[idx]
        views = torch.tensor(songs_data[songs_data['videoID'] == video_id]['views'].iloc[0], dtype=torch.float32)
        return spectrogram.to(DEVICE), views.to(DEVICE)

    def get_spectrogram_batch(self, current_idx):
        if current_idx % self.spectrogram_batch_size != 0:
            return self.spectrogram_batch
        
        self.spectrogram_batch = {}
        audio_paths_batch = self.audio_files[current_idx:min(current_idx+self.spectrogram_batch_size, self.__len__())]
        with Pool(processes=cpu_count()) as pool:
            args = enumerate(audio_paths_batch, start=current_idx)
            print(len(args[0]))
            results = pool.map(self.get_spectrogram, args)
        
        for idx, spectrogram in results:
            self.spectrogram_batch[idx] = spectrogram
            print(f"Super finished spec {idx}")
        print(f"Finished batch, current index = {current_idx}")
        return self.spectrogram_batch

    def get_spectrogram(self, args):
        idx, audio_path = args
        print(f"Getting spec {idx}")
        audio, _ = librosa.load(audio_path, sr=self.sample_rate)
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sample_rate, n_mels=self.n_mels)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        spectrogram = torch.tensor(mel_spec_db, dtype=torch.float32).transpose(0, 1)  # Shape: (time, n_mels)
        print(f"Finished spec {idx}")
        return idx, spectrogram

    def audio_path_info(self, audio_path):
        try:
            filename = os.path.basename(audio_path)
            parts = filename.split("^")
            if len(parts) != 3:
                raise ValueError("Input text does not match the expected format 'index^videoID^songTitle'")
            index, videoID, songTitle = parts
            return int(index), videoID, songTitle
        except Exception as e:
            print(f"Error parsing text '{audio_path}': {e}")
            return None, None, None

### Models

In [347]:
# CNN Model for Song Embeddings
class CNNRegression(nn.Module):
    def __init__(self, input_dim):
        super(CNNRegression, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear((input_dim // 4) * 32, 128)  # Flattened size after pooling
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)  # Output layer for regression

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension: (batch_size, 1, input_dim)
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the model
input_dim = 128
model_cnn = CNNRegression(input_dim)
print(model_cnn)

CNNRegression(
  (conv1): Conv1d(1, 16, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu1): ReLU()
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1024, out_features=128, bias=True)
  (relu3): ReLU()
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [348]:
# LSTM Model for Song Embeddings
class LSTMRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2):
        super(LSTMRegression, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)  # Output layer

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence length dimension: (batch_size, 1, input_dim)
        _, (hn, _) = self.lstm(x)  # hn is the hidden state
        x = self.relu(self.fc1(hn[-1]))  # Use the last hidden state
        x = self.fc2(x)
        return x

# Instantiate the model
model_lstm = LSTMRegression(input_dim)
print(model_lstm)

LSTMRegression(
  (lstm): LSTM(128, 64, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [349]:
# Transformer Model for Song Embeddings
class TransformerRegression(nn.Module):
    def __init__(self, input_dim, num_heads=4, dim_feedforward=128, num_layers=2):
        super(TransformerRegression, self).__init__()
        self.embedding = nn.Linear(input_dim, input_dim)  # Input projection
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)  # Output layer

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence length dimension: (batch_size, 1, input_dim)
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Global average pooling over sequence
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the model
model_transformer = TransformerRegression(input_dim)
print(model_transformer)

TransformerRegression(
  (embedding): Linear(in_features=128, out_features=128, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)




In [350]:
class RNN(nn.Module):
    def __init__(self, input_size=128, hidden_size=128, num_layers=2):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        """
        Args:
            x: Mel spectrogram (time_steps, n_mels)
        """
        output, _ = self.rnn(x)
        last_hidden_state = output[:, -1, :] # get last hidden step
        out = self.fc(last_hidden_state)
        return out

In [351]:
class RecursiveNN(nn.Module):
    def __init__(self, input_size=128, hidden_size=128):
        super(RecursiveNN, self).__init__()
        self.fc_merge = nn.Linear(input_size * 2, hidden_size)
        self.fc_output = nn.Linear(hidden_size, 1)

    def forward(self, x):
        """
        Recursively merges time steps.
        Args:
            x: Mel spectrogram (time_steps, n_mels)
        """
        while x.size(0) > 1:
            # Merge pairs of consecutive time steps
            if x.size(0) % 2 == 1:  # Duplicate last step if odd number of steps
                x = torch.cat((x, x[-1:]), dim=0)
            x = x.view(x.size(0) // 2, -1)  # Pairwise merge
            x = torch.relu(self.fc_merge(x))
        
        # Final output
        out = self.fc_output(x.squeeze(0))
        return out

In [352]:
class ZeroPaddingModel(nn.Module):
    def __init__(self, input_size=128, hidden_size=128, num_layers=2):
        super(ZeroPaddingModel, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, lengths):
        """
        Args:
            x: Padded input (batch_size, max_time, n_mels)
            lengths: Original lengths of each spectrogram
        """
        packed_input = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (h_n, c_n) = self.lstm(packed_input)
        last_hidden_states = h_n[-1]
        out = self.fc(last_hidden_states)
        return out.squeeze(1)

### Utils

In [353]:
def collate_fn(batch):
    """
    Pads spectrograms in a batch to the same length.
    Args:
        batch: List of (spectrogram, target) tuples.
    Returns:
        padded_spectrograms: Padded spectrogram tensor (batch_size, max_time, n_mels)
        lengths: Lengths of each spectrogram (for RNN packing)
        targets: View count targets
    """
    spectrograms = [item[0] for item in batch]
    targets = torch.stack([item[1] for item in batch])
    
    # Pad spectrograms to the same length
    padded_spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True, padding_value=0.0)
    return padded_spectrograms, targets


### Training

In [354]:
# Training loop
def train_model(model, X, y, num_epochs=20, lr=0.001):
    return
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Dummy input data
X = torch.randn(1000, input_dim)
y = torch.randn(1000, 1)

In [355]:
# Train CNN
train_model(model_cnn, X, y)

In [356]:
# Train LSTM
train_model(model_lstm, X, y)

In [357]:
# Train Transformer
train_model(model_transformer, X, y)

In [358]:
from torch.utils.data import DataLoader
import torch.optim as optim

def train_model(model, dataloader, num_epochs=2, learning_rate=0.001, use_lengths=False):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for spectrograms, targets in dataloader:
            optimizer.zero_grad()
            
            if use_lengths:  # Add zero padding
                lengths = torch.tensor([spec.shape[0] for spec in spectrograms], dtype=torch.int64).to(DEVICE)
                spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True, padding_value=0.0).to(DEVICE)
                outputs = model(spectrograms, lengths)
            else:
                outputs = model(spectrograms)
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")

# Initialize models
rnn_model = RNN().to(DEVICE)
recursive_model = RecursiveNN().to(DEVICE)
zero_padding_model = ZeroPaddingModel().to(DEVICE)

# DataLoader
folder_path = '/mnt/f/Alex Stuff/Songs'
dataset = AudioDataset(folder_path)
dataloader = DataLoader(dataset, batch_size=1)
# For zero-padding model, use batch size > 1
dataloader_padded = DataLoader(dataset, batch_size=8, collate_fn=collate_fn)

In [359]:
print("Training RNN Model...")
train_model(rnn_model, dataloader)

Training RNN Model...


TypeError: 'enumerate' object is not subscriptable

In [None]:
print("Training Recursive Model...")
train_model(recursive_model, dataloader)

Training Recursive Model...


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/2], Loss: 413253561477685.1250
Epoch [2/2], Loss: 413253194831674.0625


In [273]:
print("Training Zero-Padding Model...")
train_model(zero_padding_model, dataloader_padded, use_lengths=True)

Training Zero-Padding Model...
Epoch [1/2], Loss: 259307412389888.0000
Epoch [2/2], Loss: 259307412389888.0000
