In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from transformers import Wav2Vec2Model 

In [None]:
class RankNetLoss(nn.Module):
    def __init__(self, sigma=1.0):
        super(RankNetLoss, self).__init__()
        self.sigma = sigma
        self.bce_loss = nn.BCELoss()

    def forward(self, o_i, o_j, P_ij):
        """
        Args:
            o_i (torch.Tensor): Model's raw score for item i (batch_size x 1)
            o_j (torch.Tensor): Model's raw score for item j (batch_size x 1)
            P_ij (torch.Tensor): True probability that item i is ranked higher than item j (batch_size x 1)
                                 P_ij = 1.0 if score_i > score_j
                                 P_ij = 0.0 if score_i < score_j
                                 P_ij = 0.5 if score_i == score_j
        """
        # Difference in scores
        s_ij = o_i - o_j
        
        # Predicted probability that item i is ranked higher than item j
        # P_hat_ij = torch.sigmoid(self.sigma * s_ij) # Original RankNet often uses just sigmoid(s_ij)
        P_hat_ij = torch.sigmoid(s_ij / self.sigma) # Scaling by sigma, sometimes sigma is learned or fixed.
                                                  # Using 1/sigma here to match some conventions where sigma acts like a temperature.
                                                  # If sigma=1, it's just torch.sigmoid(s_ij)
        
        # Ensure P_ij is on the same device and has the correct shape
        P_ij = P_ij.to(P_hat_ij.device).float().view_as(P_hat_ij)

        loss = self.bce_loss(P_hat_ij, P_ij)
        return loss



In [None]:
class PairwiseIntensityDataset(Dataset):
    def __init__(self, audio_input_list, intensity_scores, processor, target_sampling_rate=16000, num_pairs_per_epoch=None):
        """
        Args:
            audio_input_list (list): List of audio file paths or pre-loaded audio tensors.
                                     For simplicity, this example will assume pre-loaded tensors or arrays.
            intensity_scores (np.array or torch.Tensor): Array of derived intensity scores.
            processor (Wav2Vec2Processor): Wav2Vec2 processor for audio tokenization.
            target_sampling_rate (int): Sampling rate Wav2Vec2 expects.
            num_pairs_per_epoch (int, optional): Number of pairs to generate per epoch.
        """
        self.audio_input_list = audio_input_list # This should be your actual audio data
        self.intensity_scores = torch.tensor(intensity_scores, dtype=torch.float32)
        self.processor = processor
        self.target_sampling_rate = target_sampling_rate
        self.num_samples = len(self.audio_input_list)
        
        self.pairs = []
        # For simplicity, we'll sample pairs on the fly in __getitem__ if num_pairs_per_epoch is set.
        # If num_pairs_per_epoch is None, you might want to pre-generate all pairs if memory allows.
        if num_pairs_per_epoch is None and self.num_samples > 0 : # Pre-generate if few samples
            for i in range(self.num_samples):
                for j in range(self.num_samples):
                    if i == j: continue
                    self.pairs.append((i,j))
        self.num_pairs_per_epoch = num_pairs_per_epoch


    def __len__(self):
        if self.num_pairs_per_epoch is not None:
            return self.num_pairs_per_epoch
        return len(self.pairs) if self.pairs else self.num_samples * (self.num_samples -1) if self.num_samples > 1 else 0


    def _process_audio(self, audio_input):
        # This function would load audio if paths are given, resample, and process.
        # For this example, let's assume audio_input is already a suitable waveform (e.g., numpy array).
        # If it's a path, you'd load it here using librosa or similar.
        # Example:
        # if isinstance(audio_input, str):
        #     waveform, sr = librosa.load(audio_input, sr=self.target_sampling_rate)
        # else:
        #     waveform = audio_input # Assuming it's already a waveform

        # For dummy data, we'll just use the input directly if it's a tensor
        processed = self.processor(audio_input, sampling_rate=self.target_sampling_rate, return_tensors="pt", padding=True)
        return processed.input_values.squeeze(0) # Remove batch dim if processor adds it for single sample

    def __getitem__(self, idx):
        if self.num_pairs_per_epoch is not None:
            i = random.randint(0, self.num_samples - 1)
            j = random.randint(0, self.num_samples - 1)
            while i == j:
                j = random.randint(0, self.num_samples - 1)
        elif self.pairs:
            i, j = self.pairs[idx]
        else:
             raise IndexError("Dataset not configured correctly for pair generation or is empty.")

        # Process audio to get input_values for Wav2Vec2
        # For real usage, self.audio_input_list[i] might be a path to an audio file
        # or a pre-loaded waveform.
        audio_i_processed = self._process_audio(self.audio_input_list[i])
        audio_j_processed = self._process_audio(self.audio_input_list[j])
        
        score_i = self.intensity_scores[i]
        score_j = self.intensity_scores[j]

        if score_i > score_j:
            P_ij = 1.0
        elif score_i < score_j:
            P_ij = 0.0
        else:
            P_ij = 0.5

        return audio_i_processed, audio_j_processed, torch.tensor(P_ij, dtype=torch.float32)


In [None]:
class EmoRankNet(nn.Module):
    def __init__(self, wav2vec2_model_name="facebook/wav2vec2-base", 
                scoring_head_hidden_dims=[256, 128], freeze_wav2vec2=True):
        super(EmoRankNet, self).__init__()
        
        # Load pre-trained Wav2Vec2 model
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(wav2vec2_model_name)
        
        if freeze_wav2vec2:
            for param in self.wav2vec2.parameters():
                param.requires_grad = False
        
        # Determine the input dimension for the scoring head
        # This depends on the Wav2Vec2 model's output (e.g., config.hidden_size)
        # For "facebook/wav2vec2-base", config.hidden_size is 768
        # For "facebook/wav2vec2-large", config.hidden_size is 1024
        wav2vec2_output_dim = self.wav2vec2.config.hidden_size 
        
        # Scoring head (similar to the previous IntensityScoringModel)
        layers = []
        current_dim = wav2vec2_output_dim
        if scoring_head_hidden_dims:
            for h_dim in scoring_head_hidden_dims:
                layers.append(nn.Linear(current_dim, h_dim))
                layers.append(nn.ReLU())
                current_dim = h_dim
        layers.append(nn.Linear(current_dim, 1)) # Output a single score
        self.scoring_head = nn.Sequential(*layers)

    def forward(self, input_values, attention_mask=None):
        """
        Args:
            input_values (torch.Tensor): Processed audio waveforms (batch_size x sequence_length)
            attention_mask (torch.Tensor, optional): Attention mask for padding.
        """
        # Pass through Wav2Vec2
        # The processor usually prepares the attention_mask if padding is involved
        outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
        
        # Use the last hidden state.
        # We need to decide how to pool these features. Mean pooling is common.
        # (batch_size, sequence_length, hidden_size)
        hidden_states = outputs.last_hidden_state 
        
        # Mean pool across the sequence dimension
        if attention_mask is not None:
            # Ensure attention_mask is float for division and correct broadcasting
            # (batch_size, sequence_length) -> (batch_size, sequence_length, 1)
            expanded_mask = attention_mask.unsqueeze(-1).float()
            sum_hidden_states = (hidden_states * expanded_mask).sum(dim=1)
            sum_mask = expanded_mask.sum(dim=1)
            sum_mask = torch.clamp(sum_mask, min=1e-9) # Avoid division by zero
            pooled_output = sum_hidden_states / sum_mask
        else:
            # If no attention mask, assume no padding and mean pool
            pooled_output = hidden_states.mean(dim=1)
        
        # (batch_size, hidden_size)
        
        # Pass pooled features through the scoring head
        score = self.scoring_head(pooled_output) # (batch_size, 1)
        return score

In [None]:
from transformers import Wav2Vec2Processor

# Hyperparameters
NUM_SAMPLES = 50  # Reduced for faster dummy example
EPOCHS = 5 # Reduced for faster dummy example
BATCH_SIZE = 4 # Reduced for dummy data
LEARNING_RATE = 1e-4 # May need to be smaller if fine-tuning Wav2Vec2
SIGMA_RANKNET = 1.0
WAV2VEC2_MODEL_NAME = "facebook/wav2vec2-base" # Using a smaller model for example
TARGET_SAMPLING_RATE = 16000
NUM_PAIRS_PER_EPOCH = NUM_SAMPLES * 2 # Reduced for faster dummy data

# Dummy audio data (replace with your actual pre-processed audio data)
# Each item should be a 1D numpy array or tensor representing a raw waveform
# Ensure they are at the TARGET_SAMPLING_RATE
dummy_audio_waveforms = [np.random.randn(TARGET_SAMPLING_RATE * random.randint(1,3)).astype(np.float32) for _ in range(NUM_SAMPLES)]

dummy_intensity_scores = np.random.rand(NUM_SAMPLES).astype(np.float32) * 10

# Load Wav2Vec2 Processor
processor = Wav2Vec2Processor.from_pretrained(WAV2VEC2_MODEL_NAME)

# Instantiate Dataset and DataLoader
# The PairwiseAudioDataset will now handle audio processing using the Wav2Vec2 processor
pairwise_dataset = PairwiseAudioDataset(
    dummy_audio_waveforms, 
    dummy_intensity_scores,
    processor,
    target_sampling_rate=TARGET_SAMPLING_RATE,
    num_pairs_per_epoch=NUM_PAIRS_PER_EPOCH
)

actual_batch_size = min(BATCH_SIZE, len(pairwise_dataset))
if actual_batch_size == 0:
    print("Dataset is empty or too small for the batch size.")
    exit()

# Custom collate_fn for padding variable length sequences from the processor
def collate_fn_pair(batch):
    audio_i_list, audio_j_list, p_ij_list = zip(*batch)
    
    # The processor should handle padding if called on a list of waveforms.
    # However, our dataset processes them individually. So we collect and pad here.
    # Alternatively, the dataset's _process_audio could return dicts and we use processor.pad
    
    processed_i = processor(list(audio_i_list), sampling_rate=TARGET_SAMPLING_RATE, return_tensors="pt", padding=True)
    processed_j = processor(list(audio_j_list), sampling_rate=TARGET_SAMPLING_RATE, return_tensors="pt", padding=True)
    
    p_ij_tensor = torch.stack(p_ij_list)
    
    return processed_i.input_values, processed_i.attention_mask, \
            processed_j.input_values, processed_j.attention_mask, \
            p_ij_tensor

train_loader = DataLoader(pairwise_dataset, batch_size=actual_batch_size, shuffle=True, collate_fn=collate_fn_pair)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = EmoRankNet(
    wav2vec2_model_name=WAV2VEC2_MODEL_NAME,
    freeze_wav2vec2=True # Set to False if you want to fine-tune Wav2Vec2
).to(device)

rank_loss_fn = RankNetLoss(sigma=SIGMA_RANKNET)
# If fine-tuning Wav2Vec2, you might need a smaller learning rate for wav2vec2 parameters
# and a larger one for the head, or use different optimizer groups.
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)


print(f"Starting training with {len(train_loader)} batches per epoch.")
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    num_batches = 0
    for input_values_i, attention_mask_i, \
        input_values_j, attention_mask_j, \
        P_ij_target in train_loader:

        input_values_i = input_values_i.to(device)
        attention_mask_i = attention_mask_i.to(device)
        input_values_j = input_values_j.to(device)
        attention_mask_j = attention_mask_j.to(device)
        P_ij_target = P_ij_target.to(device)

        optimizer.zero_grad()

        score_i = model(input_values_i, attention_mask=attention_mask_i)
        score_j = model(input_values_j, attention_mask=attention_mask_j)
        
        loss = rank_loss_fn(score_i, score_j, P_ij_target)
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches +=1
    
    if num_batches > 0:
        avg_loss = total_loss / num_batches
        print(f"Epoch [{epoch+1}/{EPOCHS}], Average Loss: {avg_loss:.4f}")
    else:
        print(f"Epoch [{epoch+1}/{EPOCHS}], No data processed in this epoch.")


print("Training finished.")

In [None]:

# After training, the model can predict a raw score for any given input feature.
# These raw scores can then be used for ranking or, if calibrated, as intensity estimates.
# Example of getting a score for a new sample:
model.eval()
with torch.no_grad():
    example_feature = torch.tensor(features[0:1], dtype=torch.float32).to(device)
    predicted_score = model(example_feature)
    print(f"Example feature score: {predicted_score.item()}")