In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold
import os
from pathlib import Path
import platform
from torch.amp import autocast, GradScaler  # Updated import path

In [None]:
class AudioDataset(Dataset):
    def __init__(self, segment_info_path, transform=None, train=True):
        self.df = pd.read_csv(segment_info_path)
        self.transform = transform
        self.train = train
        
        # Standardize JSo/JoS naming
        self.df['participant_id'] = self.df['participant_id'].replace('JoSP', 'JSoP')
        
        self.label_to_id = {'hhc': 0, 'hho': 1, 'kd': 2, 'sd': 3}
        
        # Parameters for mel spectrogram (as per paper)
        self.sample_rate = 16000
        self.n_mels = 128
        self.n_fft = 1024
        self.hop_length = 512
        
        # Parameters for augmentation (10x as per paper)
        self.pitch_shifts = [-2, -1, 0, 1, 2]  # 5 pitch shifts
        self.time_stretches = [0.9, 0.95, 1.0, 1.05, 1.1]  # 5 time stretches
        
    def __len__(self):
        if self.train:
            return len(self.df) * len(self.pitch_shifts) * len(self.time_stretches)
        return len(self.df)
    
    def _load_and_process_audio(self, audio_path, pitch_shift=0, time_stretch=1.0):
        """Load and process audio following paper's normalization"""
        # Fix path if needed
        if audio_path.startswith('../'):
            audio_path = f"../{audio_path}"
            
        # Load audio
        y, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        # Apply augmentation
        if time_stretch != 1.0:
            y = librosa.effects.time_stretch(y, rate=time_stretch)
        if pitch_shift != 0:
            y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
            
        # Compute mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=y, sr=sr, n_mels=self.n_mels, n_fft=self.n_fft, hop_length=self.hop_length
        )
        
        # Convert to log scale
        log_mel_spec = librosa.power_to_db(mel_spec)
        
        # Min-max normalize each patch to [0, 1] as per paper
        log_mel_spec = (log_mel_spec - log_mel_spec.min()) / (log_mel_spec.max() - log_mel_spec.min())
        
        # Convert to tensor and ensure fixed size (128 x 64)
        spec_tensor = torch.FloatTensor(log_mel_spec)
        target_length = 64
        current_length = spec_tensor.size(1)
        
        if current_length < target_length:
            pad_amount = target_length - current_length
            spec_tensor = torch.nn.functional.pad(spec_tensor, (0, pad_amount))
        elif current_length > target_length:
            start = (current_length - target_length) // 2
            spec_tensor = spec_tensor[:, start:start + target_length]
        
        return spec_tensor
    
    def __getitem__(self, idx):
        if self.train:
            n_augs = len(self.pitch_shifts) * len(self.time_stretches)
            orig_idx = idx // n_augs
            aug_idx = idx % n_augs
            pitch_idx = aug_idx // len(self.time_stretches)
            time_idx = aug_idx % len(self.time_stretches)
            
            pitch_shift = self.pitch_shifts[pitch_idx]
            time_stretch = self.time_stretches[time_idx]
        else:
            orig_idx = idx
            pitch_shift = 0
            time_stretch = 1.0
            
        row = self.df.iloc[orig_idx]
        audio_path = row['segment_path']
        label = self.label_to_id[row['instrument_label']]
        participant = row['participant_id']
        
        spec = self._load_and_process_audio(audio_path, pitch_shift, time_stretch)
        return spec, label, participant

class DrumCNN(nn.Module):
    def __init__(self, num_classes=4, embedding_dim=1024):
        super(DrumCNN, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 64 x 64 x 32
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 128 x 32 x 16
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 256 x 16 x 8
            
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2)   # 512 x 8 x 4
        )
        
        self.flatten_size = 512 * 8 * 4
        
        self.embedding = nn.Sequential(
            nn.Linear(self.flatten_size, embedding_dim),
            nn.ReLU(),
            nn.Dropout(0.5)
        )
        
        self.classifier = nn.Linear(embedding_dim, num_classes)
        
    def forward(self, x, return_embedding=False):
        x = x.unsqueeze(1)
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        embedding = self.embedding(x)
        
        if return_embedding:
            return embedding
            
        return self.classifier(embedding)

def get_device():
    """Check for GPU availability including 4090"""
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            gpu_name = torch.cuda.get_device_name(i)
            if "4090" in gpu_name:
                print(f"Found NVIDIA RTX 4090: {gpu_name}")
                return torch.device(f"cuda:{i}"), True
        print(f"Using available GPU: {torch.cuda.get_device_name(0)}")
        return torch.device("cuda:0"), True
    elif platform.system() == "Darwin":  # macOS
        if torch.backends.mps.is_available():
            print("Using Apple Silicon GPU")
            return torch.device("mps"), False
    print("Using CPU")
    return torch.device("cpu"), False

def train_model(train_loader, val_loader, model, device, cuda_available=False, num_epochs=100):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=5)
    
    # Only use GradScaler when CUDA is available
    scaler = GradScaler('cuda') if cuda_available else None
    
    best_val_acc = 0
    patience = 10  # Early stopping as per paper
    patience_counter = 0
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        
        for specs, labels, _ in train_loader:
            specs = specs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            optimizer.zero_grad(set_to_none=True)
            
            if cuda_available:
                # Use mixed precision only with CUDA
                with autocast():
                    outputs = model(specs)
                    loss = criterion(outputs, labels)
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                # Regular training for CPU/MPS
                outputs = model(specs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
        
        train_acc = 100. * correct / total
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for specs, labels, _ in val_loader:
                specs = specs.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)
                
                if cuda_available:
                    with autocast():
                        outputs = model(specs)
                        loss = criterion(outputs, labels)
                else:
                    outputs = model(specs)
                    loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()
        
        val_acc = 100. * correct / total
        
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss/len(train_loader):.3f} | Train Acc: {train_acc:.3f}%')
        print(f'Val Loss: {val_loss/len(val_loader):.3f} | Val Acc: {val_acc:.3f}%')
        
        scheduler.step(val_acc)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

def analyze_participants(df):
    """Detailed analysis of participants in both datasets"""
    # AVP participants (P1, P2, etc.)
    avp_participants = sorted([p for p in df['participant_id'].unique() if p.startswith('P')])
    
    # LVT participants (handling I/P pairs)
    lvt_ids = [p for p in df['participant_id'].unique() if not p.startswith('P')]
    lvt_base_names = sorted(list(set([p[:-1] for p in lvt_ids if p not in ['JoSP']])))
    
    print("\nDetailed Participant Analysis:")
    print(f"AVP Dataset ({len(avp_participants)} participants):")
    print(avp_participants)
    
    print(f"\nLVT Dataset ({len(lvt_base_names)} participants):")
    for base_name in lvt_base_names:
        has_I = f"{base_name}I" in lvt_ids
        has_P = f"{base_name}P" in lvt_ids or (base_name == 'JSo' and 'JoSP' in lvt_ids)
        print(f"{base_name}: {'I' if has_I else '-'}{'P' if has_P else '-'}")
    
    total_participants = len(avp_participants) + len(lvt_base_names)
    print(f"\nTotal unique participants: {total_participants}")
    print(f"- AVP Dataset: {len(avp_participants)} participants")
    print(f"- LVT Dataset: {len(lvt_base_names)} participants")
    
    return avp_participants, lvt_base_names

def main():
    # Get device and CUDA availability
    device, cuda_available = get_device()
    
    # Enable CUDA optimizations only if available
    if cuda_available:
        torch.backends.cudnn.benchmark = True
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
    
    # Set random seed
    torch.manual_seed(42)
    
    # Create dataset
    dataset = AudioDataset(
        segment_info_path='../../../segment_info/segment_info.csv',
        train=True
    )
    
    # Analyze participants
    avp_participants, lvt_base_names = analyze_participants(dataset.df)
    
    # Create participant groups for splitting
    def get_participant_group(participant_id):
        """Convert participant ID to a group number, handling the JSo/JoS case"""
        if participant_id.startswith('P'):
            return int(participant_id[1:])
        if participant_id == 'JoSP':
            return hash('JSo')
        return hash(participant_id[:-1])
    
    groups = dataset.df['participant_id'].apply(get_participant_group).values
    
    # Cross-validation setup
    n_splits = 5
    gkf = GroupKFold(n_splits=n_splits)
    
    # Storage for embeddings and labels
    all_embeddings = []
    all_labels = []
    all_participants = []
    
    # Cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(gkf.split(X=np.zeros(len(dataset.df)), groups=groups)):
        print(f"\nFold {fold + 1}/{n_splits}")
        
        train_subset = torch.utils.data.Subset(dataset, train_idx)
        val_subset = torch.utils.data.Subset(dataset, val_idx)
        
        # Determine optimal batch size and workers based on device
        batch_size = 128 if cuda_available else 32
        val_batch_size = 256 if cuda_available else 64
        num_workers = 8 if cuda_available else 4
        
        # Optimized DataLoader settings
        train_loader = DataLoader(
            train_subset, 
            batch_size=batch_size,
            shuffle=True, 
            num_workers=num_workers,
            pin_memory=cuda_available,
            persistent_workers=True if num_workers > 0 else False,
            prefetch_factor=2 if num_workers > 0 else None
        )
        
        val_loader = DataLoader(
            val_subset, 
            batch_size=val_batch_size,
            shuffle=False, 
            num_workers=num_workers,
            pin_memory=cuda_available,
            persistent_workers=True if num_workers > 0 else False,
            prefetch_factor=2 if num_workers > 0 else None
        )
        
        model = DrumCNN().to(device)
        train_model(train_loader, val_loader, model, device, cuda_available)
        
        # Extract embeddings
        model.eval()
        fold_embeddings = []
        fold_labels = []
        fold_participants = []
        
        with torch.no_grad():
            for specs, labels, parts in val_loader:
                specs = specs.to(device, non_blocking=True)
                
                if cuda_available:
                    with autocast():
                        embeddings = model(specs, return_embedding=True)
                else:
                    embeddings = model(specs, return_embedding=True)
                
                fold_embeddings.append(embeddings.cpu().numpy())
                fold_labels.extend(labels.numpy())
                fold_participants.extend(parts)
        
        all_embeddings.append(np.concatenate(fold_embeddings))
        all_labels.extend(fold_labels)
        all_participants.extend(fold_participants)
    
    # Prepare for k-NN evaluation
    embeddings = np.concatenate(all_embeddings)
    labels = np.array(all_labels)
    participants = np.array(all_participants)
    
    print("\nFinal dataset sizes:")
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Labels shape: {labels.shape}")
    print(f"Participants shape: {participants.shape}")
    
    # Scale embeddings (as per paper)
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
    
    # Evaluate using leave-one-participant-out
    knn = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
    unique_participants = np.unique(participants)
    participant_accuracies = []
    
    print("\nParticipant-wise evaluation:")
    for test_participant in unique_participants:
        train_mask = participants != test_participant
        test_mask = participants == test_participant
        
        # Scale using only training data
        train_embeddings = scaled_embeddings[train_mask]
        test_embeddings = scaled_embeddings[test_mask]
        
        knn.fit(train_embeddings, labels[train_mask])
        accuracy = knn.score(test_embeddings, labels[test_mask])
        participant_accuracies.append(accuracy)
        
        print(f"Participant {test_participant}: {accuracy:.3f}")
    
    print(f"\nMean participant-independent accuracy: {np.mean(participant_accuracies):.3f}")

if __name__ == "__main__":
    main()

Found NVIDIA RTX 4090: NVIDIA GeForce RTX 4090

Detailed Participant Analysis:
AVP Dataset (28 participants):
['P1', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P2', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9']

LVT Dataset (20 participants):
AFR: IP
AZi: IP
Bea: IP
Bic: IP
Cat: IP
Cav: IP
Cra: IP
Isa: IP
JOl: IP
JSi: IP
JSo: IP
MCo: IP
Maf: IP
Mar: IP
Nor: IP
Ric: IP
Rob: IP
Sof: IP
Zga: IP
Ziz: IP

Total unique participants: 48
- AVP Dataset: 28 participants
- LVT Dataset: 20 participants

Fold 1/5
