0. Setup


In [25]:
import torch
import pickle
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, Subset, Dataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit

1. Loading Individual Datasets


In [26]:
# Helper function to use a dataset to create a balanced dataset that makes
# sure each class has the same number of samples.
def create_balanced_dataset(test_dataset, target_samples_per_class=500):
    # Gather labels from the test dataset.
    test_labels = []
    for i in range(len(test_dataset)):
        _, label = test_dataset[i]  
        # Extract the label from the dataset item.
        test_labels.append(label.item())

    # Count samples per class in the test dataset
    class_counts = Counter(test_labels)
    print(f"Original class distribution: {dict(class_counts)}")

    # Identify classes that need oversampling
    classes_to_oversample = {
        cls: (target_samples_per_class - count)
        for cls, count in class_counts.items()
        if count < target_samples_per_class
    }

    # If no class is under the target, simply return the original dataset
    if not classes_to_oversample:
        print("No oversampling needed - all classes have enough samples.")
        return test_dataset

    # Map each class to the list of indices that contain that class
    class_indices_map = {cls: [] for cls in class_counts.keys()}
    for i, lbl in enumerate(test_labels):
        class_indices_map[lbl].append(i)

    # Generate the new indices by oversampling
    additional_indices = []
    for cls, num_needed in classes_to_oversample.items():
        # Randomly sample (with replacement) from the available indices of this class
        oversampled = np.random.choice(class_indices_map[cls], size=num_needed, replace=True)
        additional_indices.extend(oversampled)

    # Combine original indices with the newly oversampled ones
    all_indices = list(range(len(test_dataset))) + additional_indices

    # Create a new Subset using these indices
    balanced_dataset = Subset(test_dataset, all_indices)

    # Optional: verify the new distribution
    balanced_labels = []
    for idx in all_indices:
        _, label = test_dataset[idx]
        balanced_labels.append(int(label))
    balanced_counts = Counter(balanced_labels)
    print(f"Balanced class distribution: {dict(balanced_counts)}")

    return balanced_dataset

In [27]:
# Load the audio test dataset from the .pt file.
audio_dataset = torch.load("../../Data/audio_test_dataset.pt")
# Create a balanced dataset with 500 samples per class.
balanced_audio_dataset = create_balanced_dataset(audio_dataset, target_samples_per_class=500)
# Print out the size of the dataset.
print(f"Size of the balanced dataset: {len(balanced_audio_dataset)}")

  audio_dataset = torch.load("../../Data/audio_test_dataset.pt")


Original class distribution: {5: 70, 35: 37, 20: 83, 11: 42, 29: 100, 28: 56, 6: 67, 10: 58, 19: 100, 44: 31, 12: 68, 34: 92, 21: 58, 37: 100, 8: 34, 31: 40, 40: 35, 25: 42, 22: 62, 42: 42, 0: 33, 30: 100, 14: 52, 23: 100, 27: 31, 7: 100, 18: 100, 45: 66, 41: 30, 9: 37, 2: 57, 13: 60, 3: 67, 43: 45, 15: 37, 26: 36, 4: 73, 32: 57, 33: 34, 38: 42, 46: 67, 24: 41, 1: 48, 39: 32, 17: 39, 36: 39, 16: 42}
Balanced class distribution: {5: 500, 35: 500, 20: 500, 11: 500, 29: 500, 28: 500, 6: 500, 10: 500, 19: 500, 44: 500, 12: 500, 34: 500, 21: 500, 37: 500, 8: 500, 31: 500, 40: 500, 25: 500, 22: 500, 42: 500, 0: 500, 30: 500, 14: 500, 23: 500, 27: 500, 7: 500, 18: 500, 45: 500, 41: 500, 9: 500, 2: 500, 13: 500, 3: 500, 43: 500, 15: 500, 26: 500, 4: 500, 32: 500, 33: 500, 38: 500, 46: 500, 24: 500, 1: 500, 39: 500, 17: 500, 36: 500, 16: 500}
Size of the balanced dataset: 23500


In [28]:
# Now load the image test dataset from the .pt file.
images_dataset = torch.load("../../Data/images_test_dataset.pt")
# This is a dictionary with a list of tensors.
# We want to extract the data into two tensors - features and labels.
features = []
for image_tensor in images_dataset["images"]:
    features.append(image_tensor)
labels = []
for label in images_dataset["labels"]:
    labels.append(label)
# Convert the features and labels to PyTorch tensors.
features_tensor = torch.stack(features)
labels_tensor = torch.tensor(labels)

print(f"Features tensor shape: {features_tensor.shape}")
print(f"Labels tensor shape: {labels_tensor.shape}")

  images_dataset = torch.load("../../Data/images_test_dataset.pt")


Features tensor shape: torch.Size([821, 3, 224, 224])
Labels tensor shape: torch.Size([821])


In [33]:
# We need to encode the labels of images, similar to how we did for audio.
# Load the label mappings from the pickle file
with open('label_mappings.pkl', 'rb') as f:
    label_mappings = pickle.load(f)

# Encode the labels using the mappings
label_encoder = LabelEncoder()
label_encoder.fit(list(label_mappings['index_to_id'].values()))
encoded_labels = label_encoder.transform(labels_tensor.numpy())
# Convert the encoded labels to a tensor
encoded_labels_tensor = torch.tensor(encoded_labels)
print(f"Encoded labels tensor shape: {encoded_labels_tensor.shape}")

Encoded labels tensor shape: torch.Size([821])


In [35]:
# Create a dataset from the features and labels, and use it to create a balanced dataset.
images_dataset = TensorDataset(features_tensor, encoded_labels_tensor)
balanced_images_dataset = create_balanced_dataset(images_dataset, target_samples_per_class=500)
# Print out the size of the dataset.
print(f"Size of the balanced dataset: {len(balanced_images_dataset)}")

Original class distribution: {18: 18, 20: 16, 31: 18, 16: 16, 5: 15, 14: 18, 35: 16, 13: 18, 22: 18, 46: 18, 33: 18, 3: 17, 19: 18, 15: 18, 28: 18, 26: 18, 32: 16, 2: 18, 44: 18, 12: 16, 34: 18, 0: 18, 9: 18, 17: 18, 43: 18, 30: 16, 10: 18, 1: 18, 27: 18, 36: 18, 38: 18, 11: 18, 7: 16, 23: 17, 41: 18, 37: 18, 29: 18, 24: 18, 25: 16, 45: 18, 39: 18, 8: 15, 42: 18, 4: 18, 6: 18, 40: 17, 21: 18}
Balanced class distribution: {18: 500, 20: 500, 31: 500, 16: 500, 5: 500, 14: 500, 35: 500, 13: 500, 22: 500, 46: 500, 33: 500, 3: 500, 19: 500, 15: 500, 28: 500, 26: 500, 32: 500, 2: 500, 44: 500, 12: 500, 34: 500, 0: 500, 9: 500, 17: 500, 43: 500, 30: 500, 10: 500, 1: 500, 27: 500, 36: 500, 38: 500, 11: 500, 7: 500, 23: 500, 41: 500, 37: 500, 29: 500, 24: 500, 25: 500, 45: 500, 39: 500, 8: 500, 42: 500, 4: 500, 6: 500, 40: 500, 21: 500}
Size of the balanced dataset: 23500


2. Creating Combined Dataloaders


In [7]:
# Extract the audio data from the balanced dataset.
audio_data = []
audio_labels = []
for i in range(len(balanced_audio_dataset)):
    audio, label = balanced_audio_dataset[i]
    audio_data.append(audio)
    audio_labels.append(label)
# Convert the data and labels to PyTorch tensors.
audio_data_tensor = torch.stack(audio_data)
audio_labels_tensor = torch.tensor(audio_labels)
print(f"Balanced audio data tensor shape: {audio_data_tensor.shape}")
print(f"Balanced audio labels tensor shape: {audio_labels_tensor.shape}")

Balanced audio data tensor shape: torch.Size([23500, 1, 128, 128])
Balanced audio labels tensor shape: torch.Size([23500])


In [8]:
# We define a class to create a dataset for the audio and image data.
# We want to create random pairs of audio and image data, of the same class.

# Create stratified data loaders for training, validation, and testing
# These will be created using the dataset framework we defined above,
# and using the balanced datasets we created earlier.
class MultimodalFusionDataset(Dataset):
    def __init__(self, image_data, image_labels, audio_data, audio_labels, transform=None, indices=None):
        """
        Custom dataset for multimodal fusion of image and audio data.
        
        Args:
            image_data: PyTorch tensor containing image data
            image_labels: PyTorch tensor containing image labels
            audio_data: PyTorch tensor containing audio data
            audio_labels: PyTorch tensor containing audio labels
            transform: Optional transform to be applied to the samples
            indices: Optional indices to select a subset of the data
        """
        self.image_data = image_data
        self.image_labels = image_labels
        self.audio_data = audio_data
        self.audio_labels = audio_labels
        self.transform = transform
        
        # Convert tensors to numpy for processing
        image_labels_np = image_labels.cpu().numpy()
        audio_labels_np = audio_labels.cpu().numpy()
        
        # Create a list of all possible class indices - ensure they're the same for both modalities
        image_classes = set(np.unique(image_labels_np))
        audio_classes = set(np.unique(audio_labels_np))
        common_classes = sorted(list(image_classes.intersection(audio_classes)))
        
        if len(common_classes) == 0:
            raise ValueError("No common classes found between image and audio datasets")
        
        # Organize samples by class
        self.class_indices = {}
        self.pairs = []
        self.pair_labels = []
        
        for cls in common_classes:
            img_indices = np.where(image_labels_np == cls)[0]
            audio_indices = np.where(audio_labels_np == cls)[0]
            
            if len(img_indices) > 0 and len(audio_indices) > 0:
                self.class_indices[cls] = {
                    'image': img_indices,
                    'audio': audio_indices
                }
                
                # Shuffle the indices for random pairing
                np.random.seed(42)  # For reproducibility
                np.random.shuffle(img_indices)
                np.random.shuffle(audio_indices)
                
                # Create pairs (one image with one audio) from the same class
                n_pairs = min(len(img_indices), len(audio_indices))
                for i in range(n_pairs):
                    self.pairs.append((img_indices[i], audio_indices[i]))
                    self.pair_labels.append(cls)
        
        # Check if we have any pairs
        if len(self.pairs) == 0:
            raise ValueError("No valid pairs could be created. Check your data and labels.")
        
        # Convert to numpy arrays for easier indexing
        self.pairs = np.array(self.pairs)
        self.pair_labels = np.array(self.pair_labels)
        
        # If specific indices are provided, only use those
        if indices is not None:
            if len(indices) > 0:  # Make sure indices is not empty
                self.pairs = self.pairs[indices]
                self.pair_labels = self.pair_labels[indices]
            else:
                raise ValueError("Empty indices provided")
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        img_idx, audio_idx = self.pairs[idx]
        
        image = self.image_data[img_idx]
        audio = self.audio_data[audio_idx]
        label = torch.tensor(self.pair_labels[idx], dtype=torch.long)
        
        if self.transform:
            image = self.transform(image)
            
        return {
            'image': image,
            'audio': audio,
            'label': label
        }


def create_stratified_data_loaders(image_data, image_labels, audio_data, audio_labels, batch_size=32, num_workers=4):
    """
    Create train, validation, and test data loaders with stratified splits.
    
    Args:
        image_data: PyTorch tensor containing image data
        image_labels: PyTorch tensor containing image labels
        audio_data: PyTorch tensor containing audio data
        audio_labels: PyTorch tensor containing audio labels
        batch_size: Batch size for the data loaders
        num_workers: Number of worker threads for the data loaders
        
    Returns:
        train_loader, val_loader, test_loader
    """
    # First, create the full dataset
    try:
        full_dataset = MultimodalFusionDataset(
            image_data=image_data,
            image_labels=image_labels,
            audio_data=audio_data,
            audio_labels=audio_labels
        )
    except ValueError as e:
        print(f"Error creating dataset: {e}")
        print(f"Image labels shape: {image_labels.shape}, unique: {torch.unique(image_labels).shape}")
        print(f"Audio labels shape: {audio_labels.shape}, unique: {torch.unique(audio_labels).shape}")
        raise
    
    # Get all pair labels
    pair_labels = full_dataset.pair_labels
    
    # Create indices array
    indices = np.arange(len(pair_labels))
    
    # First split: train+val vs test (80:20)
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    temp_idx, test_idx = next(sss1.split(indices, pair_labels))
    
    # Get the temporary set labels
    temp_labels = pair_labels[temp_idx]
    
    # Second split: train vs val (80:20)
    temp_indices = np.arange(len(temp_idx))
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    train_temp_idx, val_temp_idx = next(sss2.split(temp_indices, temp_labels))
    
    # Convert to original indices
    train_idx = temp_idx[train_temp_idx]
    val_idx = temp_idx[val_temp_idx]
    
    # Create the individual datasets
    train_dataset = MultimodalFusionDataset(
        image_data=image_data,
        image_labels=image_labels,
        audio_data=audio_data,
        audio_labels=audio_labels,
        indices=train_idx
    )
    
    val_dataset = MultimodalFusionDataset(
        image_data=image_data,
        image_labels=image_labels,
        audio_data=audio_data,
        audio_labels=audio_labels,
        indices=val_idx
    )
    
    test_dataset = MultimodalFusionDataset(
        image_data=image_data,
        image_labels=image_labels,
        audio_data=audio_data,
        audio_labels=audio_labels,
        indices=test_idx
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers
    )
    
    # Print split information
    print(f"Dataset split: {len(train_dataset)} training, {len(val_dataset)} validation, {len(test_dataset)} test samples")
    
    return train_loader, val_loader, test_loader



In [10]:
# Use the extracted tensors to create stratified data loaders
 -٠٩batch_size = 32
train_loader, val_loader, test_loader = create_stratified_data_loaders(
    image_data=images_data_tensor,
    image_labels=images_labels_tensor,
    audio_data=audio_data_tensor,
    audio_labels=audio_labels_tensor,
    batch_size=batch_size,
    num_workers=4
)

Error creating dataset: No common classes found between image and audio datasets
Image labels shape: torch.Size([23500]), unique: torch.Size([47])
Audio labels shape: torch.Size([23500]), unique: torch.Size([47])


ValueError: No common classes found between image and audio datasets