0. Setup


In [56]:
import torch
import numpy as np
from collections import Counter
from torch.utils.data import TensorDataset, Subset, Dataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit

1. Loading Individual Datasets


In [3]:
def create_balanced_dataset(test_dataset, target_samples_per_class=500):
    # Gather labels from the test dataset.
    test_labels = []
    for i in range(len(test_dataset)):
        _, label = test_dataset[i]  
        # Extract the label from the dataset item.
        test_labels.append(label.item())

    # Count samples per class in the test dataset
    class_counts = Counter(test_labels)
    print(f"Original class distribution: {dict(class_counts)}")

    # Identify classes that need oversampling
    classes_to_oversample = {
        cls: (target_samples_per_class - count)
        for cls, count in class_counts.items()
        if count < target_samples_per_class
    }

    # If no class is under the target, simply return the original dataset
    if not classes_to_oversample:
        print("No oversampling needed - all classes have enough samples.")
        return test_dataset

    # Map each class to the list of indices that contain that class
    class_indices_map = {cls: [] for cls in class_counts.keys()}
    for i, lbl in enumerate(test_labels):
        class_indices_map[lbl].append(i)

    # Generate the new indices by oversampling
    additional_indices = []
    for cls, num_needed in classes_to_oversample.items():
        # Randomly sample (with replacement) from the available indices of this class
        oversampled = np.random.choice(class_indices_map[cls], size=num_needed, replace=True)
        additional_indices.extend(oversampled)

    # Combine original indices with the newly oversampled ones
    all_indices = list(range(len(test_dataset))) + additional_indices

    # Create a new Subset using these indices
    balanced_dataset = Subset(test_dataset, all_indices)

    # Optional: verify the new distribution
    balanced_labels = []
    for idx in all_indices:
        _, label = test_dataset[idx]
        balanced_labels.append(int(label))
    balanced_counts = Counter(balanced_labels)
    print(f"Balanced class distribution: {dict(balanced_counts)}")

    return balanced_dataset

In [5]:
# Load the audio test dataset from the .pt file.
audio_dataset = torch.load("../../Data/audio_test_dataset.pt")
# Create a balanced dataset with 500 samples per class.
balanced_audio_dataset = create_balanced_dataset(audio_dataset, target_samples_per_class=500)
# Print out the size of the dataset.
print(f"Size of the balanced dataset: {len(balanced_audio_dataset)}")

  audio_dataset = torch.load("../../Data/audio_test_dataset.pt")


Original class distribution: {5: 70, 35: 37, 20: 83, 11: 42, 29: 100, 28: 56, 6: 67, 10: 58, 19: 100, 44: 31, 12: 68, 34: 92, 21: 58, 37: 100, 8: 34, 31: 40, 40: 35, 25: 42, 22: 62, 42: 42, 0: 33, 30: 100, 14: 52, 23: 100, 27: 31, 7: 100, 18: 100, 45: 66, 41: 30, 9: 37, 2: 57, 13: 60, 3: 67, 43: 45, 15: 37, 26: 36, 4: 73, 32: 57, 33: 34, 38: 42, 46: 67, 24: 41, 1: 48, 39: 32, 17: 39, 36: 39, 16: 42}
Balanced class distribution: {5: 500, 35: 500, 20: 500, 11: 500, 29: 500, 28: 500, 6: 500, 10: 500, 19: 500, 44: 500, 12: 500, 34: 500, 21: 500, 37: 500, 8: 500, 31: 500, 40: 500, 25: 500, 22: 500, 42: 500, 0: 500, 30: 500, 14: 500, 23: 500, 27: 500, 7: 500, 18: 500, 45: 500, 41: 500, 9: 500, 2: 500, 13: 500, 3: 500, 43: 500, 15: 500, 26: 500, 4: 500, 32: 500, 33: 500, 38: 500, 46: 500, 24: 500, 1: 500, 39: 500, 17: 500, 36: 500, 16: 500}
Size of the balanced dataset: 23500


In [48]:
# Now load the image test dataset from the .pt file.
images_dataset = torch.load("../../Data/images_test_dataset.pt")
# This is a dictionary with a list of tensors.
# We want to extract the data into two tensors - features and labels.
features = []
for image_tensor in images_dataset["images"]:
    features.append(image_tensor)
labels = []
for label in images_dataset["labels"]:
    labels.append(label)
# Convert the features and labels to PyTorch tensors.
features_tensor = torch.stack(features)
labels_tensor = torch.tensor(labels)

print(f"Features tensor shape: {features_tensor.shape}")
print(f"Labels tensor shape: {labels_tensor.shape}")

  images_dataset = torch.load("../../Data/images_test_dataset.pt")


Features tensor shape: torch.Size([821, 3, 224, 224])
Labels tensor shape: torch.Size([821])


In [52]:
# Create a dataset from the features and labels, and use it to create a balanced dataset.
images_dataset = TensorDataset(features_tensor, labels_tensor)
balanced_images_dataset = create_balanced_dataset(images_dataset, target_samples_per_class=500)
# Print out the size of the dataset.
print(f"Size of the balanced dataset: {len(balanced_images_dataset)}")

Original class distribution: {796: 18, 810: 16, 875: 18, 774: 16, 513: 15, 756: 18, 889: 16, 753: 18, 831: 18, 987: 18, 886: 18, 400: 17, 802: 18, 766: 18, 852: 18, 848: 18, 877: 16, 352: 18, 964: 18, 749: 16, 888: 18, 315: 18, 543: 18, 790: 18, 957: 18, 871: 16, 669: 18, 317: 18, 851: 18, 900: 18, 910: 18, 746: 18, 536: 16, 832: 17, 949: 18, 902: 18, 856: 18, 835: 18, 847: 16, 979: 18, 914: 18, 539: 15, 950: 18, 450: 18, 527: 18, 946: 17, 830: 18}
Balanced class distribution: {796: 500, 810: 500, 875: 500, 774: 500, 513: 500, 756: 500, 889: 500, 753: 500, 831: 500, 987: 500, 886: 500, 400: 500, 802: 500, 766: 500, 852: 500, 848: 500, 877: 500, 352: 500, 964: 500, 749: 500, 888: 500, 315: 500, 543: 500, 790: 500, 957: 500, 871: 500, 669: 500, 317: 500, 851: 500, 900: 500, 910: 500, 746: 500, 536: 500, 832: 500, 949: 500, 902: 500, 856: 500, 835: 500, 847: 500, 979: 500, 914: 500, 539: 500, 950: 500, 450: 500, 527: 500, 946: 500, 830: 500}
Size of the balanced dataset: 23500


2. Creating Combined Dataloaders


In [None]:
# Extract the images data from the balanced dataset.
images_data = []
images_labels = []
for i in range(len(balanced_images_dataset)):
    image, label = balanced_images_dataset[i]
    images_data.append(image)
    images_labels.append(label)
# Convert the data and labels to PyTorch tensors.
images_data_tensor = torch.stack(images_data)
images_labels_tensor = torch.tensor(images_labels)
print(f"Balanced images data tensor shape: {images_data_tensor.shape}")
print(f"Balanced images labels tensor shape: {images_labels_tensor.shape}")

Balanced images data tensor shape: torch.Size([23500, 3, 224, 224])
Balanced images labels tensor shape: torch.Size([23500])


In [55]:
# Extract the audio data from the balanced dataset.
audio_data = []
audio_labels = []
for i in range(len(balanced_audio_dataset)):
    audio, label = balanced_audio_dataset[i]
    audio_data.append(audio)
    audio_labels.append(label)
# Convert the data and labels to PyTorch tensors.
audio_data_tensor = torch.stack(audio_data)
audio_labels_tensor = torch.tensor(audio_labels)
print(f"Balanced audio data tensor shape: {audio_data_tensor.shape}")
print(f"Balanced audio labels tensor shape: {audio_labels_tensor.shape}")

Balanced audio data tensor shape: torch.Size([23500, 1, 128, 128])
Balanced audio labels tensor shape: torch.Size([23500])
