In [1]:
import numpy as np

def combine_datasets(data_paths):
    """
    Combine multiple datasets from a list of file paths, adjusting labels to ensure uniqueness across all datasets.
    Handles both training and test data.
    
    Args:
    data_paths: List of strings, each containing the file path to a .npz dataset.
    
    Returns:
    combined_train_images: Numpy array of combined training images from all datasets.
    combined_train_labels: Numpy array of combined and adjusted training labels from all datasets.
    combined_test_images: Numpy array of combined test images from all datasets.
    combined_test_labels: Numpy array of combined and adjusted test labels from all datasets.
    """

    #! Do not mix one channel and multi channel datasets. Example breastmnist and retinamnist

    combined_train_images = []
    combined_train_labels = []
    combined_test_images = []
    combined_test_labels = []
    label_offset = 0

    for path in data_paths:
        # Load the dataset
        data = np.load(path)
        
        # Extract train images and labels
        train_images = data['train_images']
        train_labels = data['train_labels']
        
        # Extract test images and labels
        test_images = data['test_images']
        test_labels = data['test_labels']

        combined_train_images.append(train_images)
        combined_test_images.append(test_images)

        # Adjust labels for uniqueness
        adjusted_train_labels = train_labels + label_offset
        adjusted_test_labels = test_labels + label_offset
        
        combined_train_labels.append(adjusted_train_labels)
        combined_test_labels.append(adjusted_test_labels)

        # Update label offset for the next dataset
        label_offset += len(np.unique(train_labels))

    # Combine all images and labels
    combined_train_images = np.concatenate(combined_train_images, axis=0)
    combined_train_labels = np.concatenate(combined_train_labels, axis=0)
    combined_test_images = np.concatenate(combined_test_images, axis=0)
    combined_test_labels = np.concatenate(combined_test_labels, axis=0)

    return combined_train_images, combined_train_labels, combined_test_images, combined_test_labels

In [2]:

# Example usage:
dataset_paths = [
    '/home/localssk23/.medmnist/pneumoniamnist.npz',
    '/home/localssk23/.medmnist/breastmnist.npz'
]

train_images, train_labels, test_images, test_labels = combine_datasets(dataset_paths)

In [3]:
dataset_paths = [
    '/home/localssk23/.medmnist/pneumoniamnist.npz',
    '/home/localssk23/.medmnist/breastmnist.npz',
    # Add more paths as needed
]

print(dataset_paths)

train_images, train_labels, test_images, test_labels = combine_datasets(dataset_paths)

['/home/localssk23/.medmnist/pneumoniamnist.npz', '/home/localssk23/.medmnist/breastmnist.npz']


In [4]:
print(f"Combined training images shape: {train_images.shape}")
print(f"Combined training labels shape: {train_labels.shape}")
print(f"Combined test images shape: {test_images.shape}")
print(f"Combined test labels shape: {test_labels.shape}")

Combined training images shape: (5254, 28, 28)
Combined training labels shape: (5254, 1)
Combined test images shape: (780, 28, 28)
Combined test labels shape: (780, 1)


In [5]:
print(f"Unique training labels: {np.unique(train_labels)}")

Unique training labels: [0 1 2 3]
