# Convert MedMNIST Dataset

> https://huggingface.co/datasets/albertvillanova/medmnist-v2/resolve/main/medmnist-v2.py?download=true

In [1]:
class_labels = {
    "pathmnist_224": {
        "0": "adipose",
        "1": "background",
        "2": "debris",
        "3": "lymphocytes",
        "4": "mucus",
        "5": "smooth muscle",
        "6": "normal colon mucosa",
        "7": "cancer-associated stroma",
        "8": "colorectal adenocarcinoma epithelium",
    },
    "chestmnist_224": {
        "0": "atelectasis",
        "1": "cardiomegaly",
        "2": "effusion",
        "3": "infiltration",
        "4": "mass",
        "5": "nodule",
        "6": "pneumonia",
        "7": "pneumothorax",
        "8": "consolidation",
        "9": "edema",
        "10": "emphysema",
        "11": "fibrosis",
        "12": "pleural",
        "13": "hernia",
    },
    "dermamnist_224": {
        "0": "actinic keratoses and intraepithelial carcinoma",
        "1": "basal cell carcinoma",
        "2": "benign keratosis-like lesions",
        "3": "dermatofibroma",
        "4": "melanoma",
        "5": "melanocytic nevi",
        "6": "vascular lesions",
    },
    "octmnist_224": {
        "0": "choroidal neovascularization",
        "1": "diabetic macular edema",
        "2": "drusen",
        "3": "normal",
    },
    "pneumoniamnist_224": {
        "0": "normal",
        "1": "pneumonia",
    },
    "retinamnist_224": {
        "0": "0",
        "1": "1",
        "2": "2",
        "3": "3",
        "4": "4",
    },
    "breastmnist_224": {
        "0": "malignant",
        "1": "normal, benign",
    },
    "bloodmnist_224": {
        "0": "basophil",
        "1": "eosinophil",
        "2": "erythroblast",
        "3": "immature_granulocytes", #(myelocytes, metamyelocytes and promyelocytes)"
        "4": "lymphocyte",
        "5": "monocyte",
        "6": "neutrophil",
        "7": "platelet",
    },
    "tissuemnist_224": {
        "0": "Collecting Duct, Connecting Tubule",
        "1": "Distal Convoluted Tubule",
        "2": "Glomerular endothelial cells",
        "3": "Interstitial endothelial cells",
        "4": "Leukocytes",
        "5": "Podocytes",
        "6": "Proximal Tubule Segments",
        "7": "Thick Ascending Limb",
    },
    "organamnist_224": {
        "0": "bladder",
        "1": "femur-left",
        "2": "femur-right",
        "3": "heart",
        "4": "kidney-left",
        "5": "kidney-right",
        "6": "liver",
        "7": "lung-left",
        "8": "lung-right",
        "9": "pancreas",
        "10": "spleen",
    },
    "organcmnist_224": {
        "0": "bladder",
        "1": "femur-left",
        "2": "femur-right",
        "3": "heart",
        "4": "kidney-left",
        "5": "kidney-right",
        "6": "liver",
        "7": "lung-left",
        "8": "lung-right",
        "9": "pancreas",
        "10": "spleen",
    },
    "organsmnist_224": {
        "0": "bladder",
        "1": "femur-left",
        "2": "femur-right",
        "3": "heart",
        "4": "kidney-left",
        "5": "kidney-right",
        "6": "liver",
        "7": "lung-left",
        "8": "lung-right",
        "9": "pancreas",
        "10": "spleen",
    }
}

## Download

In [2]:
#!pip install medmnist

In [3]:
npz_folder = "/Users/brusnto/Doutorado/XAI/ProjetoFinal/datasets/medmnist/raw"
output_folder = "/Users/brusnto/Doutorado/XAI/ProjetoFinal/datasets/medmnist"

In [4]:
import os
import random
import numpy as np
from PIL import Image
from medmnist.info import INFO, HOMEPAGE, DEFAULT_ROOT

try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence


class MedMNIST(Sequence):

    flag = ...

    def __init__(
        self,
        split,
        transform=None,
        target_transform=None,
        download=False,
        as_rgb=False,
        root=DEFAULT_ROOT,
        size=None,
        mmap_mode=None
    ):
        ''' 
        Args:

            split (str, required): 'train', 'val' or 'test'
            transform (callable, optional): data transformation
            target_transform (callable, optional): target transformation
            size (int, optional): The size of the returned images. If None, use MNIST-like 28. Default: None.
            mmap_mode (str, optional): If not None, read image arrays from the disk directly. This is useful to set `mmap_mode='r'` to save memory usage when the dataset is large (e.g., PathMNIST-224). Default: None.

        '''

        if (size is None) or (size == 28):
            self.size = 28
            self.size_flag = ""
        else:
            assert size in self.available_sizes
            self.size = size
            self.size_flag = f"_{size}"


        self.info = INFO[self.flag]

        if root is not None and os.path.exists(root):
            self.root = root
        else:
            raise RuntimeError("Failed to setup the default `root` directory. " +
                               "Please specify and create the `root` directory manually.")

        if download:
            self.download()

        if not os.path.exists(
                os.path.join(self.root, "{}.npz".format(self.flag))):
            raise RuntimeError('Dataset not found.' +
                               ' You can use download=True to download it')

        npz_file = np.load(
            os.path.join(self.root, f"{self.flag}{self.size_flag}.npz"),
            mmap_mode=mmap_mode,
        )

        self.split = split
        self.transform = transform
        self.target_transform = target_transform
        self.as_rgb = as_rgb

        if self.split in ["train", "val", "test"]:
            self.imgs = npz_file[f"{self.split}_images"]
            self.labels = npz_file[f"{self.split}_labels"]
        else:
            raise ValueError

    def __len__(self):
        return self.imgs.shape[0]

    def __repr__(self):
        '''Adapted from torchvision.ss'''
        _repr_indent = 4
        head = f"Dataset {self.__class__.__name__} ({self.flag})"
        body = [f"Number of datapoints: {self.__len__()}"]
        body.append(f"Root location: {self.root}")
        body.append(f"Split: {self.split}")
        body.append(f"Task: {self.info['task']}")
        body.append(f"Number of channels: {self.info['n_channels']}")
        body.append(f"Meaning of labels: {self.info['label']}")
        body.append(f"Number of samples: {self.info['n_samples']}")
        body.append(f"Description: {self.info['description']}")
        body.append(f"License: {self.info['license']}")

        lines = [head] + [" " * _repr_indent + line for line in body]
        return '\n'.join(lines)

    def download(self):
        try:
            from torchvision.datasets.utils import download_url

            download_url(
                url=self.info[f"url{self.size_flag}"],
                root=self.root,
                filename=f"{self.flag}{self.size_flag}.npz",
                md5=self.info[f"MD5{self.size_flag}"],
            )
        except:
            raise RuntimeError('Something went wrong when downloading! ' +
                               'Go to the homepage to download manually. ' +
                               HOMEPAGE)

    @staticmethod
    def _collate_fn(data):
        xs = []
        ys = []
        for x, y in data:
            xs.append(np.array(x))
            ys.append(y)
        return np.array(xs), np.array(ys)


class MedMNIST2D(MedMNIST):
    available_sizes = [28, 64, 128, 224]

    def __getitem__(self, index):
        '''
        return: (without transform/target_transofrm)
            img: PIL.Image
            target: np.array of `L` (L=1 for single-label)
        '''
        img, target = self.imgs[index], self.labels[index].astype(int)
        img = Image.fromarray(img)

        if self.as_rgb:
            img = img.convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def save(self, folder, postfix="png", write_csv=True):

        from medmnist.utils import save2d

        save2d(
            imgs=self.imgs,
            labels=self.labels,
            img_folder=os.path.join(folder, f"{self.flag}{self.size_flag}"),
            split=self.split,
            postfix=postfix,
            csv_path=os.path.join(folder, f"{self.flag}{self.size_flag}.csv") if write_csv else None
        )

    def montage(self, length=20, replace=False, save_folder=None):
        from medmnist.utils import montage2d

        n_sel = length * length
        sel = np.random.choice(self.__len__(), size=n_sel, replace=replace)

        montage_img = montage2d(imgs=self.imgs,
                                n_channels=self.info['n_channels'],
                                sel=sel)

        if save_folder is not None:
            if not os.path.exists(save_folder):
                os.makedirs(save_folder)
            montage_img.save(os.path.join(save_folder,
                                          f"{self.flag}{self.size_flag}_{self.split}_montage.jpg"))

        return montage_img


class MedMNIST3D(MedMNIST):
    available_sizes = [28, 64]

    def __getitem__(self, index):
        '''
        return: (without transform/target_transofrm)
            img: an array of 1x28x28x28 or 3x28x28x28 (if `as_RGB=True`), in [0,1]
            target: np.array of `L` (L=1 for single-label)
        '''
        img, target = self.imgs[index], self.labels[index].astype(int)

        img = np.stack([img/255.]*(3 if self.as_rgb else 1), axis=0)

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def save(self, folder, postfix="gif", write_csv=True):
        from medmnist.utils import save3d

        assert postfix == "gif"

        save3d(
            imgs=self.imgs,
            labels=self.labels,
            img_folder=os.path.join(folder, f"{self.flag}{self.size_flag}"),
            split=self.split,
            postfix=postfix,
            csv_path=os.path.join(folder, f"{self.flag}{self.size_flag}.csv") if write_csv else None
        )

    def montage(self, length=20, replace=False, save_folder=None):
        assert self.info['n_channels'] == 1

        from medmnist.utils import montage3d, save_frames_as_gif

        n_sel = length * length
        sel = np.random.choice(self.__len__(), size=n_sel, replace=replace)

        montage_frames = montage3d(imgs=self.imgs,
                                   n_channels=self.info['n_channels'],
                                   sel=sel)

        if save_folder is not None:
            if not os.path.exists(save_folder):
                os.makedirs(save_folder)

            save_frames_as_gif(montage_frames,
                               os.path.join(save_folder,
                                            f"{self.flag}{self.size_flag}_{self.split}_montage.gif"))

        return montage_frames


class PathMNIST(MedMNIST2D):
    flag = "pathmnist"


class OCTMNIST(MedMNIST2D):
    flag = "octmnist"


class PneumoniaMNIST(MedMNIST2D):
    flag = "pneumoniamnist"


class ChestMNIST(MedMNIST2D):
    flag = "chestmnist"


class DermaMNIST(MedMNIST2D):
    flag = "dermamnist"


class RetinaMNIST(MedMNIST2D):
    flag = "retinamnist"


class BreastMNIST(MedMNIST2D):
    flag = "breastmnist"


class BloodMNIST(MedMNIST2D):
    flag = "bloodmnist"


class TissueMNIST(MedMNIST2D):
    flag = "tissuemnist"


class OrganAMNIST(MedMNIST2D):
    flag = "organamnist"


class OrganCMNIST(MedMNIST2D):
    flag = "organcmnist"


class OrganSMNIST(MedMNIST2D):
    flag = "organsmnist"


class OrganMNIST3D(MedMNIST3D):
    flag = "organmnist3d"


class NoduleMNIST3D(MedMNIST3D):
    flag = "nodulemnist3d"


class AdrenalMNIST3D(MedMNIST3D):
    flag = "adrenalmnist3d"


class FractureMNIST3D(MedMNIST3D):
    flag = "fracturemnist3d"


class VesselMNIST3D(MedMNIST3D):
    flag = "vesselmnist3d"


class SynapseMNIST3D(MedMNIST3D):
    flag = "synapsemnist3d"


# backward-compatible
OrganMNISTAxial = OrganAMNIST
OrganMNISTCoronal = OrganCMNIST
OrganMNISTSagittal = OrganSMNIST


def get_loader(dataset, batch_size):
    total_size = len(dataset)
    print('Size', total_size)
    index_generator = shuffle_iterator(range(total_size))
    while True:
        data = []
        for _ in range(batch_size):
            idx = next(index_generator)
            data.append(dataset[idx])
        yield dataset._collate_fn(data)


def shuffle_iterator(iterator):
    # iterator should have limited size
    index = list(iterator)
    total_size = len(index)
    i = 0
    random.shuffle(index)
    while True:
        yield index[i]
        i += 1
        if i >= total_size:
            i = 0
            random.shuffle(index)

In [None]:
# BloodMNIST(
#     split='train',
#     root=npz_folder + '/train',
#     transform=None,
#     download=True,
#     as_rgb=True,
#     size=224,
# )

In [None]:
# BloodMNIST(
#     split='val',
#     root=npz_folder + '/val',
#     transform=None,
#     download=True,
#     as_rgb=True,
#     size=224,
# )

In [None]:
# BloodMNIST(
#     split='test',
#     root=npz_folder + '/test',
#     transform=None,
#     download=True,
#     as_rgb=True,
#     size=224,
# )

In [None]:
PneumoniaMNIST(
    split='train',
    root=npz_folder + '/train',
    transform=None,
    download=True,
    as_rgb=True,
    size=224,
)

In [None]:
PneumoniaMNIST(
    split='val',
    root=npz_folder + '/val',
    transform=None,
    download=True,
    as_rgb=True,
    size=224,
)

In [None]:
PneumoniaMNIST(
    split='test',
    root=npz_folder + '/test',
    transform=None,
    download=True,
    as_rgb=True,
    size=224,
)

## Convert

In [5]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Loop through all .npz files in the folder with tqdm for progress
for root, dirs, files in os.walk(npz_folder):
    for file in files:
        if file.endswith('.npz'):  # Check for .npz files
            dataset_name = os.path.splitext(file)[0]  # Get the name without .npz
            print(f"Processing dataset: {dataset_name}")
            dataset_folder = os.path.join(output_folder, dataset_name)

            npz_file_path = os.path.abspath(os.path.join(root, file))
            
            # Create a folder for the dataset
            os.makedirs(dataset_folder, exist_ok=True)
            
            # Load the .npz file
            data = np.load(npz_file_path)
            
            # Loop through train, val, and test splits with tqdm
            for split in ['train', 'val', 'test']:
                images_key = f"{split}_images"
                labels_key = f"{split}_labels"
                
                if images_key in data and labels_key in data:
                    images = data[images_key]
                    labels = data[labels_key]
                    
                    # Create a folder for the split
                    split_folder = os.path.join(dataset_folder, split)
                    os.makedirs(split_folder, exist_ok=True)
                    
                    # Save each image as a .png file with tqdm
                    for i, image in enumerate(tqdm(images, desc=f"Saving {split} images", leave=False)):
                        label = labels[i][0]  # Labels might be stored as 2D arrays
                        label_converted = class_labels[dataset_name][str(label)]
                        label_folder = os.path.join(split_folder, str(label_converted))
                        os.makedirs(label_folder, exist_ok=True)
                        
                        # Convert the image to a PIL Image and save it
                        img = Image.fromarray(image)
                        if image.ndim == 3 and image.shape[-1] == 1:  # Handle grayscale images
                            img = img.convert('L')
                        elif image.ndim == 2:  # Handle 2D images
                            img = img.convert('L')
                        img.save(os.path.join(label_folder, f"{i}.png"))

print("Conversion complete!")

Processing dataset: pneumoniamnist_224


                                                                                

Processing dataset: pneumoniamnist_224


                                                                                

Processing dataset: pneumoniamnist_224


                                                                                

Conversion complete!


