# VisionDatasetDumper Demo
.

# Prerequisites

In [1]:
import numpy as np
from pathlib import Path
from collections import Counter
import random

import torch
from torchvision.datasets import CIFAR10, SVHN, MNIST, EMNIST
from torchvision.transforms import Compose, ToTensor

from hyperpyper.utils import DataSetDumper, VisionDatasetDumper
from hyperpyper.utils import FolderScanner as fs
from hyperpyper.transforms import PILTranspose


In [2]:
ROOT_PATH = Path.home() / "Downloads" / "data"

DATA_PATH = ROOT_PATH / "CIFAR10"

DATA_PATH_TEST = Path(DATA_PATH, "test")
DATA_PATH_TRAIN = Path(DATA_PATH, "train")

## Create CIFAR10 dataset organized in subfolders indicating class
The VisionDatasetDumper handles the download and the creation of a folder structure where images are stored. They can then be used as the starting point for experiments. We only need the dataset returned by the VisionDatasetDumper to extract the class labels to be able to match them with class indices.

In [3]:
train_dataset = VisionDatasetDumper(CIFAR10, root=DATA_PATH, dst=DATA_PATH_TRAIN, train=True).dump()

Files already downloaded and verified


### Retrieve a list of files

In [4]:
train_files = fs.get_files(DATA_PATH_TRAIN, extensions='.png', recursive=True)

# Select some random items
selected_files = random.sample(train_files, 5)
selected_files

[WindowsPath('C:/Users/bernh/Downloads/data/CIFAR10/train/2/18026.png'),
 WindowsPath('C:/Users/bernh/Downloads/data/CIFAR10/train/6/22439.png'),
 WindowsPath('C:/Users/bernh/Downloads/data/CIFAR10/train/4/10150.png'),
 WindowsPath('C:/Users/bernh/Downloads/data/CIFAR10/train/7/15772.png'),
 WindowsPath('C:/Users/bernh/Downloads/data/CIFAR10/train/7/4650.png')]

In [5]:
DATA_PATH = ROOT_PATH / "SVHN"

DATA_PATH_TEST = Path(DATA_PATH, "test")
DATA_PATH_TRAIN = Path(DATA_PATH, "train")

train_dataset = VisionDatasetDumper(SVHN, root=DATA_PATH, dst=DATA_PATH_TRAIN, train=True).dump()

Using downloaded and verified file: C:\Users\bernh\Downloads\data\SVHN\train_32x32.mat


In [6]:
DATA_PATH = ROOT_PATH / "MNIST"

DATA_PATH_TEST = Path(DATA_PATH, "test")
DATA_PATH_TRAIN = Path(DATA_PATH, "train")

train_dataset = VisionDatasetDumper(MNIST, root=DATA_PATH, dst=DATA_PATH_TRAIN, train=True).dump()

In [7]:
DATA_PATH = ROOT_PATH / "EMNIST"

DATA_PATH_TEST = Path(DATA_PATH, "test")
DATA_PATH_TRAIN = Path(DATA_PATH, "train")

transform = Compose([
    PILTranspose(),
    ToTensor(),
])

train_dataset = VisionDatasetDumper(EMNIST, root=DATA_PATH, dst=DATA_PATH_TRAIN, split='letters', train=True, transform=transform).dump()

In [8]:
train_dataset

Dataset EMNIST
    Number of datapoints: 124800
    Root location: C:\Users\bernh\Downloads\data\EMNIST
    Split: Train
    StandardTransform
Transform: Compose(
               PILTranspose()
               ToTensor()
           )