### Notes on pytorch data loading



In [None]:
""" load cifar-10 dataset """ 

import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

# load train set
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
trainloader_cifar10 = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

# load test set
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=transform)
testloader_cifar10 = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=2)

classes = ('plane','car','bird','cat','deer','dog','frog','horse','ship','truck')

In [None]:
""" select a random data point from trainset """

# perm is a tensor of same size as trainset.data, but a randomized permutation of indices
perm = torch.randperm(len(trainset.data))
torch.Tensor(trainset.targets)[perm][:2]

In [None]:
""" select a random batch from dataloader """

# the sample returned is random b.c. shuffle=True in DataLoader call
# iter(trainloader).__iter__().next() is equivalent to iter(trainloader).next() -> returns a list of two tensors ([0]:images, [1]:labels)
images, labels = iter(trainloader_cifar10).next()

note the difference of the two method:

* using trainset.data would return the raw data points without transformations (although transform is applied in CIFAR10 call, it seems only executed by dataloader class)

In [None]:
'''
    MNIST is a Dataset object, can be accessed by __getitem__ method
'''

import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor()]
)

trainset = torchvision.datasets.MNIST(root='./data', train=True, download=False, transform=transform)
trainloader_mnist = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

torchvision.utils.save_image(tensor=trainset.__getitem__(0)[0], fp='./mnist_sample.jpg')

In [None]:
""" convert dataloder obj to an iteratable obj """
iterable_cifar10 = iter(trainloader_cifar10)

In [None]:
""" iterate over a dataloader """

for i in range(5):
    images, labels = iterable_cifar10.next()
    print(labels)       # note that labels are distinct for each iteration in the loop

In [None]:
""" create DataLoader object from tensors """

import torch
from torch.utils.data import DataLoader

dl = DataLoader([torch.zeros(3,3,3) for idx in range(3)])

### DataLoader(dataset, ...)
* takes only one mandatory argument, dataset
* dataset can be either a map-style or an iterator-style object
    * see [pytorch docs](https://pytorch.org/docs/stable/data.html#map-style-datasets) for details
    * in practice:
        * a map-style object is one that can be accessed by dataset[idx]
        * an iterator style object is one that can be access by next(iter(dataset))
    * so can use any python built-in objects or custom class objects as long as the corresponding functions (i.e., __getitem__() for map-style & __iter__() for iterator-style)
      are implemented properly
        * e.g., use a list would be a good map-style example
* with this mechanism, any trainer function can be defaulted to always accept a DataLoader object as argument
    * in practice, can feed to this argument any type of DataLoader that is constructed from a standard dataset, custom dataset, or even small tensors, np arrays for testing

In [None]:
import torch
import torchvision
from torchvision.transforms import transforms
from torch.utils.data import DataLoader


dl = DataLoader([torch.zeros(3,3,3) for idx in range(10)])

for idx, (i, batch) in zip([e for e in range(5)], enumerate(dl)):
    print(idx)
    print(batch)
    

In [None]:
""" use the regular enumerate(dataloader) syntax but only train for a fraction of an epoch """

import torchvision
from torchvision.transforms import transforms
from torch.utils.data import DataLoader

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

# load train set
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
trainloader = DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

for idx, (i, (batch, labels)) in zip([e for e in range(5)], enumerate(trainloader)):
    print(idx)
    print(batch)

* always place iter(dataloader) outside for loops for better performance
* see [this post](https://github.com/pytorch/pytorch/issues/1917#issuecomment-433698337) and [this post](https://stackoverflow.com/questions/53280967/pytorch-nextitertraining-loader-extremely-slow-simple-data-cant-num-worke) for details

In [None]:
""" get one batch from a standard dataset's dataloader & make it into a DataLoader object that iterates over the same batch """

import torchvision
from torchvision.transforms import transforms
from torch.utils.data import DataLoader

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

# load train set
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
trainloader = DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

batch = next(iter(trainloader))
batch_dl = DataLoader([batch for _ in range(3)])
batch_iter = iter(batch_dl)
for idx in range(3):
    data, labels = next(batch_iter)
    print(labels)

In [None]:
""" get a random sub-set of samples from a single batch & return as a new iterator over this sampled batch """

import numpy as np

import torchvision
from torchvision.transforms import transforms
from torch.utils.data import DataLoader

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

# load train set
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
trainloader = DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2)

batch = next(iter(trainloader))

data, labels = batch

arr = np.zeros(4, dtype=bool)
arr[:3] = 1
np.random.shuffle(arr)
arr.tolist()
data[arr, :3].shape

tup = (data[arr], labels[arr])
dl = DataLoader([tup for _ in range(5)], batch_size=None)   # set batch_size=None to disable automatic batching (by default an extra batch dim is added)
sampled_batch = next(iter(dl))