In [2]:
# https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html
# But what are PyTorch DataLoaders really

#pytorch tutorial from raw
# https://pytorch.org/tutorials/beginner/nn_tutorial.html


In [3]:
import os
import json
import matplotlib.pyplot as plt 
import matplotlib.image as image 
import numpy as np
import pandas as pd
import albumentations as A
import albumentations.pytorch
import cv2
import math

import torch
from pytorch_lightning import LightningModule, Trainer
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import MNIST

import sys
sys.path.append('../')
from utils.dataset import *
# from train import PapsClsModel


A quick refresher: PyTorch Datasets are just things that have a length and are indexable so that len(dataset) will work and dataset[index] will return a tuple of (x,y).


In [19]:
X = list(range(1, 11))
Y = list(range(11,21))
print('xs values: ', X)
print('ys values: ', Y)

xs values:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ys values:  [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [20]:
dataset = list(zip(X, Y))

In [21]:
type(zip(X,Y))

zip

In [22]:
# for x, y in zip(X, Y) :
#     print(x, y)

In [23]:
dataset[5]

(6, 16)

In [24]:
len(dataset)

10

In [25]:
class ToyDataset :
    def __init__(self, X, Y) :
        self.X = X
        self.Y = Y
    
    def __getitem__(self, i) :
        return self.X[i], self.Y[i]
    
    def __len__(self) :
        return len(self.X)

In [26]:
dataset = ToyDataset(X, Y)
dataset[1]

(2, 12)

The len() function will attempt to call a method named __len__() on the class,
iterator like list, dictionary has built in magic len method, so len(list) is working
for class, need to add __len__ method

In [27]:
len(dataset)

10

In [28]:
from torch.utils.data import DataLoader

for x, y in DataLoader(dataset, batch_size=2, shuffle=True) :
    print(x, y)

tensor([7, 6]) tensor([17, 16])
tensor([4, 5]) tensor([14, 15])
tensor([ 3, 10]) tensor([13, 20])
tensor([1, 9]) tensor([11, 19])
tensor([2, 8]) tensor([12, 18])


Every DataLoader has a Sampler which is used internally to get the indices for each batch. Each index is used to index into your Dataset to grab the data (x, y). You can ignore this for now, but DataLoaders also have a batch_sampler which returns the indices for each batch in a list if batch_size is greater than 1.


SequentialSampler

In [30]:
default_sampler = DataLoader(dataset).sampler

In [31]:
for i in default_sampler :
    print(i)

0
1
2
3
4
5
6
7
8
9


In [32]:
type(default_sampler)

torch.utils.data.sampler.SequentialSampler

In [33]:
from torch.utils.data.sampler import SequentialSampler
sampler = SequentialSampler(dataset)

for x in sampler :
    print (x)

0
1
2
3
4
5
6
7
8
9


RandomSampler

In [34]:
random_sampler = DataLoader(dataset, shuffle=True).sampler

for index in random_sampler :
    print(index)

1
2
5
7
8
3
4
6
9
0


In [35]:
type(random_sampler)

torch.utils.data.sampler.RandomSampler

In [37]:
from torch.utils.data.sampler import RandomSampler

random_sampler = RandomSampler(dataset)

for x in random_sampler :
    print(x)

0
3
9
6
5
7
2
1
4
8


In [38]:
dl = DataLoader(dataset, sampler=random_sampler)
for i in dl.sampler :
    print(i)

7
9
4
1
0
6
8
2
5
3


So we've seen that every DataLoader has a sampler internally which is either SequentialSampler or RandomSampler depending on the value of shuffle, and these are iterated over to get the indices of the Dataset to use.

In [46]:
import random
from torch.utils.data.sampler import Sampler

class HalvesSampler(Sampler) :
    def __init__(self, dataset) :
        self.half = int(len(dataset)/2)
        self.first_indices = list(range(self.half))
        self.sec_indices = list(range(self.half, len(dataset)))

    def __iter__(self) :
        #shuffle first half and second
        random.shuffle(self.first_indices)
        random.shuffle(self.sec_indices)
        
        #iter first half, then second half
        return iter(self.first_indices + self.sec_indices)

    def __len__(self) :
        return len(self.first_indices) + len(self.sec_indices)



In [43]:
my_sampler = HalvesSampler(dataset)
print(my_sampler.first_indices)
print(my_sampler.sec_indices)

[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]


In [44]:
for i in my_sampler :
    print(i)

2
0
4
1
3
6
9
8
7
5


In [45]:
dl = DataLoader(dataset, sampler=my_sampler)
for x, y in dl:
    print(x, y)

tensor([1]) tensor([11])
tensor([4]) tensor([14])
tensor([3]) tensor([13])
tensor([5]) tensor([15])
tensor([2]) tensor([12])
tensor([10]) tensor([20])
tensor([7]) tensor([17])
tensor([9]) tensor([19])
tensor([8]) tensor([18])
tensor([6]) tensor([16])


In [47]:
batch_size=7
dl = DataLoader(dataset, batch_size=batch_size, sampler=my_sampler)

for x, y in dl:
    print(x,y)

tensor([ 4,  5,  2,  3,  1,  7, 10]) tensor([14, 15, 12, 13, 11, 17, 20])
tensor([8, 6, 9]) tensor([18, 16, 19])


BatchSampler

In [49]:
batch_size = 3
default_batch_sampler = DataLoader(dataset, batch_size=batch_size).batch_sampler
for i, batch_indices in enumerate(default_batch_sampler) :
    print(i, batch_indices)


0 [0, 1, 2]
1 [3, 4, 5]
2 [6, 7, 8]
3 [9]


In [50]:
type(default_batch_sampler)

torch.utils.data.sampler.BatchSampler

In [51]:
from torch.utils.data.sampler import BatchSampler
print(BatchSampler.__doc__)

Wraps another sampler to yield a mini-batch of indices.

    Args:
        sampler (Sampler or Iterable): Base sampler. Can be any iterable object
        batch_size (int): Size of mini-batch.
        drop_last (bool): If ``True``, the sampler will drop the last batch if
            its size would be less than ``batch_size``

    Example:
        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    


In [53]:
batch_sampler = BatchSampler(my_sampler, batch_size=2, drop_last=False)
for i, batch_indices in enumerate(batch_sampler) :
    print(i, batch_indices)

0 [1, 0]
1 [3, 2]
2 [4, 8]
3 [7, 6]
4 [5, 9]


Custom Batch Sampler
Similar to a custom sampler, you can also create a batch_sampler. Why? If for some reason you wanted to only batch certain things together (like only if they're the same length), or if you wanted to show some examples more often than others, a custom BatchSampler is great for this.

To create a custom batch_sampler, we just do the same as we did with a custom Sampler but our iterator returns batches of indices, rather than individual indices.

Let's create a BatchSampler which only batches together values from the first half of our dataset.

baatch sampler considering aspect ratio or area,  or class imbalance

In [54]:
def chunk(indices, chunk_size):
    return torch.split(torch.tensor(indices), chunk_size)

class EachHalfTogetherBatchSampler(Sampler):
    def __init__(self, dataset, batch_size):
        halfway_point = len(dataset) // 2 
        self.first_half_indices = list(range(halfway_point))
        self.second_half_indices = list(range(halfway_point, len(dataset)))
        self.batch_size = batch_size
    
    def __iter__(self):
        random.shuffle(self.first_half_indices)
        random.shuffle(self.second_half_indices)
        first_half_batches  = chunk(self.first_half_indices, self.batch_size)
        second_half_batches = chunk(self.second_half_indices, self.batch_size)
        combined = list(first_half_batches + second_half_batches)
        combined = [batch.tolist() for batch in combined]
        random.shuffle(combined)
        return iter(combined)
    
    def __len__(self):
        return (len(self.first_half_indices) + len(self.second_half_indices)) // self.batch_size

In [55]:
batch_size = 2
each_half_together_batch_sampler = EachHalfTogetherBatchSampler(dataset, batch_size)
for x in each_half_together_batch_sampler:
    print(x)

[9, 7]
[8, 6]
[0]
[2, 4]
[5]
[3, 1]


In [56]:
for i, (xb,yb) in enumerate(DataLoader(dataset, batch_sampler=each_half_together_batch_sampler)):
    print(f'Batch #{i}. x{i}:', xb)
    print(f'          y{i}:', yb)

Batch #0. x0: tensor([8, 6])
          y0: tensor([18, 16])
Batch #1. x1: tensor([1])
          y1: tensor([11])
Batch #2. x2: tensor([10])
          y2: tensor([20])
Batch #3. x3: tensor([4, 3])
          y3: tensor([14, 13])
Batch #4. x4: tensor([2, 5])
          y4: tensor([12, 15])
Batch #5. x5: tensor([7, 9])
          y5: tensor([17, 19])
