In [2]:
import numpy as np
import torch as t
import torch.utils.data as td

In [3]:
rng = np.random.default_rng()

In [4]:
class MyMappedDataset(td.Dataset):
    def __init__(self, n=5, m=10):
        self._x = np.arange(n * m).reshape(m, n)
        self._y = rng.choice([0, 1], size=m, p=[0.7, 0.3])

    def __getitem__(self, idx):
        return self._x[idx], self._y[idx]

    def __len__(self):
        return self._x.shape[0]

In [5]:
ds = MyMappedDataset(m=5)
dl = td.DataLoader(ds)

In [6]:
for batch in dl:
    x, y = batch
    print("\nBatch---")
    print(f"x={x} y={y}")


Batch---
x=tensor([[0, 1, 2, 3, 4]]) y=tensor([1])

Batch---
x=tensor([[5, 6, 7, 8, 9]]) y=tensor([0])

Batch---
x=tensor([[10, 11, 12, 13, 14]]) y=tensor([0])

Batch---
x=tensor([[15, 16, 17, 18, 19]]) y=tensor([0])

Batch---
x=tensor([[20, 21, 22, 23, 24]]) y=tensor([0])


In [7]:
for i in range(len(ds)):
    print(f"ds[{i}] = {ds[i]}")

ds[0] = (array([0, 1, 2, 3, 4]), 1)
ds[1] = (array([5, 6, 7, 8, 9]), 0)
ds[2] = (array([10, 11, 12, 13, 14]), 0)
ds[3] = (array([15, 16, 17, 18, 19]), 0)
ds[4] = (array([20, 21, 22, 23, 24]), 0)


In [8]:
print("sampler: ", dl.sampler)
print("batch_sampler: ", dl.batch_sampler)
print("batch_sampler.sampler: ", dl.batch_sampler.sampler)
print("batch_size: ", dl.batch_sampler.batch_size)

sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x14422dc30>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x14422dff0>
batch_sampler.sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x14422dc30>
batch_size:  1


## BatchSampler

In [9]:
def iter_dl(dl):
    for batch in dl:
        x, y = batch
        print("\nBatch---")
        print(f"x={x} y={y}")

In [10]:
ds = MyMappedDataset(m=10)
dl = td.DataLoader(ds, batch_size=3)
iter_dl(dl)


Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]) y=tensor([0, 0, 1])

Batch---
x=tensor([[45, 46, 47, 48, 49]]) y=tensor([0])


In [11]:
ds = MyMappedDataset(m=10)
dl = td.DataLoader(ds, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]) y=tensor([1, 0, 0])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) y=tensor([0, 1, 0])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]) y=tensor([1, 0, 0])


In [12]:
def print_dl(dl):
    print("sampler: ", dl.sampler)
    print("batch_sampler: ", dl.batch_sampler)
    if dl.batch_sampler:
        print("batch_sampler.sampler: ", dl.batch_sampler.sampler)
        print("batch_size: ", dl.batch_sampler.batch_size)
        print("batch_sampler.drop_last: ", dl.batch_sampler.drop_last)

In [13]:
print_dl(dl)

sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x14422d6f0>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x14422d660>
batch_sampler.sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x14422d6f0>
batch_size:  3
batch_sampler.drop_last:  True


In [14]:
ds = MyMappedDataset(m=10)
batch_sampler = td.BatchSampler(td.SequentialSampler(ds), batch_size=3, drop_last=True)
for idxs in batch_sampler:
    print(idxs)

[0, 1, 2]
[3, 4, 5]
[6, 7, 8]


In [15]:
ds = MyMappedDataset(m=10)
batch_sampler = td.BatchSampler(td.SequentialSampler(ds), batch_size=3, drop_last=True)
dl = td.DataLoader(ds, batch_sampler=batch_sampler)

# Equivalent -
# ds = MyMappedDataset(m=10)
# dl = td.DataLoader(ds, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]) y=tensor([1, 0, 0])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]) y=tensor([1, 0, 0])


In [16]:
ds = MyMappedDataset(m=5)
dl = td.DataLoader(ds, batch_size=None)

In [17]:
iter_dl(dl)


Batch---
x=tensor([0, 1, 2, 3, 4]) y=0

Batch---
x=tensor([5, 6, 7, 8, 9]) y=1

Batch---
x=tensor([10, 11, 12, 13, 14]) y=1

Batch---
x=tensor([15, 16, 17, 18, 19]) y=1

Batch---
x=tensor([20, 21, 22, 23, 24]) y=1


In [18]:
print_dl(dl)

sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x14422c4f0>
batch_sampler:  None


```
[9]
[8], [7]
[6], [5], [3],
...
```

In [19]:
class RevLinBatcher(td.Sampler):
    def __init__(self, data_source):
        self._len = len(data_source)

    def __iter__(self):
        sz = 1
        batch = []
        for idx in range(self._len-1, -1, -1):
            batch.append(idx)
            if len(batch) == sz:
                yield batch
                batch = []
                sz += 1
        if len(batch) > 0:
            yield batch

In [20]:
ds = MyMappedDataset(m=10)
revlin = RevLinBatcher(ds)
for idxs in revlin:
    print(idxs)

[9]
[8, 7]
[6, 5, 4]
[3, 2, 1, 0]


In [21]:
ds = MyMappedDataset(m=12)
revlin = RevLinBatcher(ds)
dl = td.DataLoader(ds, batch_sampler=revlin)
iter_dl(dl)


Batch---
x=tensor([[55, 56, 57, 58, 59]]) y=tensor([1])

Batch---
x=tensor([[50, 51, 52, 53, 54],
        [45, 46, 47, 48, 49]]) y=tensor([0, 0])

Batch---
x=tensor([[40, 41, 42, 43, 44],
        [35, 36, 37, 38, 39],
        [30, 31, 32, 33, 34]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[25, 26, 27, 28, 29],
        [20, 21, 22, 23, 24],
        [15, 16, 17, 18, 19],
        [10, 11, 12, 13, 14]]) y=tensor([1, 0, 1, 0])

Batch---
x=tensor([[5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4]]) y=tensor([0, 1])


In [34]:
ds = MyMappedDataset(m=12)
revlin = RevLinBatcher(ds)
try:
    dl = td.DataLoader(ds, batch_sampler=revlin, drop_last=True)
    iter_dl(dl)
except ValueError as err:
    print("ERROR:", err)

ERROR: batch_sampler option is mutually exclusive with batch_size, shuffle, sampler, and drop_last


## Samplers

In [23]:
sampler = td.SequentialSampler(MyMappedDataset())
for idx in sampler:
    print(idx, end=" ")

0 1 2 3 4 5 6 7 8 9 

In [26]:
sampler = td.RandomSampler(MyMappedDataset())
for idx in sampler:
    print(idx, end=" ")

5 0 6 7 3 2 4 1 8 9 

In [28]:
ds = MyMappedDataset(m=10)
batch_sampler = td.BatchSampler(td.RandomSampler(ds), batch_size=3, drop_last=True)
dl = td.DataLoader(ds, batch_sampler=batch_sampler)
iter_dl(dl)


Batch---
x=tensor([[10, 11, 12, 13, 14],
        [35, 36, 37, 38, 39],
        [15, 16, 17, 18, 19]]) y=tensor([0, 1, 1])

Batch---
x=tensor([[40, 41, 42, 43, 44],
        [30, 31, 32, 33, 34],
        [45, 46, 47, 48, 49]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[20, 21, 22, 23, 24],
        [ 5,  6,  7,  8,  9],
        [ 0,  1,  2,  3,  4]]) y=tensor([0, 0, 0])


In [29]:
ds = MyMappedDataset(m=10)
# batch_sampler = td.BatchSampler(td.RandomSampler(ds), batch_size=3, drop_last=True)
dl = td.DataLoader(ds, batch_size=3, drop_last=True, shuffle=True)
iter_dl(dl)


Batch---
x=tensor([[10, 11, 12, 13, 14],
        [45, 46, 47, 48, 49],
        [30, 31, 32, 33, 34]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[ 5,  6,  7,  8,  9],
        [40, 41, 42, 43, 44],
        [20, 21, 22, 23, 24]]) y=tensor([1, 0, 0])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [25, 26, 27, 28, 29],
        [35, 36, 37, 38, 39]]) y=tensor([0, 1, 0])


In [30]:
print_dl(dl)

sampler:  <torch.utils.data.sampler.RandomSampler object at 0x12e8a0dc0>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x1442da050>
batch_sampler.sampler:  <torch.utils.data.sampler.RandomSampler object at 0x12e8a0dc0>
batch_size:  3
batch_sampler.drop_last:  True


In [42]:
class ReverseSampler(td.Sampler):
    def __init__(self, data_source):
        super().__init__(data_source)
        self._len = len(data_source)

    def __iter__(self):
        yield from range(self._len-1, -1, -1)

    def __len__(self):
        return self._len

In [43]:
sampler = ReverseSampler(MyMappedDataset())
for idx in sampler:
    print(idx, end=" ")

9 8 7 6 5 4 3 2 1 0 

In [44]:
ds = MyMappedDataset(m=12)
sampler = ReverseSampler(ds)
dl = td.DataLoader(ds, sampler=sampler, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[55, 56, 57, 58, 59],
        [50, 51, 52, 53, 54],
        [45, 46, 47, 48, 49]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[40, 41, 42, 43, 44],
        [35, 36, 37, 38, 39],
        [30, 31, 32, 33, 34]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[25, 26, 27, 28, 29],
        [20, 21, 22, 23, 24],
        [15, 16, 17, 18, 19]]) y=tensor([0, 0, 1])

Batch---
x=tensor([[10, 11, 12, 13, 14],
        [ 5,  6,  7,  8,  9],
        [ 0,  1,  2,  3,  4]]) y=tensor([1, 1, 1])


In [45]:
print_dl(dl)

sampler:  <__main__.ReverseSampler object at 0x1447081f0>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x14439b940>
batch_sampler.sampler:  <__main__.ReverseSampler object at 0x1447081f0>
batch_size:  3
batch_sampler.drop_last:  True


In [47]:
try:
    ds = MyMappedDataset()
    # dl = td.DataLoader(ds, batch_sampler=RevLinBatcher(ds), shuffle=True)
    dl = td.DataLoader(ds, batch_sampler=RevLinBatcher(ds), sampler=ReverseSampler(ds))
except ValueError as err:
    print("ERROR:", err)

ERROR: batch_sampler option is mutually exclusive with batch_size, shuffle, sampler, and drop_last
