In [4]:
import numpy as np
import torch as t
import torch.utils.data as td

In [5]:
rng = np.random.default_rng()

In [6]:
class MyMappedDataset(td.Dataset):
    def __init__(self, n=5, m=10):
        self._x = np.arange(n * m).reshape(m, n)
        self._y = rng.choice([0, 1], size=m, p=[0.7, 0.3])

    def __getitem__(self, idx):
        return self._x[idx], self._y[idx]

    def __len__(self):
        return self._x.shape[0]

In [7]:
ds = MyMappedDataset(m=5)
dl = td.DataLoader(ds)

In [8]:
for batch in dl:
    x, y = batch
    print("\nBatch---")
    print(f"x={x} y={y}")


Batch---
x=tensor([[0, 1, 2, 3, 4]]) y=tensor([0])

Batch---
x=tensor([[5, 6, 7, 8, 9]]) y=tensor([1])

Batch---
x=tensor([[10, 11, 12, 13, 14]]) y=tensor([1])

Batch---
x=tensor([[15, 16, 17, 18, 19]]) y=tensor([0])

Batch---
x=tensor([[20, 21, 22, 23, 24]]) y=tensor([0])


In [9]:
for i in range(len(ds)):
    print(f"ds[{i}] = {ds[i]}")

ds[0] = (array([0, 1, 2, 3, 4]), 0)
ds[1] = (array([5, 6, 7, 8, 9]), 1)
ds[2] = (array([10, 11, 12, 13, 14]), 1)
ds[3] = (array([15, 16, 17, 18, 19]), 0)
ds[4] = (array([20, 21, 22, 23, 24]), 0)


In [10]:
print("sampler: ", dl.sampler)
print("batch_sampler: ", dl.batch_sampler)
print("batch_sampler.sampler: ", dl.batch_sampler.sampler)
print("batch_size: ", dl.batch_sampler.batch_size)

sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x139e33550>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x139e33d30>
batch_sampler.sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x139e33550>
batch_size:  1


## BatchSampler

In [11]:
def iter_dl(dl):
    for batch in dl:
        x, y = batch
        print("\nBatch---")
        print(f"x={x} y={y}")

In [12]:
ds = MyMappedDataset(m=10)
dl = td.DataLoader(ds, batch_size=3)
iter_dl(dl)


Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]) y=tensor([0, 1, 1])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]) y=tensor([1, 1, 1])

Batch---
x=tensor([[45, 46, 47, 48, 49]]) y=tensor([0])


In [13]:
ds = MyMappedDataset(m=10)
dl = td.DataLoader(ds, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) y=tensor([0, 0, 1])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]) y=tensor([0, 1, 0])


In [14]:
def print_dl(dl):
    print("sampler: ", dl.sampler)
    print("batch_sampler: ", dl.batch_sampler)
    if dl.batch_sampler:
        print("batch_sampler.sampler: ", dl.batch_sampler.sampler)
        print("batch_size: ", dl.batch_sampler.batch_size)
        print("batch_sampler.drop_last: ", dl.batch_sampler.drop_last)

In [15]:
print_dl(dl)

sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x139e33d90>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x139e326b0>
batch_sampler.sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x139e33d90>
batch_size:  3
batch_sampler.drop_last:  True


In [16]:
ds = MyMappedDataset(m=10)
batch_sampler = td.BatchSampler(td.SequentialSampler(ds), batch_size=3, drop_last=True)
for idxs in batch_sampler:
    print(idxs)

[0, 1, 2]
[3, 4, 5]
[6, 7, 8]


In [17]:
ds = MyMappedDataset(m=10)
batch_sampler = td.BatchSampler(td.SequentialSampler(ds), batch_size=3, drop_last=True)
dl = td.DataLoader(ds, batch_sampler=batch_sampler)

# Equivalent -
# ds = MyMappedDataset(m=10)
# dl = td.DataLoader(ds, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]) y=tensor([1, 0, 1])

Batch---
x=tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]) y=tensor([0, 0, 1])


In [18]:
ds = MyMappedDataset(m=5)
dl = td.DataLoader(ds, batch_size=None)

In [19]:
iter_dl(dl)


Batch---
x=tensor([0, 1, 2, 3, 4]) y=1

Batch---
x=tensor([5, 6, 7, 8, 9]) y=0

Batch---
x=tensor([10, 11, 12, 13, 14]) y=0

Batch---
x=tensor([15, 16, 17, 18, 19]) y=1

Batch---
x=tensor([20, 21, 22, 23, 24]) y=1


In [20]:
print_dl(dl)

sampler:  <torch.utils.data.sampler.SequentialSampler object at 0x139e32b60>
batch_sampler:  None


```
[9]
[8], [7]
[6], [5], [3],
...
```

In [21]:
class RevLinBatcher(td.Sampler):
    def __init__(self, data_source):
        self._len = len(data_source)

    def __iter__(self):
        sz = 1
        batch = []
        for idx in range(self._len-1, -1, -1):
            batch.append(idx)
            if len(batch) == sz:
                yield batch
                batch = []
                sz += 1
        if len(batch) > 0:
            yield batch

In [22]:
ds = MyMappedDataset(m=10)
revlin = RevLinBatcher(ds)
for idxs in revlin:
    print(idxs)

[9]
[8, 7]
[6, 5, 4]
[3, 2, 1, 0]


In [23]:
ds = MyMappedDataset(m=12)
revlin = RevLinBatcher(ds)
dl = td.DataLoader(ds, batch_sampler=revlin)
iter_dl(dl)


Batch---
x=tensor([[55, 56, 57, 58, 59]]) y=tensor([1])

Batch---
x=tensor([[50, 51, 52, 53, 54],
        [45, 46, 47, 48, 49]]) y=tensor([1, 0])

Batch---
x=tensor([[40, 41, 42, 43, 44],
        [35, 36, 37, 38, 39],
        [30, 31, 32, 33, 34]]) y=tensor([0, 0, 1])

Batch---
x=tensor([[25, 26, 27, 28, 29],
        [20, 21, 22, 23, 24],
        [15, 16, 17, 18, 19],
        [10, 11, 12, 13, 14]]) y=tensor([0, 0, 1, 1])

Batch---
x=tensor([[5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4]]) y=tensor([0, 0])


In [24]:
try:
    ds = MyMappedDataset(m=12)
    revlin = RevLinBatcher(ds)
    dl = td.DataLoader(ds, batch_sampler=revlin, drop_last=True)
    iter_dl(dl)
except ValueError as err:
    print("ERROR:", err)

ERROR: batch_sampler option is mutually exclusive with batch_size, shuffle, sampler, and drop_last


## Samplers

In [25]:
sampler = td.SequentialSampler(MyMappedDataset())
for idx in sampler:
    print(idx, end=" ")

0 1 2 3 4 5 6 7 8 9 

In [26]:
sampler = td.RandomSampler(MyMappedDataset())
for idx in sampler:
    print(idx, end=" ")

7 4 2 0 6 8 9 1 3 5 

In [27]:
ds = MyMappedDataset(m=12)
sampler = td.RandomSampler(ds)
dl = td.DataLoader(ds, sampler=sampler, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[40, 41, 42, 43, 44],
        [25, 26, 27, 28, 29],
        [ 0,  1,  2,  3,  4]]) y=tensor([1, 0, 0])

Batch---
x=tensor([[35, 36, 37, 38, 39],
        [45, 46, 47, 48, 49],
        [10, 11, 12, 13, 14]]) y=tensor([0, 0, 1])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [ 5,  6,  7,  8,  9],
        [55, 56, 57, 58, 59]]) y=tensor([0, 1, 0])

Batch---
x=tensor([[50, 51, 52, 53, 54],
        [20, 21, 22, 23, 24],
        [15, 16, 17, 18, 19]]) y=tensor([1, 0, 0])


In [28]:
ds = MyMappedDataset(m=12)
# sampler = td.RandomSampler(ds)
dl = td.DataLoader(ds, shuffle=True, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[20, 21, 22, 23, 24],
        [10, 11, 12, 13, 14],
        [45, 46, 47, 48, 49]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [15, 16, 17, 18, 19],
        [25, 26, 27, 28, 29]]) y=tensor([0, 1, 0])

Batch---
x=tensor([[ 5,  6,  7,  8,  9],
        [55, 56, 57, 58, 59],
        [30, 31, 32, 33, 34]]) y=tensor([0, 1, 0])

Batch---
x=tensor([[35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44],
        [50, 51, 52, 53, 54]]) y=tensor([0, 1, 0])


In [29]:
print_dl(dl)

sampler:  <torch.utils.data.sampler.RandomSampler object at 0x139e332b0>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x139e31de0>
batch_sampler.sampler:  <torch.utils.data.sampler.RandomSampler object at 0x139e332b0>
batch_size:  3
batch_sampler.drop_last:  True


In [30]:
class ReverseSampler(td.Sampler):
    def __init__(self, data_source):
        super().__init__(data_source)
        self._len = len(data_source)

    def __iter__(self):
        yield from range(self._len - 1, -1, -1)

    def __len__(self):
        return self._len

In [31]:
sampler = ReverseSampler(MyMappedDataset())
for idx in sampler:
    print(idx, end=" ")

9 8 7 6 5 4 3 2 1 0 

In [32]:
ds = MyMappedDataset(m=12)
sampler = ReverseSampler(ds)
dl = td.DataLoader(ds, sampler=sampler, batch_size=3, drop_last=True)
iter_dl(dl)


Batch---
x=tensor([[55, 56, 57, 58, 59],
        [50, 51, 52, 53, 54],
        [45, 46, 47, 48, 49]]) y=tensor([1, 1, 1])

Batch---
x=tensor([[40, 41, 42, 43, 44],
        [35, 36, 37, 38, 39],
        [30, 31, 32, 33, 34]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[25, 26, 27, 28, 29],
        [20, 21, 22, 23, 24],
        [15, 16, 17, 18, 19]]) y=tensor([0, 1, 0])

Batch---
x=tensor([[10, 11, 12, 13, 14],
        [ 5,  6,  7,  8,  9],
        [ 0,  1,  2,  3,  4]]) y=tensor([0, 0, 0])


In [33]:
print_dl(dl)

sampler:  <__main__.ReverseSampler object at 0x139e33a30>
batch_sampler:  <torch.utils.data.sampler.BatchSampler object at 0x139e33280>
batch_sampler.sampler:  <__main__.ReverseSampler object at 0x139e33a30>
batch_size:  3
batch_sampler.drop_last:  True


In [34]:
try:
    ds = MyMappedDataset(m=12)
    sampler = ReverseSampler(ds)
    batch_sampler = RevLinBatcher(ds)
    dl = td.DataLoader(ds, sampler=sampler, shuffle=True)
except ValueError as err:
    print("ERROR:", err)

ERROR: sampler option is mutually exclusive with shuffle


## Collate

In [35]:
ds = MyMappedDataset()
dl = td.DataLoader(ds, batch_size=3, drop_last=True, shuffle=True)
iter_dl(dl)


Batch---
x=tensor([[35, 36, 37, 38, 39],
        [45, 46, 47, 48, 49],
        [25, 26, 27, 28, 29]]) y=tensor([0, 0, 0])

Batch---
x=tensor([[30, 31, 32, 33, 34],
        [20, 21, 22, 23, 24],
        [ 5,  6,  7,  8,  9]]) y=tensor([1, 0, 0])

Batch---
x=tensor([[ 0,  1,  2,  3,  4],
        [15, 16, 17, 18, 19],
        [40, 41, 42, 43, 44]]) y=tensor([0, 0, 0])


In [36]:
def collate(samples):
    xs, ys = zip(*samples)
    X = np.vstack([x for x in xs])
    y = np.array([y for y in ys]).reshape(-1, 1)
    return t.tensor(X), t.tensor(y)
    

In [37]:
ds = MyMappedDataset()
dl = td.DataLoader(ds, batch_size=3, drop_last=True, shuffle=True, collate_fn=collate)
iter_dl(dl)


Batch---
x=tensor([[30, 31, 32, 33, 34],
        [40, 41, 42, 43, 44],
        [10, 11, 12, 13, 14]]) y=tensor([[0],
        [0],
        [0]])

Batch---
x=tensor([[25, 26, 27, 28, 29],
        [35, 36, 37, 38, 39],
        [15, 16, 17, 18, 19]]) y=tensor([[0],
        [0],
        [1]])

Batch---
x=tensor([[ 5,  6,  7,  8,  9],
        [20, 21, 22, 23, 24],
        [45, 46, 47, 48, 49]]) y=tensor([[0],
        [1],
        [0]])


## With Iterable Dataset

In [52]:
class MyStreamingDataset(td.IterableDataset):
    def __init__(self, n):
        super().__init__()
        self._n = n

    def __iter__(self):
        start = 0
        while True:
            x = np.arange(start, start+self._n)
            y = rng.choice([0, 1], p=[0.7, 0.3])
            yield x, y
            start += self._n

    def __len__(self):
        # fake len method
        return 0

In [46]:
ds = MyStreamingDataset(n=5)
dl = td.DataLoader(ds)

In [47]:
def iter_mdl(dl):
    ctr = 0
    for batch in dl:
        if ctr >= 5:
            break
        print(batch)
        ctr += 1

In [48]:
iter_mdl(dl)

[tensor([[0, 1, 2, 3, 4]]), tensor([0])]
[tensor([[5, 6, 7, 8, 9]]), tensor([0])]
[tensor([[10, 11, 12, 13, 14]]), tensor([1])]
[tensor([[15, 16, 17, 18, 19]]), tensor([1])]
[tensor([[20, 21, 22, 23, 24]]), tensor([1])]


In [49]:
ds = MyStreamingDataset(n=5)
dl = td.DataLoader(ds, batch_size=3)
iter_mdl(dl)

[tensor([[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]]), tensor([0, 0, 0])]
[tensor([[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]]), tensor([0, 1, 1])]
[tensor([[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]]), tensor([0, 0, 0])]
[tensor([[45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54],
        [55, 56, 57, 58, 59]]), tensor([1, 0, 0])]
[tensor([[60, 61, 62, 63, 64],
        [65, 66, 67, 68, 69],
        [70, 71, 72, 73, 74]]), tensor([0, 0, 0])]


In [50]:
print("Sampler: ", dl.sampler)
print("Batch Sampler: ", dl.batch_sampler)

Sampler:  <torch.utils.data.dataloader._InfiniteConstantSampler object at 0x139e33670>
Batch Sampler:  <torch.utils.data.sampler.BatchSampler object at 0x139e339a0>


In [54]:
ds = MyStreamingDataset(n=5)
dl = td.DataLoader(ds, shuffle=True)
iter_mdl(dl)

ValueError: DataLoader with IterableDataset: expected unspecified shuffle option, but got shuffle=True