### Dataset

In [1]:
import os

import cv2
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchvision.datasets import ImageFolder

In [2]:
# 1. customized

class MyDataset(Dataset):
    def __init__(self):
        self.x = [ torch.rand(1,2,2) for i in range(4) ]
        self.y = torch.tensor([ i%2 for i in range(4) ], dtype=torch.long)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

dataset = MyDataset()
print( next(iter(dataset)) )

(tensor([[[0.6003, 0.9678],
         [0.7393, 0.3976]]]), tensor(0))


In [3]:
# 2. folder (image classification)
os.makedirs("./_data/example_img_folder/cls1", exist_ok=True)
for i in range(2):
    img = np.random.randint(0, 256, size=(2,2,1), dtype=np.uint8)
    cv2.imwrite(f"./_data/example_img_folder/cls1/{i}.jpg", img)

os.makedirs("./_data/example_img_folder/cls2", exist_ok=True)
for i in range(2,4):
    img = np.random.randint(0, 256, size=(2,2,1), dtype=np.uint8)
    cv2.imwrite(f"./_data/example_img_folder/cls2/{i}.jpg", img)

dataset = ImageFolder("./_data/example_img_folder")
print( len(dataset), type(dataset) )
print( dataset.classes )
print( dataset.imgs[0], dataset.targets[0] )
print( next(iter(dataset)) )

4 <class 'torchvision.datasets.folder.ImageFolder'>
['cls1', 'cls2']
('./_data/example_img_folder/cls1/0.jpg', 0) 0
(<PIL.Image.Image image mode=RGB size=2x2 at 0x7FE79528A2E0>, 0)


In [4]:
# 3. small
x = torch.rand(4,1,2,2)
y = torch.tensor([ i%2 for i in range(4) ], dtype=torch.long)

dataset = TensorDataset(x, y)
print( dataset.tensors[0].shape, dataset.tensors[1].shape )
print( next(iter(dataset)) )

torch.Size([4, 1, 2, 2]) torch.Size([4])
(tensor([[[0.0494, 0.2581],
         [0.4403, 0.8102]]]), tensor(0))


### DataLoader(dataset, *)
+ batch_size: int
+ collate_fn: func
+ pin_memory: bool (faster or not)
+ drop_last: bool

In [5]:
x = torch.tensor([[1,1,1], [2,2,2], [3,3,3], [4,4,4]])
y = torch.tensor([1,2,3,4])
dataset = TensorDataset(x,y)

# 1 default collate_fn
print( next(iter(DataLoader(dataset, batch_size=2))) )
print()

# 2 customized collate_fn
def my_collate(batch):
    x_list, y_list = [], []
    for x, y in batch: # format same as iterating dataset
        x_list.append(x), y_list.append(y)
    return torch.stack(x_list), torch.stack(y_list)
print( next(iter(DataLoader(dataset, batch_size=2, collate_fn=my_collate))) )

[tensor([[1, 1, 1],
        [2, 2, 2]]), tensor([1, 2])]

(tensor([[1, 1, 1],
        [2, 2, 2]]), tensor([1, 2]))


### torchvision.datasets

+ https://pytorch.org/vision/main/datasets.html

In [1]:
from torchvision.datasets import CIFAR10

In [2]:
# 1. CIFAR10
# download=True means download all # train=True/False means return train/val part
trainset = CIFAR10(root="./_data/cifar10", train=True, download=True)
validset = CIFAR10(root="./_data/cifar10", train=False, download=True)
!du -sh ./_data/cifar10

print( len(trainset), len(validset), type(validset) )
print( validset.classes )
print( len(validset.data), validset.data[0].shape )
print( len(validset.targets), validset.targets[:5] )
print( next(iter(validset)) )

Files already downloaded and verified
Files already downloaded and verified
354M	./_data/cifar10
50000 10000 <class 'torchvision.datasets.cifar.CIFAR10'>
['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
10000 (32, 32, 3)
10000 [3, 8, 8, 0, 6]
(<PIL.Image.Image image mode=RGB size=32x32 at 0x7FEFE2892B80>, 3)
