### Dataset

In [2]:
import os

import cv2
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchvision.datasets import ImageFolder

In [1]:
"""
from torchvision import transforms
self.transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ColorJitter([0.7,1.3], [0.7,1.3], [0.7,1.3], [-0.5,0.5]),
    # brigntness, contrast, saturation, hue
    transforms.RandomRotation([-90,90]),
    transforms.RandomHorizontalFlip(0.5),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    # RGB mean & std based on imagenet
])
"""
0

0

In [2]:
# 1. customized class

class MyDataset(Dataset):
    def __init__(self):
        self.x = [ torch.rand(1,2,2) for i in range(4) ]
        self.y = torch.tensor([ i%2 for i in range(4) ], dtype=torch.long)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

dataset = MyDataset()
print( next(iter(dataset)) )

(tensor([[[0.6003, 0.9678],
         [0.7393, 0.3976]]]), tensor(0))


In [3]:
# 2. from folder (image classification)
os.makedirs("./_data/example_img_folder/cls1", exist_ok=True)
for i in range(2):
    img = np.random.randint(0, 256, size=(2,2,1), dtype=np.uint8)
    cv2.imwrite(f"./_data/example_img_folder/cls1/{i}.jpg", img)

os.makedirs("./_data/example_img_folder/cls2", exist_ok=True)
for i in range(2,4):
    img = np.random.randint(0, 256, size=(2,2,1), dtype=np.uint8)
    cv2.imwrite(f"./_data/example_img_folder/cls2/{i}.jpg", img)

dataset = ImageFolder("./_data/example_img_folder")
print( len(dataset), type(dataset) )
print( dataset.classes )
print( dataset.imgs[0], dataset.targets[0] )
print( next(iter(dataset)) )

4 <class 'torchvision.datasets.folder.ImageFolder'>
['cls1', 'cls2']
('./_data/example_img_folder/cls1/0.jpg', 0) 0
(<PIL.Image.Image image mode=RGB size=2x2 at 0x7FE79528A2E0>, 0)


In [4]:
# 3. from tensor
x = torch.rand(4,1,2,2)
y = torch.tensor([ i%2 for i in range(4) ], dtype=torch.long)

dataset = TensorDataset(x, y)
print( dataset.tensors[0].shape, dataset.tensors[1].shape )
print( next(iter(dataset)) )

torch.Size([4, 1, 2, 2]) torch.Size([4])
(tensor([[[0.0494, 0.2581],
         [0.4403, 0.8102]]]), tensor(0))


### DataLoader(dataset, *)
+ batch_size: int
+ collate_fn: func
+ pin_memory: bool (faster or not)
+ drop_last: bool

In [5]:
x = torch.tensor([[1,1,1], [2,2,2], [3,3,3], [4,4,4]])
y = torch.tensor([1,2,3,4])
dataset = TensorDataset(x,y)

# 1 default collate_fn
print( next(iter(DataLoader(dataset, batch_size=2))) )
print()

# 2 customized collate_fn
def my_collate(batch):
    x_list, y_list = [], []
    for x, y in batch: # format same as iterating dataset
        x_list.append(x), y_list.append(y)
    return torch.stack(x_list), torch.stack(y_list)
print( next(iter(DataLoader(dataset, batch_size=2, collate_fn=my_collate))) )

[tensor([[1, 1, 1],
        [2, 2, 2]]), tensor([1, 2])]

(tensor([[1, 1, 1],
        [2, 2, 2]]), tensor([1, 2]))


### torchvision.datasets

+ https://pytorch.org/vision/main/datasets.html

In [1]:
from torchvision.datasets import CIFAR10

In [2]:
# 1. CIFAR10
# download=True means download all # train=True/False means return train/val part
trainset = CIFAR10(root="./_data/cifar10", train=True, download=True)
validset = CIFAR10(root="./_data/cifar10", train=False, download=True)
!du -sh ./_data/cifar10

print( len(trainset), len(validset), type(validset) )
print( validset.classes )
print( len(validset.data), validset.data[0].shape )
print( len(validset.targets), validset.targets[:5] )
print( next(iter(validset)) )

Files already downloaded and verified
Files already downloaded and verified
354M	./_data/cifar10
50000 10000 <class 'torchvision.datasets.cifar.CIFAR10'>
['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
10000 (32, 32, 3)
10000 [3, 8, 8, 0, 6]
(<PIL.Image.Image image mode=RGB size=32x32 at 0x7FEFE2892B80>, 3)


### Sampler
+ keyword of DataLoader "shuffle" and "sampler" cannot be both specified
    + shuffle=True <-> sampler = SequentialSampler(dataset)
    + shuffle=False <-> sampler = RandomSampler(dataset, replacement=False) / sampler = SubsetRandomSampler(indices)
+ Unbalanced -> sampler = WeightedRandomSampler(weights, num_samples=len(dataset), replacement=True)
+ Custom Sampler -> Sampler

In [98]:
from torch.utils.data import SequentialSampler, RandomSampler, SubsetRandomSampler, WeightedRandomSampler, BatchSampler

In [53]:
class NullDataset(Dataset):
    def __len__(self):
        return 5
    def __getitem__(self, index):
        return 0

class MyDataset(Dataset):
    def __init__(self, size):
        self.x = torch.arange(size)
    def __len__(self):
        return len(self.x)
    def __getitem__(self, index):
        return self.x[index]
    
dataset = MyDataset(7)
nullset = NullDataset()

In [97]:
# parent class
if 0 and "source code":
    class Sampler(object):
        r"""Base class for all Samplers.
        Every Sampler subclass has to provide an __iter__ method, providing a way
        to iterate over indices of dataset elements, and a __len__ method that
        returns the length of the returned iterators.
        """
        # 一个 迭代器 基类
        def __init__(self, data_source):
            pass

        def __iter__(self):
            raise NotImplementedError

        def __len__(self):
            raise NotImplementedError

In [95]:
# sequential sampler: 1st arg refers __getitem__, SequentialSampler refers __len__
if 0 and "source code":
    class SequentialSampler(Sampler[int]):
        r"""Samples elements sequentially, always in the same order.

        Args:
            data_source (Dataset): dataset to sample from
        """
        data_source: Sized

        def __init__(self, data_source: Sized) -> None:
            self.data_source = data_source

        def __iter__(self) -> Iterator[int]:
            return iter(range(len(self.data_source)))

        def __len__(self) -> int:
            return len(self.data_source)
        
loader = torch.utils.data.DataLoader(nullset, batch_size=2, sampler=SequentialSampler(dataset))
print([data for data in loader])
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=SequentialSampler(nullset))
print([data for data in loader])
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=SequentialSampler(dataset)) # real case used
print([data for data in loader], "\n")

# random sampler
if 0 and "source code":
    class RandomSampler(Sampler):
        r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
        If with replacement, then user can specify ``num_samples`` to draw.
        Arguments:
            data_source (Dataset): dataset to sample from
            num_samples (int): number of samples to draw, default=len(dataset)
            replacement (bool): samples are drawn with replacement if ``True``, default=False
        """

        def __init__(self, data_source, replacement=False, num_samples=None):
            self.data_source = data_source
            self.replacement = replacement
            self.num_samples = num_samples

            if self.num_samples is not None and replacement is False:
                raise ValueError("With replacement=False, num_samples should not be specified, "
                                "since a random permute will be performed.")

            if self.num_samples is None:
                self.num_samples = len(self.data_source)

            if not isinstance(self.num_samples, int) or self.num_samples <= 0:
                raise ValueError("num_samples should be a positive integeral "
                                "value, but got num_samples={}".format(self.num_samples))
            if not isinstance(self.replacement, bool):
                raise ValueError("replacement should be a boolean value, but got "
                                "replacement={}".format(self.replacement))

        def __iter__(self):
            n = len(self.data_source)
            if self.replacement:
                return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64).tolist())
            return iter(torch.randperm(n).tolist())

        def __len__(self):
            return len(self.data_source)
        
loader = torch.utils.data.DataLoader(nullset, batch_size=2, sampler=RandomSampler(dataset))
print([data for data in loader])
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=RandomSampler(nullset))
print([data for data in loader])
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=RandomSampler(dataset)) # real case used
print([data for data in loader])

[tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0])]
[tensor([0, 1]), tensor([2, 3]), tensor([4])]
[tensor([0, 1]), tensor([2, 3]), tensor([4, 5]), tensor([6])] 

[tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0])]
[tensor([0, 4]), tensor([1, 2]), tensor([3])]
[tensor([0, 4]), tensor([1, 2]), tensor([5, 3]), tensor([6])]


In [96]:
# SubsetRandomSampler
if 0 and "source code":
    class SubsetRandomSampler(Sampler):
        r"""Samples elements randomly from a given list of indices, without replacement.
        Arguments:
            indices (sequence): a sequence of indices
        """

        def __init__(self, indices):
            self.indices = indices

        def __iter__(self):
            return (self.indices[i] for i in torch.randperm(len(self.indices)))

        def __len__(self):
            return len(self.indices)

indices = list(range(len(dataset))) # can shuffle or not
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=SubsetRandomSampler(indices[:5]))
print([data for data in loader])
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=SubsetRandomSampler(indices[5:]))
print([data for data in loader])


[tensor([1, 0]), tensor([2, 3]), tensor([4])]
[tensor([6, 5])]


In [94]:
# WeightedRandomSampler
if 0 and "source code":
    class WeightedRandomSampler(Sampler):
        r"""Samples elements from [0,..,len(weights)-1] with given probabilities (weights).
        Arguments:
            weights (sequence)   : a sequence of weights, not necessary summing up to one
            num_samples (int): number of samples to draw
            replacement (bool): if ``True``, samples are drawn with replacement.
                If not, they are drawn without replacement, which means that when a
                sample index is drawn for a row, it cannot be drawn again for that row.
        """

        def __init__(self, weights, num_samples, replacement=True):
            if not isinstance(num_samples, _int_classes) or isinstance(num_samples, bool) or \
                    num_samples <= 0:
                raise ValueError("num_samples should be a positive integeral "
                                "value, but got num_samples={}".format(num_samples))
            if not isinstance(replacement, bool):
                raise ValueError("replacement should be a boolean value, but got "
                                "replacement={}".format(replacement))
            self.weights = torch.tensor(weights, dtype=torch.double)
            self.num_samples = num_samples
            self.replacement = replacement

        def __iter__(self):
            return iter(torch.multinomial(self.weights, self.num_samples, self.replacement).tolist())

        def __len__(self):
            return self.num_samples  ## 指的是一次一共采样的样本的数量

weights = [5, 1, 0, 0, 0, 0]
loader = torch.utils.data.DataLoader(dataset, batch_size=2, sampler=WeightedRandomSampler(weights, num_samples=len(dataset), replacement=True))
print([data for data in loader])

[tensor([0, 0]), tensor([0, 0]), tensor([1, 0]), tensor([0])]


In [100]:
# BatchSampler
weights = [5, 1, 0, 0, 0, 0]
batch_sampler = BatchSampler(SequentialSampler(dataset), batch_size=3, drop_last=False)
loader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler)
print([data for data in loader])

[tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6])]


### Traditional (preprocessed) Sampling
+ Undersampling
    + CC, CNN, ENN, Near Miss, Tomek links, OSS
+ Oversampling
    + SMOTE

In [5]:
import time
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.cluster import MiniBatchKMeans

from imblearn.under_sampling import ClusterCentroids, CondensedNearestNeighbour, EditedNearestNeighbours,\
    NearMiss, TomekLinks, OneSidedSelection
from imblearn.over_sampling import SMOTE

In [3]:
X, y = make_classification(n_classes=2, class_sep=2,
    weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) # generate numpy dataset
print(type(X), type(y), X.shape, y.shape)
print('Original dataset shape %s\n' % Counter(y)) # Original dataset shape Counter({1: 900, 0: 100})

cc = ClusterCentroids(
    estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=42
) # default strategy is downsampling # n_init: cluster centers 
start = time.time()
X_res, y_res = cc.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

cnn = CondensedNearestNeighbour(random_state=42) # n_neighbors==1
start = time.time()
X_res, y_res = cnn.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

enn = EditedNearestNeighbours() # n_neighbors=3, kind_sel='all'
start = time.time()
X_res, y_res = enn.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

nm = NearMiss()
start = time.time()
X_res, y_res = nm.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

tl = TomekLinks()
start = time.time()
X_res, y_res = tl.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

oss = OneSidedSelection(random_state=42) # n_neighbors=1
start = time.time()
X_res, y_res = oss.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (1000, 20) (1000,)
Original dataset shape Counter({1: 900, 0: 100})

0.2863 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (200, 20) (200,)
Resampled dataset shape Counter({0: 100, 1: 100})

0.6952 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (144, 20) (144,)
Resampled dataset shape Counter({0: 100, 1: 44})

0.0062 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (987, 20) (987,)
Resampled dataset shape Counter({1: 887, 0: 100})

0.0438 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (200, 20) (200,)
Resampled dataset shape Counter({0: 100, 1: 100})

0.0061 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (997, 20) (997,)
Resampled dataset shape Counter({1: 897, 0: 100})

0.0493 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (596, 20) (596,)
Resampled dataset shape Counter({1: 496, 0: 100})



In [4]:
X, y = make_classification(n_classes=2, class_sep=2,
    weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
print(type(X), type(y), X.shape, y.shape)
print('Original dataset shape %s' % Counter(y))

sm = SMOTE(random_state=42) # k_neighbors=5
start = time.time()
X_res, y_res = sm.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s' % Counter(y_res))

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (1000, 20) (1000,)
Original dataset shape Counter({1: 900, 0: 100})
0.0027 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (1800, 20) (1800,)
Resampled dataset shape Counter({0: 900, 1: 900})


In [13]:
X, y = make_classification(n_classes=2, class_sep=2,
    weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) # generate numpy dataset
print(type(X), type(y), X.shape, y.shape)
print('Original dataset shape %s\n' % Counter(y)) # Original dataset shape Counter({1: 900, 0: 100})

cc = ClusterCentroids(
    sampling_strategy=0.5,
    estimator=MiniBatchKMeans(n_init=1, random_state=0), random_state=42
) # default strategy is downsampling # n_init: cluster centers 
start = time.time()
X_res, y_res = cc.fit_resample(X, y)
print(round(time.time()-start, 4), type(X_res), type(y_res), X_res.shape, y_res.shape)
print('Resampled dataset shape %s\n' % Counter(y_res))

<class 'numpy.ndarray'> <class 'numpy.ndarray'> (1000, 20) (1000,)
Original dataset shape Counter({0: 900, 1: 100})

0.2328 <class 'numpy.ndarray'> <class 'numpy.ndarray'> (300, 20) (300,)
Resampled dataset shape Counter({0: 200, 1: 100})

