In [None]:
!pip install torch
!pip install torchvision
!pip install numpy
!pip install sklearn

Collecting torch
  Downloading torch-1.11.0-cp310-cp310-win_amd64.whl (158.0 MB)
     -------------------------------------- 158.0/158.0 MB 3.4 MB/s eta 0:00:00
Installing collected packages: torch
Successfully installed torch-1.11.0
Collecting torchvision
  Downloading torchvision-0.12.0-cp310-cp310-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 2.7 MB/s eta 0:00:00
Installing collected packages: torchvision
Successfully installed torchvision-0.12.0
Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp310-cp310-win_amd64.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 14.4 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=0.11
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Buil

In [None]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [None]:
val_size = 0.2
batch_size = 32

In [None]:
transform_MNIST = transforms.ToTensor()   #transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,)),])
class_names_MNIST = ('0', '1', '2', '3', '4', '5', '6', '7','8', '9')
nb_classes_MNIST = len(class_names_MNIST)

In [None]:
def get_labels_and_class_counts(labels_list):
    labels = np.array(labels_list)
    _, class_counts = np.unique(labels, return_counts=True)
    return labels, class_counts

In [None]:
class ImbalanceGeneratorMNIST(Dataset):
    def __init__(self, num_samples, root, train, download, transform):
        self.dataset = datasets.MNIST(root=root, train=train, download=download, transform=transform_MNIST)
        self.train = train
        self.num_samples = num_samples
        self.idxs = self.resample()

    def get_labels_and_class_counts(self):
        return self.labels, self.imbal_class_counts

    def resample(self):
        if self.train:
            targets, class_counts = get_labels_and_class_counts(
                self.dataset.targets)
        else:
            targets, class_counts = get_labels_and_class_counts(
                self.dataset.targets)

        class_indices = [np.where(targets == i)[0] for i in range(nb_classes_MNIST)]

        self.imbal_class_counts = [
            int(prop)
            for count, prop in zip(class_counts, self.num_samples)
        ]

        idxs = []
        for c in range(nb_classes_MNIST):
            imbal_class_count = self.imbal_class_counts[c]
            idxs.append(class_indices[c][:imbal_class_count])
        idxs = np.hstack(idxs)
        self.labels = targets[idxs]
        return idxs

    def __getitem__(self, index):
        img, target = self.dataset[self.idxs[index]]
        return img, target

    def __len__(self):
        return len(self.idxs)

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def dataset_creator(seed_number, num_samples_train, num_samples_test):
    g = torch.Generator()
    g.manual_seed(seed_number)

    trainset = ImbalanceGeneratorMNIST(num_samples_train, root='.', train=True, download=True, transform=transform_MNIST)
    testset = ImbalanceGeneratorMNIST(num_samples_test, root='.', train=False, download=True, transform=transform_MNIST)

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, worker_init_fn=seed_worker, generator=g, pin_memory=torch.cuda.is_available())
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, pin_memory=torch.cuda.is_available())

    return trainloader, testloader, trainset

In [None]:
##Half Split Imbalance
setting1_mnist = dataset_creator(1, np.hstack(([10]*5, [5000]*5)), np.hstack(([150] * 5, [750] * 5)))
trainloader_mnist_1 = setting1_mnist[0]
testloader_mnist_1 = setting1_mnist[1]
trainset_mnist_1 = setting1_mnist[2]

##Multimajority
setting2_mnist = dataset_creator(2, np.hstack(([5400]*9, [5])), np.hstack(([800] * 9, [8])))
trainloader_mnist_2 = setting2_mnist[0]
testloader_mnist_2 = setting2_mnist[1]
trainset_mnist_2 = setting2_mnist[2]

##Multiminority
setting3_mnist = dataset_creator(3, np.hstack(([5]*9, [5800])), np.hstack(([8] * 9, [800])))
trainloader_mnist_3 = setting3_mnist[0]
testloader_mnist_3 = setting3_mnist[1]
trainset_mnist_3 = setting3_mnist[2]