In [1]:
# as datasest will load for us a chunk of data not a tuple
# we need to import DataLoader and Dataset form torch.utils.data 

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [2]:
# Now we will create a IID function
# which will take our dataset and num_users for calculating num_images which is 
# the number of images each paritipating client will have
# and this function will take each client at a time and parse
# random number of indices to its own list

def mnistIID(dataset, num_users):
    num_images = int(len(dataset) / num_users) # formula for calculating each participating clients image number
    users_dict, indices = {}, [i for i in range(len(dataset))] # dictionary type data structure so that key->client number, value->list of total indices of the dataset
    for i in range(num_users):
        np.random.seed(i) # so that everytime the random function is run, as we are evaluating, we need it to choose which it chose previously the random number
        users_dict[i] = set(np.random.choice(indices, num_images, replace=False)) #set-> by default will drop the repeated items, numpy.random.choice will choose a number from passed list, num_images for how many numbers it needs to choose from the list, replace=False as we dont want the repeated indice number
        indices = list(set(indices) - users_dict[i]) # to drop the indices chose on the previous line
    return users_dict

In [3]:
# Now we will create a non-IID function
# Note- we have decided to give equally 2 classes each time for each client
# First we need to select the number of classes
# And based on this number,we are going to distribute
# the dataset and the number of images we wat to give
# each client from each class
# then we need to create classes indices list
# after that, we need to create the indices
# after that, we need to the labels from the dataset
# these labels are unsorted as we need to sort the
# labels later
# then, we need to create a list of
# indices with unsorted labels
# then we need to sort the new unsorted labels
# then we need to update the indices list for current sorted
# then, we need to iterate over all the clients and 
# each client is going to select random indices from
# the classes, and for that we will use 
# numpy.random.choice
# and once a client's random number is chosen
# we need to remove those indices from classes_indx list
# and then, we need to update our users_dict with the images
# 

def mnistNonIID(dataset, num_users):
    classes, images = 100, 600
    classes_indx = [i for i in range(classes)]
    indices = np.arrange(classes * images) # basically the length of the training dataset
    users_dict = { i:np.array([]) for i in range(num_users)}
    unsorted_labels = dataset.train_labels.numpy() # we need to convert them into numpy as we need to stack them with their specific data with numpy vstack
    
    indices_unlabels = np.vstack(indices, unsorted_labels)
    labels = indices_unlabels[:, indices_unlabels[1,:].argsort()]
    indices = labels[0, :]
    
    # so we need to iterate over num_users
    # we need a temp variable to hold the indices of the classes

    
    
    for i in range(num_users):
        temp = set(np.random.choice(classes_indx, 2, replace=False)) # 2 = the number of classes we want to give for each clients at a time equally
        classes_indx = list(set(classes_indx) - temp)
        
        for i in temp:
            users_dict[i] = np.concatenate(
            (users_dict[i], indices[t*images:(t+1)*images]), axis = 0)
    return users_dict

In [4]:
# Now we will create a non-IIDUnequal function
# Note- This function will give unequal number of classes to each clients
# First we need to select the number of classes
# And based on this number,we are going to distribute
# the dataset and the number of images we wat to give
# each client from each class
# then we need to create classes indices list
# after that, we need to create the indices
# after that, we need to the labels from the dataset
# these labels are unsorted as we need to sort the
# labels later
# then, we need to create a list of
# indices with unsorted labels
# then we need to sort the new unsorted labels
# then we need to update the indices list for current sorted
# then, we need to iterate over all the clients and 
# each client is going to select random indices from
# the classes, and for that we will use 
# numpy.random.choice
# and once a client's random number is chosen
# we need to remove those indices from classes_indx list
# and then, we need to update our users_dict with the images
# 

def mnistNonIIDUnequal(dataset, num_users):
    classes, images = 1200, 50
    classes_indx = [i for i in range(classes)]
    indices = np.arrange(classes * images) # basically the length of the training dataset
    users_dict = { i:np.array([]) for i in range(num_users)}
    labels = dataset.train_labels.numpy() # we need to convert them into numpy as we need to stack them with their specific data with numpy vstack
    
    indices_labels = np.vstack(indices, labels)
    indices_labels = indices_labels[:, indices_labels[1,:].argsort()]
    indices = indices_labels[0, :]
    
    # now we need to set minimum number of classes for each user 
    # then, we need to set maximum number of classes for each user
    # then, we need to calculate from random generated numbers to get the probability of each class's chosen number
    
    min_cls_per_client = 1
    max_cls_per_client = 30
    
    random_selected_classes = np.random.tint(min_cls_per_client, max_cls_per_client+1, size=num_users)
    random_selected_classes = np.around(random_selected_classes / sum(random_selected_classes) * classes)
    random_selected_classes = random_selected_classes.astype(int)
    
    if sum(random_selected_classes)> classes:
        
        for i in range(num_users):
            temp = set(np.random.choice(classes_indx, 1, replace=False)) # for giving each users at least one class
            classes_indx = list(set(classes_indx) - temp) # dropping the at least one class chosen for one user from overall classes
            for t in temp:
                users_dict[i] = np.concatenate((users_dict[i], indices[t*images:(t+1)*images]), axis = 0)
            
        random_selected_classes = random_selected_classes - 1
        
        for i in range(num_users):
            if len(classes_indx) == 0:
                continue
            class_size = random_selected_classes[i]    
            
            if class_size > len(classes_indx):
                class_size = len(class_indx)
            
            temp = set(np.random.choice(classes_indx, class_size, replace=False))
            classes_indx = list(set(classes_indx) - temp)
            for t in temp:
                users_dict[i] = np.concatenate((users_dict[i], indices[t*images:(t+1)*images]), axis = 0)
            
    else:
        
        for i in range(num_users):
            class_size = random_selected_classes[i]
            temp = set(np.random.choice(classes_indx, class_size, replace=False))
            classes_indx = list(set(classes_indx) - temp)
            for t in temp:
                users_dict[i] = np.concatenate((users_dict[i], indices[t*images:(t+1)*images]), axis = 0)
    
        
        if len(classes_indx) > 0:
            class_size = len(classes_indx)
            k = min(users_dict, key=lambda x: len(users_dict.get(x))) # getting the client who has got the least number of classes
            temp = set(np.random.choice(classes_indx, class_size, replace=False))
            classes_indx = list(set(classes_indx) - temp)
            for t in temp:
                users_dict[k] = np.concatenate((users_dict[i], indices[t*images:(t+1)*images]), axis = 0)
    
    return users_dict

In [5]:
# first we need to load the image data
# so first we will create a transform
# then we will create the training dataset
# then, test dataset --Note: for test dataset we have to put train=False in the argument, so that pytorch will know that it will be used for test dataset
#

def load_dataset(num_users, iidtype):
# we will load the data

    # first we need to create a transform
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) # this nomralization will only work for this MNIST Dataset

    # we will download the dataset
    train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)

    # we weill need a test dataset
    test_dataset = datasets.MNIST('./data', train=False, download=True, transform=transform)
    
    train_group, test_group = None, None
    if iidtype== 'iid':
        train_group = mnistIID(train_dataset, num_users)
        test_group = mnistIID(test_dataset, num_users)
    elif iidtype== 'noniid':
        train_group = mnistNonIID(train_dataset, num_users)
        test_group = mnistNonIID(test_dataset, num_users)
    else:
        train_group = mnistNonIIDUnequal(train_dataset, num_users)
        test_group = mnistNonIIDUnequal(test_dataset, num_users)
    
    return train_dataset, test_dataset, train_group, test_group

In [6]:
# since we need to pass a list of numbers and we 
# want to fetch the actual images and targets
# we need to create a class so it inherits all the features
# and all the functions from the dataset

class FedDataset(Dataset):
    def __init__(self, dataset, indx):
        self.dataset = dataset
        self.indx = [int(i) for i in indx]
     
    # we need to change len
    # as we dont want it to return the actal size of the dataset
    # we want it to return the length of our indices for
    # that specific client
    
    def __len__(self):
        return len(self.indx)
    
    # we need to define a function, which will get the item
    # thats going to return the images and labels where each image
    # and label is equal
    
    def __getitem__(self, item):
        images, label = self.dataset[self.indx[item]]
        return torch.tensor(images).clone().detach(), torch.tensor(label).clone().detach()

In [7]:
# this funtion will take indices and going to parse the actual image at target
# in DataLoader, the dataset that we need to pass is gonna have to be an actual dataset like the train_dataset or the test_dataset we created

def getActualImages(dataset, indices, batch_size):
    return DataLoader(FedDataset(dataset, indices), batch_size=batch_size, shuffle=True)