## IID (Independent and Identically Distributed)

In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np

In [2]:
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

In [3]:
from collections import defaultdict, Counter
from torchvision import datasets, transforms
from torch.utils.data import Subset

# Load MNIST training data
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

# Parameters
num_clients = 10
total_samples = 6000

# Extract labels
labels = np.array(mnist_train.targets)

# Step 1: Sample 6000 indices from the entire dataset to preserve global distribution
all_indices = np.arange(len(labels))
np.random.shuffle(all_indices)

# Choose 6000 samples
selected_indices = all_indices[:total_samples]
selected_labels = labels[selected_indices]

# Step 2: Randomly assign samples to clients (with varying sizes)
client_indices = defaultdict(list)

# Generate random proportions for each client (but sum to 1)
proportions = np.random.dirichlet(np.ones(num_clients), size=1)[0]
client_sizes = (proportions * total_samples).astype(int)

# Fix rounding issue to ensure exactly 6000 samples
diff = total_samples - np.sum(client_sizes)
client_sizes[np.argmax(client_sizes)] += diff

# Now split selected_indices into those sizes
start = 0
for client_id, size in enumerate(client_sizes):
    client_indices[client_id] = selected_indices[start:start + size].tolist()
    start += size

# Create Subset datasets for each client
client_datasets = [Subset(mnist_train, client_indices[i]) for i in range(num_clients)]

# Debug: show class distribution per client
for i, dataset in enumerate(client_datasets):
    labels = [mnist_train.targets[idx].item() for idx in dataset.indices]
    label_count = dict(sorted(Counter(labels).items()))
    print(f"Client {i} class distribution: {label_count} (Total: {len(dataset)})")

Client 0 class distribution: {0: 81, 1: 104, 2: 95, 3: 80, 4: 88, 5: 82, 6: 98, 7: 105, 8: 80, 9: 96} (Total: 909)
Client 1 class distribution: {0: 150, 1: 175, 2: 138, 3: 135, 4: 132, 5: 120, 6: 148, 7: 166, 8: 133, 9: 150} (Total: 1447)
Client 2 class distribution: {0: 39, 1: 36, 2: 45, 3: 33, 4: 54, 5: 36, 6: 35, 7: 30, 8: 45, 9: 40} (Total: 393)
Client 3 class distribution: {0: 61, 1: 84, 2: 80, 3: 77, 4: 81, 5: 81, 6: 82, 7: 68, 8: 70, 9: 73} (Total: 757)
Client 4 class distribution: {0: 54, 1: 79, 2: 62, 3: 47, 4: 49, 5: 59, 6: 53, 7: 55, 8: 56, 9: 56} (Total: 570)
Client 5 class distribution: {0: 12, 1: 26, 2: 33, 3: 29, 4: 17, 5: 22, 6: 17, 7: 23, 8: 22, 9: 15} (Total: 216)
Client 6 class distribution: {1: 2, 2: 2, 3: 1, 4: 1, 5: 1, 8: 2} (Total: 9)
Client 7 class distribution: {0: 13, 1: 25, 2: 28, 3: 18, 4: 24, 5: 22, 6: 16, 7: 19, 8: 17, 9: 18} (Total: 200)
Client 8 class distribution: {0: 21, 1: 24, 2: 31, 3: 30, 4: 23, 5: 16, 6: 25, 7: 28, 8: 24, 9: 22} (Total: 244)
Client

## Non-IID (Independent and Identically Distributed)

In [7]:
def create_noniid_shards(dataset, num_clients=10, shards_per_client=2, total_samples=6000):
    labels = np.array(dataset.targets)
    data_indices = np.arange(len(labels))

    # Step 1: Select 6000 samples only (preserve label proportions)
    shuffled_indices = np.random.permutation(len(labels))
    selected_indices = shuffled_indices[:total_samples]
    selected_labels = labels[selected_indices]

    # Step 2: Sort selected samples by label
    sorted_indices = selected_indices[np.argsort(selected_labels)]

    # Step 3: Create shards
    num_shards = num_clients * shards_per_client
    shard_size = total_samples // num_shards
    shards = [sorted_indices[i * shard_size:(i + 1) * shard_size] for i in range(num_shards)]

    # Step 4: Assign shards randomly to clients
    np.random.shuffle(shards)
    client_indices = [[] for _ in range(num_clients)]
    
    shard_idx = 0
    for client_id in range(num_clients):
        for _ in range(shards_per_client):
            client_indices[client_id].extend(shards[shard_idx])
            shard_idx += 1

    # Step 5: Create Subset datasets
    client_datasets = [Subset(dataset, indices) for indices in client_indices]

    # Optional: Show label distributions
    for i, dataset in enumerate(client_datasets):
        lbls = [dataset.dataset.targets[idx].item() for idx in dataset.indices]
        dist = dict(sorted(Counter(lbls).items()))
        print(f"Client {i} class distribution: {dist} (Total: {len(dataset)})")

    return client_datasets

In [9]:
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

non_iid_clients = create_noniid_shards(mnist_train, num_clients=10, shards_per_client=6, total_samples=6000)

Client 0 class distribution: {2: 200, 3: 100, 5: 100, 6: 100, 7: 100} (Total: 600)
Client 1 class distribution: {0: 80, 1: 20, 2: 200, 4: 200, 5: 12, 6: 88} (Total: 600)
Client 2 class distribution: {1: 47, 2: 53, 4: 100, 5: 100, 8: 126, 9: 174} (Total: 600)
Client 3 class distribution: {1: 200, 4: 100, 6: 100, 7: 200} (Total: 600)
Client 4 class distribution: {6: 200, 8: 300, 9: 100} (Total: 600)
Client 5 class distribution: {0: 200, 4: 76, 5: 124, 6: 100, 7: 100} (Total: 600)
Client 6 class distribution: {0: 100, 2: 144, 3: 156, 5: 100, 6: 9, 7: 91} (Total: 600)
Client 7 class distribution: {0: 100, 1: 200, 7: 151, 8: 149} (Total: 600)
Client 8 class distribution: {3: 161, 4: 139, 5: 100, 9: 200} (Total: 600)
Client 9 class distribution: {0: 100, 1: 200, 3: 200, 9: 100} (Total: 600)
