## IID (Independent and Identically Distributed)

In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np

In [2]:
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

In [3]:
from collections import defaultdict, Counter
from torchvision import datasets, transforms
from torch.utils.data import Subset

# Load MNIST training data
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

# Parameters
num_clients = 10
total_samples = 6000

# Extract labels
labels = np.array(mnist_train.targets)

# Step 1: Sample 6000 indices from the entire dataset to preserve global distribution
all_indices = np.arange(len(labels))
np.random.shuffle(all_indices)

# Choose 6000 samples
selected_indices = all_indices[:total_samples]
selected_labels = labels[selected_indices]

# Step 2: Randomly assign samples to clients (with varying sizes)
client_indices = defaultdict(list)

# Generate random proportions for each client (but sum to 1)
proportions = np.random.dirichlet(np.ones(num_clients), size=1)[0]
client_sizes = (proportions * total_samples).astype(int)

# Fix rounding issue to ensure exactly 6000 samples
diff = total_samples - np.sum(client_sizes)
client_sizes[np.argmax(client_sizes)] += diff

# Now split selected_indices into those sizes
start = 0
for client_id, size in enumerate(client_sizes):
    client_indices[client_id] = selected_indices[start:start + size].tolist()
    start += size

# Create Subset datasets for each client
client_datasets = [Subset(mnist_train, client_indices[i]) for i in range(num_clients)]

# Debug: show class distribution per client
for i, dataset in enumerate(client_datasets):
    labels = [mnist_train.targets[idx].item() for idx in dataset.indices]
    label_count = dict(sorted(Counter(labels).items()))
    print(f"Client {i} class distribution: {label_count} (Total: {len(dataset)})")

Client 0 class distribution: {0: 81, 1: 104, 2: 95, 3: 80, 4: 88, 5: 82, 6: 98, 7: 105, 8: 80, 9: 96} (Total: 909)
Client 1 class distribution: {0: 150, 1: 175, 2: 138, 3: 135, 4: 132, 5: 120, 6: 148, 7: 166, 8: 133, 9: 150} (Total: 1447)
Client 2 class distribution: {0: 39, 1: 36, 2: 45, 3: 33, 4: 54, 5: 36, 6: 35, 7: 30, 8: 45, 9: 40} (Total: 393)
Client 3 class distribution: {0: 61, 1: 84, 2: 80, 3: 77, 4: 81, 5: 81, 6: 82, 7: 68, 8: 70, 9: 73} (Total: 757)
Client 4 class distribution: {0: 54, 1: 79, 2: 62, 3: 47, 4: 49, 5: 59, 6: 53, 7: 55, 8: 56, 9: 56} (Total: 570)
Client 5 class distribution: {0: 12, 1: 26, 2: 33, 3: 29, 4: 17, 5: 22, 6: 17, 7: 23, 8: 22, 9: 15} (Total: 216)
Client 6 class distribution: {1: 2, 2: 2, 3: 1, 4: 1, 5: 1, 8: 2} (Total: 9)
Client 7 class distribution: {0: 13, 1: 25, 2: 28, 3: 18, 4: 24, 5: 22, 6: 16, 7: 19, 8: 17, 9: 18} (Total: 200)
Client 8 class distribution: {0: 21, 1: 24, 2: 31, 3: 30, 4: 23, 5: 16, 6: 25, 7: 28, 8: 24, 9: 22} (Total: 244)
Client

## Non-IID (Independent and Identically Distributed)

In [4]:
def create_noniid_shards(dataset, num_clients=10, shards_per_client=2):
    labels = np.array(dataset.targets)
    data_indices = np.arange(len(labels))

    # Sort by label
    sorted_indices = data_indices[np.argsort(labels)]

    # Create 200 shards of size 300
    num_shards = num_clients * shards_per_client
    shard_size = len(dataset) // num_shards
    shards = [sorted_indices[i * shard_size:(i + 1) * shard_size] for i in range(num_shards)]

    # Shuffle and assign shards to clients
    np.random.shuffle(shards)
    client_indices = [[] for _ in range(num_clients)]
    
    for client_id in range(num_clients):
        for i in range(shards_per_client):
            shard = shards[client_id * shards_per_client + i]
            client_indices[client_id].extend(shard)

    # Create Subset datasets for each client
    client_datasets = [Subset(dataset, indices) for indices in client_indices]
    return client_datasets

In [5]:
transform = transforms.Compose([transforms.ToTensor()])
mnist_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)

client_datasets = create_noniid_shards(mnist_train)

In [6]:
from collections import Counter

for i, client_data in enumerate(client_datasets):
    labels = [mnist_train.targets[idx].item() for idx in client_data.indices]
    dist = Counter(labels)
    print(f"Client {i} label distribution: {dict(dist)}")

Client 0 label distribution: {0: 2923, 1: 742, 2: 2335}
Client 1 label distribution: {2: 623, 3: 2377, 7: 3000}
Client 2 label distribution: {1: 3000, 3: 3000}
Client 3 label distribution: {5: 3017, 6: 2983}
Client 4 label distribution: {7: 3000, 2: 3000}
Client 5 label distribution: {4: 3000, 9: 3000}
Client 6 label distribution: {7: 200, 8: 2851, 9: 2949}
Client 7 label distribution: {0: 3000, 6: 2935, 7: 65}
Client 8 label distribution: {4: 2842, 5: 2404, 3: 754}
Client 9 label distribution: {8: 3000, 1: 3000}
