Aim: The aim of this notebook is to explore the Hellinger Distance dependency on the basic parameters of IidPartitioner and Dirichlet Partitioner when using CIFAR10 dataset.

# Imports

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import sys
import os
print(os.getcwd())
# you're in fl-heterogeneity/heterogeneity/notebooks
sys.path.append(os.path.abspath("./../.."))

In [51]:
import itertools

import numpy as np
import pandas as pd
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import IidPartitioner, DirichletPartitioner, ShardPartitioner

from heterogeneity.metrics.hellinger_distance import compute_hellinger_distance

# CIFAR10

## IID

In [47]:
# Sample usage
num_partitions = 10
iid_partitioner = IidPartitioner(num_partitions=num_partitions)
cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
# Basic statistics of the global train CIFAR10 data
train = cifar_iid.load_split("train")
train_labels = train["label"]
pd.Series(train_labels).value_counts().sort_index()

0    5000
1    5000
2    5000
3    5000
4    5000
5    5000
6    5000
7    5000
8    5000
9    5000
Name: count, dtype: int64

In [48]:
num_partitions_to_cifar_iid_partitions = {}
num_partitions_to_cifar_iid_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
for num_partitions in num_partitions_list:
    iid_partitioner = IidPartitioner(num_partitions=num_partitions)
    cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
    num_partitions_to_cifar_iid_fds[num_partitions] = cifar_iid
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_iid_hellinger_distance = {}
num_partitions_to_cifar_iid_hellinger_distance_list = {}
for num_partitions, cifar_iid_fds in num_partitions_to_cifar_iid_fds.items():
    hellinger_distance_list, avg_hellinger_distance = compute_hellinger_distance(cifar_iid_fds, "train")
    num_partitions_to_cifar_iid_hellinger_distance_list[num_partitions] = hellinger_distance_list
    num_partitions_to_cifar_iid_hellinger_distance[num_partitions] = avg_hellinger_distance


Unnamed: 0,0
3,0.006003
10,0.013347
30,0.021757
100,0.044432
300,0.079355
1000,0.156089


In [104]:
iid_helinger_results = pd.Series(num_partitions_to_cifar_iid_hellinger_distance, name="iid_helinger").to_frame().style.background_gradient()
iid_helinger_results.index.name = "num_partitions"
iid_helinger_results

Unnamed: 0_level_0,iid_helinger
num_partitions,Unnamed: 1_level_1
3,0.006003
10,0.013347
30,0.021757
100,0.044432
300,0.079355
1000,0.156089


Is it desired to that degree? When the num_partitions is 1000, then each partition is of size 5. That can make it indeed heterogeneus.

# Dirichlet

In [108]:
# Example usage
num_partitions = 10
alpha = [0.1] * 10
dirichlet_partitioner = DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

In [109]:
num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label", self_balancing=False)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_hellinger_distance_list = {}
num_partitions_to_cifar_dir_hellinger_distance = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        hellinger_distance_list, avg_hellinger_distance = hellinger_distance(cifar_dir_fds, "train")
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        hellinger_distance_list, avg_hellinger_distance = np.nan, np.nan
    num_partitions_to_cifar_dir_hellinger_distance_list[(num_partitions, alpha)] = hellinger_distance_list
    num_partitions_to_cifar_dir_hellinger_distance[(num_partitions, alpha)] = avg_hellinger_distance

(3, 0.1)
(3, 0.3)
(3, 1.0)
(3, 3.0)
(3, 10.0)
(3, 100.0)
(10, 0.1)
(10, 0.3)
(10, 1.0)
(10, 3.0)
(10, 10.0)
(10, 100.0)
(30, 0.1)
(30, 0.3)
(30, 1.0)
(30, 3.0)
(30, 10.0)
(30, 100.0)
(100, 0.1)




(100, 0.3)
(100, 1.0)
(100, 3.0)
(100, 10.0)
(100, 100.0)
(300, 0.1)




Sampling failed for (300, 0.1)
(300, 0.3)
(300, 1.0)
(300, 3.0)
(300, 10.0)
(300, 100.0)
(1000, 0.1)




Sampling failed for (1000, 0.1)
(1000, 0.3)




Sampling failed for (1000, 0.3)
(1000, 1.0)
(1000, 3.0)
(1000, 10.0)
(1000, 100.0)


In [110]:
hel_dir = pd.Series(num_partitions_to_cifar_dir_hellinger_distance).unstack(level=1)#.style.background_gradient(axis=None)
hel_dir.index.name = "num_partitions"
hel_dir.columns.name = "alpha"
hel_dir.style.background_gradient(axis=None)

alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000
num_partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,0.465201,0.37095,0.227716,0.10901,0.071695,0.022788
10,0.613713,0.476372,0.290158,0.155027,0.09144,0.028729
30,0.651914,0.471448,0.297516,0.174453,0.097322,0.030711
100,0.667463,0.503658,0.324196,0.186368,0.103555,0.032853
300,,0.511364,0.315759,0.182981,0.101346,0.032604
1000,,,0.334892,0.191701,0.106884,0.042349


In [44]:
num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label", self_balancing=True)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_hellinger_distance_list = {}
num_partitions_to_cifar_dir_hellinger_distance = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        hellinger_distance_list, avg_hellinger_distance = hellinger_distance(cifar_dir_fds, "train")
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        hellinger_distance_list, avg_hellinger_distance = np.nan, np.nan
    num_partitions_to_cifar_dir_hellinger_distance_list[(num_partitions, alpha)] = hellinger_distance_list
    num_partitions_to_cifar_dir_hellinger_distance[(num_partitions, alpha)] = avg_hellinger_distance

(3, 0.1)
(3, 0.3)
(3, 1.0)
(3, 3.0)
(3, 10.0)
(3, 100.0)
(10, 0.1)
(10, 0.3)
(10, 1.0)
(10, 3.0)
(10, 10.0)
(10, 100.0)
(30, 0.1)
(30, 0.3)
(30, 1.0)
(30, 3.0)
(30, 10.0)
(30, 100.0)
(100, 0.1)




(100, 0.3)
(100, 1.0)
(100, 3.0)
(100, 10.0)
(100, 100.0)
(300, 0.1)




Sampling failed for (300, 0.1)
(300, 0.3)
(300, 1.0)
(300, 3.0)
(300, 10.0)
(300, 100.0)
(1000, 0.1)




Sampling failed for (1000, 0.1)
(1000, 0.3)




Sampling failed for (1000, 0.3)
(1000, 1.0)
(1000, 3.0)
(1000, 10.0)
(1000, 100.0)


In [45]:
pd.Series(num_partitions_to_cifar_dir_hellinger_distance).to_frame().unstack(level=1).style.background_gradient(axis=None)

Unnamed: 0_level_0,0,0,0,0,0,0
Unnamed: 0_level_1,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000
3,0.567639,0.440146,0.287541,0.161999,0.071695,0.022788
10,0.691987,0.530651,0.371551,0.223615,0.134834,0.028729
30,0.721607,0.558678,0.387039,0.238245,0.115445,0.030711
100,0.715659,0.569919,0.384234,0.240813,0.125372,0.032853
300,,0.58356,0.381126,0.238532,0.13224,0.032604
1000,,,0.396759,0.236577,0.129837,0.042934


The self_balancing (size balancing) creates more heterogenous dataset division.

## Shard

In [57]:
params_to_partitioner = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
num_shards_per_partition_list = [2, 3, 4, 5]
for num_partitions, num_shards_per_partition in itertools.product(num_partitions_list, num_shards_per_partition_list):
    partitioner = ShardPartitioner(num_partitions=num_partitions, partition_by="label", num_shards_per_partition=num_shards_per_partition)
    fds = FederatedDataset(dataset="cifar10", partitioners={"train" : partitioner})
    params_to_partitioner[(num_partitions, num_shards_per_partition)] = fds

parameters_to_shard_cifar_fds_metric_list = {}
parameters_to_shard_cifar_fds_metric = {}
for (num_partitions, num_shards_per_partition), fds in params_to_partitioner.items():
    print((num_partitions, num_shards_per_partition))
    try:
        metric_list, avg_metric = hellinger_distance(fds, "train")
    except:
        print(f"Sampling failed for {(num_partitions, num_shards_per_partition)}")
        metric_list, avg_metric = np.nan, np.nan
    parameters_to_shard_cifar_fds_metric_list[(num_partitions, num_shards_per_partition)] = metric_list
    parameters_to_shard_cifar_fds_metric[(num_partitions, num_shards_per_partition)] = avg_metric

(3, 2)
(3, 3)
(3, 4)
(3, 5)
(10, 2)
(10, 3)
(10, 4)
(10, 5)
(30, 2)
(30, 3)
(30, 4)
(30, 5)
(100, 2)
(100, 3)
(100, 4)
(100, 5)
(300, 2)
(300, 3)
(300, 4)
(300, 5)
(1000, 2)
(1000, 3)
(1000, 4)
(1000, 5)


In [103]:
shard_emd_results = pd.Series(parameters_to_shard_cifar_fds_metric).unstack(level=1)
shard_emd_results.index.name = "num_partitions"
shard_emd_results.columns.name = "num_shards"
shard_emd_results.style.background_gradient(axis=None)

num_shards,2,3,4,5
num_partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,0.61413,0.506585,0.493643,0.506547
10,0.743496,0.681302,0.649515,0.597347
30,0.750597,0.692119,0.630778,0.588314
100,0.748501,0.685527,0.645899,0.605997
300,0.751315,0.689076,0.643505,0.605235
1000,0.751587,0.69273,0.64764,0.60094


In [95]:
shard_emd_results.columns

MultiIndex([(0, 2),
            (0, 3),
            (0, 4),
            (0, 5)],
           )

In [74]:
shard_emd_results.columns = [""

In [75]:
shard_emd_results

Unnamed: 0,Unnamed: 1,Unnamed: 2
3,2,0.61413
3,3,0.506585
3,4,0.493643
3,5,0.506547
10,2,0.743496
10,3,0.681302
10,4,0.649515
10,5,0.597347
30,2,0.750597
30,3,0.692119
