Aim: The aim of this notebook is to explore the Hellinger Distance dependency on the basic parameters of IidPartitioner and Dirichlet Partitioner when using CIFAR10 dataset.

# Imports

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
print(os.getcwd())
# you're in fl-heterogeneity/heterogeneity/notebooks
sys.path.append(os.path.abspath("./../.."))

In [4]:
import itertools

import numpy as np
import pandas as pd
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import IidPartitioner, DirichletPartitioner, ShardPartitioner, InnerDirichletPartitioner

from heterogeneity.metrics.hellinger_distance import compute_hellinger_distance
from heterogeneity.utils import create_lognormal_partition_sizes

# CIFAR10

## IID

In [47]:
# Sample usage
num_partitions = 10
iid_partitioner = IidPartitioner(num_partitions=num_partitions)
cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
# Basic statistics of the global train CIFAR10 data
train = cifar_iid.load_split("train")
train_labels = train["label"]
pd.Series(train_labels).value_counts().sort_index()

In [48]:
num_partitions_to_cifar_iid_partitions = {}
num_partitions_to_cifar_iid_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
for num_partitions in num_partitions_list:
    iid_partitioner = IidPartitioner(num_partitions=num_partitions)
    cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
    num_partitions_to_cifar_iid_fds[num_partitions] = cifar_iid
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_iid_hellinger_distance = {}
num_partitions_to_cifar_iid_hellinger_distance_list = {}
for num_partitions, cifar_iid_fds in num_partitions_to_cifar_iid_fds.items():
    hellinger_distance_list, avg_hellinger_distance = compute_hellinger_distance(cifar_iid_fds.partitioners["train"])
    num_partitions_to_cifar_iid_hellinger_distance_list[num_partitions] = hellinger_distance_list
    num_partitions_to_cifar_iid_hellinger_distance[num_partitions] = avg_hellinger_distance


In [104]:
iid_helinger_results = pd.Series(num_partitions_to_cifar_iid_hellinger_distance, name="iid_helinger").to_frame().style.background_gradient()
iid_helinger_results.index.name = "num_partitions"
iid_helinger_results

Is it desired to that degree? When the num_partitions is 1000, then each partition is of size 5. That can make it indeed heterogeneus.

# Dirichlet

In [108]:
# Example usage
num_partitions = 10
alpha = [0.1] * 10
dirichlet_partitioner = DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

In [109]:
num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label", self_balancing=False)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_hellinger_distance_list = {}
num_partitions_to_cifar_dir_hellinger_distance = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        hellinger_distance_list, avg_hellinger_distance = compute_hellinger_distance(cifar_dir_fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        hellinger_distance_list, avg_hellinger_distance = np.nan, np.nan
    num_partitions_to_cifar_dir_hellinger_distance_list[(num_partitions, alpha)] = hellinger_distance_list
    num_partitions_to_cifar_dir_hellinger_distance[(num_partitions, alpha)] = avg_hellinger_distance

In [110]:
hel_dir = pd.Series(num_partitions_to_cifar_dir_hellinger_distance).unstack(level=1)#.style.background_gradient(axis=None)
hel_dir.index.name = "num_partitions"
hel_dir.columns.name = "alpha"
hel_dir.style.background_gradient(axis=None)

In [44]:
num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label", self_balancing=True)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_hellinger_distance_list = {}
num_partitions_to_cifar_dir_hellinger_distance = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        hellinger_distance_list, avg_hellinger_distance = hellinger_distance(cifar_dir_fds, "train")
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        hellinger_distance_list, avg_hellinger_distance = np.nan, np.nan
    num_partitions_to_cifar_dir_hellinger_distance_list[(num_partitions, alpha)] = hellinger_distance_list
    num_partitions_to_cifar_dir_hellinger_distance[(num_partitions, alpha)] = avg_hellinger_distance

In [45]:
pd.Series(num_partitions_to_cifar_dir_hellinger_distance).to_frame().unstack(level=1).style.background_gradient(axis=None)

The self_balancing (size balancing) creates more heterogenous dataset division.

## Shard

In [57]:
params_to_partitioner = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
num_shards_per_partition_list = [2, 3, 4, 5]
for num_partitions, num_shards_per_partition in itertools.product(num_partitions_list, num_shards_per_partition_list):
    partitioner = ShardPartitioner(num_partitions=num_partitions, partition_by="label", num_shards_per_partition=num_shards_per_partition)
    fds = FederatedDataset(dataset="cifar10", partitioners={"train" : partitioner})
    params_to_partitioner[(num_partitions, num_shards_per_partition)] = fds

parameters_to_shard_cifar_fds_metric_list = {}
parameters_to_shard_cifar_fds_metric = {}
for (num_partitions, num_shards_per_partition), fds in params_to_partitioner.items():
    print((num_partitions, num_shards_per_partition))
    try:
        metric_list, avg_metric = compute_hellinger_distance(fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, num_shards_per_partition)}")
        metric_list, avg_metric = np.nan, np.nan
    parameters_to_shard_cifar_fds_metric_list[(num_partitions, num_shards_per_partition)] = metric_list
    parameters_to_shard_cifar_fds_metric[(num_partitions, num_shards_per_partition)] = avg_metric

In [103]:
shard_emd_results = pd.Series(parameters_to_shard_cifar_fds_metric).unstack(level=1)
shard_emd_results.index.name = "num_partitions"
shard_emd_results.columns.name = "num_shards"
shard_emd_results.style.background_gradient(axis=None)

In [75]:
shard_emd_results

## Inner Dirichlet

In [5]:

dataset_name = "cifar10"
# num_partitions = 10
# sigma = 0.3
# partition_sizes = create_lognormal_partition_sizes(dataset_name, num_partitions, sigma)
# 
# alpha = 0.1
# dirichlet_partitioner = InnerDirichletPartitioner(partition_sizes=partition_sizes, partition_by="label", alpha=0.1)
# cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
# cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
sigma_list = [0.1, 0.3, 1., 3.]
partition_sizes_dict = {}
for num_partitions, alpha, sigma in itertools.product(num_partitions_list, alpha_list, sigma_list):
    partition_sizes = create_lognormal_partition_sizes(dataset_name, num_partitions, sigma)
    dir_partitioner =  InnerDirichletPartitioner(partition_sizes=partition_sizes, partition_by="label", alpha=alpha)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha, sigma)] = cifar_dir
    partition_sizes_dict[(num_partitions, alpha, sigma)] = partition_sizes
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_metric_list = {}
num_partitions_to_cifar_dir_metric = {}
for (num_partitions, alpha, sigma), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha, sigma))
    try:
        metric_list, avg_metric = compute_hellinger_distance(cifar_dir_fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, alpha, sigma)}")
        metric_list, avg_metric = np.nan, np.nan
    num_partitions_to_cifar_dir_metric_list[(num_partitions, alpha, sigma)] = metric_list
    num_partitions_to_cifar_dir_metric[(num_partitions, alpha, sigma)] = avg_metric

In [13]:
emd_dir = pd.Series(num_partitions_to_cifar_dir_metric, name="emd_inner_dir").unstack(level=1)
emd_dir

In [16]:
partition_sizes_dict[(10, 0.3, 0.1)]

In [17]:
partition_sizes_dict[(10, 0.3, 3)]

# FEMNIST

In [3]:
import pandas as pd
path = "../../results/adamnarozniak_old/femnist/IidPartitioner/compute_hellinger_distance.csv"
femnist_helinger = pd.read_csv(path)
femnist_helinger = femnist_helinger.drop_duplicates()
femnist_helinger.groupby(["num_partitions"])["metric_value"].mean().to_frame().style.background_gradient(axis=None)

Unnamed: 0_level_0,metric_value
num_partitions,Unnamed: 1_level_1
3,0.137941
10,0.166024
30,0.170423
100,0.175127
300,0.180012
1000,0.200157


In [31]:
path = "../../results/adamnarozniak/femnist/DirichletPartitioner/compute_hellinger_distance.csv"
femnist_helinger = pd.read_csv(path)
femnist_helinger = femnist_helinger.drop_duplicates()
femnist_helinger.groupby(["num_partitions", "alpha"])["metric_value"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None)

# VISUALIZE RESULTS

In [1]:
import pandas as pd
from IPython.display import display

In [23]:
partitioner_name = "IidPartitioner"
for ds_name in ["cifar10", "cifar100", "mnist"]:
    path = f"../../results-heterogeneity/{ds_name}/{partitioner_name}/compute_hellinger_distance.csv"
    metrics = pd.read_csv(path)
    metrics.columns = metrics.columns.values[:-1].tolist() + ["hellinger_distance"]
    metrics_len = len(metrics)
    metrics = metrics.drop_duplicates()
    metrics_len_after = len(metrics)
    if metrics_len != metrics_len_after:
        print(f"Removed {metrics_len - metrics_len_after} duplicates")
    print(ds_name)
    display(metrics.groupby(["num_partitions"])["hellinger_distance"].mean().to_frame().style.background_gradient(axis=None))

cifar10


Unnamed: 0_level_0,hellinger_distance
num_partitions,Unnamed: 1_level_1
3,0.006195
10,0.014177
30,0.023834
100,0.045574
300,0.079977
1000,0.155471


cifar100


Unnamed: 0_level_0,hellinger_distance
num_partitions,Unnamed: 1_level_1
3,0.022335
10,0.047788
30,0.085924
100,0.168592
300,0.358407
1000,0.618352


mnist


Unnamed: 0_level_0,hellinger_distance
num_partitions,Unnamed: 1_level_1
3,0.005481
10,0.012979
30,0.023203
100,0.04193
300,0.073407
1000,0.139402


In [20]:
partitioner_name = "DirichletPartitioner"
for ds_name in ["cifar10", "cifar100", "mnist"]:#, "adamnarozniak/femnist"]:
    path = f"../../results-heterogeneity/{ds_name}/{partitioner_name}/compute_hellinger_distance.csv"
    metrics = pd.read_csv(path)
    metrics.columns = metrics.columns.values[:-1].tolist() + ["hellinger_distance"]
    metrics_len = len(metrics)
    metrics = metrics.drop_duplicates()
    metrics_len_after = len(metrics)
    if metrics_len != metrics_len_after:
        print(f"Removed {metrics_len - metrics_len_after} duplicates")
    print(ds_name)
    display(metrics.groupby(["num_partitions", "alpha"])["hellinger_distance"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None))

cifar10


Unnamed: 0_level_0,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance
alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000,1000.000000
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,0.520317,0.388491,0.24994,0.141503,0.078354,0.025434,0.008023
10,0.62025,0.483452,0.283198,0.177413,0.098933,0.031327,0.009913
30,0.64871,0.495558,0.299832,0.181689,0.101169,0.032138,0.010206
100,0.662323,0.50817,0.316892,0.185112,0.101766,0.032366,0.010536
300,,0.514505,0.318957,0.186285,0.103144,0.033555,0.013112
1000,,,0.333503,0.192177,0.107014,0.042001,0.020511


cifar100


Unnamed: 0_level_0,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance
alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000,1000.000000
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,0.545902,0.432712,0.264131,0.168904,0.093807,0.029856,0.009441
10,0.665189,0.513564,0.319783,0.189262,0.104793,0.033321,0.010897
30,0.698148,0.536673,0.335131,0.197012,0.109142,0.035608,0.013893
100,0.716125,0.560403,0.35606,0.206456,0.114645,0.045331,0.02564
300,0.734549,0.596062,0.418377,0.268632,0.154695,0.106362,0.102625
1000,0.766136,0.672943,0.586927,0.54724,,,


mnist


Unnamed: 0_level_0,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance
alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000,1000.000000
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,0.520301,0.388463,0.250478,0.141928,0.078216,0.025358,0.008
10,0.620323,0.482991,0.282998,0.177639,0.09909,0.031378,0.009941
30,0.648271,0.495456,0.29957,0.18182,0.101249,0.032245,0.010209
100,0.661791,0.507179,0.317002,0.18524,0.10196,0.032304,0.010423
300,,0.5128,0.318501,0.186138,0.102904,0.03337,0.012335
1000,,,0.329041,0.190857,0.105978,0.039573,0.022974


In [37]:
partitioner_name = "PathologicalPartitioner"
for ds_name in ["cifar10", "cifar100", "mnist"]:# "adamnarozniak/femnist"]:
    path = f"../../results-heterogeneity/{ds_name}/{partitioner_name}/compute_hellinger_distance.csv"
    metrics = pd.read_csv(path)
    metrics.columns = metrics.columns.values[:-1].tolist() + ["hellinger_distance"]
    metrics = metrics.drop_duplicates()
    print(ds_name)
    if ds_name == "cifar100":
        metrics["num_classes_per_partition"] = (metrics["num_classes_per_partition"] * 100).astype(int)
        display(metrics.groupby(["num_partitions", "num_classes_per_partition"])["hellinger_distance"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None))
    elif ds_name == "adamnarozniak/femnist":
        display(metrics.groupby(["num_partitions", "num_classes_per_partition"])["hellinger_distance"].mean().to_frame().unstack(level=-1).iloc[:, 4:].style.background_gradient(axis=None))
    else:
        display(metrics.groupby(["num_partitions", "num_classes_per_partition"])["hellinger_distance"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None))

cifar10


Unnamed: 0_level_0,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance
num_classes_per_partition,2,3,4,5
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
3,0.744533,0.679702,0.616772,0.550042
10,0.749747,0.678813,0.612648,0.545984
30,0.745921,0.674504,0.608206,0.543011
100,0.744021,0.673122,0.606988,0.541832
300,0.743661,0.672708,0.606528,0.54147
1000,0.743554,0.672605,0.606373,0.541332


cifar100


Unnamed: 0_level_0,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance
num_classes_per_partition,20,30,40,50
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
3,0.747538,0.679269,0.616514,0.55464
10,0.751742,0.681989,0.617184,0.550703
30,0.748942,0.676896,0.609606,0.544201
100,0.745072,0.6738,0.607388,0.542245
300,0.744095,0.673118,0.606949,0.542004
1000,0.744028,0.673458,0.607295,


mnist


Unnamed: 0_level_0,hellinger_distance,hellinger_distance,hellinger_distance,hellinger_distance
num_classes_per_partition,2,3,4,5
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
3,0.743239,0.680142,0.618636,0.551557
10,0.750358,0.678988,0.613246,0.546469
30,0.746112,0.674386,0.608254,0.542557
100,0.743906,0.672848,0.60662,0.541341
300,0.743468,0.672429,0.606215,0.541064
1000,0.74333,0.672317,0.606096,0.541121
