Aim: The aim of this notebook is to explore the Hellinger Distance dependency on the basic parameters of IidPartitioner and Dirichlet Partitioner when using CIFAR10 dataset.

# Imports

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
print(os.getcwd())
# you're in fl-heterogeneity/heterogeneity/notebooks
sys.path.append(os.path.abspath("./../.."))

In [4]:
import itertools

import numpy as np
import pandas as pd
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import IidPartitioner, DirichletPartitioner, ShardPartitioner, InnerDirichletPartitioner

from heterogeneity.metrics.hellinger_distance import compute_hellinger_distance
from heterogeneity.utils import create_lognormal_partition_sizes

# CIFAR10

## IID

In [47]:
# Sample usage
num_partitions = 10
iid_partitioner = IidPartitioner(num_partitions=num_partitions)
cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
# Basic statistics of the global train CIFAR10 data
train = cifar_iid.load_split("train")
train_labels = train["label"]
pd.Series(train_labels).value_counts().sort_index()

In [48]:
num_partitions_to_cifar_iid_partitions = {}
num_partitions_to_cifar_iid_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
for num_partitions in num_partitions_list:
    iid_partitioner = IidPartitioner(num_partitions=num_partitions)
    cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
    num_partitions_to_cifar_iid_fds[num_partitions] = cifar_iid
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_iid_hellinger_distance = {}
num_partitions_to_cifar_iid_hellinger_distance_list = {}
for num_partitions, cifar_iid_fds in num_partitions_to_cifar_iid_fds.items():
    hellinger_distance_list, avg_hellinger_distance = compute_hellinger_distance(cifar_iid_fds.partitioners["train"])
    num_partitions_to_cifar_iid_hellinger_distance_list[num_partitions] = hellinger_distance_list
    num_partitions_to_cifar_iid_hellinger_distance[num_partitions] = avg_hellinger_distance


In [104]:
iid_helinger_results = pd.Series(num_partitions_to_cifar_iid_hellinger_distance, name="iid_helinger").to_frame().style.background_gradient()
iid_helinger_results.index.name = "num_partitions"
iid_helinger_results

Is it desired to that degree? When the num_partitions is 1000, then each partition is of size 5. That can make it indeed heterogeneus.

# Dirichlet

In [108]:
# Example usage
num_partitions = 10
alpha = [0.1] * 10
dirichlet_partitioner = DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

In [109]:
num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label", self_balancing=False)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_hellinger_distance_list = {}
num_partitions_to_cifar_dir_hellinger_distance = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        hellinger_distance_list, avg_hellinger_distance = compute_hellinger_distance(cifar_dir_fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        hellinger_distance_list, avg_hellinger_distance = np.nan, np.nan
    num_partitions_to_cifar_dir_hellinger_distance_list[(num_partitions, alpha)] = hellinger_distance_list
    num_partitions_to_cifar_dir_hellinger_distance[(num_partitions, alpha)] = avg_hellinger_distance

In [110]:
hel_dir = pd.Series(num_partitions_to_cifar_dir_hellinger_distance).unstack(level=1)#.style.background_gradient(axis=None)
hel_dir.index.name = "num_partitions"
hel_dir.columns.name = "alpha"
hel_dir.style.background_gradient(axis=None)

In [44]:
num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label", self_balancing=True)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_hellinger_distance_list = {}
num_partitions_to_cifar_dir_hellinger_distance = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        hellinger_distance_list, avg_hellinger_distance = hellinger_distance(cifar_dir_fds, "train")
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        hellinger_distance_list, avg_hellinger_distance = np.nan, np.nan
    num_partitions_to_cifar_dir_hellinger_distance_list[(num_partitions, alpha)] = hellinger_distance_list
    num_partitions_to_cifar_dir_hellinger_distance[(num_partitions, alpha)] = avg_hellinger_distance

In [45]:
pd.Series(num_partitions_to_cifar_dir_hellinger_distance).to_frame().unstack(level=1).style.background_gradient(axis=None)

The self_balancing (size balancing) creates more heterogenous dataset division.

## Shard

In [57]:
params_to_partitioner = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
num_shards_per_partition_list = [2, 3, 4, 5]
for num_partitions, num_shards_per_partition in itertools.product(num_partitions_list, num_shards_per_partition_list):
    partitioner = ShardPartitioner(num_partitions=num_partitions, partition_by="label", num_shards_per_partition=num_shards_per_partition)
    fds = FederatedDataset(dataset="cifar10", partitioners={"train" : partitioner})
    params_to_partitioner[(num_partitions, num_shards_per_partition)] = fds

parameters_to_shard_cifar_fds_metric_list = {}
parameters_to_shard_cifar_fds_metric = {}
for (num_partitions, num_shards_per_partition), fds in params_to_partitioner.items():
    print((num_partitions, num_shards_per_partition))
    try:
        metric_list, avg_metric = compute_hellinger_distance(fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, num_shards_per_partition)}")
        metric_list, avg_metric = np.nan, np.nan
    parameters_to_shard_cifar_fds_metric_list[(num_partitions, num_shards_per_partition)] = metric_list
    parameters_to_shard_cifar_fds_metric[(num_partitions, num_shards_per_partition)] = avg_metric

In [103]:
shard_emd_results = pd.Series(parameters_to_shard_cifar_fds_metric).unstack(level=1)
shard_emd_results.index.name = "num_partitions"
shard_emd_results.columns.name = "num_shards"
shard_emd_results.style.background_gradient(axis=None)

In [75]:
shard_emd_results

## Inner Dirichlet

In [5]:

dataset_name = "cifar10"
# num_partitions = 10
# sigma = 0.3
# partition_sizes = create_lognormal_partition_sizes(dataset_name, num_partitions, sigma)
# 
# alpha = 0.1
# dirichlet_partitioner = InnerDirichletPartitioner(partition_sizes=partition_sizes, partition_by="label", alpha=0.1)
# cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
# cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
sigma_list = [0.1, 0.3, 1., 3.]
partition_sizes_dict = {}
for num_partitions, alpha, sigma in itertools.product(num_partitions_list, alpha_list, sigma_list):
    partition_sizes = create_lognormal_partition_sizes(dataset_name, num_partitions, sigma)
    dir_partitioner =  InnerDirichletPartitioner(partition_sizes=partition_sizes, partition_by="label", alpha=alpha)
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha, sigma)] = cifar_dir
    partition_sizes_dict[(num_partitions, alpha, sigma)] = partition_sizes
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_metric_list = {}
num_partitions_to_cifar_dir_metric = {}
for (num_partitions, alpha, sigma), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha, sigma))
    try:
        metric_list, avg_metric = compute_hellinger_distance(cifar_dir_fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, alpha, sigma)}")
        metric_list, avg_metric = np.nan, np.nan
    num_partitions_to_cifar_dir_metric_list[(num_partitions, alpha, sigma)] = metric_list
    num_partitions_to_cifar_dir_metric[(num_partitions, alpha, sigma)] = avg_metric

In [13]:
emd_dir = pd.Series(num_partitions_to_cifar_dir_metric, name="emd_inner_dir").unstack(level=1)
emd_dir

In [16]:
partition_sizes_dict[(10, 0.3, 0.1)]

In [17]:
partition_sizes_dict[(10, 0.3, 3)]

# FEMNIST

In [3]:
import pandas as pd
path = "../../results/adamnarozniak_old/femnist/IidPartitioner/compute_hellinger_distance.csv"
femnist_helinger = pd.read_csv(path)
femnist_helinger = femnist_helinger.drop_duplicates()
femnist_helinger.groupby(["num_partitions"])["metric_value"].mean().to_frame().style.background_gradient(axis=None)

Unnamed: 0_level_0,metric_value
num_partitions,Unnamed: 1_level_1
3,0.137941
10,0.166024
30,0.170423
100,0.175127
300,0.180012
1000,0.200157


In [31]:
path = "../../results/adamnarozniak/femnist/DirichletPartitioner/compute_hellinger_distance.csv"
femnist_helinger = pd.read_csv(path)
femnist_helinger = femnist_helinger.drop_duplicates()
femnist_helinger.groupby(["num_partitions", "alpha"])["metric_value"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None)

In [34]:
femnist_helinger

# VISUALIZE RESULTS

In [None]:
import pandas as pd
from IPython.display import display

In [12]:
partitioner_name = "DirichletPartitioner"
for ds_name in ["cifar10", "cifar100", "adamnarozniak/femnist"]:
    path = f"../../results/{ds_name}/{partitioner_name}/compute_hellinger_distance.csv"
    metrics = pd.read_csv(path)
    metrics = metrics.drop_duplicates()
    print(ds_name)
    display(metrics.groupby(["num_partitions", "alpha"])["metric_value"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None))

cifar10


Unnamed: 0_level_0,metric_value,metric_value,metric_value,metric_value,metric_value,metric_value,metric_value
alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000,1000.000000
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,0.520317,0.388491,0.227716,0.10901,0.071695,0.022788,0.007207
10,0.62025,0.483452,0.290158,0.155027,0.09144,0.028729,0.009075
30,0.651914,0.471448,0.297516,0.174453,0.097322,0.030711,0.009779
100,0.667463,0.503658,0.324196,0.186368,0.103555,0.032853,0.010701
300,,0.511364,0.315759,0.182981,0.101346,0.032604,0.012954
1000,,,0.334892,0.191701,0.106884,0.042349,0.019999


cifar100


Unnamed: 0_level_0,metric_value,metric_value,metric_value,metric_value,metric_value,metric_value,metric_value
alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000,1000.000000
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,0.545902,0.432712,0.264131,0.168904,0.093807,0.029856,0.009441
10,0.665189,0.513564,0.319783,0.189262,0.104793,0.033321,0.010897
30,0.698148,0.536673,0.335131,0.197012,0.109142,0.035608,0.013893
100,0.716125,0.560403,0.35606,0.206456,0.114645,0.045331,0.02564
300,0.734549,0.596062,0.418377,0.268632,0.154695,0.106362,0.102625
1000,0.766136,0.672943,0.586927,0.54724,,,


adamnarozniak/femnist


Unnamed: 0_level_0,metric_value,metric_value,metric_value,metric_value,metric_value,metric_value,metric_value
alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000,1000.000000
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
3,0.536864,0.422294,0.255861,0.163154,0.090725,0.029214,0.00922
10,0.650876,0.498058,0.310485,0.188974,0.105111,0.033428,0.010573
30,0.679069,0.518127,0.319906,0.194965,0.107799,0.034399,0.010885
100,0.693921,0.528278,0.328536,0.195881,0.108143,0.034338,0.010957
300,0.69629,0.53103,0.330971,0.19764,0.109085,0.03497,0.012046
1000,0.702297,0.539061,0.336675,0.199733,0.110515,0.038362,0.019432


In [33]:
partitioner_name = "ClassConstrainedPartitioner"
for ds_name in ["cifar10", "cifar100", "adamnarozniak/femnist"]:
    path = f"../../results/{ds_name}/{partitioner_name}/compute_hellinger_distance.csv"
    metrics = pd.read_csv(path)
    metrics = metrics.drop_duplicates()
    print(ds_name)
    if ds_name == "cifar100":
        display(metrics.groupby(["num_partitions", "num_classes_per_partition"])["metric_value"].mean().to_frame().unstack(level=-1).iloc[:, 4:].style.background_gradient(axis=None))
    elif ds_name == "adamnarozniak/femnist":
        display(metrics.groupby(["num_partitions", "num_classes_per_partition"])["metric_value"].mean().to_frame().unstack(level=-1).iloc[:, 4:].style.background_gradient(axis=None))
    else:
        display(metrics.groupby(["num_partitions", "num_classes_per_partition"])["metric_value"].mean().to_frame().unstack(level=-1).style.background_gradient(axis=None))

cifar10


Unnamed: 0_level_0,metric_value,metric_value,metric_value,metric_value
num_classes_per_partition,2,3,4,5
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
3,0.746087,0.682975,0.616898,0.547415
10,0.746657,0.676441,0.610837,0.545627
30,0.744504,0.674283,0.60822,0.542897
100,0.743999,0.67347,0.606805,0.542048
300,0.743661,0.672823,0.606567,0.541391
1000,0.743556,0.672622,0.606377,0.54131


cifar100


Unnamed: 0_level_0,metric_value,metric_value,metric_value,metric_value
num_classes_per_partition,20,30,44,50
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
3,0.747161,0.678922,0.591173,0.554904
10,0.752889,0.682405,0.591131,0.552181
30,0.74939,0.677376,0.584015,0.544645
100,0.745188,0.673984,0.581317,0.542161
300,0.744063,0.673136,0.58098,0.541944
1000,0.743987,0.673395,0.581251,0.544497


adamnarozniak/femnist


Unnamed: 0_level_0,metric_value,metric_value,metric_value,metric_value
num_classes_per_partition,12,18,24,31
num_partitions,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
3,0.73291,0.679164,0.606772,0.547173
10,0.732284,0.66117,0.596914,0.526238
30,0.74157,0.662204,0.597991,0.529528
100,0.735692,0.66454,0.602622,0.531656
300,0.736596,0.669761,0.606481,0.532237
1000,0.73757,0.668877,0.606853,0.534182
