In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import sys
import os
print(os.getcwd())
# you're in fl-heterogeneity/heterogeneity/notebooks
sys.path.append(os.path.abspath("./../.."))

/Users/adam/Projects/FL-heterogeneity/heterogeneity/notebooks


In [11]:
import itertools

import numpy as np
import pandas as pd
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import IidPartitioner, DirichletPartitioner, ShardPartitioner

from heterogeneity.metrics import compute_earths_mover_distance

# KL

## IID

In [5]:
# Sample usage
num_partitions = 10
iid_partitioner = IidPartitioner(num_partitions=num_partitions)
cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]


num_partitions_to_cifar_iid_partitions = {}
num_partitions_to_cifar_iid_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
for num_partitions in num_partitions_list:
    iid_partitioner = IidPartitioner(num_partitions=num_partitions)
    cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
    num_partitions_to_cifar_iid_fds[num_partitions] = cifar_iid
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_iid_hellinger_distance = {}
num_partitions_to_cifar_iid_hellinger_distance_list = {}
for num_partitions, cifar_iid_fds in num_partitions_to_cifar_iid_fds.items():
    print(f"num partitions: {num_partitions}")
    metric_list, metric_avg = compute_earths_mover_distance(cifar_iid_fds.partitioners["train"])
    num_partitions_to_cifar_iid_hellinger_distance_list[num_partitions] = metric_list
    num_partitions_to_cifar_iid_hellinger_distance[num_partitions] = metric_avg    

num partitions: 3
num partitions: 10
num partitions: 30
num partitions: 100
num partitions: 300
num partitions: 1000


In [24]:
iid_emd_results = pd.Series(num_partitions_to_cifar_iid_hellinger_distance, name="iid_emd").to_frame().style.background_gradient()
iid_emd_results.index.name = "num_partitions"
iid_emd_results

Unnamed: 0_level_0,iid_emd
num_partitions,Unnamed: 1_level_1
3,0.013216
10,0.03796
30,0.067076
100,0.13408
300,0.239212
1000,0.4324


## Dirichlet

In [7]:
num_partitions = 10
alpha = [0.1] * 10
dirichlet_partitioner = DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

# num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_metric_list = {}
num_partitions_to_cifar_dir_metric = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        metric_list, avg_metric = compute_earths_mover_distance(cifar_dir_fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        metric_list, avg_metric = np.nan, np.nan
    num_partitions_to_cifar_dir_metric_list[(num_partitions, alpha)] = metric_list
    num_partitions_to_cifar_dir_metric[(num_partitions, alpha)] = avg_metric

(3, 0.1)
(3, 0.3)
(3, 1.0)
(3, 3.0)
(3, 10.0)
(3, 100.0)
(10, 0.1)
(10, 0.3)
(10, 1.0)
(10, 3.0)
(10, 10.0)
(10, 100.0)
(30, 0.1)
(30, 0.3)
(30, 1.0)
(30, 3.0)
(30, 10.0)
(30, 100.0)
(100, 0.1)




(100, 0.3)
(100, 1.0)
(100, 3.0)
(100, 10.0)
(100, 100.0)
(300, 0.1)




Sampling failed for (300, 0.1)
(300, 0.3)
(300, 1.0)
(300, 3.0)
(300, 10.0)
(300, 100.0)
(1000, 0.1)




Sampling failed for (1000, 0.1)
(1000, 0.3)




Sampling failed for (1000, 0.3)
(1000, 1.0)
(1000, 3.0)
(1000, 10.0)
(1000, 100.0)


In [27]:
emd_dir = pd.Series(num_partitions_to_cifar_dir_metric, name="emd_dir").unstack(level=1)
emd_dir.index.name = "num_partitions"
emd_dir.columns.name = "alpha"
emd_dir.style.background_gradient(axis=None)

alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000
num_partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,1.23724,0.597772,0.693348,0.261268,0.228528,0.074664
10,1.824524,1.708128,0.858312,0.47704,0.296712,0.095752
30,2.283676,1.419028,0.929312,0.52212,0.293228,0.094044
100,2.16178,1.452524,0.93032,0.562596,0.308308,0.103052
300,,1.518324,0.882444,0.54082,0.297744,0.094116
1000,,,0.936348,0.550116,0.312556,0.122664


In [9]:
results = pd.Series(num_partitions_to_cifar_dir_metric).to_frame().unstack(level=1)
results.replace([np.inf, -np.inf], np.nan, inplace=False).style.background_gradient(axis=None)

Unnamed: 0_level_0,0,0,0,0,0,0
Unnamed: 0_level_1,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000
3,1.23724,0.597772,0.693348,0.261268,0.228528,0.074664
10,1.824524,1.708128,0.858312,0.47704,0.296712,0.095752
30,2.283676,1.419028,0.929312,0.52212,0.293228,0.094044
100,2.16178,1.452524,0.93032,0.562596,0.308308,0.103052
300,,1.518324,0.882444,0.54082,0.297744,0.094116
1000,,,0.936348,0.550116,0.312556,0.122664


## Shard

In [20]:
params_to_partitioner = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
num_shards_per_partition_list = [2, 3, 4, 5]
for num_partitions, num_shards_per_partition in itertools.product(num_partitions_list, num_shards_per_partition_list):
    partitioner = ShardPartitioner(num_partitions=num_partitions, partition_by="label", num_shards_per_partition=num_shards_per_partition)
    fds = FederatedDataset(dataset="cifar10", partitioners={"train" : partitioner})
    params_to_partitioner[(num_partitions, num_shards_per_partition)] = fds

parameters_to_shard_cifar_fds_metric_list = {}
parameters_to_shard_cifar_fds_metric = {}
for (num_partitions, num_shards_per_partition), fds in params_to_partitioner.items():
    print((num_partitions, num_shards_per_partition))
    try:
        metric_list, avg_metric = compute_earths_mover_distance(fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, num_shards_per_partition)}")
        metric_list, avg_metric = np.nan, np.nan
    parameters_to_shard_cifar_fds_metric_list[(num_partitions, num_shards_per_partition)] = metric_list
    parameters_to_shard_cifar_fds_metric[(num_partitions, num_shards_per_partition)] = avg_metric

(3, 2)
(3, 3)
(3, 4)
(3, 5)
(10, 2)
(10, 3)
(10, 4)
(10, 5)
(30, 2)
(30, 3)
(30, 4)
(30, 5)
(100, 2)
(100, 3)
(100, 4)
(100, 5)
(300, 2)
(300, 3)
(300, 4)
(300, 5)
(1000, 2)
(1000, 3)
(1000, 4)
(1000, 5)


In [29]:
shard_emd_results = pd.Series(parameters_to_shard_cifar_fds_metric).unstack(level=1).style.background_gradient(axis=None)
shard_emd_results.index.name = "parameters"
# shard_emd_results.name = "shard_emd"
shard_emd_results.columns.name = "num_shards"
shard_emd_results

num_shards,2,3,4,5
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,2.733289,0.911041,0.833381,0.80015
10,2.12,1.872733,1.26,1.38
30,2.171853,1.747187,1.332212,1.117497
100,2.19,1.741205,1.624,1.288
300,2.217161,1.825778,1.55852,1.352444
1000,2.2218,1.8099,1.597167,1.344
