In [3]:
%load_ext autoreload
%autoreload 2

# Imports

In [4]:
import sys
import os
print(os.getcwd())
# you're in fl-heterogeneity/heterogeneity/notebooks
sys.path.append(os.path.abspath("./../.."))

/Users/adam/Projects/FL-heterogeneity/heterogeneity/notebooks


In [33]:
import itertools

import numpy as np
import pandas as pd
from flwr_datasets import FederatedDataset
from flwr_datasets.partitioner import IidPartitioner, DirichletPartitioner, ShardPartitioner

from heterogeneity.metrics import compute_kl_divergence

# KL

## IID

In [39]:
# Sample usage
num_partitions = 10
iid_partitioner = IidPartitioner(num_partitions=num_partitions)
cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]


num_partitions_to_cifar_iid_partitions = {}
num_partitions_to_cifar_iid_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
for num_partitions in num_partitions_list:
    iid_partitioner = IidPartitioner(num_partitions=num_partitions)
    cifar_iid = FederatedDataset(dataset="cifar10", partitioners={"train" : iid_partitioner})
    num_partitions_to_cifar_iid_fds[num_partitions] = cifar_iid
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_iid_hellinger_distance = {}
num_partitions_to_cifar_iid_hellinger_distance_list = {}
for num_partitions, cifar_iid_fds in num_partitions_to_cifar_iid_fds.items():
    metric_list, metric_avg = compute_kl_divergence(cifar_iid_fds.partitioners["train"])
    num_partitions_to_cifar_iid_hellinger_distance_list[num_partitions] = metric_list
    num_partitions_to_cifar_iid_hellinger_distance[num_partitions] = metric_avg    

In [47]:
iid_kl_div_results = pd.Series(num_partitions_to_cifar_iid_hellinger_distance, name="iid_kl").iloc[:-1].to_frame().style.background_gradient()
iid_kl_div_results.index.name = "num_partitions"
iid_kl_div_results

Unnamed: 0_level_0,iid_kl
num_partitions,Unnamed: 1_level_1
3,0.000153
10,0.000749
30,0.002011
100,0.008409
300,0.027277


In [None]:
# labels = num_partitions_to_cifar_iid_fds[100].partitioners["train"].loa
# distributions = []
# for partition_id in num_partitions_to_cifar_iid_fds[100].partitioners["train"].num_partitions:
#     labels = num_partitions_to_cifar_iid_fds[100].partitioners["train"].loa
#     compute_distributions(

## Dirichlet

In [28]:
num_partitions = 10
alpha = [0.1] * 10
dirichlet_partitioner = DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dirichlet_partitioner})
cifar_dir_partitions = [cifar_dir.load_partition(i) for i in range(num_partitions)]

num_partitions_to_cifar_dir_partitions = {}
num_partitions_to_cifar_dir_fds = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
alpha_list = [0.1, 0.3, 1., 3., 10., 100., 100.]
for num_partitions, alpha in itertools.product(num_partitions_list, alpha_list):
    dir_partitioner =  DirichletPartitioner(num_partitions=num_partitions, alpha=alpha, partition_by="label")
    cifar_dir = FederatedDataset(dataset="cifar10", partitioners={"train" : dir_partitioner})
    num_partitions_to_cifar_dir_fds[(num_partitions, alpha)] = cifar_dir
    # cifar_iid_partitions = [cifar_iid.load_partition(i) for i in range(num_partitions)]
    # num_partitions_to_cifar_iid_partitions[num_partitions] = cifar_iid_partitions

num_partitions_to_cifar_dir_metric_list = {}
num_partitions_to_cifar_dir_metric = {}
for (num_partitions, alpha), cifar_dir_fds in num_partitions_to_cifar_dir_fds.items():
    print((num_partitions, alpha))
    try:
        metric_list, avg_metric = compute_kl_divergence(cifar_dir_fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, alpha)}")
        metric_list, avg_metric = np.nan, np.nan
    num_partitions_to_cifar_dir_metric_list[(num_partitions, alpha)] = metric_list
    num_partitions_to_cifar_dir_metric[(num_partitions, alpha)] = avg_metric

(3, 0.1)
(3, 0.3)
(3, 1.0)
(3, 3.0)
(3, 10.0)
(3, 100.0)
(10, 0.1)
(10, 0.3)
(10, 1.0)
(10, 3.0)
(10, 10.0)
(10, 100.0)
(30, 0.1)
(30, 0.3)
(30, 1.0)
(30, 3.0)
(30, 10.0)
(30, 100.0)
(100, 0.1)




(100, 0.3)
(100, 1.0)
(100, 3.0)
(100, 10.0)
(100, 100.0)
(300, 0.1)




Sampling failed for (300, 0.1)
(300, 0.3)
(300, 1.0)
(300, 3.0)
(300, 10.0)
(300, 100.0)
(1000, 0.1)




Sampling failed for (1000, 0.1)
(1000, 0.3)




Sampling failed for (1000, 0.3)
(1000, 1.0)
(1000, 3.0)
(1000, 10.0)
(1000, 100.0)


In [43]:
kl_dir = pd.Series(num_partitions_to_cifar_dir_metric).unstack(level=1)
kl_dir.replace([np.inf, -np.inf], np.nan, inplace=True)
kl_dir.index.name = "num_partitions"
kl_dir.columns.name = "alpha"
kl_dir.style.background_gradient(axis=None)

alpha,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000
num_partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,,0.99639,0.275006,0.050017,0.021439,0.002098
10,,,0.429987,0.105258,0.035281,0.00343
30,,,0.455574,0.135978,0.040678,0.004033
100,,,,0.159319,0.046716,0.00456
300,,,,0.153635,0.044635,0.00453
1000,,,,,0.049907,0.007669


In [30]:
results = pd.Series(num_partitions_to_cifar_dir_metric).to_frame().unstack(level=1)
results.replace([np.inf, -np.inf], np.nan, inplace=False).style.background_gradient(axis=None)

Unnamed: 0_level_0,0,0,0,0,0,0
Unnamed: 0_level_1,0.100000,0.300000,1.000000,3.000000,10.000000,100.000000
3,,0.99639,0.275006,0.050017,0.021439,0.002098
10,,,0.429987,0.105258,0.035281,0.00343
30,,,0.455574,0.135978,0.040678,0.004033
100,,,,0.159319,0.046716,0.00456
300,,,,0.153635,0.044635,0.00453
1000,,,,,0.049907,0.007669


## Shard

In [45]:
params_to_partitioner = {}
num_partitions_list = [3, 10, 30, 100, 300, 1000]
num_shards_per_partition_list = [2, 3, 4, 5]
for num_partitions, num_shards_per_partition in itertools.product(num_partitions_list, num_shards_per_partition_list):
    partitioner = ShardPartitioner(num_partitions=num_partitions, partition_by="label", num_shards_per_partition=num_shards_per_partition)
    fds = FederatedDataset(dataset="cifar10", partitioners={"train" : partitioner})
    params_to_partitioner[(num_partitions, num_shards_per_partition)] = fds

parameters_to_shard_cifar_fds_metric_list = {}
parameters_to_shard_cifar_fds_metric = {}
for (num_partitions, num_shards_per_partition), fds in params_to_partitioner.items():
    print((num_partitions, num_shards_per_partition))
    try:
        metric_list, avg_metric = compute_kl_divergence(fds.partitioners["train"])
    except:
        print(f"Sampling failed for {(num_partitions, num_shards_per_partition)}")
        metric_list, avg_metric = np.nan, np.nan
    parameters_to_shard_cifar_fds_metric_list[(num_partitions, num_shards_per_partition)] = metric_list
    parameters_to_shard_cifar_fds_metric[(num_partitions, num_shards_per_partition)] = avg_metric

(3, 2)
(3, 3)
(3, 4)
(3, 5)
(10, 2)
(10, 3)
(10, 4)
(10, 5)
(30, 2)
(30, 3)
(30, 4)
(30, 5)
(100, 2)
(100, 3)
(100, 4)
(100, 5)
(300, 2)
(300, 3)
(300, 4)
(300, 5)
(1000, 2)
(1000, 3)
(1000, 4)
(1000, 5)


In [46]:
shard_emd_results = pd.Series(parameters_to_shard_cifar_fds_metric).unstack(level=1)
shard_emd_results.index.name = "num_partitions"
shard_emd_results.columns.name = "num_shards"
shard_emd_results.style.background_gradient(axis=None)

  rng = smax - smin


num_shards,2,3,4,5
num_partitions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,inf,inf,inf,inf
10,inf,inf,inf,inf
30,inf,inf,inf,inf
100,inf,inf,inf,inf
300,inf,inf,inf,inf
1000,inf,inf,inf,inf
