In [None]:
"""
File: FederatedLearning.ipynb
Author: Amit Prakash
Purpose: See how many clients/epochs affect the ML model accuracy
"""

In [None]:
!pip install h5py
!pip install typing-extensions
!pip install wheel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install --quiet --upgrade tensorflow-federated==0.20.0
!pip install --quiet --upgrade nest-asyncio



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.9/819.9 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.7/126.7 KB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m498.1/498.1 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.7/251.7 KB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.3/887.3 KB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.2/65.2 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 KB[0m [31m4.3 MB/s

In [None]:
!pip install tensorflow==2.8.0 tensorflow_probability==0.14.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.8.0
  Downloading tensorflow-2.8.0-cp39-cp39-manylinux2010_x86_64.whl (497.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.6/497.6 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow_probability==0.14.1
  Downloading tensorflow_probability-0.14.1-py2.py3-none-any.whl (5.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.5/462.5 KB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tf-estimator-nightly, tensorflow_probability, tensorflow
  Attempting uninstall: tensorflow_probability
    Found existing installation: tensor

In [None]:
# Import all necessary libraries/modules
import nest_asyncio
nest_asyncio.apply()
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff

In [None]:
# Set seed
SEED = 200372055 
np.random.seed(SEED)

In [None]:
# Preprocess the input data 
def preprocess(dataset, epoch):
  def batch_format_fn(element):
    """Flatten a batch `pixels` and return the features as an `OrderedDict`."""
    return collections.OrderedDict(
        x=tf.reshape(element['pixels'], [-1, 784]),
        y=tf.reshape(element['label'], [-1, 1]))

  return dataset.repeat(epoch).shuffle(100, seed=SEED).batch(
      20).map(batch_format_fn).prefetch(10)

# Combine data from multiple clients
def make_federated_data(client_data, client_ids, epoch):
  return [
      preprocess(client_data.create_tf_dataset_for_client(x), epoch)
      for x in client_ids
  ]

In [None]:
# Download the MNIST data 
emnist_train, emnist_test = tff.simulation.datasets.emnist.load_data()
print ("Total number of clients: ",len(emnist_train.client_ids))

Downloading emnist_all.sqlite.lzma: 100%|██████████| 170507172/170507172 [00:44<00:00, 3945280.25it/s]


Total number of clients:  3383


In [None]:
# Determine the sample data input data structure for ML model 
example_dataset = emnist_train.create_tf_dataset_for_client(emnist_train.client_ids[0])
preprocessed_example_dataset = preprocess(example_dataset, 0)

# Neural network keras model
def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.InputLayer(input_shape=(784,)),
      tf.keras.layers.Dense(10, kernel_initializer='zeros'),
      tf.keras.layers.Softmax(),
  ])
  
def model_fn():
  # We must create a new model here, and not capture it from an external
  # scope. TFF will call this within different graph contexts.
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
NUM_EPOCHS = 5 # Change the number of training epoch for local training by each client in next block

## Iteratively change NUM_CLIENTS
for NUM_CLIENTS in [5, 50, 100]:
  sample_clients = np.random.choice(emnist_train.client_ids, NUM_CLIENTS)
  print ("Client IDs selected: ", sample_clients)

  # Consider data from only the selected clients
  federated_train_data = make_federated_data(emnist_train, sample_clients, NUM_EPOCHS)
  print(f'Number of client datasets considered: {len(sample_clients)}')

  # Initialize the iterative training object with the right learning parameter
  iterative_process = tff.learning.build_federated_averaging_process(
      model_fn,
      client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.01),
      server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))

  # Initialize the parameters of the ML model (you need to initialize this each time you change the client number or epoch number)
  state = iterative_process.initialize()

  # Total number of server and client interactions
  NUM_ROUNDS = 11
  for round_num in range(1, NUM_ROUNDS):
    state, metrics = iterative_process.next(state, federated_train_data)
    print('round {:2d}, training accuracy= {}%'.format(round_num, metrics['train']['sparse_categorical_accuracy']*100))

  # Evaluate the latest converged model 
  evaluation = tff.learning.build_federated_evaluation(model_fn)
  federated_test_data = make_federated_data(emnist_test, sample_clients, 5)
  test_metrics = evaluation(state.model, federated_test_data)
  print('Test Accuracy: {}%'.format(str(test_metrics['eval']['sparse_categorical_accuracy']*100)))

Client IDs selected:  ['f0969_31' 'f3590_03' 'f1089_04' 'f1432_44' 'f1765_33']
Number of client datasets considered: 5
round  1, training accuracy= 13.188010454177856%
round  2, training accuracy= 15.422342717647552%
round  3, training accuracy= 19.073569774627686%
round  4, training accuracy= 22.45231568813324%
round  5, training accuracy= 21.58038169145584%
round  6, training accuracy= 29.536783695220947%
round  7, training accuracy= 34.71389710903168%
round  8, training accuracy= 39.29155170917511%
round  9, training accuracy= 40.21798372268677%
round 10, training accuracy= 47.247955203056335%
Test Accuracy: 59.090906381607056%
Client IDs selected:  ['f0458_38' 'f0305_08' 'f3447_20' 'f1113_16' 'f0744_23']
Number of client datasets considered: 5
round  1, training accuracy= 31.67780041694641%
round  2, training accuracy= 58.117878437042236%
round  3, training accuracy= 74.79764223098755%
round  4, training accuracy= 82.14145302772522%
round  5, training accuracy= 85.65815091133118%
r

In [None]:
NUM_CLIENTS = 5 # Change number of clients as needed in previous block

## Iteratively change NUM_EPOCHS
for NUM_EPOCHS in [5, 50, 100]:
  sample_clients = np.random.choice(emnist_train.client_ids, NUM_CLIENTS)
  print ("Client IDs selected: ", sample_clients)

  # Consider data from only the selected clients
  federated_train_data = make_federated_data(emnist_train, sample_clients, NUM_EPOCHS)
  print(f'Number of client datasets considered: {len(sample_clients)}')

  # Initialize the iterative training object with the right learning parameter
  iterative_process = tff.learning.build_federated_averaging_process(
      model_fn,
      client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.01),
      server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))

  # Initialize the parameters of the ML model (you need to initialize this each time you change the client number or epoch number)
  state = iterative_process.initialize()

  # Total number of server and client interactions
  NUM_ROUNDS = 11
  for round_num in range(1, NUM_ROUNDS):
    state, metrics = iterative_process.next(state, federated_train_data)
    print('round {:2d}, training accuracy= {}%'.format(round_num, metrics['train']['sparse_categorical_accuracy']*100))

  # Evaluate the latest converged model 
  evaluation = tff.learning.build_federated_evaluation(model_fn)
  federated_test_data = make_federated_data(emnist_test, sample_clients, 5)
  test_metrics = evaluation(state.model, federated_test_data)
  print('Test Accuracy: {}%'.format(str(test_metrics['eval']['sparse_categorical_accuracy']*100)))

Client IDs selected:  ['f0969_31' 'f3590_03' 'f1089_04' 'f1432_44' 'f1765_33']
Number of client datasets considered: 5
round  1, training accuracy= 13.188010454177856%
round  2, training accuracy= 15.422342717647552%
round  3, training accuracy= 19.073569774627686%
round  4, training accuracy= 22.45231568813324%
round  5, training accuracy= 21.58038169145584%
round  6, training accuracy= 29.536783695220947%
round  7, training accuracy= 34.71389710903168%
round  8, training accuracy= 39.29155170917511%
round  9, training accuracy= 40.21798372268677%
round 10, training accuracy= 47.247955203056335%
Test Accuracy: 59.090906381607056%
Client IDs selected:  ['f0458_38' 'f0305_08' 'f3447_20' 'f1113_16' 'f0744_23']
Number of client datasets considered: 5
round  1, training accuracy= 31.67780041694641%
round  2, training accuracy= 58.117878437042236%
round  3, training accuracy= 74.79764223098755%
round  4, training accuracy= 82.14145302772522%
round  5, training accuracy= 85.65815091133118%
r