<a href="https://colab.research.google.com/github/Yasaman-A/federated-learning-tools/blob/main/AutoEdgeML/AutoEdgeML_MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lib

In [None]:
!pip install -U tensorboard-plugin-profile
!pip install --quiet --upgrade tensorflow_federated
!pip install nest_asyncio

import nest_asyncio
nest_asyncio.apply()


In [2]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import collections
import functools
import os
import time
import tensorflow_federated as tff
from datetime import datetime

tf.compat.v1.enable_v2_behavior()
%load_ext tensorboard


In [3]:
log_base_dir = '/tmp/logs/'

In [4]:
rm -rf "/tmp/logs/"

In [5]:
federated_accuracies = {}
federated_order=[]
federated_history = collections.defaultdict(list)

# Input

In [15]:
#@title String fields { display-mode: "form" }

#@markdown Enter your model and parameters. You will also need to provide two functions which will be described in the form below.

#@markdown Central-approach specific parameters.
EPOCHS =  25#@param {type:"integer"}

#@markdown Federated-Learning-approach specific parameters.
NUM_FD_ROUNDS =  50#@param {type:"integer"}
NUM_CLIENTS =  5#@param {type:"integer"}
SPLIT_RANDOMLY = False #@param {typ: "boolean"}

#@markdown Common parameters.

BATCH_SIZE = 50 #@param {type:"integer"}
SHUFFLE_BUFFER = 1024 #@param {type:"integer"}
TRAIN_SIZE = 0.685 #@param {type:"number"}
VALIDATION_SIZE = 0.2 #@param {type:"number"}
PREFETCH_BUFFER = 10

#@markdown Provide the URL (web or local) to the model and the name of the extracted folder.

MODEL_URL = "file:////content/demo_model.tar.gz" #@param {type:"string"}
MODEL_EXTRACTED_DIR_NAME = "demo_model"  #@param {type:"string"}

train_size = TRAIN_SIZE
validation_size = VALIDATION_SIZE


Provide the dataset to use in the training. You can download a dataset from https://www.tensorflow.org/datasets/catalog/overview or create your own datasets. In either case, you should assign your dataset to a varibale called `dataset`

In [7]:
# An example of creating your own dataset from mnist images and some auxilary data.
from random import randrange

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x = np.concatenate([x_train, x_test], axis=0).astype('float32')
x /= 255
x = x.reshape(x.shape[0], 28, 28, 1)

NUM_CLASS = 10
y = np.concatenate([y_train, y_test], axis=0)
y = keras.utils.to_categorical(y, NUM_CLASS)

# aux is used for non-iid. splitting the data randomly between 600 users for testing.
aux = np.array([randrange(600) for i in range(len(y))])

dataset = tf.data.Dataset.from_tensor_slices((x, aux, y))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


The code will run the following function to choose non-iid input. The function does this by receiving a `client_id` parameter as the first input and the rest of your dataset as the subsequent inputs. In this function you have the freedom to implememnt how you want the code to distribute the data between different clients. For example, similar to before, assume that your dataset has image data, a label, and some auxilary data. In which case, you can write the function as follows.

```Python
def get_non_iid(data):
  return data[1].numpy()
```

In [9]:
def get_non_iid(data):
  return # TODO: return the values of the target column. 
 

The code will run the following function to map the elements of your dataset to a format that is acceptable for your model. For example, assuming that each element in your dataset contains an image data, a label, and some auxilary data you will need to write the following function.

```python
def prep_model_input(image, aux, label):
  return (image, label)
```

In [10]:
def prep_model_input(): # TODO: add inputs
  # TODO perform optional modifications
  return # TODO return the data suitable for your model

# Download

In [11]:
#@title Download the dataset and model.

import urllib.request
import tarfile

ftpstream = urllib.request.urlopen(MODEL_URL)
thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz")
thetarfile.extractall()

central_model = tf.keras.models.load_model(MODEL_EXTRACTED_DIR_NAME)

def create_keras_model():
  federated_model = tf.keras.models.load_model(MODEL_EXTRACTED_DIR_NAME)
  return federated_model




# Central

In [12]:
#@title Split data into train, validation, and test.

# -------- Internal tool code
# -- Split into train, test, and validation

def split_dataset(dataset: tf.data.Dataset, fraction: float):
    data_percent = round(fraction * 100)
    if not (0 <= data_percent <= 100):
        raise ValueError("validation data fraction must be ∈ [0,1]")

    # count = 0
    # for _ in dataset:
    #   count += 1
    # max_fraction_index = round(count * fraction)

    dataset = dataset.enumerate()
    remaining_dataset = dataset.filter(lambda f, data: f % 100 > data_percent)
    fraction_dataset = dataset.filter(lambda f, data: f % 100 <= data_percent)

    # remove enumeration
    remaining_dataset = remaining_dataset.map(lambda f, data: data)
    fraction_dataset = fraction_dataset.map(lambda f, data: data)

    return fraction_dataset, remaining_dataset


shuffled_dataset = dataset.shuffle(SHUFFLE_BUFFER).map(prep_model_input)

train_dataset, test_dataset = split_dataset(shuffled_dataset, train_size)
train_dataset, validation_dataset = split_dataset(train_dataset, 1-validation_size)

train_dataset      = train_dataset.batch(BATCH_SIZE)
validation_dataset = validation_dataset.batch(BATCH_SIZE)
test_dataset       = test_dataset.batch(BATCH_SIZE)

validation_dataset

<BatchDataset shapes: ((None, 28, 28, 1), (None, 10)), types: (tf.float32, tf.float32)>

In [None]:

#@title Compile and run the model.

logdir = log_base_dir + "central/"
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# -------- Run the model
central_model.compile(loss=keras.losses.CategoricalCrossentropy(),
              optimizer=keras.optimizers.SGD(learning_rate=0.02),
              metrics=[tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.Accuracy()])

import time
first = time.time()

history = central_model.fit(train_dataset,
          epochs=EPOCHS,
          shuffle=True,
          verbose=1,
          validation_data=validation_dataset,
          callbacks=[tensorboard_callback])

seconds = time.time()
print("Time diff =", seconds - first)

score = central_model.evaluate(test_dataset, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [14]:
central_history = history.history

# Federated

In [16]:
non_iid_map = collections.defaultdict(list)

for data in iter(dataset):
    non_iid_map[get_non_iid(data)].append(data)


In [None]:
client_ids = [i for i in range(NUM_CLIENTS)]

def create_non_iid_dataset_for_client_fn(client_id):
    train = collections.defaultdict(list)
    test = collections.defaultdict(list)

    max_train_index = round(len(non_iid_map[client_id]) * TRAIN_SIZE)
    for i, data in enumerate(non_iid_map[client_id]):
        for index, item in enumerate(data):
            if i < max_train_index:
                train[index].append(item)
            else:
                test[index].append(item)

    new_train_format = (train[0], train[1], train[2])
    new_test_format = (test[0], test[1], test[2])
    return tf.data.Dataset.from_tensor_slices(new_train_format),  tf.data.Dataset.from_tensor_slices(new_test_format), 

def fd_preprocess(dataset):
  train_dataset, test_dataset = dataset
  def batch_format_fn(x, y):
    return collections.OrderedDict(
        x = x,
        y = y
    )

  return train_dataset.repeat(EPOCHS).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).map(prep_model_input).map(batch_format_fn), \
         test_dataset.repeat(EPOCHS).shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).map(prep_model_input).map(batch_format_fn)

def make_federated_data(client_ids):
    train_dataset = []
    test_dataset = []
    for x in client_ids:
        train, test = fd_preprocess(create_non_iid_dataset_for_client_fn(x))
        train_dataset.append(train)
        test_dataset.append(test)
    return train_dataset, test_dataset

train_preprocessed_fd_dataset, test_preprocessed_fd_dataset = make_federated_data(client_ids)
element_spec = train_preprocessed_fd_dataset[0].element_spec
train_preprocessed_fd_dataset

In [None]:
count = 0
for ds in iter(train_preprocessed_fd_dataset[0]):
    count += 1
    #print(count)
print(count)

count = 0
for ds in iter(test_preprocessed_fd_dataset[0]):
    count += 1
    #print(count)
print(count)


In [19]:
#@title Prep the model and log dir.

def model_fn():
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=element_spec,
      loss=tf.keras.losses.CategoricalCrossentropy(name="epoch_loss"),
      metrics=[
               tf.keras.metrics.CategoricalAccuracy(name="epoch_categorical_accuracy"),
               tf.keras.metrics.Accuracy(name="epoch_accuracy")
               ])
  
import shutil
import pathlib

fd_key = "clients_{}_rounds_{}_splitrandom_{}".format(NUM_CLIENTS, NUM_FD_ROUNDS, SPLIT_RANDOMLY)
train_logdir = log_base_dir + "federated/train_{}".format(fd_key)
test_logdir = log_base_dir + "federated/test_{}".format(fd_key)

try:
  shutil.rmtree(train_logdir)
  shutil.rmtree(test_logdir)
except:
  pass

pathlib.Path(train_logdir).mkdir(parents=True, exist_ok=True)
pathlib.Path(test_logdir).mkdir(parents=True, exist_ok=True)

In [None]:
#@title Compile and run the model.

iterative_process = tff.learning.build_federated_averaging_process(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))

summary_writer = tf.summary.create_file_writer(train_logdir)

state = iterative_process.initialize()

# TODO make this a parametr as well.
USERS_PER_ROUND = NUM_CLIENTS

In [None]:
import time

start_time = time.monotonic()
print(time.ctime())

key = "C{},R{}".format(NUM_CLIENTS, NUM_FD_ROUNDS)

with summary_writer.as_default():
    for round_num in range(0, NUM_FD_ROUNDS+1):
        round_start_time = time.monotonic()
        state, metrics = iterative_process.next(state, train_preprocessed_fd_dataset)
        #print('Round duration: ', (time.monotonic() - round_start_time))
        federated_history[key].append(metrics['train'])
        for name, value in metrics['train'].items():
            if name == 'loss':
                name = 'epoch_loss'
            tf.summary.scalar(name, value, step=round_num)
        print('round {:2d}, metrics={}'.format(round_num, metrics))
print('Duration: ', (time.monotonic() - start_time))



In [None]:
evaluation = tff.learning.build_federated_evaluation(model_fn)

train_metrics = evaluation(state.model, train_preprocessed_fd_dataset)
test_metrics = evaluation(state.model, test_preprocessed_fd_dataset)


In [None]:
print(train_metrics)
print(test_metrics)

In [None]:
key = "C{},R{}".format(NUM_CLIENTS, NUM_FD_ROUNDS)
federated_accuracies[key] = [test_metrics['epoch_categorical_accuracy']]
federated_order.append(key)
federated_accuracies

In [None]:
federated_order

# Show Graphs with TensorBoard

In [None]:
%tensorboard --logdir '/tmp/logs/' --port=0

# Draw Graphs


In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random

In [28]:
for key in federated_accuracies:
    for i in range(len(federated_accuracies[key])):
        federated_accuracies[key][i] *= 100

In [None]:


values = {}
for key in federated_order:
    values[key] = federated_accuracies[key]

#df2 = pd.DataFrame.from_dict(federated_accuracies)
df2 = pd.DataFrame.from_dict(values)


sns.set_theme(style="whitegrid")

ax = sns.barplot(data=df2, palette="muted", edgecolor=(0,0,0))
ax.set(ylim = (0,110))

ax.set(ylabel='Accuracy (%)')

i = 0
for key in federated_accuracies:
    ax.text(i,
            federated_accuracies[key][0] + 1,
            "{:.2f}%".format(federated_accuracies[key][0]),
            color='black',
            ha="center")
    i += 1

def change_width(ax, new_value) :
    for patch in ax.patches :
        current_width = patch.get_width()
        diff = current_width - new_value

        # we change the bar width
        patch.set_width(new_value)

        # we recenter the bar
        patch.set_x(patch.get_x() + diff * .5)

change_width(ax, .5)
plt.show()

In [None]:


central_acc = score[1] * 100
fd_acc = test_metrics['epoch_categorical_accuracy'] * 100
df2 = pd.DataFrame(np.array([[central_acc, fd_acc]]), columns=['Central', 'Federated'])

sns.set_theme(style="whitegrid")


ax = sns.barplot(data=df2, palette="muted", edgecolor=(0,0,0))
ax.set(ylim = (0,110))

ax.set(ylabel='Accuracy (%)')

ax.text(0, central_acc + 1, "{:.2f}%".format(central_acc), color='black', ha="center")
ax.text(1, fd_acc + 1, "{:.2f}%".format(fd_acc), color='black', ha="center")

change_width(ax, .5)

plt.show()
#sns.despine()

In [32]:
federated_categorical = collections.defaultdict(list)
for key in federated_history:
    for i in range(len(federated_history[key])):
        federated_categorical[key].append(federated_history[key][i]['epoch_categorical_accuracy'] * 100)

fd_len = len(federated_categorical[list(federated_categorical.keys())[0]])
for key in federated_categorical:
    fd_len = min(len(federated_categorical[key]), fd_len)


In [36]:
central_categorical = central_history['categorical_accuracy'] * 100
central_len = len(central_history['categorical_accuracy'])

In [37]:
count = fd_len#min(fd_len, central_len)
ids = [i for i in range(count)]

In [None]:
d = {'id': ids}
for key in federated_categorical:
    d[key] = federated_categorical[key][:count]
df = pd.DataFrame(data=d)


In [39]:


max_line_count = 10

colors = random.sample(list(mcolors.TABLEAU_COLORS.keys()), max_line_count)
markers = random.sample([ ".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8"], max_line_count)

In [None]:

#plt.plot( 'id', 'central', data=df, markerfacecolor='blue', markersize=12, color='skyblue', linewidth=4)
i = 0
for key in federated_categorical:
    plt.plot( 'id', key, data=df, marker=markers[i], markerfacecolor=colors[i], color=colors[i], linewidth=1)
    i += 1
plt.xlabel('Rounds')
plt.ylabel('Accuracy (%)')
plt.grid(axis='x')

plt.ylim(0, 100)

plt.legend()
plt.show()