# **USE CASE 5.** Introducing Differential Privacy (in image classification) in TFF

## Required libraries and configuration

Import required libraries

In [1]:
import collections
import random
import os

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import tensorflow_datasets as tfds

from tensorflow_federated.python.simulation.datasets import emnist
from tensorflow_federated.python.learning.algorithms import build_unweighted_fed_avg, build_fed_eval
from tensorflow_federated.python.learning.model_update_aggregator import dp_aggregator
from tensorflow.keras import models, layers, losses, metrics, optimizers

# Option for debugging warning errors
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

2023-02-20 17:04:15.760326: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-20 17:04:18.041411: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-20 17:04:18.041547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  from .autonotebook import tqdm as notebook_tqdm


Define some parameters for the simulation, such as the number of clients in the federated scenario, the number of federated rounds, the number of epochs of each client before communicating, and the batch size for training phase. Besides, the DP_MULTIPLIERS comprises the different multipliers to evaluate for the gaussian noise in the DP mechanism.

In [2]:
# Some parameters
NUM_CLIENTS = 10 # Number of clients in the federated scenario
NUM_ROUNDS = 10 # Number of learning rounds in the federated computation
NUM_EPOCHS = 5 # Number of epochs that the local dataset is seen each round
BATCH_SIZE = 20 # Batch size for training phase

DP_MULTIPLIERS = [0.0, 0.05, 0.1, 0.2, 0.5] # Gaussian noise multipliers for DP mechanism

# Define the seed for random numbers
seed = 10
np.random.seed(seed)
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)

## Loading and preparing the input data

For a more detailed information about the data loading, please see use case 1.1 in TFF.

The `iid_data` variable indicates, if True, that the i.i.d. MNIST dataset is i.i.d. partitioned; otherwise, the non-i.i.d. partition is obtained from TFF.

In [3]:
iid_data = False

if iid_data:
    # Load MNIST from tfds, and get train and test partitions
    mnist = tfds.load('mnist')
    mnist_train, mnist_test = mnist['train'], mnist['test']

    # Transform the data to a dataframe
    mnist_train_df = tfds.as_dataframe(mnist_train)

    # Create a random list of ids and assign to the dataframe
    ids_train = [i for i in range(NUM_CLIENTS) for _ in range(len(mnist_train)//NUM_CLIENTS)]
    random.Random(seed).shuffle(ids_train)
    mnist_train_df['id'] = ids_train

    # Do the same with the test data
    mnist_test_df = tfds.as_dataframe(mnist_test) 
    ids_test = [i for i in range(NUM_CLIENTS) for _ in range(len(mnist_test)//NUM_CLIENTS)]
    random.Random(seed+1).shuffle(ids_test)
    mnist_test_df['id'] = ids_test

    # This method receives a client_id, and returns the training tf.data.Dataset for that client
    def create_tf_dataset_for_client_fn_train(client_id):
        client_data = mnist_train_df[mnist_train_df['id'] == client_id].drop(columns='id')
        return tf.data.Dataset.from_tensor_slices(client_data.to_dict('list'))

    # This method receives a client_id, and returns the testing tf.data.Dataset for that client
    def create_tf_dataset_for_client_fn_test(client_id):
        client_data = mnist_test_df[mnist_test_df['id'] == client_id].drop(columns='id')
        return tf.data.Dataset.from_tensor_slices(client_data.to_dict('list'))

    mnist_train = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
        client_ids=list(range(0,NUM_CLIENTS)),
        serializable_dataset_fn=create_tf_dataset_for_client_fn_train
    )
    mnist_test = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
        client_ids=list(range(0,NUM_CLIENTS)),
        serializable_dataset_fn=create_tf_dataset_for_client_fn_test
    )
else:
    # Load federated version of MNIST
    mnist_train, mnist_test = emnist.load_data(only_digits=True)

# Preprocess the dataset as a OrderedDict
def preprocess(dataset):
    def batch_format_fn(element):
        if iid_data:
            return collections.OrderedDict(
                x=element['image']/255,
                y=element['label']
            )
        else:
            return collections.OrderedDict(
                x=element['pixels'],
                y=element['label']
            )

    return dataset.repeat(NUM_EPOCHS).shuffle(100, seed=seed).batch(BATCH_SIZE).map(batch_format_fn)

# Construct a list of datasets (one for each client) from the complete dataset and the number of 
# clients (it will select the first client ids for simulation).
def make_federated_data(client_data, n_clients):    
    return [
        preprocess(client_data.create_tf_dataset_for_client(x)) # Call previous preprocess method
        for x in client_data.client_ids[0:n_clients]
    ]

# Create the federated train and testing data
train_data = make_federated_data(mnist_train, NUM_CLIENTS)
test_data = make_federated_data(mnist_test, NUM_CLIENTS)

2023-02-20 17:04:24.621840: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-20 17:04:24.622423: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)


## Create a Deep Learning model

Define a CNN model with keras. 
Note that any network architecture supported by keras can be used.

In [4]:
def create_keras_model():
    model = models.Sequential([
        layers.Reshape((28, 28, 1), input_shape=(28, 28)),
        layers.Conv2D(32, kernel_size=(5, 5), activation="relu", padding="same", strides=1),
        layers.MaxPooling2D(pool_size=2, strides=2, padding='valid'),
        layers.Flatten(),
        layers.Dense(10, activation="softmax"),
    ])
        
    return model

def model_fn():
    keras_model = create_keras_model()
    
    return tff.learning.from_keras_model(
        keras_model,
        input_spec=train_data[0].element_spec,
        loss=losses.SparseCategoricalCrossentropy(),
        metrics=[metrics.SparseCategoricalAccuracy()]
    )


## Training (and evaluating) federated model with Differential Privacy

For training a model in a federated learning framework using differential privacy, we use the `dp_aggregator` provided by TFF, along with the unweighted FedAvg aggregator. The rest of the process is similar to the one performed in use case 1.1 in TFF.

We define a method to train and test the model with a given value for the gaussian noise multiplier 

In [5]:
def fl_train_test_with_DP(noise_multiplier, train_data, test_data):
    print('m: ' + str(noise_multiplier))

    # Aggregator with Differential Privacy
    aggregation_factory = dp_aggregator(noise_multiplier, NUM_CLIENTS)

    training_process = build_unweighted_fed_avg(
        model_fn,
        client_optimizer_fn=lambda: optimizers.Adam(learning_rate=0.001),
        server_optimizer_fn=lambda: optimizers.Adam(learning_rate=0.01),
        model_aggregator=aggregation_factory
    )
    
    # The rest of the code below in this cell is the same as in Use Case 1.1
    
    # Training
    train_state = training_process.initialize()

    for round_num in range(1, NUM_ROUNDS+1):
        # Train next round (send model to clients, local training, and server model averaging)
        result = training_process.next(train_state, train_data)

        # Current state of the model
        train_state = result.state

        # Get and print metrics, as the loss and accuracy (averaged across all clients)
        train_metrics = result.metrics['client_work']['train']
        print('Round {:2d},  \t Loss={:.4f}, \t Accuracy={:.4f}'.format(round_num, train_metrics['loss'], train_metrics['sparse_categorical_accuracy']))
        
    # Evaluation
    # Indicate that the model arquitecture is the one proposed before
    evaluation_process = build_fed_eval(model_fn)

    # Initialize the process and set the weights to those previously trained (getting from the training state and setting to the evaluation one).
    evaluation_state = evaluation_process.initialize()
    model_weights = training_process.get_model_weights(train_state)
    evaluation_state = evaluation_process.set_model_weights(evaluation_state, model_weights)
    
    # Pass test data to the model in each client
    evaluation_output = evaluation_process.next(evaluation_state, test_data)

    # Get and print metrics
    eval_metrics = evaluation_output.metrics['client_work']['eval']['current_round_metrics']  
    print('Test data, \t Loss={:.4f}, \t Accuracy={:.4f}'.format(eval_metrics['loss'], eval_metrics['sparse_categorical_accuracy']))

    print('---\n\n')

Try with different multipliers for the DP process, and compare the results

In [6]:
m = DP_MULTIPLIERS[0]

for m in DP_MULTIPLIERS:
    fl_train_test_with_DP(m, train_data, test_data)

m: 0.0
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Round  1,  	 Loss=2.2077, 	 Accuracy=0.2379
Round  2,  	 Loss=2.0795, 	 Accuracy=0.3981
Round  3,  	 Loss=1.7940, 	 Accuracy=0.5813
Round  4,  	 Loss=1.5610, 	 Accuracy=0.6728
Round  5,  	 Loss=1.3017, 	 Accuracy=0.7603
Round  6,  	 Loss=1.0646, 	 Accuracy=0.8152
Round  7,  	 Loss=0.8369, 	 Accuracy=0.8582
Round  8,  	 Loss=0.6483, 	 Accuracy=0.8842
Round  9,  	 Loss=0.5184, 	 Accuracy=0.8967
Round 10,  	 Loss=0.4202, 	 Accuracy=0.9053
Test data, 	 Loss=0.6447, 	 Accuracy=0.8017
Training and testing in 33.12 seconds
---


m: 0.05
Round  1,  	 Loss=2.2136, 	 Accuracy=0.2469
Round  2,  	 Loss=2.0113, 	 Accuracy=0.4535
Round  3,  	 Loss=1.8386, 	 Accuracy=0.5953
Round  4,  	 Loss=1.6572, 	 Accuracy=0.6792
Round  5,  	 Loss=1.4541, 	 Accuracy=0.7484
Round  6,  	 Loss=1.2471, 	 Accuracy=0.7988
Round  7,  	 Loss=1.0507, 	 Accuracy=0.8325
Round  8,  	 Loss=0.8802, 	 Accuracy=0.8572
Round  9,  	 Loss=0.7283, 	 Accuracy=0.8794
Round 10,  	 Loss=0.6034, 	 Accuracy=0.8887
Test data, 	 Loss=0.7302