<a href="https://colab.research.google.com/github/alessandrotofani/Tesi_magistrale/blob/master/6_Federated_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Overview: https://www.tensorflow.org/federated

Image classification tutorial: https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification

# Installation

In [1]:
!pip install --quiet fastai==2.2.5
!pip install --quiet folium==0.2.1
!pip install --quiet imgaug==0.2.5
!pip install --quiet tensorflow==2.4.0
!pip install --quiet tensorflow_federated==0.18.0
!pip install --quiet --upgrade nest_asyncio

In [2]:
import nest_asyncio
nest_asyncio.apply()
%load_ext tensorboard

In [3]:
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import pandas as pd 
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sys 
sys.path.append('/content/drive/MyDrive/Tesi_magistrale/Tesi_magistrale')
import mf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import data

I dati vengono importati e poi splittati in train e test. 

In [5]:
data = pd.read_csv('/content/drive/MyDrive/Tesi_magistrale/Dataset/IEEE/Output/data.csv')
data = mf.new_processing(data)
col_name = mf.get_col(data)

In [6]:
print('Rate safe/fraud:', (1/mf.ratio(data)).round(3))

Rate safe/fraud: 0.036


In [7]:
data, categorical_col_toemb, numerical_col = mf.labelEncoding(data, merge = True)
col_name = mf.get_col(data)

In [8]:
input_size = {}
for col in categorical_col_toemb:
  input_size[col] = data[col].max() + 1

In [9]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.1)

Smote: https://imbalanced-learn.org/stable/generated/imblearn.over_sampling.SMOTE.html

RandomUnderSampler: https://imbalanced-learn.org/stable/generated/imblearn.under_sampling.RandomUnderSampler.html

In [10]:
def underSampling(data, frac_under=0.1):
  from imblearn.under_sampling import RandomUnderSampler 
  us = RandomUnderSampler(sampling_strategy=frac_under, random_state=42)
  y = data['isFraud']
  X = data.drop(columns = ['isFraud'])
  X_us, y_us = us.fit_resample(X, y)
  return X_us, y_us

def overSampling(X, y, frac_over=0.3):
  from imblearn.over_sampling import SMOTE
  sm = SMOTE(sampling_strategy=frac_over, random_state=42)
  X_sm, y_sm = sm.fit_resample(X, y)  
  return X_sm, y_sm

# def mergeResult(X, y, col_name):
#   y_res = np.ndarray(shape=(np.shape(y)[0],1), buffer = y)
#   data = np.concatenate((X,y_res), axis = 1)
#   col_name.append('isFraud')
#   dataset = pd.DataFrame(data=data, columns=col_name)  
#   return dataset

def rate(y):
  n_fraud = np.count_nonzero(y == 1)
  n_safe = np.shape(y)[0] - n_fraud
  return f'Rate safe/fraud: {n_safe/n_fraud}'

In [11]:
X, y = underSampling(train_data)
rate(y)

'Rate safe/fraud: 10.0'

In [12]:
X, y = overSampling(X, y)
rate(y)

'Rate safe/fraud: 3.3333333333333335'

In [13]:
train_data = mf.mergeResult(X, y, col_name)

In [14]:
print('Dataset size:', train_data.shape[0])

Dataset size: 242385


In [15]:
del data, X, y

# Convert data

Il dataset deve essere convertito in un tensore, con componenti (feature_vector, label). 

In [16]:
def to_tensor(data, categorical_col_toemb, n_clients = 4):
  shuffled = data.sample(frac=1)
  result = np.array_split(shuffled, n_clients)  

  res = []
  # new_res = []
  label = []

  # for dataset in result:
  #   res.append(mf.feature_scaling(dataset))
  
  for dataset in result:
    label.append(dataset['isFraud'])
    res.append(dataset.drop(columns = ['isFraud']))

  dataset = {}
  for i in range(n_clients):
    # dataset[i] = tf.data.Dataset.from_tensor_slices((res[i].drop(columns=categorical_col_toemb).to_numpy(), 
    #                                                  res[i]['P_emaildomain'].to_numpy(), 
    #                                                  res[i]['R_emaildomain'].to_numpy(),
    #                                                  res[i]['device_name'].to_numpy(),
    #                                                  res[i]['device_version'].to_numpy(),
    #                                                  res[i]['os_name'].to_numpy(),
    #                                                  res[i]['browser_name'].to_numpy(),                                                     
    #                                                  label[i]))   
    lista = tuple([res[i][col].to_numpy() for col in res[i].columns]+[label[i]])
    # print(lista)
    dataset[i] = tf.data.Dataset.from_tensor_slices(lista)

  return dataset

In [17]:
dataset = to_tensor(train_data, categorical_col_toemb)
test_set = to_tensor(test_data, categorical_col_toemb)

In [18]:
dataset

{0: <TensorSliceDataset shapes: ((), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (),

In [19]:
# del train_data, test_data

# Federated data

Si definisce la funzione di preprocessing del dataset, che serve a creare l'OrderedDict, su cui si andranno a creare le batch necessarie per il training del modello. 

In [26]:
NUM_CLIENTS = 4
NUM_EPOCHS = 10
BATCH_SIZE = 250
SHUFFLE_BUFFER = 10
PREFETCH_BUFFER = 10

def preprocess(dataset):
  def batch_format_fn(*args):
    print(args)
    lista = [args[i] for i in range(len(args) -1)]
    # print(lista)
    o_dict = collections.OrderedDict()
    for i in range(len(lista)):
      # print(arg)
      o_dict[i] = tf.cast(lista[i], tf.float32, name = i)
    return collections.OrderedDict(
        # x = tf.cast(e1, tf.float32),
        x = o_dict,
        y = tf.cast(args[-1], tf.int32))
            
            # a=tf.cast(a, dtype=tf.float32),
            # b=tf.cast(b, dtype=tf.float32),
            # c=tf.cast(c, dtype=tf.float32),
            # d=tf.cast(d, dtype=tf.float32),
            # e=tf.cast(e, dtype=tf.float32),
            # f=tf.cast(f, dtype=tf.float32),
            # g=tf.cast(g, dtype=tf.float32)),
        # y = tf.cast(label, tf.int32))
  return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(
      BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

preprocessed_example_dataset = preprocess(dataset[0])
sample_batch = tf.nest.map_structure(lambda x: x.numpy(), next(iter(preprocessed_example_dataset)))
sample_batch

(<tf.Tensor 'args_0:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_1:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_2:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_3:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_4:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_5:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_6:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_7:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_8:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_9:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_10:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_11:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_12:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_13:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_14:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_15:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_16:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_17:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_18:0' shape=(None,) dtype=float32>, <t

OrderedDict([('x',
              OrderedDict([(0,
                            array([21., 18., 44.,  1., 18., 18., 18., 18.,  1.,  1., 55., 21., 18.,
                                   18., 18., 55.,  4., 18., 55.,  1., 18.,  1., 18., 11., 18., 18.,
                                   18., 55.,  5.,  3., 50.,  1., 18.,  1., 18., 18., 18., 55., 55.,
                                    1., 55.,  3., 18.,  1., 18., 55., 55.,  1., 18., 18.,  1.,  3.,
                                   55., 21.,  1., 18.,  1., 55., 18., 18., 55., 18., 18.,  1., 18.,
                                   18., 21., 18., 55., 18., 55.,  1., 18., 18., 31., 21., 18.,  1.,
                                   18., 55., 18., 18., 60.,  1.,  3.,  4.,  1., 55., 18., 18., 10.,
                                    4.,  1.,  1., 18., 18., 55., 18., 55., 18., 21., 18., 21.,  1.,
                                   18.,  3., 18.,  4.,  1., 18., 55., 55., 55., 18.,  3.,  1.,  1.,
                                   55.,  1., 18., 

In [27]:
preprocessed_example_dataset

<PrefetchDataset shapes: OrderedDict([(x, OrderedDict([(0, (None,)), (1, (None,)), (2, (None,)), (3, (None,)), (4, (None,)), (5, (None,)), (6, (None,)), (7, (None,)), (8, (None,)), (9, (None,)), (10, (None,)), (11, (None,)), (12, (None,)), (13, (None,)), (14, (None,)), (15, (None,)), (16, (None,)), (17, (None,)), (18, (None,)), (19, (None,)), (20, (None,)), (21, (None,)), (22, (None,)), (23, (None,)), (24, (None,)), (25, (None,)), (26, (None,)), (27, (None,)), (28, (None,)), (29, (None,)), (30, (None,)), (31, (None,)), (32, (None,)), (33, (None,)), (34, (None,)), (35, (None,)), (36, (None,)), (37, (None,)), (38, (None,)), (39, (None,)), (40, (None,)), (41, (None,)), (42, (None,)), (43, (None,)), (44, (None,)), (45, (None,)), (46, (None,)), (47, (None,)), (48, (None,)), (49, (None,)), (50, (None,)), (51, (None,)), (52, (None,)), (53, (None,)), (54, (None,)), (55, (None,)), (56, (None,)), (57, (None,)), (58, (None,)), (59, (None,)), (60, (None,)), (61, (None,)), (62, (None,)), (63, (None

In [29]:
preprocessed_example_dataset.element_spec

OrderedDict([('x',
              OrderedDict([(0,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (1,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (2,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (3,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (4,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (5,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (6,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (7,
                            TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
                           (8,
                      

I dati federati sono una lista di dataset divisi per cliente. 

In [30]:
def make_federated_data(dataset):
  federated = []
  for i in dataset:
    federated.append(preprocess(dataset[i]))
  return federated

federated_train_data = make_federated_data(dataset)

print('Number of client datasets: {l}'.format(l=len(federated_train_data)))
print('First dataset: {d}'.format(d=federated_train_data[0]))

(<tf.Tensor 'args_0:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_1:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_2:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_3:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_4:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_5:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_6:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_7:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_8:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_9:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_10:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_11:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_12:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_13:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_14:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_15:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_16:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_17:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_18:0' shape=(None,) dtype=float32>, <t

In [31]:
# del dataset

# Model creation and training

Creazione della rete neurale che sarà trainata. 

Viene anche definita la model function, in cui si specifica il modello, il tipo di input, la loss e le metriche da utilizzare. 

Infine si costruisce il processo di averaging, specificando l'optimizer da usare, cioè SGD, e il learning rate del server e del client. 

In [32]:
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers

def build_multiple_inputs_keras_model():
  categorical_input = []
  numerical_input = []
  embeddings = []
  embedding_layer_names = []
  for col in categorical_col_toemb:
      _input = layers.Input(shape=[1], name=col)
      _embed = layers.Embedding(int(input_size[col]), 3, name=col+'_emb')(_input)
      categorical_input.append(_input)
      embeddings.append(_embed)
      embedding_layer_names.append(col+'_emb')
      
  # Simple inputs for the numeric features
  for col in numerical_col:
      numeric_input = layers.Input(shape=(1,), name=col)
      numerical_input.append(numeric_input)
      
  # Merge the numeric inputs
  merged_num_inputs = layers.concatenate(numerical_input)

  # Merge embedding and use a Droput to prevent overfittting
  merged_inputs = layers.concatenate(embeddings)
  spatial_dropout = layers.SpatialDropout1D(0.6)(merged_inputs) # 0.2
  flat_embed = layers.Flatten()(spatial_dropout)

  # Merge embedding and numeric features
  all_features = layers.concatenate([flat_embed, merged_num_inputs])

  # MLP for classification
  x = layers.Dense(360, activation=tf.keras.activations.gelu)(all_features) 
  x = layers.BatchNormalization()(x)
  x = layers.Dropout(0.4)(x) #0.05 0.2
  output = layers.Dense(1, activation='sigmoid')(x)
  model = models.Model(inputs=categorical_input + numerical_input, outputs=output)
  return model

def model_fn():
  soglia = 0.4
  keras_model = build_multiple_inputs_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(), 
               tf.keras.metrics.Recall(thresholds=soglia),
               tf.keras.metrics.Precision(thresholds=soglia)])
  
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn,  
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.05), #0.4
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1)) #0.8

TypeError: ignored

Training del modello. 

Gpu usage: https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=Y04m-jvKRDsJ

In [None]:
NUM_ROUNDS =2
# with tf.device('/device:GPU:0'):
state = iterative_process.initialize()
state, metrics = iterative_process.next(state, federated_train_data)
print('round  1, metrics={}'.format(metrics))
for round_num in range(2, NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num, metrics))

# Model evaluation

Evaluation del modello sui test data. 

In [None]:
evaluation = tff.learning.build_federated_evaluation(model_fn)
train_metrics = evaluation(state.model, federated_train_data)

In [None]:
federated_test_data = make_federated_data(test_set)

In [None]:
test_metrics = evaluation(state.model, federated_test_data)
str(test_metrics)

"OrderedDict([('binary_accuracy', 0.9495985), ('recall', 0.54801166), ('precision', 0.2786683), ('loss', 0.15338975)])"

Board di tensorboad, per visualizzare la loss e le metriche in modo interattivo. 

In [None]:
# logdir = "/tmp/logs/scalars/training/"
# summary_writer = tf.summary.create_file_writer(logdir)
# state = iterative_process.initialize()
# with summary_writer.as_default():
#   for round_num in range(1, NUM_ROUNDS):
#     state, metrics = iterative_process.next(state, federated_train_data)
#     for name, value in metrics['train'].items():
#       tf.summary.scalar(name, value, step=round_num)

In [None]:
# !ls {logdir}
# %tensorboard --logdir {logdir} --port=0