<a href="https://colab.research.google.com/github/alessandrotofani/Tesi_magistrale/blob/master/6_Federated_Embedding_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Overview: https://www.tensorflow.org/federated

Image classification tutorial: https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification

# Installation

In [1]:
!pip install --quiet fastai==2.2.5
!pip install --quiet folium==0.2.1
!pip install --quiet imgaug==0.2.5
!pip install --quiet tensorflow==2.3.0
!pip install --quiet tensorflow_federated==0.17.0
!pip install --quiet --upgrade nest_asyncio

[K     |████████████████████████████████| 194kB 18.0MB/s 
[K     |████████████████████████████████| 61kB 9.7MB/s 
[K     |████████████████████████████████| 71kB 8.2MB/s 
[?25h  Building wheel for folium (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 563kB 17.3MB/s 
[?25h  Building wheel for imgaug (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 320.4MB 50kB/s 
[K     |████████████████████████████████| 20.1MB 1.3MB/s 
[K     |████████████████████████████████| 460kB 57.0MB/s 
[K     |████████████████████████████████| 522kB 16.0MB/s 
[K     |████████████████████████████████| 3.0MB 52.2MB/s 
[K     |████████████████████████████████| 112kB 60.5MB/s 
[K     |████████████████████████████████| 1.1MB 45.4MB/s 
[K     |████████████████████████████████| 153kB 61.6MB/s 
[K     |████████████████████████████████| 174kB 60.8MB/s 
[?25h  Building wheel for absl-py (setup.py) ... [?25l[?25hdone


In [2]:
import nest_asyncio
nest_asyncio.apply()
%load_ext tensorboard

In [3]:
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import pandas as pd 
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys 
sys.path.append('/content/drive/MyDrive/Tesi_magistrale/Tesi_magistrale')
import mf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import data

I dati vengono importati e poi splittati in train e test. 

In [5]:
data = pd.read_csv('/content/drive/MyDrive/Tesi_magistrale/Dataset/IEEE/Output/data.csv')
data = mf.new_processing(data)
col_name = mf.get_col(data)

In [6]:
print('Rate safe/fraud:', (1/mf.ratio(data)).round(3))

Rate safe/fraud: 0.036


In [7]:
data, categorical_col_toemb, numerical_col = mf.labelEncoding(data, merge = True)
col_name = mf.get_col(data)

In [8]:
input_size = {}
for col in categorical_col_toemb:
  input_size[col] = data[col].max() + 1

In [None]:
# def underSampling(data, frac_under=0.1):
#   from imblearn.under_sampling import RandomUnderSampler 
#   us = RandomUnderSampler(sampling_strategy=frac_under, random_state=42)
#   y = data['isFraud']
#   X = data.drop(columns = ['isFraud'])
#   X_us, y_us = us.fit_resample(X, y)
#   return X_us, y_us

def overSampling(data, frac_over=0.05):
  from imblearn.over_sampling import SMOTE
  y = data['isFraud']
  X = data.drop(columns = ['isFraud'])
  sm = SMOTE(sampling_strategy=frac_over, random_state=42)
  X_sm, y_sm = sm.fit_resample(X, y)  
  return X_sm, y_sm

def rate(y):
  n_fraud = np.count_nonzero(y == 1)
  n_safe = np.shape(y)[0] - n_fraud
  return f'Rate safe/fraud: {n_safe/n_fraud}'

def get_keras_dataset(X, cols):
  df = pd.DataFrame(data=X, columns=cols)
  X = {str(col) : np.array(df[col]) for col in df.columns}
  return X

def get_server_data(data, frac=0.2):
  indici = np.arange(0,data.index[-1]+1, dtype=int)
  server_id = np.random.choice(indici, int(len(indici)*frac))
  client_id = [id for id in indici not in server_id]
  return data.iloc[server_id,:], data.iloc[client_id,:]

In [None]:
def server_embedding_model(data):
  from sklearn.metrics import f1_score, recall_score, precision_score
  import tensorflow as tf
  from tensorflow import keras
  from tensorflow.keras import models
  from tensorflow.keras import layers

  X, y = overSampling(data)
  X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2)
  X_train, y_train = overSampling(X_train, y_train)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

  keras.backend.clear_session()
  # Embedding for categorical features
  categorical_input = []
  numerical_input = []
  embeddings = []
  embedding_layer_names = []
  for col in categorical_col_toemb:
  # for col in categorical_col:
      _input = layers.Input(shape=[1], name=col)
      _embed = layers.Embedding(fasted[col].max() + 1, 3, name=col+'_emb')(_input)
      categorical_input.append(_input)
      embeddings.append(_embed)
      embedding_layer_names.append(col+'_emb')
      
  # Simple inputs for the numeric features
  for col in numerical_col:
      numeric_input = layers.Input(shape=(1,), name=col)
      numerical_input.append(numeric_input)
      
  # Merge the numeric inputs
  merged_num_inputs = layers.concatenate(numerical_input)

  # Merge embedding and use a Droput to prevent overfittting
  merged_inputs = layers.concatenate(embeddings)
  spatial_dropout = layers.SpatialDropout1D(0.6)(merged_inputs) # 0.2
  flat_embed = layers.Flatten()(spatial_dropout)

  # Merge embedding and numeric features
  all_features = layers.concatenate([flat_embed, merged_num_inputs])

  # MLP for classification
  x = layers.Dense(360, activation=tf.keras.activations.gelu)(all_features) 
  x = layers.BatchNormalization()(x)
  x = layers.Dropout(0.4)(x) #0.05 0.2

  # Final model
  output = layers.Dense(1, activation='sigmoid')(x)
  model = models.Model(inputs=categorical_input + numerical_input, outputs=output)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])
  print(***************************************************************)
  print('Model\'s training')
  history = model.fit(get_keras_dataset(X_train, cols), y_train, epochs=30, 
      batch_size=512, validation_data=(get_keras_dataset(X_val, cols),y_val),
      verbose=1,shuffle = True , class_weight = {0: 0.5, 1: 4} )

  if performance:
    y_pred = model.predict(get_keras_dataset(X_test, cols))
    mf.plot_cm(y_test, y_pred, 'Blues')
    mf.plot_roc("ROC curve", y_test, y_pred,color='blue')
    print(***************************************************************)
    print('Model\'s performances')
    print('F1 score: ',f1_score(y_test, y_predicted, average="binary"))
    print('Recall: ', recall_score(y_test, y_predicted, average='binary'))
    print('Precision: ', precision_score(y_test, y_predicted,  average='binary'))
  return model

In [None]:
server_data, client_data = get_server_data(data, frac=0.2)

In [None]:
server_model = server_embedding_model(server_data)

Estrarre i layer di embedding e trasformare i dati. 

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(client_data, test_size=0.1)

In [None]:
# X, y = underSampling(train_data)
# rate(y)

In [None]:
X, y = overSampling(train_data)
rate(y)

'Rate safe/fraud: 20.000467963966774'

In [None]:
train_data = mf.mergeResult(X, y, col_name)

In [None]:
print('Dataset size:', train_data.shape[0])

Dataset size: 538515


In [None]:
del client_data, server_data, X, y

# Convert data

Il dataset deve essere convertito in un tensore, con componenti (feature_vector, label). 

In [None]:
def to_tensor(data, categorical_col_toemb, n_clients = 5):
  shuffled = data.sample(frac=1)
  result = np.array_split(shuffled, n_clients)  

  res = []
  label = []

  for dataset in result:
    label.append(dataset['isFraud'])
    res.append(dataset.drop(columns = ['isFraud']))

  dataset = {}
  for i in range(n_clients):
    lista = tuple([res[i][col].to_numpy() for col in res[i].columns]+[label[i]])
    dataset[i] = tf.data.Dataset.from_tensor_slices(lista)

  return dataset

In [None]:
dataset = to_tensor(train_data, categorical_col_toemb)
test_set = to_tensor(test_data, categorical_col_toemb)

In [None]:
dataset

{0: <TensorSliceDataset shapes: ((), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (),

In [None]:
# del train_data, test_data

# Federated data

Si definisce la funzione di preprocessing del dataset, che serve a creare l'OrderedDict, su cui si andranno a creare le batch necessarie per il training del modello. 

In [None]:
NUM_CLIENTS = 5
NUM_EPOCHS = 10
BATCH_SIZE = 500 #250
SHUFFLE_BUFFER = 10
PREFETCH_BUFFER = 10

def preprocess(dataset):
  def batch_format_fn(*args):
    print(args)
    lista = [args[i] for i in range(len(args))]
    o_dict = collections.OrderedDict()
    for i in range(len(lista)-1):
      o_dict[col_name[i]] = tf.cast(lista[i], tf.float32, name = col_name[i])
    return collections.OrderedDict(
        x = o_dict,
        y = tf.cast(args[-1], tf.int32, name = 'isFraud'))
  return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(
      BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

preprocessed_example_dataset = preprocess(dataset[0])
# sample_batch = tf.nest.map_structure(lambda x: x.numpy(), next(iter(preprocessed_example_dataset)))
# sample_batch

(<tf.Tensor 'args_0:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_1:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_2:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_3:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_4:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_5:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_6:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_7:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_8:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_9:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_10:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_11:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_12:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_13:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_14:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_15:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_16:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_17:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_18:0' shape=(None,) dtype=float32>, <t

In [None]:
# preprocessed_example_dataset.element_spec

I dati federati sono una lista di dataset divisi per cliente. 

In [None]:
def make_federated_data(dataset):
  federated = []
  for i in dataset:
    federated.append(preprocess(dataset[i]))
  return federated

federated_train_data = make_federated_data(dataset)

print('Number of client datasets: {l}'.format(l=len(federated_train_data)))
print('First dataset: {d}'.format(d=federated_train_data[0]))

(<tf.Tensor 'args_0:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_1:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_2:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_3:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_4:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_5:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_6:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_7:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_8:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_9:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_10:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_11:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_12:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_13:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_14:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_15:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_16:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_17:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_18:0' shape=(None,) dtype=float32>, <t

In [None]:
# del dataset

# Model creation and training

Creazione della rete neurale che sarà trainata. 

Viene anche definita la model function, in cui si specifica il modello, il tipo di input, la loss e le metriche da utilizzare. 

Infine si costruisce il processo di averaging, specificando l'optimizer da usare, cioè SGD, e il learning rate del server e del client. 

In [None]:
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers

def build_multiple_inputs_keras_model():
  categorical_input = []
  numerical_input = []
  embeddings = []
  embedding_layer_names = []
  for col in categorical_col_toemb:
      _input = layers.Input(shape=[1], name=col)
      _embed = layers.Embedding(int(input_size[col]), 3, name=col+'_emb')(_input)
      categorical_input.append(_input)
      embeddings.append(_embed)
      embedding_layer_names.append(col+'_emb')
      
  # Simple inputs for the numeric features
  for col in numerical_col:
      numeric_input = layers.Input(shape=(1,), name=col)
      numerical_input.append(numeric_input)
      
  # Merge the numeric inputs
  merged_num_inputs = layers.concatenate(numerical_input)

  # Merge embedding and use a Droput to prevent overfittting
  merged_inputs = layers.concatenate(embeddings)
  spatial_dropout = layers.SpatialDropout1D(0.6)(merged_inputs) # 0.2
  flat_embed = layers.Flatten()(spatial_dropout)

  # Merge embedding and numeric features
  all_features = layers.concatenate([flat_embed, merged_num_inputs])

  # MLP for classification
  x = layers.Dense(360, activation='relu')(all_features) #tf.keras.activations.gelu
  x = layers.BatchNormalization()(x)
  x = layers.Dropout(0.4)(x) #0.05 0.2
  output = layers.Dense(1, activation='sigmoid')(x)
  model = models.Model(inputs=categorical_input + numerical_input, outputs=output)
  return model

def model_fn():
  soglia = 0.5
  keras_model = build_multiple_inputs_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(), 
               tf.keras.metrics.Recall(thresholds=soglia),
               tf.keras.metrics.Precision(thresholds=soglia)])
  
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn,  
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.05), #0.05
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1)) 

Training del modello. 

Gpu usage: https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=Y04m-jvKRDsJ

In [None]:
NUM_ROUNDS = 20
state = iterative_process.initialize()
for round_num in range(1, NUM_ROUNDS + 1):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num, metrics))

round  1, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.94913054), ('recall', 0.06314394), ('precision', 0.32480142), ('loss', 0.17123254)]))])
round  2, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.95453), ('recall', 0.09227079), ('precision', 0.66301453), ('loss', 0.15266807)]))])
round  3, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.9553067), ('recall', 0.115236126), ('precision', 0.6819913), ('loss', 0.14871122)]))])
round  4, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.9559455), ('

# Model evaluation

Evaluation del modello sui test data. 

In [None]:
evaluation = tff.learning.build_federated_evaluation(model_fn)
train_metrics = evaluation(state.model, federated_train_data)

In [None]:
federated_test_data = make_federated_data(test_set)

(<tf.Tensor 'args_0:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_1:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_2:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_3:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_4:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_5:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_6:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_7:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_8:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_9:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_10:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_11:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_12:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_13:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_14:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_15:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_16:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_17:0' shape=(None,) dtype=float32>, <tf.Tensor 'args_18:0' shape=(None,) dtype=float32>, <t

In [None]:
test_metrics = evaluation(state.model, federated_test_data)
str(test_metrics)

"OrderedDict([('binary_accuracy', 0.9652067), ('recall', 0.0), ('precision', 0.0), ('loss', 0.15449838)])"

Board di tensorboad, per visualizzare la loss e le metriche in modo interattivo. 

In [None]:
# logdir = "/tmp/logs/scalars/training/"
# summary_writer = tf.summary.create_file_writer(logdir)
# state = iterative_process.initialize()
# with summary_writer.as_default():
#   for round_num in range(1, NUM_ROUNDS):
#     state, metrics = iterative_process.next(state, federated_train_data)
#     for name, value in metrics['train'].items():
#       tf.summary.scalar(name, value, step=round_num)

In [None]:
# !ls {logdir}
# %tensorboard --logdir {logdir} --port=0