<a href="https://colab.research.google.com/github/alessandrotofani/Tesi_magistrale/blob/master/6_Federated_MLP_512.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Overview: https://www.tensorflow.org/federated

Image classification tutorial: https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification

# Installation

In [1]:
!pip install --quiet tensorflow==2.3.0
!pip install --quiet tensorflow_federated==0.17.0
!pip install --quiet --upgrade nest_asyncio

In [2]:
import nest_asyncio
nest_asyncio.apply()
%load_ext tensorboard

In [3]:
import collections
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import pandas as pd 
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sys 
sys.path.append('/content/drive/MyDrive/Tesi_magistrale/Tesi_magistrale')
import mf

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import data

I dati vengono importati e poi splittati in train e test. 

In [5]:
data = pd.read_csv('/content/drive/MyDrive/Tesi_magistrale/Dataset/IEEE/Output/data.csv')
data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [6]:
data = mf.feature_engineering(data)
# data = mf.feature_scaling(data)
data = pd.get_dummies(data)

In [7]:
# data = data[200000:500000]
col_name = mf.get_col(data)

In [8]:
print('Rate safe/fraud:', (1/mf.ratio(data)).round(3))

Rate safe/fraud: 0.036


In [9]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.1)

Smote: https://imbalanced-learn.org/stable/generated/imblearn.over_sampling.SMOTE.html

RandomUnderSampler: https://imbalanced-learn.org/stable/generated/imblearn.under_sampling.RandomUnderSampler.html

In [10]:
def underSampling(data, frac_under=0.1):
  from imblearn.under_sampling import RandomUnderSampler 
  us = RandomUnderSampler(sampling_strategy=frac_under, random_state=42)
  y = data['isFraud']
  X = data.drop(columns = ['isFraud'])
  X_us, y_us = us.fit_resample(X, y)
  return X_us, y_us

def overSampling(X, y, frac_over=0.2):
  from imblearn.over_sampling import SMOTE
  sm = SMOTE(sampling_strategy=frac_over, random_state=42)
  X_sm, y_sm = sm.fit_resample(X, y)  
  return X_sm, y_sm

def mergeResult(X, y, col_name):
  y_res = np.ndarray(shape=(np.shape(y)[0],1), buffer = y)
  data = np.concatenate((X,y_res), axis = 1)
  col_name.append('isFraud')
  dataset = pd.DataFrame(data=data, columns=col_name)  
  return dataset

def rate(y):
  n_fraud = np.count_nonzero(y == 1)
  n_safe = np.shape(y)[0] - n_fraud
  return f'Rate safe/fraud: {n_safe/n_fraud}'

In [11]:
X, y = underSampling(train_data)
rate(y)

'Rate safe/fraud: 10.0'

In [12]:
X, y = overSampling(X, y)
rate(y)

'Rate safe/fraud: 5.0'

In [13]:
train_data = mergeResult(X, y, col_name)

In [14]:
print('Rate safe/fraud:', (mf.ratio(train_data)).round(3))
print('Dataset size:', train_data.shape[0])

Rate safe/fraud: 5.0
Dataset size: 223224


In [15]:
del data, X, y

# Convert data

Il dataset deve essere convertito in un tensore, con componenti (feature_vector, label). 

In [16]:
def to_tensor(data, n_clients = 1):
  shuffled = data.sample(frac=1)
  result = np.array_split(shuffled, n_clients)  

  res = []

  new_res = []
  label = []

  for dataset in result:
    res.append(mf.feature_scaling(dataset))
  
  for subset in res:
    label.append(subset['isFraud'])
    new_res.append(subset.drop(columns = ['isFraud']).to_numpy())

  dataset = {}
  for i in range(n_clients):
    dataset[i] = tf.data.Dataset.from_tensor_slices((new_res[i], label[i]))
  return dataset

In [17]:
dataset = to_tensor(train_data)
test_set = to_tensor(test_data)

In [18]:
del train_data, test_data

# Federated data

Si definisce la funzione di preprocessing del dataset, che serve a creare l'OrderedDict, su cui si andranno a creare le batch necessarie per il training del modello. 

In [19]:
NUM_CLIENTS = 1
NUM_EPOCHS = 10
BATCH_SIZE = 500
SHUFFLE_BUFFER = 10
PREFETCH_BUFFER = 10

def preprocess(dataset):
  def batch_format_fn(e1, e2):
    return collections.OrderedDict(
        x = tf.cast(e1, tf.float32),
        y = tf.cast(e2, tf.int32))
  return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER).batch(
      BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

preprocessed_example_dataset = preprocess(dataset[0])
# sample_batch = tf.nest.map_structure(lambda x: x.numpy(), next(iter(preprocessed_example_dataset)))
# sample_batch

I dati federati sono una lista di dataset divisi per cliente. 

In [20]:
def make_federated_data(dataset):
  federated = []
  for i in dataset:
    federated.append(preprocess(dataset[i]))
  return federated

federated_train_data = make_federated_data(dataset)

print('Number of client datasets: {l}'.format(l=len(federated_train_data)))
print('First dataset: {d}'.format(d=federated_train_data[0]))

Number of client datasets: 1
First dataset: <PrefetchDataset shapes: OrderedDict([(x, (None, 1103)), (y, (None,))]), types: OrderedDict([(x, tf.float32), (y, tf.int32)])>


In [21]:
del dataset

# Model creation and training

Creazione della rete neurale che sarà trainata. 

Viene anche definita la model function, in cui si specifica il modello, il tipo di input, la loss e le metriche da utilizzare. 

Infine si costruisce il processo di averaging, specificando l'optimizer da usare, cioè SGD, e il learning rate del server e del client. 

In [43]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, Input

def create_keras_model():
  model = Sequential()
  model.add(Input(shape=(1103,)))  
  model.add(Dense(512, activation='relu')) 
  model.add(Dropout(0.7))  
  # model.add(Dense(256, activation='relu')) 
  # model.add(Dropout(0.7))
  # model.add(Dense(128, activation='relu')) 
  # model.add(Dropout(0.7))
  # model.add(Dense(64, activation='relu')) 
  # model.add(Dropout(0.5))
  # model.add(Dense(32, activation='relu')) 
  # model.add(Dropout(0.7))
  # model.add(Dense(16, activation='relu')) 
  # model.add(Dropout(0.7))
  model.add(Dense(1, activation='sigmoid'))
  return model

def model_fn():
  soglia = 0.4
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.BinaryCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(), 
               tf.keras.metrics.Recall(thresholds=soglia),
               tf.keras.metrics.Precision(thresholds=soglia)])
  
iterative_process = tff.learning.build_federated_averaging_process(
    model_fn,  
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.2), #0.4
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1)) #0.8

Training del modello. 

Gpu usage: https://colab.research.google.com/notebooks/gpu.ipynb#scrollTo=Y04m-jvKRDsJ

In [44]:
NUM_ROUNDS =81
# with tf.device('/device:GPU:0'):
state = iterative_process.initialize()
state, metrics = iterative_process.next(state, federated_train_data)
print('round  1, metrics={}'.format(metrics))
for round_num in range(2, NUM_ROUNDS):
  state, metrics = iterative_process.next(state, federated_train_data)
  print('round {:2d}, metrics={}'.format(round_num, metrics))

round  1, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.85600287), ('recall', 0.39479357), ('precision', 0.5911908), ('loss', 0.35478997)]))])
round  2, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.8726726), ('recall', 0.48692882), ('precision', 0.6339658), ('loss', 0.32769817)]))])
round  3, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.87906164), ('recall', 0.5169928), ('precision', 0.6523407), ('loss', 0.31666806)]))])
round  4, metrics=OrderedDict([('broadcast', ()), ('aggregation', OrderedDict([('value_sum_process', ()), ('weight_sum_process', ())])), ('train', OrderedDict([('binary_accuracy', 0.88287634), ('

# Model evaluation

Evaluation del modello sui test data. 

In [45]:
evaluation = tff.learning.build_federated_evaluation(model_fn)
train_metrics = evaluation(state.model, federated_train_data)

In [46]:
federated_test_data = make_federated_data(test_set)

In [47]:
test_metrics = evaluation(state.model, federated_test_data)
str(test_metrics)

"OrderedDict([('binary_accuracy', 0.9666966), ('recall', 0.51941746), ('precision', 0.419279), ('loss', 0.12685366)])"

Board di tensorboad, per visualizzare la loss e le metriche in modo interattivo. 

In [48]:
# logdir = "/tmp/logs/scalars/training/"
# summary_writer = tf.summary.create_file_writer(logdir)
# state = iterative_process.initialize()
# with summary_writer.as_default():
#   for round_num in range(1, NUM_ROUNDS):
#     state, metrics = iterative_process.next(state, federated_train_data)
#     for name, value in metrics['train'].items():
#       tf.summary.scalar(name, value, step=round_num)

In [49]:
# !ls {logdir}
# %tensorboard --logdir {logdir} --port=0