In [1]:
# Reference: https://www.kaggle.com/sharp1/malaria-cells-classification-through-keras

In [2]:
# Byzantine Protection: MultiKrum
# Privacy Protection: Gradient Pruning
# Aggregation Algorithm: FedAvg

In [3]:
# !pip3 install tensorflow_model_optimization

In [141]:
from PIL import Image
import numpy as np
import os
import cv2
import keras
import tempfile
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D,MaxPooling2D,Dense,Flatten,Dropout, GaussianNoise
import pandas as pd
import sys
%matplotlib inline
from scipy.spatial.distance import euclidean as euc
import matplotlib.pyplot as plt
import random
import plotly.express as px
import numpy
import tensorflow as tf
import requests
import random
import tensorflow_model_optimization as tfmot

prune_low_magnitude = tfmot.sparsity.keras.prune_low_magnitude

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [142]:
C=np.load("Cells.npy")
L=np.load("Labels.npy")

In [143]:
# To Shuffle our dataset
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = numpy.random.permutation(len(a))
    return a[p], b[p]

C, L = unison_shuffled_copies(C, L)

In [144]:
(Cells,Labels)=C[(int)(0.1*len(C)):],L[(int)(0.1*len(C)):]
(Cells_Test,Labels_Test)=C[:(int)(0.1*len(C))],L[:(int)(0.1*len(C))]

In [145]:
## SETTINGS

# Blockchain
NUM_Clients = 5 # number of clients contributing per training round

# ML
Cluster_Size = 100 # max client dataset size for training
Batch_Size = 10
NUM_Epoch = 3
num_classes = 2

# Krum
krum_f = 0.00 # percentage of byzantine nodes

# Differential Privacy
Gaussian_Noise = False
Gaussian_Noise_Std_Dev = 0.20

Gradient_Clipping = False
Clip_Norm = 0.60

Gradient_Pruning = False
initial_sparsity = 0.00
final_sparsity = 0.45

# backdoor = True
# poisonedLocals = [1, 2, 3, 4, 5]
# if backdoor == True:
#     backdoorAcc = []
#     backdoorLoss = []

In [146]:
# if backdoor == True:
#     print('Reading Backdoor Testing Data')
#     BTestParasitizedCells, BTestParasitizedLabels = readData('./input/backdoor/Parasitized/', 1)
#     BTestUninfectedCells, BTestUninfectedLabels  = readData('./input/backdoor/Uninfected/', 0)
    
#     BTestCells =np.concatenate((BTestParasitizedCells, BTestUninfectedCells))
#     BTestLabels = np.concatenate((BTestParasitizedLabels, BTestUninfectedLabels))
    
#     BTestCells, BTestLabels = unison_shuffled_copies(BTestCells, BTestLabels)
    
#     len_BTestData=len(BTestCells)
    
#     (BTestCells)= BTestCells[:(int)(0.1*len_BTestData)]
#     (BTestLabels)=BTestLabels[:(int)(0.1*len_BTestData)]
    
#     # As we are working on image data we are normalizing data by divinding 255.
#     BTestCells = BTestCells.astype('float32')/255
#     #Doing One hot encoding as classifier has multiple classes
#     BTestLabels=keras.utils.to_categorical(BTestLabels,num_classes)

In [147]:
def train(name, Cells, Labels, globalId, poisoned = False):
    
    s = np.arange(Cells.shape[0])
    np.random.shuffle(s)
    Cells = Cells[s]
    Labels = Labels[s]
    
    num_classes=len(np.unique(Labels))
    len_data=len(Cells)
    print(len_data, ' Data Points')
    
    (x_train)=Cells
    (y_train)=Labels
    
    # Since we're working on image data, we normalize data by divinding 255.
    x_train = x_train.astype('float32')/255 
    train_len=len(x_train)
    
#     if poisoned:
#         x_train[:50] = addBackdoorPattern(x_train[:50])
#         y_train[:50] = [1 for i in range(50)]
#         y_train[50:] = [0 for i in range(50)]
        
    #Doing One hot encoding as classifier has multiple classes
    y_train=keras.utils.to_categorical(y_train,num_classes)
    
    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    
    if Gaussian_Noise == True:
        model.add(GaussianNoise(Gaussian_Noise_Std_Dev))
    
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
    # model.summary()
    
    if Gradient_Pruning == True:
        num_images = len(x_train)
        end_step = np.ceil(num_images / Batch_Size).astype(np.int32) * NUM_Epoch

        # Define model for pruning.
        pruning_params = {
              'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=initial_sparsity,
                                                                       final_sparsity=final_sparsity,
                                                                       begin_step=0,
                                                                       end_step=end_step)
        }

        logdir = tempfile.mkdtemp()

        callbacks = [
          tfmot.sparsity.keras.UpdatePruningStep(),
          tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
        ]

        model = prune_low_magnitude(model, **pruning_params)
    
    if globalId != 1:
        model.load_weights("./weights/global"+str(globalId)+".h5")
        
    # Compile the model with loss as categorical_crossentropy and using adam optimizer
    
    # Differential Privacy Gradient Clipping
    if Gradient_Clipping == True:
        opt = keras.optimizers.Adam(clipnorm=Clip_Norm)
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    else: 
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
    #Fit the model with min batch size as 50[can tune batch size to some factor of 2^power ] 
    if Gradient_Pruning == True:
        model.fit(x_train, y_train, batch_size=Batch_Size, epochs=NUM_Epoch, verbose=1, callbacks=callbacks)
    else:
        model.fit(x_train, y_train, batch_size=Batch_Size, epochs=NUM_Epoch, verbose=1)
    
#     print(model.summary())
    
    #Saving Model
    model.save("./weights/"+str(name)+".h5")
    return len_data, model

In [148]:
def getDataLen(trainingDict):
    n = 0
    for w in trainingDict:
#         print(w)
        n += trainingDict[w]
    print('Total number of data points after this round: ', n)
    return n

def assignWeights(trainingDf, trainingDict):
    n = getDataLen(trainingDict)
    trainingDf['Weightage'] = trainingDf['DataSize'].apply(lambda x: x/n)
    return trainingDf, n
    
def scale(weight, scaler):
    scaledWeights = []
    for i in range(len(weight)):
        scaledWeights.append(scaler * weight[i])
    return scaledWeights

def getWeight(d):
    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    if Gaussian_Noise == True:
        model.add(GaussianNoise(Gaussian_Noise_Std_Dev))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
#     model.summary()
    
    if Gradient_Pruning == True:
        model = prune_low_magnitude(model)
    
    fpath = "./weights/"+d+".h5"
    model.load_weights(fpath)
    weight = model.get_weights()
    return weight

def getScaledWeight(d, scaler):
    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    if Gaussian_Noise == True:
        model.add(GaussianNoise(Gaussian_Noise_Std_Dev))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
#     model.summary()
    
    if Gradient_Pruning == True:
        model = prune_low_magnitude(model)
    
    
    fpath = "./weights/"+d+".h5"
    model.load_weights(fpath)
    weight = model.get_weights()
    return scale(weight, scaler)

def avgWeights(scaledWeights):
    avg = list()
    for weight_list_tuple in zip(*scaledWeights):
        layer_mean = tf.math.reduce_sum(weight_list_tuple, axis=0)
        avg.append(layer_mean)
    return avg

def FedAvg(trainingDict):
    trainingDf = pd.DataFrame.from_dict(trainingDict, orient='index', columns=['DataSize']) 
    models = list(trainingDict.keys())
    scaledWeights = []
    trainingDf, dataLen = assignWeights(trainingDf, trainingDict)
    for m in models:
        scaledWeights.append(getScaledWeight(m, trainingDf.loc[m]['Weightage']))
    fedAvgWeight = avgWeights(scaledWeights)
    return fedAvgWeight, dataLen


def saveModel(weight, n):
    
    TestCells = np.array(Cells_Test)
    TestLabels = np.array(Labels_Test)
    
    sTest = np.arange(TestCells.shape[0])
    np.random.shuffle(sTest)
    TestCells = TestCells[sTest]
    TestLabels = TestLabels[sTest]
    
    num_classes=len(np.unique(TestLabels))
    
    (x_test) = TestCells
    (y_test) = TestLabels
    
    # Since we're working on image data, we normalize data by divinding 255.
    x_test = x_test.astype('float32')/255
    test_len=len(x_test)
    
    #Doing One hot encoding as classifier has multiple classes
    y_test=keras.utils.to_categorical(y_test,num_classes)

    #creating sequential model
    model=Sequential()
    model.add(Conv2D(filters=16,kernel_size=2,padding="same",activation="relu",input_shape=(50,50,3)))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=32,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Conv2D(filters=64,kernel_size=2,padding="same",activation="relu"))
    model.add(MaxPooling2D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(500,activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation="softmax"))#2 represent output layer neurons 
#     model.summary()
    
    if Gradient_Pruning == True:
        model = prune_low_magnitude(model)
    
    
    model.set_weights(weight)

    # compile the model with loss as categorical_crossentropy and using adam optimizer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    scores = model.evaluate(x_test, y_test)
    print("Loss: ", scores[0])        #Loss
    print("Accuracy: ", scores[1])    #Accuracy
    
#     if backdoor == True:
#         Bscores = model.evaluate(BTestCells, BTestLabels)
#         print("Backdoor Loss: ", Bscores[0])        #Loss
#         print("Backdoor Accuracy: ", Bscores[1])    #Accuracy
#         backdoorLoss.append(Bscores[0])
#         backdoorAcc.append(Bscores[1])

    #Saving Model
    fpath = "./weights/global"+str(n)+".h5"
    model.save(fpath)
    return scores[0], scores[1]

def euclidean(m, n):
    distance = []
    for i in range(len(m)):
#         print(i)
        distance.append(euc(m[i].reshape(-1,1), n[i].reshape(-1,1)))
#     print(distance)
    distance = sum(distance)/len(m)
    return distance

def merge(trainingDict, b):
    models = list(trainingDict.keys())
    trainingDf = pd.DataFrame.from_dict(trainingDict, orient='index', columns=['DataSize'])
    l_weights = []
    g_weight = {}
    for m in models:
        print(m)
        if 'global' in m:
            g_weight['name'] = m
            g_weight['weight'] = getWeight(m)
        else:
            l_weights.append({
                'name': m,
                'weight': getWeight(m)
            })
    scores = {}
    for m in l_weights:
        scores[m['name']] = euclidean(m['weight'], g_weight['weight'])
    sortedScores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1])}
    b = int(len(scores)*b)
    
    selected = []
    
    for i in range(b):
        selected.append((sortedScores.popitem())[0])

    newDict = {}
    for i in trainingDict.keys():
        if (((i not in selected) and ('global' not in i)) or int(i.replace('local', '') in poisonedLocals)):
            newDict[i] = trainingDict[i]

    print('Selections: ', newDict)
    NewGlobal, dataLen = FedAvg(newDict)
    return NewGlobal, dataLen

# def addBackdoorPattern(imgs):
#     for img in imgs:
#         randInt = random.randint(0, 49)
#         img[randInt][0][0] = 1.
#         img[randInt][0][1] = 0.
#         img[randInt][0][2] = 1.
#         img[randInt][1][0] = -10.
#         img[randInt][1][1] = 1.
#         img[randInt][1][2] = -10.
#         img[randInt][2][0] = -10.
#         img[randInt][2][1] = -10.
#         img[randInt][2][2] = 0.
#         img[randInt][2][0] = 1.
#         img[randInt][2][1] = 0.
#         img[randInt][2][2] = 1.
#     return imgs

In [149]:
Cells_Test.shape

(2755, 50, 50, 3)

In [152]:
import time
def BEAS():
    start_time = time.time()
    curr_local = 0
    curr_global = 0
    local = {}
    loss_array = []
    acc_array = []
    for i in range(0, len(Cells), Cluster_Size):
        print("--- %s seconds ---" % (time.time() - start_time))
        if int(curr_global) == 0:
            curr_global += 1
            name = 'global' + str(curr_global)
            l, m = train(name, Cells[i:i+Cluster_Size], Labels[i:i+Cluster_Size], curr_global)
            local[name] = l
        elif (curr_local != 0) and (int(curr_local)%NUM_Clients == 0):
            curr_global += 1
            print('Current Global: ', curr_global)
            name = 'global' + str(curr_global)
            m, l = merge(local, krum_f)
            loss, acc = saveModel(m, curr_global)
            loss_array.append(loss)
            acc_array.append(acc)
            curr_local += 1
            local = {}
            local[name] = l
        else:
            p = False
            print('Current Local: ', curr_local)
    #         if backdoor:
    #             if curr_local in poisonedLocals:
    #                 p = True
    #                 print('Training Poisoned Local')
            name = str('local'+str(curr_local))
            curr_local += 1
            l, m = train(name, Cells[i:i+Cluster_Size], Labels[i:i+Cluster_Size], curr_global, poisoned = p)
            local[name] = l
    print('----------------------------------------')
    print('Number of Clients: ', NUM_Clients)
    print('Cluster Size: ', Cluster_Size)
    print('Batch Size: ', Batch_Size)
    print('Number of Local Epochs: ', NUM_Epoch)
    print('F: ', krum_f)
    print('Gaussian_Noise: ', Gaussian_Noise)
    if Gaussian_Noise: 
        print('Noise Std Dev: ', Gaussian_Noise_Std_Dev)
    print('Gradient_Clipping: ', Gradient_Clipping)
    if Gradient_Clipping: 
        print('Clip Norm: ', Clip_Norm)
    print('Gradient_Pruning: ', Gradient_Pruning)
    if Gradient_Pruning: 
        print('Pruning Sparcity: ', final_sparsity)
    print('----------------------------------------')
    print(acc_array)
    fig = px.line(y=acc_array)
    fig.show()

In [153]:
BEAS()

--- 5.245208740234375e-06 seconds ---
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 2.187479019165039 seconds ---
Current Local:  0
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 4.268304109573364 seconds ---
Current Local:  1
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 6.059256076812744 seconds ---
Current Local:  2
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 8.097369194030762 seconds ---
Current Local:  3
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 10.105907201766968 seconds ---
Current Local:  4
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 11.83807921409607 seconds ---
Current Global:  2
global1
local0
local1
local2
local3
local4
Selections:  {'local0': 100, 'local1': 100, 'local2': 100, 'local3': 100, 'local4': 100}
Total number of data points after this round:  500
Loss:  0.6931252479553223
Accuracy:  0.5012704133987427
--- 17.955313205718994 seconds ---
Current Local:  6
100  Data Points
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- 19.861887216567993 seco

KeyboardInterrupt: 