# Blood cells cancer using MobileNetV2 (with transfer learning) in a Federated Learning context

## Install and test Tensorflow Federated
---

⚠ **Please restart runtime after installing tensorflow**

---

In [None]:
# !pip install --quiet --upgrade tensorflow-federated

Test tensorflow federated


In [None]:
import collections

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff

np.random.seed(0)

tff.federated_computation(lambda: 'Hello, World!')()

b'Hello, World!'

Load tensorboard

In [None]:
%load_ext tensorboard

## Download dataset from Kaggle

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"andreiovidiumuntean","key":"589cc73e5a2e1d3c6be2ea858db432c7"}'}

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d mohammadamireshraghi/blood-cell-cancer-all-4class

Downloading blood-cell-cancer-all-4class.zip to /content
100% 1.68G/1.68G [01:24<00:00, 21.8MB/s]
100% 1.68G/1.68G [01:24<00:00, 21.4MB/s]


In [None]:
! unzip -q blood-cell-cancer-all-4class.zip

In [None]:
# Remove spaces from folder names
! mv "Blood cell Cancer [ALL]" dataset
! mv dataset/"[Malignant] Pre-B" dataset/pre_b
! mv dataset/"[Malignant] Pro-B" dataset/pro_b
! mv dataset/"[Malignant] early Pre-B" dataset/early_pre_b
! mv dataset/"Benign" dataset/benign

## Imports

In [None]:
from imutils import paths
import os
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

import cv2
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy import ndimage as ndi
from skimage import morphology
import time

import keras,math
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D,BatchNormalization
from keras.layers import Dense,Dropout
from keras.models import Model

import tensorflow as tf
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import regularizers

import tensorflow as tf
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn import metrics

## Preparing the dataset

In [None]:
data_dir  = 'dataset'
data_list = sorted(list(paths.list_images(data_dir)))

random.seed(88)
random.shuffle(data_list)

data_list = data_list[:1000]

train_list, test_list = train_test_split(data_list, train_size=0.90, shuffle=True, random_state=88)

print('number of testing list:',len(test_list))
print('number of training list (for all clients):',len(train_list))

number of testing list: 100
number of training list (for all clients): 900


## Split the original training data into NUM_CLIENTS parts acording to DATA_PER_CLIENT_PERCENTAGE

In [None]:
DATA_PER_CLIENT_PERCENTAGE = [0.5, 0.5]
NUM_CLIENTS = len(DATA_PER_CLIENT_PERCENTAGE)

def split_list_with_percentages(original_list, percentages):
    list_copy = original_list.copy() # this list will be destryied
    total_size = len(list_copy)
    split_sizes = [int(percentage * total_size) for percentage in percentages]

    splits = []

    for size in split_sizes:
        splits.append(list_copy[:size])
        list_copy = list_copy[size:]

    splits[-1].extend(list_copy)

    return splits

# Split the list
clients_data = split_list_with_percentages(train_list, DATA_PER_CLIENT_PERCENTAGE)

print("Number of samples for each client:")
for idx, client_data in enumerate(clients_data):
  print(f"Client {idx} | num_training_samples: {len(client_data)} | 1st sample: {client_data[0]}")


Number of samples for each client:
Client 0 | num_training_samples: 450 | 1st sample: dataset/early_pre_b/Snap_206.jpg
Client 1 | num_training_samples: 450 | 1st sample: dataset/pro_b/Snap_093.jpg


In [None]:
def preprocess_train_data_for_client(client_id: int, paths):
  for idx, path in enumerate(paths):
    img = cv2.imread(path)
    img = cv2.resize(img,(224,224))
    label = path.split(os.path.sep)[1]
    client_dir = os.path.join("tmp", "train", f"{client_id}", label)
    os.makedirs(client_dir, exist_ok=True)
    filename = os.path.join(client_dir, f"{label}_{idx}.png")
    cv2.imwrite(filename, img)

for client_id in range(NUM_CLIENTS):
  preprocess_train_data_for_client(client_id, clients_data[client_id])

In [None]:
def preprocess_test_data(paths):
  for idx, path in enumerate(paths):
    i= cv2.imread(path)
    i= cv2.resize(i,(224,224))
    label = path.split(os.path.sep)[1]
    dir = os.path.join("tmp", "test", f"{label}")
    os.makedirs(dir, exist_ok=True)
    b = os.path.join(dir, f"{label}_{str(idx)}.png")
    cv2.imwrite(b, i)

preprocess_test_data(test_list)

## Create dataframe with file path and label (for test/train)

In [None]:
def convert_label_to_int(label: str):
  dict = {"benign" : 0, "early_pre_b": 1, "pre_b": 2, "pro_b": 3}
  return dict[label]

def load_data(directory_path):
    filenames = sorted(list(paths.list_images(directory_path)))
    random.shuffle(filenames)
    labels = [convert_label_to_int(os.path.basename(os.path.dirname(filename))) for filename in filenames]
    return pd.DataFrame({'filenames': filenames, 'labels': labels})

In [None]:
# this is an array of dataframes containing data for each client
clients_dfs = [load_data(f"tmp/train/{client_id}") for client_id in range(NUM_CLIENTS)]

clients_dfs[0].head(30)

Unnamed: 0,filenames,labels
0,tmp/train/0/benign/benign_17.png,0
1,tmp/train/0/early_pre_b/early_pre_b_192.png,1
2,tmp/train/0/early_pre_b/early_pre_b_303.png,1
3,tmp/train/0/pro_b/pro_b_32.png,3
4,tmp/train/0/benign/benign_161.png,0
5,tmp/train/0/pre_b/pre_b_120.png,2
6,tmp/train/0/early_pre_b/early_pre_b_326.png,1
7,tmp/train/0/pre_b/pre_b_233.png,2
8,tmp/train/0/early_pre_b/early_pre_b_225.png,1
9,tmp/train/0/pro_b/pro_b_419.png,3


In [None]:
def load_and_preprocess_image(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=3)
    image = image / 255
    return image, label

def convert_dataframe_into_dataset(df):
  dataset = tf.data.Dataset.from_tensor_slices((df["filenames"], df["labels"]))
  return dataset.map(load_and_preprocess_image)


In [None]:
clients_datasets = [convert_dataframe_into_dataset(client_df) for client_df in clients_dfs]

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 20
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10

def preprocess(dataset):

  def batch_format_fn(element1, element2):
    """Flatten a batch `pixels` and return the features as an `OrderedDict`."""
    return collections.OrderedDict(
        x=element1,
        y=element2)

  return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(
      BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

In [None]:
preprocessed_example_dataset = preprocess(clients_datasets[0])

sample_batch = tf.nest.map_structure(lambda x: x.numpy(),
                                     next(iter(preprocessed_example_dataset)))

In [None]:
def make_federated_data(clients_datasets, sample_clients):
  return [
      preprocess(clients_datasets[x])
      for x in sample_clients
  ]

In [None]:
sample_clients = range(NUM_CLIENTS)

federated_train_data = make_federated_data(clients_datasets, sample_clients)

print(f'Number of client datasets: {len(federated_train_data)}')
print(f'First dataset: {federated_train_data[0]}')

Number of client datasets: 2
First dataset: <_PrefetchDataset element_spec=OrderedDict([('x', TensorSpec(shape=(None, None, None, 3), dtype=tf.float32, name=None)), ('y', TensorSpec(shape=(None,), dtype=tf.int64, name=None))])>


In [None]:
def create_keras_model():
  base_model= tf.keras.applications.mobilenet_v2.MobileNetV2(include_top=False,weights='imagenet',input_shape=(224,224,3))
  base_model.trainable = False

  x = base_model.output
  x = GlobalAveragePooling2D()(x)
  x = BatchNormalization()(x)
  predictions = Dense(4, activation= "softmax")(x)
  return Model(inputs=base_model.input, outputs=predictions)

In [None]:
def model_fn():
  # We _must_ create a new model here, and _not_ capture it from an external
  # scope. TFF will call this within different graph contexts.
  keras_model = create_keras_model()
  return tff.learning.models.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [None]:
training_process = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn,
    client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=0.02),
    server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=1.0))



In [None]:
print(training_process.initialize.type_signature.formatted_representation())

( -> <
  global_model_weights=<
    trainable=<
      float32[1280],
      float32[1280],
      float32[1280,4],
      float32[4]
    >,
    non_trainable=<
      float32[3,3,3,32],
      float32[32],
      float32[32],
      float32[32],
      float32[32],
      float32[3,3,32,1],
      float32[32],
      float32[32],
      float32[32],
      float32[32],
      float32[1,1,32,16],
      float32[16],
      float32[16],
      float32[16],
      float32[16],
      float32[1,1,16,96],
      float32[96],
      float32[96],
      float32[96],
      float32[96],
      float32[3,3,96,1],
      float32[96],
      float32[96],
      float32[96],
      float32[96],
      float32[1,1,96,24],
      float32[24],
      float32[24],
      float32[24],
      float32[24],
      float32[1,1,24,144],
      float32[144],
      float32[144],
      float32[144],
      float32[144],
      float32[3,3,144,1],
      float32[144],
      float32[144],
      float32[144],
      float32[144],
      float32[1,1,144

In [None]:
train_state = training_process.initialize()

In [None]:
result = training_process.next(train_state, federated_train_data)
train_state = result.state
train_metrics = result.metrics
print('round  1, metrics={}'.format(train_metrics))

round  1, metrics=OrderedDict([('distributor', ()), ('client_work', OrderedDict([('train', OrderedDict([('sparse_categorical_accuracy', 0.2883721), ('loss', 1.3822216), ('num_examples', 5160), ('num_batches', 259)]))])), ('aggregator', OrderedDict([('mean_value', ()), ('mean_weight', ())])), ('finalizer', OrderedDict([('update_non_finite', 0)]))])
