## Setup

In [None]:
import strategies
import load_data
import baselines

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import warnings
import pickle
import random
from copy import deepcopy
from collections import Counter

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")

## Config

In [None]:
# Data Features
dataset_name = "MNIST"

# Data Distribution
attribute_skew = None #"noise"
label_or_quantity_skew = "label_distribution"
label_alpha = 2
label_n = None
attribute_alpha = None
num_clients = 100
purity = None

# Analysis Parameters
num_quantiles = 4

# Lerning Parametes
num_local_epochs = 3
local_multiplier = 0
num_rounds = 200

# Learning Strategy
strategy_name = "FedAvg"
model_name = "auto"
bert_pretrained_model = None
stepsize = 1.2
weighted = True
reset_per_round = False

# Static Parameters
batch_size= 64
device = "cuda"
test_set_fraction = 0.2
shared_set_fraction = 0

# Logging
log_per_round = False
log_file = None
averaging = "weighted"

In [None]:
default_config = {
    "strategy_name": strategy_name,
    "model_name": model_name,
    "bert_pretrained_model":bert_pretrained_model,
    "dataset_name": dataset_name,

    "num_clients":num_clients,
    "batch_size":batch_size,

    "weighted":weighted,
    "reset_per_round":reset_per_round,

    "device":device,
    "stepsize":stepsize,
    "rounds": num_rounds,
    "local_epochs": num_local_epochs,
    "local_multiplier": local_multiplier,

    "attribute_skew": attribute_skew,
    "label_skew": label_or_quantity_skew,
    "label_alpha": label_alpha,
    "label_n": label_n,
    "attribute_alpha": attribute_alpha,
    "purity": purity,

    "num_quantiles": num_quantiles,

    "test_set_fraction": test_set_fraction,
    "shared_set_fraction": shared_set_fraction,

    "evaluation_averaging": averaging,
}

## Experiments

In [None]:
def apply_sampling(mode, features, labels, X_test, y_test, num_classes, mean_size, sampling_strategy="auto"):

    features = np.array(features)
    labels = np.array(labels)
    feature_shape = list(features.shape)
    X_test = np.array(X_test).reshape(len(X_test),-1)

    counter_dict = dict(Counter(labels))
    cnts = [a for _,a in Counter(labels).items()]
    if (len(list(set(labels))) < num_classes or min(cnts) < 6) and mode is not None:
        return None, None, "excluded"
    if mode == "constrained":
        return features, labels, "constrained"
    elif mode == "undersampling":
        if sampling_strategy != "auto" and num_classes > 2:
            num_samples = int(max(cnts)*sampling_strategy)
            sampler = {x:min(counter_dict[x], num_samples) for x in range(num_classes)}
        else:
            sampler = sampling_strategy
        undersample = RandomUnderSampler(sampling_strategy=sampler)
        if sampling_strategy != "auto" and sampling_strategy <= min(cnts)/max(cnts):
            return features, labels, "undersampling"
        X,y = undersample.fit_resample(features.reshape(feature_shape[0], -1), labels)
        feature_shape[0] = -1
        return X.reshape(feature_shape), y, "undersampling"
    elif mode == "oversampling":
        if sampling_strategy != "auto" and num_classes > 2:
            num_samples = int(max(cnts)*sampling_strategy)
            sampler = {x:max(counter_dict[x], num_samples) for x in range(num_classes)}
        else:
            sampler = sampling_strategy
        oversample = SMOTE(sampling_strategy=sampler)
        if sampling_strategy != "auto" and sampling_strategy <= min(cnts)/max(cnts):
            return features, labels, "oversampling"
        X,y = oversample.fit_resample(features.reshape(feature_shape[0], -1), labels)
        feature_shape[0] = -1
        return X.reshape(feature_shape), y, "oversampling"
    elif mode == "hybrid":
        if sampling_strategy != "auto" and num_classes > 2:
            num_samples = int(max(cnts)*sampling_strategy)
            sampler = {x:max(counter_dict[x], num_samples) for x in range(num_classes)}
        else:
            sampler = sampling_strategy
        smt = SMOTETomek(sampling_strategy=sampler)
        if sampling_strategy != "auto" and sampling_strategy <= min(cnts)/max(cnts):
            return features, labels, "hybridsampling"
        X,y = smt.fit_resample(features.reshape(feature_shape[0], -1), labels)
        feature_shape[0] = -1
        return X.reshape(feature_shape), y, "hybridsampling"
    elif mode == "dynamic":
        if len(labels) > mean_size:
            undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
            X,y = undersample.fit_resample(features.reshape(feature_shape[0], -1), labels)
            feature_shape[0] = -1
            return X.reshape(feature_shape), y, "undersampling"
        else:
            oversample = SMOTE(sampling_strategy=sampling_strategy)
            X,y = oversample.fit_resample(features.reshape(feature_shape[0], -1), labels)
            feature_shape[0] = -1
            return X.reshape(feature_shape), y, "oversampling"
    elif mode == "optimized":
        undersample = RandomUnderSampler(sampling_strategy=sampling_strategy)
        X_train_us, y_train_us = undersample.fit_resample(features.reshape(feature_shape[0], -1), labels)
        clf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=0)
        clf.fit(X_train_us, y_train_us)
        preds = clf.predict(X_test)
        acc_us = accuracy_score(y_test, preds)

        oversample = SMOTE(sampling_strategy=sampling_strategy)
        X_train_os, y_train_os = oversample.fit_resample(features.reshape(feature_shape[0], -1), labels)
        clf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=0)
        clf.fit(X_train_os, y_train_os)
        preds = clf.predict(X_test)
        acc_os = accuracy_score(y_test, preds)

        smt = SMOTETomek(sampling_strategy=sampling_strategy)
        X_train_hs, y_train_hs = smt.fit_resample(features.reshape(feature_shape[0], -1), labels)
        clf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=0)
        clf.fit(X_train_hs, y_train_hs)
        preds = clf.predict(X_test)
        acc_hs = accuracy_score(y_test, preds)

        feature_shape[0] = -1
        if acc_us >= acc_os and acc_us >= acc_hs:
            return X_train_us.reshape(feature_shape), y_train_us, "undersampling"
        elif acc_os >= acc_os and acc_os >= acc_hs:
            return X_train_os.reshape(feature_shape), y_train_os, "oversampling"
        elif acc_hs >= acc_us and acc_hs >= acc_os:
            return X_train_os.reshape(feature_shape), y_train_os, "hybridsampling"
    #else
    return features, labels, "none"

### Samples per Client

In [None]:
all_central_metrics = []

samplings = [None, "constrained", "oversampling", "undersampling", "dynamic", "hybrid", "optimized"]
if default_config["dataset_name"] == "hand":
    maxs = [650, 600, 500,400,300,200,100]
elif default_config["dataset_name"] == "diabetes_insulin":
    maxs = [1000,750,500,350,200,150,100]
elif default_config["dataset_name"] == "MNIST":
    maxs = [480,450,400,300,200,150,100]
else:
    maxs = [4000,3000,2000,1000,500,200,100]

default_config = load_data.load_raw_data(default_config)
default_config = load_data.distribute_skewed_data(default_config)

for max in maxs:
    config = deepcopy(default_config)

    for key in config["clients_feature_dict"]:
          indices = random.sample(range(0, len(config["clients_feature_dict"][key])), min(max, len(config["clients_feature_dict"][key])))
          config["clients_feature_dict"][key]  = np.array(config["clients_feature_dict"][key])[indices].tolist()
          config["clients_label_dict"][key]  = np.array(config["clients_label_dict"][key])[indices].tolist()
    print("Max client size", max)

    m = statistics.mean([len(x) for x in config["clients_label_dict"].values()])

    for sampling in samplings:
      print("Sampling:", sampling)
      local_config = deepcopy(config)

      feature_dict = local_config["clients_feature_dict"]
      label_dict = local_config["clients_label_dict"]
      excluded=[]
      modes = []
      for key in feature_dict:
          sampled_features, sampled_labels, mode = apply_sampling(sampling, feature_dict[key], label_dict[key], local_config["X_train"], local_config["y_train"], local_config["num_classes"], m)
          modes.append(mode)
          feature_dict[key] = sampled_features
          label_dict[key] = sampled_labels
          if sampled_features is None:
              excluded.append(key)
      print("Modes", Counter(modes))
      local_config["clients_feature_dict"] = feature_dict
      local_config["clients_label_dict"] = label_dict

      excluded.reverse()
      for i in excluded:
          max_idx = len(local_config["clients_feature_dict"])-1
          local_config["clients_feature_dict"][i] = local_config["clients_feature_dict"][max_idx]
          del local_config["clients_feature_dict"][max_idx]
          local_config["clients_label_dict"][i] = local_config["clients_label_dict"][max_idx]
          del local_config["clients_label_dict"][max_idx]
      local_config["num_clients"] = local_config["num_clients"] - len(excluded)

      learning_strategy = strategies.get_strategy_by_name(local_config)

      federated_model, federated_f1s = learning_strategy.run(local_config, filename=log_file, log_per_round=log_per_round, return_f1s=True)
      _, _, _, central_metrics = baselines.run_local_baselines(local_config, central_model=federated_model, filename=log_file)

      all_central_metrics.append(central_metrics)

### Cohort Size

In [None]:
all_central_metrics = []

samplings = [None, "constrained", "oversampling", "undersampling", "dynamic", "hybrid", "optimized"]
removes = [10,20,30,40,50,60,70,80,90,91,92,93,94,95]

default_config = load_data.load_raw_data(default_config)
default_config = load_data.distribute_skewed_data(default_config)

for remove in removes:
    config = deepcopy(default_config)

    remove_idx = list(sorted(random.sample(range(0, config["num_clients"]), remove)))
    remove_idx.reverse()

    for i in remove_idx:
          max_idx = len(config["clients_feature_dict"])-1
          config["clients_feature_dict"][i] = config["clients_feature_dict"][max_idx]
          del config["clients_feature_dict"][max_idx]
          config["clients_label_dict"][i] = config["clients_label_dict"][max_idx]
          del config["clients_label_dict"][max_idx]
    config["num_clients"] = config["num_clients"] - remove

    print("Cohort size", config["num_clients"])

    m = statistics.mean([len(x) for x in config["clients_label_dict"].values()])

    for sampling in samplings:
      print("Sampling:", sampling)
      local_config = deepcopy(config)

      feature_dict = local_config["clients_feature_dict"]
      label_dict = local_config["clients_label_dict"]
      excluded=[]
      modes = []
      for key in feature_dict:
          sampled_features, sampled_labels, mode = apply_sampling(sampling, feature_dict[key], label_dict[key], local_config["X_train"], local_config["y_train"], local_config["num_classes"], m)
          modes.append(mode)
          feature_dict[key] = sampled_features
          label_dict[key] = sampled_labels
          if sampled_features is None:
              excluded.append(key)
      print("Modes", Counter(modes))
      local_config["clients_feature_dict"] = feature_dict
      local_config["clients_label_dict"] = label_dict

      excluded.reverse()
      for i in excluded:
          max_idx = len(local_config["clients_feature_dict"])-1
          local_config["clients_feature_dict"][i] = local_config["clients_feature_dict"][max_idx]
          del local_config["clients_feature_dict"][max_idx]
          local_config["clients_label_dict"][i] = local_config["clients_label_dict"][max_idx]
          del local_config["clients_label_dict"][max_idx]
      local_config["num_clients"] = local_config["num_clients"] - len(excluded)

      learning_strategy = strategies.get_strategy_by_name(local_config)

      federated_model, federated_f1s = learning_strategy.run(local_config, filename=log_file, log_per_round=log_per_round, return_f1s=True)
      _, _, _, central_metrics = baselines.run_local_baselines(local_config, central_model=federated_model, filename=log_file)

      all_central_metrics.append(central_metrics)

### Data Imbalance

In [None]:
all_central_metrics = []

samplings = [None, "constrained", "oversampling", "undersampling", "dynamic", "hybrid", "optimized"]

alphas = [100,50,20,10,5,2,1,0.5,0.2,0.1]

for alpha in alphas:
    default_config["label_alpha"] = alpha
    print("Alpha:", alpha)

    config = load_data.load_raw_data(default_config)
    config = load_data.distribute_skewed_data(config)

    m = statistics.mean([len(x) for x in config["clients_label_dict"].values()])

    for sampling in samplings:
      print("Sampling:", sampling)
      local_config = deepcopy(config)

      feature_dict = local_config["clients_feature_dict"]
      label_dict = local_config["clients_label_dict"]
      excluded=[]
      modes = []
      for key in feature_dict:
          sampled_features, sampled_labels, mode = apply_sampling(sampling, feature_dict[key], label_dict[key], local_config["X_train"], local_config["y_train"], local_config["num_classes"], m)
          modes.append(mode)
          feature_dict[key] = sampled_features
          label_dict[key] = sampled_labels
          if sampled_features is None:
              excluded.append(key)
      print("Modes", Counter(modes))
      local_config["clients_feature_dict"] = feature_dict
      local_config["clients_label_dict"] = label_dict

      excluded.reverse()
      for i in excluded:
          max_idx = len(local_config["clients_feature_dict"])-1
          local_config["clients_feature_dict"][i] = local_config["clients_feature_dict"][max_idx]
          del local_config["clients_feature_dict"][max_idx]
          local_config["clients_label_dict"][i] = local_config["clients_label_dict"][max_idx]
          del local_config["clients_label_dict"][max_idx]
      local_config["num_clients"] = local_config["num_clients"] - len(excluded)

      learning_strategy = strategies.get_strategy_by_name(local_config)


      federated_model, federated_f1s = learning_strategy.run(local_config, filename=log_file, log_per_round=log_per_round, return_f1s=True)
      _, _, _, central_metrics = baselines.run_local_baselines(local_config, central_model=federated_model, filename=log_file)

      all_central_metrics.append(central_metrics)