# Demonstration for Region North

### Setup and imports

In [None]:
import strategies
import load_data
import baselines

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import datetime
import warnings

import shap

from os import listdir
from os.path import isfile, join
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings("ignore")
sns.set_style("darkgrid")

In [25]:
# Lerning Parametes
num_local_epochs = 5
num_rounds = 200

# Learning Strategy
strategy_name = "FedAvg"
model_name = "auto"
stepsize = 1.2
weighted = True
reset_per_round = False

# Static Parameters
batch_size= 64
device = "cuda"
test_set_fraction = 0.2

# Analysis Parameters
average_lxo = 2
reputation_ts = 3

# Logging
log_per_round = True
log_file = None
averaging = "weighted"

In [26]:
config = {
    "strategy_name": strategy_name,
    "model_name": model_name,
    "batch_size":batch_size,

    "weighted":weighted,
    "reset_per_round":reset_per_round,

    "device":device,
    "stepsize":stepsize,
    "rounds": num_rounds,
    "local_epochs": num_local_epochs,
    "average_lxo": average_lxo,
    "reputation_ts": reputation_ts,

    "test_set_fraction": test_set_fraction,

    "evaluation_averaging": averaging,
}

In [20]:
REGION = "North"

## Contribution Prediction

### Acquiring the training data

In [None]:
# load the FL setting with REGION as hold-out region
config = load_data.load_churn_dataset(config, column="state")
config = load_data.split_train_test(config, region=REGION, frac=config["test_set_fraction"])

# measure data imbalance among clients
global_label_imbalance, local_label_imbalances, global_quantity_imbalance, local_quantity_imbalances, (global_cs_median, global_cs_stdev), local_label_distribution_imbalances, global_feature_imbalance, local_feature_imbalances = load_data.measure_imbalance(config, filename=log_file, log=False)
imbalances = [[local_label_imbalances[x], local_quantity_imbalances[x],  local_label_distribution_imbalances[x], local_feature_imbalances[x]] for x in range(config["num_train_clients"])]

LDI = [local_label_distribution_imbalances[x] for x in range(config["num_train_clients"])]
LQI = [local_quantity_imbalances[x] for x in range(config["num_train_clients"])]
LLI = [x for x in list(local_label_imbalances.values())]
LFI = [local_feature_imbalances[x] for x in range(config["num_train_clients"])]

GDI = [global_cs_median for x in range(len(LLI))]
GQI = [global_quantity_imbalance for x in range(len(LLI))]
GLI = [global_label_imbalance for x in range(len(LLI))]
GFI = [global_feature_imbalance for x in range(len(LLI))]

# Perform training
learning_strategy = strategies.get_strategy_by_name(config)
federated_model, federated_f1s = learning_strategy.run(config, filename=log_file, log_per_round=False, return_f1s=True)

#Performance measurement
acc, pre, rec, f1, _ = baselines.evaluatefederated_model, config["X_test"], config["y_test"], config, filename=log_file, log=False)

# measure client contribution
leave_out_performances, influences, reputations, cluster_imbalances, leave_out_clusters, _ = baselines.measure_contribution(federated_model, federated_f1s, config, imbalances, filename=log_file, log_per_round=True)

contributions = []
length = config["num_train_clients"]
correct_cluster_imbalances = leave_out_performances[-length:]
for i in range(config["num_train_clients"]):
    for j, l in enumerate(leave_out_clusters):
        if i in l:
            contributions.append((f1 - correct_cluster_imbalances[j][3]) / len(l))

# save to csv-file for subsequent training
d = {"Contribution": contributions, "Global Label Imbalance": GLI, "Global Distribution Imbalance": GDI, "Global Quantity Imbalance": GQI, "Global Feature Imbalance": GFI, "Local Label Imbalance": LLI, "Local Distribution Imbalance": LDI, "Local Quantity Imbalance": LQI, "Local Feature Imbalance": LFI}

if "regions" in config:
    d["Regions"] = config["regions"]
    d["States"] = config["states"]

df = pd.DataFrame(data=d)
df.to_csv("./predictions/churn_"+config["client_group"]+"_"+config["test_region"]+"_"+datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")+".csv", index=False)

### Train the contribution regressor

In [36]:
# load dataframes from csv-files for region REGION
files = [f for f in listdir("./predictions") if isfile(join("./predictions", f)) & f.startswith("churn_state_"+REGION+"_")]

dfs = []
for filename in files:
    df = pd.read_csv("./predictions/"+filename)
    dfs.append(df)

df = pd.concat(dfs, axis=0, ignore_index=True)

In [None]:
# train an regressor
# instead of the RandomForestRegressor, any alternative can be provided
y = df["Contribution"].to_numpy().astype(float)
df = df.drop(["Contribution"], axis=1)
X = df.to_numpy().astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=0)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print("MAE:  ",mean_absolute_error(y_test, y_pred))
print("RMSE: ",mean_squared_error(y_test, y_pred))

## Contribution Understanding

### Global understanding using SHAP

In [None]:
# load SHAP explainer for the previously trained regressor
explainer = shap.Explainer(clf)
shap_values = explainer(np.array(X_test))
feature_names = ["GLI","GDI","GQI","GFI","LLI", "LDI", "LQI", "LFI"]
order = list(range(8))

fig = shap.plots.beeswarm(shap_values, order=order, show=False)

plt.xlabel("SHAP value")
plt.ylabel("Imbalance")
feature_names.reverse()
plt.yticks(list(range(8)),feature_names)
plt.show()

### Local Understanding using SHAP

In [None]:
sns.set(style="white")
# load SHAP explainer for the previously trained regressor
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

# iterate instances to be explained
for i in range(len(X_test)):
    names = ["GLI","GDI","GQI","GFI","LLI", "LDI", "LQI", "LFI"]
    names = [x+"={:1.4f}".format(X_test[i][n]) for n, x in enumerate(names)]

    shap_plot = shap.force_plot(explainer.expected_value,
        shap_values[i],
        matplotlib=True, show=False,plot_cmap=['#77dd77', '#f99191'], feature_names=names)
    plt.gcf()
    plt.show()

# Federated Training
Apply FL with all but the rejected clients.

In [48]:
# list of states excluded for each setting according to our input contorl
d = {
     "Uncontrolled":[],
     "North": ["PA", "AM", "AC"],
     "Northeast": ["PE", "AL", "CE", "PB"],
     "Center West": ["DF"],
     "Southeast": ["SP"],
     "South": ["RS", "PR"],
}

In [None]:
# perform training without excluded states
config = load_data.load_churn_dataset(config, column="state")
config = load_data.split_train_test(config, region=0.2, exclude=d[REGION])
learning_strategy = strategies.get_strategy_by_name(config)
federated_model, federated_f1s = learning_strategy.run(config, filename=log_file, log_per_round=False, return_f1s=True)