In [None]:
import sys, os
sys.path.insert(1, os.path.join(os.getcwd(), "src"))
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
from my_library import read_log, filter_results
from pprint import pprint
from math import ceil
%matplotlib inline

In [None]:
def make_label_rule(metadata):
    rule = []
    for col in metadata.columns:
        if col != "log_file":
            if list(metadata[col].values).count(metadata[col].values[0]) != len(metadata[col].values):
                rule.append(col)
    rule.sort()
    return rule

def col_to_string(config, r):
    if r == "server_global_rate":
        return f"Global rate: {config[r]}"
    elif r == "mobility_rate":
        return f"Mobility: {config[r] if config['clients_mobility'] else 'static'}"
    elif r == "client_n_epochs":
        return f"Local iterations: {config[r]}"
    elif r == "client_batch_size":
        return f"Batch Size: {config[r]}"
    elif r == "dataset_distribution":
        return f"Data: {config[r]}"
    elif r == "lr_warmup":
        return f"LR Warmup: {'Yes' if config[r] else 'No'}"
    elif r == "clients_mobility":
        return ""
    elif r == "epochs_delay_localSGD":
        return f"LocalSGD delay: {int(config[r])}"
    elif r == "n_clusters":
        return f"Clusters: {config[r]}"
    elif r == "move_to_neighbours":
        return f"Move to: {'neighbours' if config[r] else 'any'}"
    elif r == "client_algorithm":
        return f"Algorithm: {config[r]}"
    else:
        raise ValueError(r)

def make_label(config, rule):
    label = ""
    for r in rule:
        text = col_to_string(config, r)
        if text:
            label += f"{text} - "
    return label[:-3]

def make_plot_title(metadata, cols,sep="\n"):
    out = ""
    for col in cols:
        text = col_to_string(metadata.iloc[0], col)
        if text:
            out = "\n(" if not out else out+f",{sep}"
            out += f"{text}"
    return out+")"

In [None]:
TEST_RESULTS_DIR = "log_files"
TEST_RESULTS_DIR = "results_library"

results_dir = os.path.join(os.getcwd(), TEST_RESULTS_DIR)
files = os.listdir(results_dir)
files = [f for f in files if "_avg" in f] # average files only
files.sort()
print(f"Total result files found:\t{len(files)}")

# !ls -1 {results_dir}
# print()
# pprint(files)

In [None]:
# FILTER RESULT FILES

# ALL
selected_files = files
# FIILTER IF CONTAINS STRINGS
select_str = ["cifar","multi"]
# select_str = ["test", "C"]
for s in select_str:
    selected_files = [f for f in selected_files if s in f]

# LOAD RESULTS FROM SELECTED FILES
selected_files = [os.path.join(results_dir,f) for f in selected_files]
data = []
for f in selected_files:
    data.append(read_log(f))

print(f"Loaded {len(data)} result files.")


metadata = [d["config"] for d in data]
columns = []
for i in metadata:
    if len(i.keys()) > len(columns):
        columns = list(i.keys())
metadata = pd.DataFrame(metadata, columns=columns).fillna(False)
metadata = metadata.drop(["log_frequency","log_verbosity","stdout_frequency","stdout_verbosity","stop_value","debug"],axis=1)

results = [d["results"] for d in data]
results = pd.DataFrame(results, columns=results[0].keys())

In [None]:
#####################################
#    DEFINE FILTERS FOR RESULTS
#####################################
filters = {}
filters["dataset_distribution"] = ["non_iid_spatial"]
filters["client_batch_size"] = [32]
filters["client_n_epochs"] = []
filters["mobility_rate"] = [0]
filters["server_global_rate"] = []
filters["lr_warmup"] = [False]
filters["epochs_delay_localSGD"] = []

filters["dataset_name"] = ["cifar10"]
filters["model_type"] = []
filters["client_algorithm"] = ["sgd"]
filters["n_clusters"] = [25]
filters["move_to_neighbours"] = [False]

filters["client_lr"] = []
filters["client_selection_fraction"] = []
#####################################
#####################################
selected_metadata, selected_results = filter_results(metadata, results, filters,
                                                     sort_by = ["client_algorithm","dataset_distribution","server_global_rate","client_batch_size","mobility_rate","client_n_epochs","epochs_delay_localSGD","lr_warmup"])
used_filters = [f for f in filters.keys() if filters[f]]
print(f"Selected {len(selected_metadata)} results.")
selected_metadata#.head()
# selected_results.head()

In [None]:
rule = make_label_rule(selected_metadata)
# PLOT RESULTS
plt.figure(figsize=(12,5))
# plt.figure(figsize=(16,8))
for acc_id, acc_name in enumerate(["test_accuracy","weight_divergence"]):
    plt.subplot(1,2,acc_id+1)
#     ttl = make_plot_title(selected_metadata, cols=["mobility_rate","move_to_neighbours"])
    ttl = make_plot_title(selected_metadata, cols=["dataset_distribution","mobility_rate"], sep=" ")
    plt.title("Weight Divergence"+ttl if acc_name=="weight_divergence" else "Test Accuracy"+ttl)
    max_x = 0
    min_x = 1e99
    max_y = 0
    min_y = 1e99
    for r_idx,r in selected_results.iterrows():
        rounds = r["rounds"]
        avg_latency = np.mean(r["latency"])
        avg_latency = r["latency_median"]
        
        X = [round_*avg_latency for round_ in rounds]
        X = rounds
        # X = np.cumsum(r["results"]["latency"])
        acc = r[acc_name]
        label = make_label(selected_metadata.loc[r_idx],
                           rule=rule)
#         X = X[:50]
#         acc = acc[:50]
        plt.plot(X, acc, label=label)
        min_x = min(min_x, min(X))
        max_x = max(max_x, max(X))
        min_y = min(min_y, min(acc))
        max_y = max(max_y, max(acc))
    if "accuracy" in acc_name:
        plt.ylim(0,1)
        plt.ylim(0,0.8)
#         plt.yticks([x/10 for x in range(11)])
        pass
#         plt.plot(list(range(0,251,5)), [0.95 for _ in range(0,251,5)],"_",markeredgewidth=5)
    else:
        plt.ylim(0,ceil(max_y))
        pass
    plt.xlim(0,max_x)
    plt.xlabel("Rounds")
    plt.ylabel("Accuracy" if "accuracy" in acc_name else "Average Divergence")
    plt.legend()
    plt.grid()
plt.show()
# print(max_x)
# print(min_x)

In [None]:
threshold = 0.75
rounds2target_test = []
latency = []
weight_div = []
for i,res in selected_results.iterrows():
    try:
        round_target = [res.test_accuracy[i] >= threshold for i in range(len(res.test_accuracy))].index(True) + 1
    except ValueError:
        round_target = None
    rounds2target_test.append(round_target)
    latency.append(res.latency_median)
    weight_div.append(np.mean(res.weight_divergence))
for t in rounds2target_test:
    print(t)
print("+++++++++")
for wd in weight_div:
    print(round(wd,1))
# pprint([round(wd,1) for wd in weight_div])

# selected_results["rounds2target_test"] = rounds2target_test
# selected_results["latency"] = latency


In [None]:
corr_metadata = copy.deepcopy(selected_metadata)
if "epochs_delay_localSGD" in corr_metadata.columns:
    corr_metadata["epochs_delay_localSGD"] = pd.to_numeric(corr_metadata["epochs_delay_localSGD"])
corr_metadata["rounds2target_test"] = rounds2target_test
corr_metadata["latency"] = latency
corr_metadata["avg_weight_divergence"] = weight_div
corr_metadata.replace(False,0).replace(True,1).replace("sgd",0).replace("scaffold",1).corr().round(2)[["rounds2target_test","avg_weight_divergence", "latency"]].drop(["client_selection_fraction","rounds2target_test","avg_weight_divergence", "latency"]).dropna()

In [None]:
mobility = [0,0.1,0.25,0.5]
neighb_corr = [-0.03,0.15,0.37,0.24]
rate_corr = [1,-0.25,-0.34,-0.78]

plt.figure(figsize=(10,6))
plt.plot(mobility, rate_corr,marker="o", label="9 clusters")
rate_corr = [np.NaN,-0.37,-0.48,-0.31]
plt.plot(mobility, rate_corr,marker="o", label="25 clusters")
plt.grid()
plt.xticks(mobility)
plt.xlabel("Mobility Rate")
plt.ylabel("Correlation w\ Rounds @ 0.95 accuracy")
plt.ylim(-1,1)
plt.title("Global Update Rate\n(Negative correlation means faster\nconvergence with higher parameter value)")
plt.legend()
plt.show()

plt.figure(figsize=(10,6))
plt.plot(mobility, neighb_corr,marker="o", label="9 clusters")
neighb_corr = [-1,-0.48,0.42,-0.21]
plt.plot(mobility, neighb_corr,marker="o", label="25 clusters")
plt.grid()
plt.xticks(mobility)
plt.ylim(-1,1)
plt.xlabel("Mobility Rate")
plt.ylabel("Correlation w\ Rounds @ 0.95 accuracy")
plt.title("Move To Neighbours Only\n(Negative correlation means faster\nconvergence with higher parameter value)")
plt.legend()
plt.show()

In [None]:
# selected_metadata = selected_metadata.sort_values("server_global_rate")

In [None]:
# temp = selected_results.loc[selected_metadata.index[:2]]
# from src.my_library import read_log,write_log


In [None]:
# for i in range(0,len(selected_metadata.index),2):
# # for i in range(1):
#     print("YOOO",i, "rate",selected_metadata.iloc[i]["server_global_rate"])
#     temp_logs=[]
#     file_names=[]
#     for e in selected_metadata.loc[selected_metadata.index[i:i+2]]["log_file"]:
#         print(e)
#         file_names.append(f"{e}_avg.json")
#         log = read_log(f"{e}_avg.json")
#         temp_logs.append(log)
#     temp = selected_results.loc[selected_metadata.index[i:i+2]]
# #     display(temp)
#     for n_c,c in enumerate(temp.columns):
#         a = temp[c]

#         if c=="latency_median":
#             for i in temp[c].index:
#                 temp.at[i,c] = float(np.mean(a))
#             print(np.mean(a))
#         elif c=="rounds":
#             pass
#         else:

#             boh = np.array([x for x in a])
#             for i in temp[c].index:
#                 temp.at[i,c] = list(np.mean(boh, axis=0))
# #             pprint(np.mean(boh, axis=0))
#     for j,log in enumerate(temp_logs):
#         print(file_names[j])
#         print(log)
#         for c in temp.columns:
#             log["results"][c] = temp.iloc[0][c]
#         write_log(file_names[j], log)
        
    
