# Analysis of Split ECG latent space for Model Poisoning Detection

## Distance Based Per-Class First Moments

In [1]:
import numpy as np
import torch 
import os
import sys
import pickle
import pandas as pd
import scipy.spatial as sp
from functools import partial
import gc
import multiprocessing
from contextlib import closing
#from tqdm import tqdm
import tqdm
from tqdm.contrib.concurrent import process_map, thread_map


In [62]:

# Weights classes of a multi-label dataset based on the number of samples in each class. Every unique combination of labels is considered a class. Uses softmax to normalize the weights.
def class_weights(dataset):		
    # Get the number of samples in each class
    class_counts = np.zeros(dataset.num_classes)
    for i in range(dataset.num_classes):
        class_counts[i] = np.sum(dataset.labels[:, i])
    
    # Normalize the weights
    class_weights = np.exp(class_counts)
    class_weights = class_weights / np.sum(class_weights)
    
    return class_weights	

# Gets a pandas series of 5D numpy arrays. Filters out all arrays, which have a value of 0 in the 3rd dimension.
def filter_labels(df, idx, val=1):	
    return df[df.label.apply(lambda x: x[idx] == val)]

# Gets a numpy array as input. Returns a numpy array with all possible 2-combinations of the input. The order of the combinations is not important and combinations with the same elements are not included.
def get_similarities(X, similarity_functions):	
    similarities = {}	
    for s in similarity_functions:
        similarities[s] = sp.distance.pdist(X, s)
    return similarities

def get_unique_labels(df):	
    #decimal = df.label.apply(lambda x: np.sum(x * 2**np.arange(x.size)[::-1]))
    decimal = df.label
    ul = decimal.unique()
    un = decimal.value_counts().values
    ind = np.argsort(ul)
    ul = np.take_along_axis(ul, ind, axis=0)
    un = np.take_along_axis(un, ind, axis=0)
    return np.column_stack((ul, un))

def per_label_similarities(epoch, base_path, similarities):	
    epoch_path = os.path.join(base_path, "epoch_" + str(epoch) + ".pickle")
    samples = pickle.load(open(epoch_path, "rb"))
    samples.label = samples.label.apply(lambda x: np.sum(x * 2**np.arange(x.size)[::-1]))
    samples = samples.groupby(["label"])
    df_epoch = pd.DataFrame(columns=["epoch", "label"] + similarities)
    for label, group in samples:
        latent_vectors = group.client_output_pooled.values
        latent_vectors = np.array(latent_vectors.tolist())
        sim = get_similarities(latent_vectors, similarities)
        sim["epoch"] = epoch
        sim["label"] = label
        df_epoch = pd.concat([df_epoch, pd.DataFrame([sim])], ignore_index=True, copy=False)	
    return df_epoch

def compute_in_parallel(base_path, epochs, similarities, num_workers=5, save_path=None):	
    df = pd.DataFrame(columns=["epoch", "label"] + similarities)
    partial_per_label_similarities = partial(per_label_similarities, base_path=base_path, similarities=similarities)
    with closing(multiprocessing.Pool(processes=num_workers)) as p:
        with tqdm.tqdm(total=epochs) as pbar:
            for r in p.imap_unordered(partial_per_label_similarities, np.arange(1, epochs+1)):
                df = pd.concat([df, r], ignore_index=True, copy=False)
                pbar.update()
    p.close()
    p.terminate()
    p.join()
    
    if save_path:
        df.to_pickle(os.path.join(base_path, save_path))
    return df

In [3]:
similarities = ["cosine", "euclidean", "cityblock", "correlation", "jaccard"]
epochs = 30

### Case 1: Honest Client

In [45]:
base_path = "/home/mohkoh/Projects/Split_ECG_Classification/latent_space/single_client_honest/client_1"
#df_sim_honest = compute_in_parallel(base_path, epochs, similarities, save_path="similarities_honest.pickle")

In [46]:
df_sim_honest = pickle.load(open(os.path.join(base_path, "similarities_honest.pickle"), "rb"))

In [47]:
for s in similarities:
    df_sim_honest[s] = df_sim_honest[s].apply(lambda x: np.mean(x))

  return _methods._mean(a, axis=axis, dtype=dtype,


In [70]:
df_sim_honest[(df_sim_honest.label == 5.) & (df_sim_honest.epoch == 30)].mean()

epoch           30.000000
label            5.000000
cosine           0.093269
euclidean       35.726017
cityblock      141.188511
correlation      0.087024
jaccard          1.000000
dtype: float64

In [72]:
df_sim_honest[(df_sim_honest.label == 5.) & (df_sim_honest.epoch == 30)]

Unnamed: 0,epoch,label,cosine,euclidean,cityblock,correlation,jaccard
641,30,5.0,0.093269,35.726017,141.188511,0.087024,1.0


In [77]:
df_sim_honest[(df_sim_honest.label == 2.)].sort_values(by="epoch")

Unnamed: 0,epoch,label,cosine,euclidean,cityblock,correlation,jaccard
89,1,2.0,0.140021,0.232187,0.915716,0.136112,1.0
67,2,2.0,0.192272,0.421606,1.653392,0.186834,1.0
1,3,2.0,0.119043,1.619107,6.35857,0.114372,1.0
23,4,2.0,0.078803,4.345937,16.931035,0.074516,1.0
45,5,2.0,0.087734,7.00654,27.46128,0.082934,1.0
111,6,2.0,0.108336,8.656045,33.994794,0.102934,1.0
133,7,2.0,0.105669,9.872597,38.78148,0.099989,1.0
155,8,2.0,0.104453,10.895528,42.886002,0.098594,1.0
177,9,2.0,0.097394,11.631895,45.860806,0.091516,1.0
199,10,2.0,0.096418,12.550113,49.491111,0.090395,1.0


In [66]:
df_sim_honest[(df_sim_honest.label == 12.)].var()

epoch            77.500000
label             0.000000
cosine            0.000128
euclidean       125.472394
cityblock      2034.404335
correlation       0.000128
jaccard           0.000000
dtype: float64

In [69]:
meta = pickle.load(open("/home/mohkoh/Projects/Split_ECG_Classification/latent_space/single_client_honest/client_1/metadata.pickle", "rb"))
meta	

({'exp_name': 'single_client_honest',
  'client_num': 1,
  'is_malicious': False,
  'batchsize': 64,
  'random_point_prob': 0.2,
  'random_label_prob': 0.1},)

### Case 2: Malicious Client

In [49]:
base_path = "/home/mohkoh/Projects/Split_ECG_Classification/latent_space/single_client_malicious/client_1"
#df_sim_malicious = compute_in_parallel(base_path, epochs, similarities, save_path="similarities_malicious.pickle")

In [50]:
df_sim_malicious = pickle.load(open(os.path.join(base_path, "similarities_malicious.pickle"), "rb"))

In [51]:
for s in similarities:
    df_sim_malicious[s] = df_sim_malicious[s].apply(lambda x: np.mean(x))

In [78]:
df_sim_malicious[(df_sim_malicious.label == 2.)].sort_values(by="epoch")

Unnamed: 0,epoch,label,cosine,euclidean,cityblock,correlation,jaccard
2,1,2.0,0.157481,0.251441,0.992092,0.154964,1.0
98,2,2.0,0.163464,0.312491,1.224345,0.157153,1.0
66,3,2.0,0.10756,0.451769,1.778263,0.102383,1.0
130,4,2.0,0.10101,0.656637,2.576239,0.096865,1.0
34,5,2.0,0.09219,1.24541,4.900747,0.086329,1.0
162,6,2.0,0.087928,1.513455,5.949912,0.082102,1.0
194,7,2.0,0.081729,2.131535,8.351654,0.076514,1.0
226,8,2.0,0.077407,2.56512,10.076274,0.07193,1.0
258,9,2.0,0.065052,2.644972,10.357064,0.061771,1.0
290,10,2.0,0.062044,3.02265,11.895438,0.058918,1.0


In [71]:
df_sim_malicious[(df_sim_malicious.label == 5.) & (df_sim_malicious.epoch == 30)].mean()

epoch          30.000000
label           5.000000
cosine          0.047480
euclidean       5.702662
cityblock      22.432329
correlation     0.046966
jaccard         1.000000
dtype: float64

In [67]:
df_sim_malicious[(df_sim_malicious.label == 12.)].var()

epoch          77.500000
label           0.000000
cosine          0.001237
euclidean       5.451526
cityblock      84.421348
correlation     0.001150
jaccard         0.000000
dtype: float64