In [None]:
import numpy as np
import cupy as cp
import os, shutil
import time
import math
import pandas as pd
import queue
from math import floor
from shutil import rmtree
from concurrent.futures import ThreadPoolExecutor
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

#### Helper Functions

In [None]:
## Read all the .npy files, build consolidated CSV files (Features/Labels) and export it to disk 
def consolidate_data(path_files, path_output):
    filenames = ['{0}\\{1}'.format(path_files, file) for file in os.listdir(path_files)]
    feature_files = [file for file in filenames if 'data_pattern' in file]
    label_files = [file for file in filenames if 'label_pattern' in file]

    # Labels
    labels_map = []

    for file in label_files[0:10]:
        curr_label = cp.load(file)
        # File, Label, Patient
        row = [file.split('\\')[-1], curr_label[0].item(), curr_label[1].item()]
        labels_map.append(row)

    labels_map_df = pd.DataFrame(labels_map, columns=['File', 'Label', 'Patient'])
    labels_map_df.to_csv(f'{path_output}/eeg_labels.csv', index=False)

    # Features
    features_map = []

    for file in feature_files[0:10]:
        curr_file = cp.load(file)
        row = curr_file[0].tolist()
        row.insert(0, file.split('\\')[-1])
        features_map.append(row)

    feature_names = [f'F{feature_number + 1}' for feature_number in range(53)]
    feature_names.insert(0, 'File')
    features_map_df = pd.DataFrame(features_map, columns=feature_names)
    features_map_df.to_csv(f'{path_output}/eeg_features.csv', index=False)

#### Data Consolidation

In [None]:
# Expects a folder with the extracted .npy files
# ! This is an expensive operation, proceed with caution
path_files = "..\..\data\outputs\eeg-signals\data-prep\non DL models Gcloud"
path_output = "..\..\data\outputs\eeg-signals\data-prep"
consolidate_data(path_files, path_output)

### Data Partitioning (Stratified Sampling / Cross-Patient Distribution)

##### Centralized Partitioning

In [None]:
features = pd.read_csv("../../data/outputs/eeg-signals/data-prep/eeg_features.csv").drop('File', axis=1)
labels = pd.read_csv("../../data/outputs/eeg-signals/data-prep/eeg_labels.csv")['Label']

# Flip the labels (Preictal: 1, Interictal: 0)
labels = pd.Series(np.where(labels==0, 1, 0)).to_frame('Label')

# Split the data for the centralized approach
# First separate the train set from a 30% holdout-set
x_train, x_holdout, y_train, y_holdout = train_test_split(features, labels, test_size=0.30, shuffle=True, random_state=64, stratify=labels)
# Then split the holdout-set in half to get the evaluation and test sets
x_val, x_test, y_val, y_test = train_test_split(x_holdout, y_holdout, test_size=0.5, shuffle=True, random_state=64, stratify=y_holdout)

output_dir = '../../data/outputs/eeg-signals/data-prep/partitions/centralized-stratified'

# Clean the directory first
if os.path.exists(output_dir): 
    shutil.rmtree(output_dir)
    
os.makedirs(output_dir) 

# Export Features
x_train.to_csv(f'{output_dir}/eeg_x_train.csv', header=True, index=False)
x_val.to_csv(f'{output_dir}/eeg_x_val.csv', header=True, index=False)
x_test.to_csv(f'{output_dir}/eeg_x_test.csv', header=True, index=False)

# Export Labels
y_train.to_csv(f'{output_dir}/eeg_y_train.csv', header=True, index=False)
y_val.to_csv(f'{output_dir}/eeg_y_val.csv', header=True, index=False)
y_test.to_csv(f'{output_dir}/eeg_y_test.csv', header=True, index=False)

##### Federated Partitioning

In [None]:
x_dev = x_train
y_dev = y_train

# How many powers of 2 to have as configurations
n_fl_configs = 4
fl_configs = [2**config for config in range(1, n_fl_configs + 1, 1)]
fl_partitions = {}

# Get a mapping of the patients in each federated client, per federated configuration
for levels, fl_config in enumerate(fl_configs):
    levels += 1
    conf_partitions = []

    # Start the queue with the whole dataset
    partition_queue = queue.Queue()
    temp_queue = queue.Queue()
    partition_queue.put({'x_dev': x_dev, 'y_dev': y_dev})

    for level in range(1, levels + 1, 1):
        # Grab from the partition queue, split in 2 stratified partitions and append to the temporal queue, until empty
        while not partition_queue.empty():
            next_element = partition_queue.get()
            x_dev_partition = next_element['x_dev']
            y_dev_partition = next_element['y_dev']

            # Partition the dataset in the queue into 2 stratified dataframes
            x_dev_1, x_dev_2, y_dev_1, y_dev_2  = train_test_split(x_dev_partition, y_dev_partition, test_size=0.5, shuffle=True, random_state=64, stratify=y_dev_partition)
        
            # Store each partition in the queue for future splitting 
            temp_queue.put({'x_dev': x_dev_1, 'y_dev': y_dev_1})
            temp_queue.put({'x_dev': x_dev_2, 'y_dev': y_dev_2})
        
        fl_client = 1
        fl_config = temp_queue.qsize()
        fl_partitions[fl_config] = []

        # Get the next element in the temp queue and add it to the partition queue for future partitioning
        while not temp_queue.empty():
            next_element = temp_queue.get()
            partition_queue.put(next_element)
            fl_partitions[fl_config].append(next_element)
            fl_client += 1

partitions = list(partition_queue.queue)

# Save all partitions to disk
for fl_config in fl_partitions.keys():
    output_dir = f'../../data/outputs/eeg-signals/data-prep/partitions/fl-stratified/{fl_config}-flclients'

    # Clean the directory first
    if os.path.exists(output_dir): 
        shutil.rmtree(output_dir)
        
    os.makedirs(output_dir) 

    for fl_client, partition_df in enumerate(fl_partitions[fl_config]):
        fl_client += 1
        partition_df['x_dev'].to_csv(f'{output_dir}/eeg_x_train_flc{fl_client}.csv', header=True, index=False)
        partition_df['y_dev'].to_csv(f'{output_dir}/eeg_y_train_flc{fl_client}.csv', header=True, index=False)
        partition_df['y_dev']

## Export partitions stats for federated distribution analysis
stats = []

for key in fl_partitions.keys():
    fl_client = 0

    for client in range(0, len(fl_partitions[key])):
        fl_client += 1
        labels_df = fl_partitions[key][client]["y_dev"]
        props = round((labels_df['Label'].value_counts() / labels_df.shape[0])*100, 2)
        prop_examples = round((labels_df.shape[0] / y_train.shape[0])*100, 2)
    
        summary = {'fl_config': key, 'fl_client': fl_client, 'prop_neg': props[0], 'prop_pos': props[1], 'prop_examples': prop_examples}
        stats.append(summary)

stats_df = pd.DataFrame(stats)

output_dir = f'../../data/outputs/eda'

# Clean the directory first
if os.path.exists(output_dir): 
    shutil.rmtree(output_dir)
        
os.makedirs(output_dir) 

stats_df.to_csv(f'{output_dir}/stats_eeg_fl_stratified.csv', header=True, index=False)

### Data Partitioning (Patient-Aware Sampling)

##### Centralized Partitioning

In [None]:
# Read the data
features_df = pd.read_csv("../../data/outputs/eeg-signals/data-prep/eeg_features.csv")
labels_df = pd.read_csv("../../data/outputs/eeg-signals/data-prep/eeg_labels.csv")
labels_df['Label'] = pd.Series(np.where(labels_df['Label']==0, 1, 0))

# Get the amount of examples per patient
df_patients = labels_df.groupby('Patient')['File'].count().rename('Examples').reset_index()
df_patients['Percentage'] = round((df_patients['Examples']/labels_df.shape[0])*100, 3)
df_patients = df_patients.sort_values('Percentage', ascending=False)

# Select patients that will be included in the test set (The number of patients was calibrated to account around 15% of the data)
holdout_patients = df_patients.sample(frac=0.25, random_state=64)['Patient'].to_list()
train_patients = [patient for patient in df_patients['Patient'].to_list() if patient not in holdout_patients]
test_patients = holdout_patients[:4]
val_patients = holdout_patients[4:]

# Check which examples go in each bucket
is_train_patient = labels_df['Patient'].isin(train_patients)
is_val_patient = labels_df['Patient'].isin(test_patients)
is_test_patient = labels_df['Patient'].isin(val_patients)

# Sanity Checks
holdout_patients_cnt = is_val_patient.value_counts()[True] + is_test_patient.value_counts()[True]
train_patients_cnt = is_train_patient.value_counts()[True]
val_patients_cnt = is_val_patient.value_counts()[True]
test_patients_cnt = is_test_patient.value_counts()[True]
total_patients_cnt = labels_df.shape[0]

print(f'% of examples in training set: {(total_patients_cnt-holdout_patients_cnt)/total_patients_cnt: .2f}')
print(f'% of examples in holdout set: {holdout_patients_cnt/total_patients_cnt: .2f}')
print(f'- % of examples in validation set: {val_patients_cnt/total_patients_cnt: .2f}')
print(f'- % of examples in testing set: {test_patients_cnt/total_patients_cnt: .2f}')

# Split into train, val and test sets
x_train = features_df[is_train_patient].drop('File', axis=1)
y_train = labels_df[is_train_patient]['Label'].to_frame('Label')
x_val = features_df[is_val_patient].drop('File', axis=1)
y_val = labels_df[is_val_patient]['Label'].to_frame('Label')
x_test = features_df[is_test_patient].drop('File', axis=1)
y_test = labels_df[is_test_patient]['Label'].to_frame('Label')

# Write results to disk
output_dir = '../../data/outputs/eeg-signals/data-prep/partitions/centralized-patient-aware'

# Clean the directory first
if os.path.exists(output_dir): 
    shutil.rmtree(output_dir)
    
os.makedirs(output_dir) 

# Export Features
x_train.to_csv(f'{output_dir}/eeg_x_train.csv', header=True, index=False)
x_val.to_csv(f'{output_dir}/eeg_x_val.csv', header=True, index=False)
x_test.to_csv(f'{output_dir}/eeg_x_test.csv', header=True, index=False)

# Export Labels
y_train.to_csv(f'{output_dir}/eeg_y_train.csv', header=True, index=False)
y_val.to_csv(f'{output_dir}/eeg_y_val.csv', header=True, index=False)
y_test.to_csv(f'{output_dir}/eeg_y_test.csv', header=True, index=False)


##### Federated Partitioning

In [None]:
# Get a mapping of the patients in each federated client, per federated configuration
fl_map_dict = {}

queue_patients = queue.Queue()

for client_config in [2, 4, 8, 16]:
    for patient in train_patients:
        queue_patients.put(patient)

    # Initialize the client dictionary with lists
    client_patient_dict = {}

    for client in range(1, client_config + 1):
        client_patient_dict[client] = []
    
    # Until the queue is empty and all the clients have been traversed
    partition = 1
    while not queue_patients.empty():
        curr_patient = queue_patients.get()
        client_patient_dict[partition].append(curr_patient)

        if(partition==client_config):
            partition = 1
        else:
            partition += 1

    fl_map_dict[client_config] = client_patient_dict

# Save all partitions to disk
for fl_config in fl_map_dict.keys():
    output_dir = f'../../data/outputs/eeg-signals/data-prep/partitions/fl-patient-aware/{fl_config}-flclients'

    # Clean the directory first
    if os.path.exists(output_dir): 
        shutil.rmtree(output_dir)
        
    os.makedirs(output_dir) 

    for fl_client in fl_map_dict[fl_config].keys():
        partition_patients = fl_map_dict[fl_config][fl_client]
        include_patients = labels_df['Patient'].isin(partition_patients)
    
        x_dev_partition_df = features_df[include_patients].drop('File', axis=1)
        y_dev_partition_df = labels_df[include_patients]['Label']

        x_dev_partition_df.to_csv(f'{output_dir}/eeg_x_train_flc{fl_client}.csv', header=True, index=False)
        y_dev_partition_df.to_csv(f'{output_dir}/eeg_y_train_flc{fl_client}.csv', header=True, index=False)

## Export partitions stats for federated distribution analysis
stats = []

# For each fl config
for fl_config in fl_map_dict.keys():
    # tot_matches = 0
    for fl_client in fl_map_dict[fl_config].keys():
        partition_patients = fl_map_dict[fl_config][fl_client]
        include_patients = labels_df['Patient'].isin(partition_patients)

        y_dev_partition_df = labels_df[include_patients]
        props = round((y_dev_partition_df['Label'].value_counts() / y_dev_partition_df.shape[0])*100, 2)
        prop_examples = round((y_dev_partition_df.shape[0] / y_train.shape[0])*100, 2)
    
        summary = {'fl_config': fl_config, 'fl_client': fl_client, 'prop_neg': props[0], 'prop_pos': props[1], 'prop_examples': prop_examples}
        stats.append(summary)

stats_df = pd.DataFrame(stats)
output_dir = f'../../data/outputs/eda'

stats_df = pd.melt(stats_df, id_vars=['fl_config', 'fl_client'], value_name='value', value_vars=['prop_neg', 'prop_pos'])
stats_df.to_csv(f'{output_dir}/stats_eeg_fl_patient-aware.csv', header=True, index=False)