In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sys import getsizeof
import os
import cv2

#### EEG Dataset EDA

In [None]:
## Dataset Size
labels_df = pd.read_csv("../../data/outputs/eeg-signals/data-prep/eeg_labels.csv")
# Flip the labels (Preictal: 1, Interictal: 0)
labels_df['Label'] = pd.Series(np.where(labels_df['Label']==0, 1, 0)).to_frame()
features_df = pd.read_csv("../../data/outputs/eeg-signals/data-prep/eeg_features.csv")
print(f'Size of Labels in Memory: {getsizeof(labels_df)/1e+9: .3f} GB')
print(f'Size of Features in Memory: {getsizeof(features_df)/1e+9: .3f} GB')

In [None]:
## Data Preparation (EDA)
# Distribution of Classes
df_classes = labels_df.groupby('Label')['File'].count().rename('Examples').reset_index()
df_classes['Percentage'] = round((df_classes['Examples']/labels_df.shape[0])*100, 3)

# Distribution of Patients
df_patients = labels_df.groupby('Patient')['File'].count().rename('Examples').reset_index()
df_patients['Percentage'] = round((df_patients['Examples']/labels_df.shape[0])*100, 3)
df_patients = df_patients.sort_values('Percentage', ascending=False)

# Distribution of Classes per Patient
df_patient_classes = labels_df.groupby(['Patient', 'Label'])['File'].count().rename('Examples').reset_index()
df_patient_classes = pd.merge(
    df_patient_classes
    , df_patients.rename(columns={'Examples': 'Total Examples', 'Percentage': 'Patient Percentage'}), how='inner', left_on='Patient', right_on='Patient'
)

df_patient_classes['Percentage'] = round((df_patient_classes['Examples']/df_patient_classes['Total Examples'])*100, 3)
df_patient_classes['Scaled Percentage'] = round(df_patient_classes['Patient Percentage']*(df_patient_classes['Percentage']/100), 3)

# Distribution of Patients
df_patients = labels_df.groupby('Patient')['File'].count().rename('Examples').reset_index()
df_patients['Percentage'] = round((df_patients['Examples']/labels_df.shape[0])*100, 3)
df_patients = df_patients.sort_values('Percentage', ascending=False)

# Get the test set patients: This was calibrated manually to be close to 25% for the Test Set
patients_test_set = df_patients.sample(frac=0.2, random_state=64)['Patient']
sel_examples = df_patients[df_patients['Patient'].isin(patients_test_set)]['Examples'].sum()
tot_examples = df_patients['Examples'].sum()
print(f'% of Examples: {sel_examples/tot_examples: .3f}')

# Get the dev and test sets
x_test = features_df[labels_df['Patient'].isin(patients_test_set)].drop('File', axis=1)
y_test = labels_df[labels_df['Patient'].isin(patients_test_set)]['Label'].to_frame('Labels')
x_dev = features_df[~labels_df['Patient'].isin(patients_test_set)].drop('File', axis=1)
y_dev = labels_df[~labels_df['Patient'].isin(patients_test_set)]['Label'].to_frame('Labels')

In [None]:
## Patient Class Distribution
plt.title('Class Distribution')
plt.xlabel('Label')
plt.ylabel('Percentage')
plt.ylim(0, 100)
bars = plt.bar(df_classes['Label'].astype('str'), df_classes['Percentage'], color=['#3C76B4', '#F28010'])
plt.bar_label(bars)
plt.show()

# Patient & Class Distribution
patients = df_patient_classes.sort_values('Patient Percentage', ascending=False)['Patient'].unique().astype(str)
weight_counts = {
    "0": df_patient_classes.sort_values('Patient Percentage', ascending=False)[df_patient_classes['Label']==0]['Scaled Percentage']
    , "1": df_patient_classes.sort_values('Patient Percentage', ascending=False)[df_patient_classes['Label']==1]['Scaled Percentage']
}
width = 0.5

fig, ax = plt.subplots(figsize=(15, 6))
bottom = np.zeros(patients.shape[0])

for boolean, weight_count in weight_counts.items():
    p = ax.bar(patients, weight_count, width, label=boolean, bottom=bottom)
    bottom += weight_count

ax.set_title("Class Distribution per Patient")
ax.set_xlabel('Patient')
ax.set_ylabel('Percentage')
ax.legend(loc="upper right")

fig.tight_layout()
plt.show()


 ##### Centralized Partitioning Analysis

In [None]:
# Read the data
x_dev = pd.read_csv('../../data/outputs/eeg-signals/data-prep/partitions/eeg-centralized/eeg_x_dev.csv')
y_dev = pd.read_csv('../../data/outputs/eeg-signals/data-prep/partitions/eeg-centralized/eeg_y_dev.csv')
x_test = pd.read_csv('../../data/outputs/eeg-signals/data-prep/partitions/eeg-centralized/eeg_x_test.csv')
y_test = pd.read_csv('../../data/outputs/eeg-signals/data-prep/partitions/eeg-centralized/eeg_y_test.csv')

In [None]:
class_prop_dev = y_dev['Label'].value_counts() / y_dev.shape[0]
class_prop_test = y_test['Label'].value_counts() / y_test.shape[0]

print(f'Dev Split Size: {y_dev.shape[0]}')
print(f'Test Split Size: {y_test.shape[0]}')
print('\nDev Split Class Distribution')
print(class_prop_dev)
print('\nTest Split Class Distribution')
print(class_prop_test)

##### Centralized/Federated Patient-Aware Partitioning

In [None]:
## Get a dataframe summarizing the dev-test split
# Patients selected in the random sampling (Check prep-eeg-signals.ipynb for details)
sel_patients = [13, 35, 12, 34, 10, 32, 37, 22]
test_patients_df = df_patient_classes[df_patient_classes['Patient'].isin(sel_patients)]
dev_patients_df = df_patient_classes[~df_patient_classes['Patient'].isin(sel_patients)]

## Get dataframe summarizing the class distribution in the dev-test splits
dev_class_prop_df = dev_patients_df.groupby('Label')['Examples'].sum().rename('Examples').reset_index()
dev_class_prop_df['Set'] = 'Dev Set'
dev_class_prop_df['Percentage'] = round((dev_class_prop_df['Examples']/dev_class_prop_df['Examples'].sum())*100, 3)

test_class_prop_df = test_patients_df.groupby('Label')['Examples'].sum().rename('Examples').reset_index()
test_class_prop_df['Set'] = 'Test Set'
test_class_prop_df['Percentage'] = round((test_class_prop_df['Examples']/test_class_prop_df['Examples'].sum())*100, 3)

split_prop_df = pd.concat([dev_class_prop_df, test_class_prop_df], ignore_index=True)

## Get dataframe summarizing fold class distribution
# Patients selected in the random sampling (Check prep-eeg-signals.ipynb for details)
patients_per_fold = {
    1: [5, 7, 28, 36, 27, 21, 25, 24, 26, 20]
    , 2: [14, 9, 3, 33, 2, 18, 38, 29, 31, 17]
    , 3: [6, 1, 8, 11, 16, 19, 15, 4, 23, 30]
}

fold_prop_df = pd.DataFrame()

for fold in patients_per_fold:
    fold_patients = patients_per_fold[fold]
    fold_n_patients = len(fold_patients)
    fold_patients_df = dev_patients_df[dev_patients_df['Patient'].isin(fold_patients)]

    fold_class_prop_df = fold_patients_df.groupby('Label')['Examples'].sum().rename('Examples').reset_index()

    fold_summary_neg = {
        'fold': fold, 'n_patients': fold_n_patients
        , 'class': 0
        , 'n_examples': fold_class_prop_df[fold_class_prop_df['Label']==0]['Examples'].values[0]
    }

    fold_summary_pos = {
        'fold': fold, 'n_patients': fold_n_patients
        , 'class': 1
        , 'n_examples': fold_class_prop_df[fold_class_prop_df['Label']==1]['Examples'].values[0]
    }

    fold_prop_df = pd.concat([fold_prop_df, pd.DataFrame.from_dict([fold_summary_neg])], ignore_index=True)
    fold_prop_df = pd.concat([fold_prop_df, pd.DataFrame.from_dict([fold_summary_pos])], ignore_index=True)

path_output = '../../data/outputs/eeg-signals/data-prep/exploratory'
split_prop_df.to_csv(f'{path_output}/eeg_split_summary.csv', header=True, index=False)
fold_prop_df.to_csv(f'{path_output}/eeg_fold_summary.csv', header=True, index=False)

data_dir = '../../data/outputs/eeg-signals/data-prep/partitions'

train_labels = pd.read_csv(f'{data_dir}/centralized-PatientAware/eeg_y_train.csv')['Label']
val_labels = pd.read_csv(f'{data_dir}/centralized-PatientAware/eeg_y_val.csv')['Label']
test_labels = pd.read_csv(f'{data_dir}/centralized-PatientAware/eeg_y_test.csv')['Label']

prop_class_train = train_labels.value_counts() / train_labels.shape[0]
prop_class_val = val_labels.value_counts() / val_labels.shape[0]
prop_class_test = test_labels.value_counts() / test_labels.shape[0]

#### Polyp Dataset EDA

In [None]:
# Exploration of centralized CNN splits
data_dir = '../../data/inputs/kvasir'
stats = []

for data_split in os.listdir(data_dir):
    imgs = os.listdir(f'{data_dir}/{data_split}/masks')

    if(data_split in ['train', 'evaluate']):
        imgs_kvasir = [img for img in imgs if not img.split('.')[0].isdigit()]
        imgs_clinic = [img for img in imgs if img.split('.')[0].isdigit()]
        total_px_kvasir = 0
        total_px_clinic = 0
        neg_px_kvasir = 0
        neg_px_clinic = 0

        for img in imgs_kvasir:
            img_path = f'{data_dir}/{data_split}/masks/{img}'
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            total_pixels = image.shape[0] * image.shape[1]
            white_pixels = cv2.countNonZero(image)
            black_pixels = total_pixels - white_pixels

            total_px_kvasir += total_pixels
            neg_px_kvasir += black_pixels
        
        stats.append({'split': data_split, 'dataset': 'Kvasir', 'images': len(imgs_kvasir), 'total_px': total_px_kvasir, 'neg_px': neg_px_kvasir})
        
        for img in imgs_clinic:
            img_path = f'{data_dir}/{data_split}/masks/{img}'
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            total_pixels = image.shape[0] * image.shape[1]
            white_pixels = cv2.countNonZero(image)
            black_pixels = total_pixels - white_pixels

            total_px_clinic += total_pixels
            neg_px_clinic += black_pixels

        stats.append({'split': data_split, 'dataset': 'CVC-ClinicDB', 'images': len(imgs_clinic), 'total_px': total_px_clinic, 'neg_px': neg_px_clinic})
    else:
        total_px_colon= 0
        neg_px_colon = 0

        for img in imgs:
            img_path = f'{data_dir}/{data_split}/masks/{img}'
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            total_pixels = image.shape[0] * image.shape[1]
            white_pixels = cv2.countNonZero(image)
            black_pixels = total_pixels - white_pixels

            total_px_colon += total_pixels
            neg_px_colon += black_pixels

        stats.append({'split': data_split, 'dataset': 'CVC-ColonDB', 'images': len(imgs), 'total_px': total_px_colon, 'neg_px': neg_px_colon})

stats_df = pd.DataFrame(stats)

output_dir = '../../data/outputs/eda'
stats_df.to_csv(f'{output_dir}/stats_polyp_centralized.csv', header=True, index=False)

In [None]:
# Exploration of federated CNN splits
data_dir = '../../data/inputs/kvasir_federated'
stats = []

for config in os.listdir(f'{data_dir}'):
    fl_config = config.split('_')[0]
    config_dir = f'{data_dir}/{config}'


    for client in os.listdir(config_dir):
        fl_client = client.split('_')[1]
        client_dir = f'{config_dir}/{client}/masks'
    
        accum_black_px = 0
        accum_white_px = 0
    
        for img in os.listdir(client_dir):
            n_imgs = len(os.listdir(client_dir))
            img_path = f'{client_dir}/{img}'
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            total_pixels = image.shape[0] * image.shape[1]
            white_pixels = cv2.countNonZero(image)
            black_pixels = total_pixels - white_pixels
            accum_black_px += black_pixels
            accum_white_px += white_pixels

        prop_black = round(accum_black_px / (accum_black_px + accum_white_px), 2)
        prop_white = round(accum_white_px / (accum_black_px + accum_white_px), 2)

        stats.append({'fl_config': fl_config, 'fl_client': fl_client, 'prop_neg': prop_black, 'prop_pos': prop_white, 'n_imgs': n_imgs})

stats_df = pd.DataFrame(stats)
stats_df = stats_df.sort_values(['fl_config', 'fl_client'])

output_dir = '../../data/outputs/eda'
stats_df.to_csv(f'{output_dir}/stats_polyp_fl.csv', header=True, index=False)