* ISIC Challenge datasets

1. Import the libs

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict

2. Statistics of the datasets

All the images were preprocessed using the `data_processing/preprocess.py` to images of 256 x 256 pixels for uniformity to reduce the domain shift.

ISIC 2020 - The dataset we will use for federated learning

The dataset has only 2 labels for the images: malignant and benign.

In [3]:
df = pd.read_csv("labels/ISIC_2020_Training_GroundTruth.csv")
num_images = len(df)
num_patients = df['patient_id'].nunique()
num_classes = df['benign_malignant'].nunique()
class_counts = df['benign_malignant'].value_counts()

print(f"Total images: {num_images}")
print(f"Total patients: {num_patients}")
print(f"Classes: {num_classes}")
print("Class distribution:\n", class_counts)
print("\nAge statistics:")
print(df['age_approx'].describe())
print("\nSex distribution:\n", df['sex'].value_counts())
print("\nAnatomical site distribution:\n", df['anatom_site_general_challenge'].value_counts())

Total images: 33126
Total patients: 2056
Classes: 2
Class distribution:
 benign_malignant
benign       32542
malignant      584
Name: count, dtype: int64

Age statistics:
count    33058.000000
mean        48.870016
std         14.380360
min          0.000000
25%         40.000000
50%         50.000000
75%         60.000000
max         90.000000
Name: age_approx, dtype: float64

Sex distribution:
 sex
male      17080
female    15981
Name: count, dtype: int64

Anatomical site distribution:
 anatom_site_general_challenge
torso              16845
lower extremity     8417
upper extremity     4983
head/neck           1855
palms/soles          375
oral/genital         124
Name: count, dtype: int64


3. Split the data for Federated Learning

We need to simulate the real life where hospitals have different types of pacients (different ages, different social backgrounds, etc). We decided to split the data into 4 clients. Each client will receive a different data split. All the data for a particular patient is inside the same client (hospital) - people tend to go to the hospital they are used to.

Each hospital has a different "story" to try to mimic the real world:

1) Hospital A - Big city hospital with most of the patients (45% of the total number of patients). Also, the remaining patients that had missing metadata is assigned to this hospital as well because there can be some mistakes at bigger hospitals with many patients.
2) Hospital B - Hospital focused more on younger people (all the patients younger than 40 years, max 20% of the total number)
3) Hospital C - Hospital focused on the upper body. It contains all the patients that come to take images of lesions of "head/neck", "oral/genital", "upper extremity"
4) Hospital D - Hospital with the rest. Can be a regular dermathological hospital in a city.

In [39]:
def split_patients_federated(csv_path, seed=42):
    np.random.seed(seed)
    df = pd.read_csv(csv_path)

    # Unique patients
    patients = df.groupby("patient_id").agg({
        "age_approx": "mean",
        "anatom_site_general_challenge": lambda x: x.mode().iloc[0] if not x.mode().empty else "unknown",
        "benign_malignant": lambda x: "malignant" if "malignant" in x.values else "benign"
    }).reset_index()

    # ---- Define hospital groups ----
    # Hospital A: Urban General (largest)
    # Hospital B: Youth Dermatology (<40)
    # Hospital C: Head/Neck specialist
    # Hospital D: Rural/Older

    num_patients = len(patients)
    hospital_a_size = int(0.45 * num_patients)
    hospital_b_size = int(0.20 * num_patients)
    hospital_c_size = int(0.15 * num_patients)

   # Hospital B -> Young patients (<40)
    available_patients = patients.copy()
    young = available_patients[available_patients["age_approx"] < 40]
    if len(young) > hospital_b_size:
        young = young.sample(hospital_b_size, random_state=seed)
    hospital_b_patients = set(young["patient_id"])

    # Hospital C -> Upper body focus
    available_patients = patients[~patients["patient_id"].isin(hospital_b_patients)]
    head_sites = ["head/neck", "oral/genital", "upper extremity"]
    upper_body_patients = available_patients[available_patients["anatom_site_general_challenge"].isin(head_sites)]
    if len(upper_body_patients) > hospital_c_size:
        upper_body_patients = upper_body_patients.sample(hospital_c_size, random_state=seed)
    hospital_c_patients = set(upper_body_patients["patient_id"])

    # Hospital A -> Big City Hospital
    available_patients = patients[~patients["patient_id"].isin(hospital_b_patients | hospital_c_patients)]
    hospital_a_patients = set(
        available_patients.sample(hospital_a_size, random_state=seed)["patient_id"]
    )

    # Hospital D -> Rest
    hospital_d_patients = set(patients["patient_id"]) - (hospital_a_patients | hospital_b_patients | hospital_c_patients)

    hospitals = {
        "Hospital_A_Urban": df[df["patient_id"].isin(hospital_a_patients)],
        "Hospital_B_Youth": df[df["patient_id"].isin(hospital_b_patients)],
        "Hospital_C_Upper_Body": df[df["patient_id"].isin(hospital_c_patients)],
        "Hospital_D_Rural": df[df["patient_id"].isin(hospital_d_patients)]
    }

    # --- Print summaries ---
    print("\n=== Hospital Summary ===")
    for name, hdf in hospitals.items():
        counts = hdf["benign_malignant"].value_counts()
        mean_age = hdf["age_approx"].mean()
        top_site = hdf["anatom_site_general_challenge"].mode().iloc[0]
        print(f"\n{name}:")
        print(f" Patients: {hdf['patient_id'].nunique():5d} | Images: {len(hdf):5d}")
        print(f" Avg. age: {mean_age:.1f} | Dominant body part: {top_site}")
        print(f" Benign: {counts.get('benign',0):5d} | Malignant: {counts.get('malignant',0):4d}")

    return hospitals

In [40]:
csv_path = "labels/ISIC_2020_Training_GroundTruth.csv"
seed = 42
client_splits = split_patients_federated(csv_path, seed)


=== Hospital Summary ===

Hospital_A_Urban:
 Patients:   925 | Images: 14505
 Avg. age: 54.8 | Dominant body part: torso
 Benign: 14212 | Malignant:  293

Hospital_B_Youth:
 Patients:   411 | Images:  8141
 Avg. age: 31.1 | Dominant body part: torso
 Benign:  8074 | Malignant:   67

Hospital_C_Upper_Body:
 Patients:   163 | Images:  1429
 Avg. age: 58.2 | Dominant body part: upper extremity
 Benign:  1367 | Malignant:   62

Hospital_D_Rural:
 Patients:   557 | Images:  9051
 Avg. age: 54.0 | Dominant body part: torso
 Benign:  8889 | Malignant:  162


NameError: name 'hospital_a_patients' is not defined