In [11]:
import pandas as pd
from pathlib import Path

In [12]:
dataset_logs_path = Path("../../experiments/tumorseg_20250616020201/logs").resolve()

In [13]:
metadata_df = pd.read_csv(dataset_logs_path / "metadata.csv")
train_df = pd.read_csv(dataset_logs_path / "train.csv")
train_hospital_df = pd.read_csv(dataset_logs_path / "train_hospital.csv")
train_hpa_df = pd.read_csv(dataset_logs_path / "train_hpa.csv")
val_df = pd.read_csv(dataset_logs_path / "val.csv")
test_df = pd.read_csv(dataset_logs_path / "test.csv")

In [14]:
combined_df = pd.concat([train_hospital_df, train_hpa_df], ignore_index=True)

print("Hospital:", len(train_hospital_df))
print("Hospital Slide Count:", len(train_hospital_df["slide_name"].unique()))
print("Hospital Slides:", train_hospital_df["slide_name"].unique(), "\n")


print("HPA:", len(train_hpa_df))
print("HPA Slide Count:", len(train_hpa_df["slide_name"].unique()))
print("HPA Slides:", train_hpa_df["slide_name"].unique(), "\n")

print("Hospital + HPA:", len(combined_df))
print("Hospital + HPA Slide Count:", len(combined_df["slide_name"].unique()), "\n")

print("Train:", len(train_df))
print("Train Slide Count:", len(train_df["slide_name"].unique()), "\n")

metadata_train_df = metadata_df[metadata_df["split"] == "train"]
print("Metadata Train:", len(metadata_train_df))
print(
    "Metadata Train Slide Count:", len(metadata_train_df["slide_name"].unique()), "\n"
)


train_df_sorted = train_df.sort_values(by=train_df.columns.tolist()).reset_index(
    drop=True
)
combined_df_sorted = combined_df.sort_values(
    by=combined_df.columns.tolist()
).reset_index(drop=True)

is_equal = train_df_sorted.equals(combined_df_sorted)
print("Train equals Hospital + HPA:", is_equal)

Hospital: 7465
Hospital Slide Count: 7
Hospital Slides: ['B-17557-23-ER' 'B-17557-23-KI67' 'B-18125-23-ER' 'B-19844-23-ER'
 'B-20002-23-ER' 'B-20788-23-ER' 'B-25194-23-ER'] 

HPA: 3061
HPA Slide Count: 150
HPA Slides: ['414_A_4_1' '414_A_4_2' '414_A_4_3' '414_A_4_5' '414_A_4_6' '414_A_4_7'
 '414_A_4_8' '414_A_5_1' '414_A_5_2' '414_A_5_4' '414_A_5_5' '414_A_5_6'
 '414_A_5_7' '414_A_5_8' '414_A_6_3' '414_A_6_4' '414_A_6_8' '1898_A_4_1'
 '1898_A_4_3' '1898_A_4_5' '1898_A_4_6' '1898_A_4_7' '1898_A_4_8'
 '1898_A_5_1' '1898_A_5_2' '1898_A_5_3' '1898_A_5_5' '1898_A_5_6'
 '1898_A_5_8' '1898_A_6_3' '1898_A_6_4' '1898_A_6_5' '1898_A_6_6'
 '1941_A_4_1' '1941_A_4_2' '1941_A_4_3' '1941_A_4_4' '1941_A_4_5'
 '1941_A_4_6' '1941_A_4_7' '1941_A_4_8' '1941_A_5_1' '1941_A_5_2'
 '1941_A_5_3' '1941_A_5_4' '1941_A_5_5' '1941_A_5_6' '1941_A_5_7'
 '1941_A_6_1' '1941_A_6_2' '1941_A_6_3' '1941_A_6_4' '1941_A_6_5'
 '1941_A_6_7' '1974_A_4_1' '1974_A_4_2' '1974_A_4_3' '1974_A_4_4'
 '1974_A_4_5' '1974_A_4_6' '1974_A

In [15]:
print("Validation:", len(val_df))
print("Validation Slide Count:", len(val_df["slide_name"].unique()))
print("Validation Slides:", val_df["slide_name"].unique(), "\n")

diff_train_val = pd.concat([train_df, val_df]).drop_duplicates(keep=False)
print("Rows in Train not in Validation (or vice versa):", not diff_train_val.empty)

Validation: 821
Validation Slide Count: 2
Validation Slides: ['B-20000-23-2-ER' 'B-25081-23-PR312'] 

Rows in Train not in Validation (or vice versa): True


In [16]:
print("Test:", len(test_df))
print("Test Slide Count:", len(test_df["slide_name"].unique()))
print("Test Slides:", test_df["slide_name"].unique(), "\n")

diff_train_test = pd.concat([train_df, test_df]).drop_duplicates(keep=False)
print("Rows in Train not in Test (or vice versa):", not diff_train_test.empty)

Test: 1363
Test Slide Count: 2
Test Slides: ['B-18132-23-ER' 'B-23157-23-ER'] 

Rows in Train not in Test (or vice versa): True


In [17]:
diff_val_test = pd.concat([val_df, test_df]).drop_duplicates(keep=False)
print("Rows in Validation not in Test (or vice versa):", not diff_val_test.empty)

Rows in Validation not in Test (or vice versa): True


In [18]:
metadata_test_df = metadata_df[metadata_df["split"] == "test"]

print("Metadata Test:", len(test_df))
print("Metadata Test Slide Count:", len(test_df["slide_name"].unique()))
print("Metadata Test Slides:", test_df["slide_name"].unique(), "\n")

Metadata Test: 1363
Metadata Test Slide Count: 2
Metadata Test Slides: ['B-18132-23-ER' 'B-23157-23-ER'] 



---

In [19]:
metadata_df.columns

Index(['slide_name', 'parent_dir_path', 'relative_image_path',
       'relative_mask_path', 'tumor_frac', 'category', 'tile_count', 'split',
       'tumor_bin'],
      dtype='object')

In [20]:
duplicates = metadata_df[metadata_df["relative_image_path"].duplicated()]
print(f"Number of duplicate images in metadata: {len(duplicates)}")

Number of duplicate images in metadata: 0


In [21]:
image_names = metadata_df["relative_image_path"].apply(lambda x: Path(x).stem)
mask_names = metadata_df["relative_mask_path"].apply(
    lambda x: Path(x).stem.strip("_label")
)

(image_names == mask_names).all()

np.True_

In [22]:
image_names.is_unique

True

In [23]:
train_images = set(train_df["relative_image_path"])
val_images = set(val_df["relative_image_path"])
test_images = set(test_df["relative_image_path"])

overlap_train_val = train_images & val_images
overlap_train_test = train_images & test_images
overlap_val_test = val_images & test_images

print("Train-Val overlap:", len(overlap_train_val))
print("Train-Test overlap:", len(overlap_train_test))
print("Val-Test overlap:", len(overlap_val_test))

Train-Val overlap: 0
Train-Test overlap: 0
Val-Test overlap: 0


In [24]:
train_slides = set(train_df["relative_image_path"])
val_slides = set(val_df["relative_image_path"])
test_slides = set(test_df["relative_image_path"])

overlap_train_val = train_slides & val_slides
overlap_train_test = train_slides & test_slides
overlap_val_test = val_slides & test_slides

print("Train-Val overlap:", len(overlap_train_val))
print("Train-Test overlap:", len(overlap_train_test))
print("Val-Test overlap:", len(overlap_val_test))

Train-Val overlap: 0
Train-Test overlap: 0
Val-Test overlap: 0
