# Create EMBED splits
This notebook takes care of creating the train/val/test splits csv used throughout this project.

In [1]:
import pandas as pd
from pathlib import Path
import sys

project_root = Path().resolve().parent.parent
sys.path.append(str(project_root))

from data_handling.mammo import domain_maps, modelname_map, tissue_maps
from default_paths import EMBED_ROOT
from sklearn.model_selection import train_test_split
import numpy as np

## Create main EMBED csv 
These cells take care of merging the oroginal metadata and clinical csv, remove invalid views, convert density to numerical scale etc. 

In [2]:
full_dicom = pd.read_csv(
    EMBED_ROOT / "tables/EMBED_OpenData_metadata.csv", low_memory=False
)[
    [
        "InstanceNumber",
        "anon_dicom_path",
        "PixelSpacing",
        "ImagerPixelSpacing",
        "Rows",
        "Columns",
    ]
]


dicom = pd.read_csv(
    EMBED_ROOT / "tables/EMBED_OpenData_metadata_reduced.csv", low_memory=False
)
print(len(dicom))
dicom = dicom.merge(full_dicom, on="anon_dicom_path")
print(len(dicom))
dicom["image_path"] = (
    dicom["empi_anon"].astype("str")
    + "/"
    + dicom["anon_dicom_path"].str.split("/").str[-1].str.split(".dcm").str[0]
    + ".png"
)

480323
480323


In [3]:
# XCCL shouldn't be converted to CC so manually editing it
dicom.loc[
    (dicom["SeriesDescription"] == "RXCCL") | (dicom["SeriesDescription"] == "LXCCL"),
    "ViewPosition",
] = "XCCL"

# Getting all rows with "ViewPosition" == Nan (but for which SeriesDescription is also not nan, as these are the ones subject to the data entry error)
view_nan = dicom.loc[(dicom.ViewPosition.isna()) & (~dicom.SeriesDescription.isna())]

# Drop these rows from
dicom_no_nans = dicom[~dicom.index.isin(view_nan.index)]

view_nan["ViewPosition"] = view_nan["SeriesDescription"].apply(
    lambda x: "CC" if "CC" in x else ("MLO" if "MLO" in x else None)
)

dicom = pd.concat([dicom_no_nans, view_nan], axis=0, ignore_index=True)

print(len(dicom))
# Remove any duplicated images
dicom = dicom.drop_duplicates(subset="anon_dicom_path")
# Remove spot compressed and magnified images
dicom = dicom[dicom.spot_mag.isna()]
# Remove invalid views
dicom = dicom[dicom.ViewPosition.isin(["CC", "MLO"])]
# Remove images from male clients
dicom = dicom[dicom.PatientSex == "F"]
print(len(dicom))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  view_nan["ViewPosition"] = view_nan["SeriesDescription"].apply(


480323
420457


In [4]:
# Remove any unnecessary fields from the DICOM imagewise dataframe (this may need to be updated in the future if other fields are deemed relevant)
dicom = dicom[
    [
        "empi_anon",
        "acc_anon",
        "image_path",
        "FinalImageType",
        "ImageLateralityFinal",
        "ViewPosition",
        "Manufacturer",
        "ManufacturerModelName",
    ]
]

In [5]:
# Conversion dictionary to standardised naming of various fields in clincial metadata

# Human reader BIRADS density assessment
dens_conversion = {1.0: "A", 2.0: "B", 3.0: "C", 4.0: "D"}

# Load in the clinical metadata
mag = pd.read_csv(EMBED_ROOT / "tables/EMBED_OpenData_clinical.csv", low_memory=False)
print(len(mag))
# Remove cases from cases a valid BIRADS density assessment
mag = mag[mag.tissueden.isin([1.0, 2.0, 3.0, 4.0])]
mag.replace({"tissueden": dens_conversion}, inplace=True)


# Keep important study metadata tags to join up with final aggregated dataframe at end of script
mag = mag[["empi_anon", "tissueden", "study_date_anon", "acc_anon"]].drop_duplicates(
    subset="acc_anon"
)
print(len(mag))

# Convert to pandas datetime object
mag["study_date_anon"] = pd.to_datetime(mag["study_date_anon"], errors="coerce")

81776
72188


In [6]:
dicom.Manufacturer.value_counts()

Manufacturer
HOLOGIC, Inc.           386257
GE MEDICAL SYSTEMS       25569
FUJIFILM Corporation      8133
GE HEALTHCARE              498
Name: count, dtype: int64

In [7]:
# Only consider studies which have a valid link between the DICOM and clinical metadata
print(len(dicom))
df = mag.merge(dicom, on=["acc_anon", "empi_anon"])
print(len(df))

420457
418784


In [8]:
path_to_repo_root = "/vol/biomedic3/bglocker/mscproj/zm1224/msc_individual_project/"
df.to_csv(Path(path_to_repo_root) / "data_handling" / "embed_full.csv", index=False)

## Create the splits

In [9]:
image_dir = EMBED_ROOT / Path("images/png/1024x768")

try:
    df = pd.read_csv(Path(path_to_repo_root) / "data_handling" / "embed_full.csv")
except FileNotFoundError:
    print(
        """
        For running EMBED code you need to first generate the csv
        file used for this study by running the cells above
        """
    )

df["shortimgpath"] = df["image_path"]
df["image_path"] = df["image_path"].apply(lambda x: image_dir / str(x))

df["manufacturer_domain"] = df.Manufacturer.apply(lambda x: domain_maps[x])

# convert tissueden to trainable label
df["tissueden"] = df.tissueden.apply(lambda x: tissue_maps[x])

df["SimpleModelLabel"] = df.ManufacturerModelName.apply(lambda x: modelname_map[x])
print(df.SimpleModelLabel.value_counts())
df["ViewLabel"] = df.ViewPosition.apply(lambda x: 0 if x == "MLO" else 1)


df = df.dropna(
    subset=[
        "tissueden",
        "SimpleModelLabel",
        "ViewLabel",
        "image_path",
    ]
)

df["tissueden"].value_counts(normalize=True)

SimpleModelLabel
0    374157
2     13268
5     12218
3     10515
4      8128
1       498
Name: count, dtype: int64


tissueden
1    0.422151
2    0.411339
0    0.111991
3    0.054520
Name: proportion, dtype: float64

In [10]:
df = df.loc[df.FinalImageType == "2D"]


y = df.groupby("empi_anon")["tissueden"].unique().apply(lambda x: x[0]).values
print(np.bincount(y) / np.bincount(y).sum())
train_id, val_id = train_test_split(
    df.empi_anon.unique(), test_size=0.4, random_state=33, stratify=y
)


val_test_df = df.loc[df["empi_anon"].isin(val_id)]
# Keep only one study by patient
studies = (
    val_test_df.groupby("empi_anon")["acc_anon"].unique().apply(lambda x: x[0]).values
)
# For testing filter out all studies for which there is more than the expected 4 images (L/R, MLO/CC).
# These are the studies with failed images, images with unexpected stuff. To make sure that the
# distribution of val and un-shifted test are the same. Otherwise it might falsily the results.
weird = (
    df.groupby("acc_anon")["acc_anon"]
    .unique()
    .index[
        np.where(
            df.groupby("acc_anon")["shortimgpath"]
            .unique()
            .apply(lambda x: len(x) != 4)
            .values
        )[0]
    ]
)
val_test_df = val_test_df.loc[val_test_df["acc_anon"].isin(studies)]
val_test_df = val_test_df.loc[~val_test_df["acc_anon"].isin(weird)]

pd.crosstab(val_test_df["SimpleModelLabel"], val_test_df["tissueden"])

[0.09846894 0.41614173 0.42699038 0.05839895]


tissueden,0,1,2,3
SimpleModelLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1136,5080,6332,1112
1,0,16,8,16
2,24,620,768,60
3,108,300,436,28
4,100,612,584,44
5,92,660,512,36


In [11]:
val_test_df["combined_var"] = val_test_df["SimpleModelLabel"] + 10 * val_test_df["tissueden"]
val_test_df["combined_var"].value_counts()

combined_var
20    6332
10    5080
0     1136
30    1112
22     768
15     660
12     620
14     612
24     584
25     512
23     436
13     300
3      108
4      100
5       92
32      60
34      44
35      36
33      28
2       24
31      16
11      16
21       8
Name: count, dtype: int64

In [12]:
tmp = val_test_df.groupby("acc_anon")["combined_var"].unique()
ids, y = tmp.index, tmp.apply(lambda x: x[0]).values
test_id, val_id = train_test_split(ids, test_size=1200, random_state=33, stratify=y)
print(
    f"N patients train: {train_id.shape[0]}, val: {val_id.shape[0]}, test {test_id.shape[0]}"
)  # noqa

N patients train: 13716, val: 1200, test 3471


In [13]:
train_df = df.loc[df.empi_anon.isin(train_id)]
val_df = val_test_df.loc[val_test_df.acc_anon.isin(val_id)]
test_df = val_test_df.loc[val_test_df.acc_anon.isin(test_id)]
test_df["idx_in_original_test"] = np.arange(len(test_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["idx_in_original_test"] = np.arange(len(test_df))


In [14]:
pd.crosstab(test_df["SimpleModelLabel"], test_df["tissueden"], normalize="index")

tissueden,0,1,2,3
SimpleModelLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.083136,0.371946,0.463357,0.08156
1,0.0,0.428571,0.142857,0.428571
2,0.014652,0.421245,0.52381,0.040293
3,0.123457,0.345679,0.5,0.030864
4,0.076305,0.457831,0.433735,0.032129
5,0.070248,0.508264,0.392562,0.028926


In [15]:
pd.crosstab(val_df["SimpleModelLabel"], val_df["tissueden"], normalize="index")

tissueden,0,1,2,3
SimpleModelLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.083238,0.371722,0.464082,0.080958
1,0.0,0.333333,0.333333,0.333333
2,0.021053,0.421053,0.515789,0.042105
3,0.125,0.339286,0.5,0.035714
4,0.069767,0.453488,0.44186,0.034884
5,0.072289,0.506024,0.39759,0.024096


In [16]:
train_df.to_csv("/vol/biomedic3/bglocker/mscproj/zm1224/msc_individual_project/experiments/train_embed.csv")

In [17]:
val_df.to_csv("/vol/biomedic3/bglocker/mscproj/zm1224/msc_individual_project/experiments/train_embed.csv")

In [18]:
test_df.to_csv("/vol/biomedic3/bglocker/mscproj/zm1224/msc_individual_project/experiments/train_embed.csv")