# Create EMBED splits

In [None]:
import pandas as pd
from pathlib import Path
import sys

sys.path.append("PATH TO ROOT")
from data_handling.mammo import domain_maps, modelname_map, tissue_maps, EMBED_ROOT, PROJECT_ROOT

## Load original csv file

In [None]:
def get_embed_csv():
    image_dir = EMBED_ROOT / Path("images/png/1024x768")
    try:
        mydf = pd.read_csv(Path(__file__).parent / "joined_simple.csv")
    except FileNotFoundError:
        print(
            """
            For running EMBED code you need to first generate the csv
            file used for this study in csv_generation_code/generate_embed_csv.ipynb
            """
        )

    mydf["shortimgpath"] = mydf["image_path"]
    mydf["image_path"] = mydf["image_path"].apply(lambda x: image_dir / str(x))

    mydf["manufacturer_domain"] = mydf.Manufacturer.apply(lambda x: domain_maps[x])

    # convert tissueden to trainable label
    mydf["tissueden"] = mydf.tissueden.apply(lambda x: tissue_maps[x])

    mydf["SimpleModelLabel"] = mydf.ManufacturerModelName.apply(
        lambda x: modelname_map[x]
    )
    print(mydf.SimpleModelLabel.value_counts())
    mydf["ViewLabel"] = mydf.ViewPosition.apply(lambda x: 0 if x == "MLO" else 1)

    mydf = mydf.dropna(
        subset=[
            "age_at_study",
            "tissueden",
            "SimpleModelLabel",
            "ViewLabel",
            "image_path",
        ]
    )
    return mydf

In [None]:
df = get_embed_csv()

In [None]:
df["tissueden"].value_counts(normalize=True)

## Create the splits

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

df = df.loc[df.FinalImageType == "2D"]

y = df.groupby("empi_anon")["tissueden"].unique().apply(lambda x: x[0]).values
print(np.bincount(y) / np.bincount(y).sum())
train_id, val_id = train_test_split(
    df.empi_anon.unique(), test_size=0.4, random_state=33, stratify=y
)


val_test_df = df.loc[df["empi_anon"].isin(val_id)]
# Keep only one study by patient
studies = (
    val_test_df.groupby("empi_anon")["acc_anon"].unique().apply(lambda x: x[0]).values
)
# For testing filter out all studies for which there is more than the expected 4 images (L/R, MLO/CC).
# These are the studies with failed images, images with unexpected stuff. To make sure that the
# distribution of val and un-shifted test are the same. Otherwise it might falsily the results.
weird = (
    df.groupby("acc_anon")["acc_anon"]
    .unique()
    .index[
        np.where(
            df.groupby("acc_anon")["shortimgpath"]
            .unique()
            .apply(lambda x: len(x) != 4)
            .values
        )[0]
    ]
)
val_test_df = val_test_df.loc[val_test_df["acc_anon"].isin(studies)]
val_test_df = val_test_df.loc[~val_test_df["acc_anon"].isin(weird)]

pd.crosstab(val_test_df["SimpleModelLabel"], val_test_df["tissueden"])

In [None]:
tmp = val_test_df.groupby("acc_anon")["combined_var"].unique()
ids, y = tmp.index, tmp.apply(lambda x: x[0]).values
test_id, val_id = train_test_split(ids, test_size=1200, random_state=33, stratify=y)
print(
    f"N patients train: {train_id.shape[0]}, val: {val_id.shape[0]}, test {test_id.shape[0]}"
)  # noqa

In [None]:
train_df = df.loc[df.empi_anon.isin(train_id)]
val_df = val_test_df.loc[val_test_df.acc_anon.isin(val_id)]
test_df = val_test_df.loc[val_test_df.acc_anon.isin(test_id)]
test_df["idx_in_original_test"] = np.arange(len(test_df))

In [None]:
pd.crosstab(test_df["SimpleModelLabel"], test_df["tissueden"], normalize="index")

In [None]:
pd.crosstab(val_df["SimpleModelLabel"], val_df["tissueden"], normalize="index")

In [None]:
train_df.to_csv(PROJECT_ROOT / "data/train_embed.csv")

In [None]:
val_df.to_csv(PROJECT_ROOT / "data/val_embed.csv")

In [None]:
test_df.to_csv(PROJECT_ROOT / "data/test_embed.csv")