# Prepare PadChest split csv

In [None]:
import pandas as pd
from datetime import datetime
from pathlib import Path
PROJECT_ROOT = Path("path to root")

## Load original csv file

In [None]:
df = pd.read_csv(
    "PATH-TO-BIMCV-PADCHEST/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv"
)
df = df.loc[df.Pediatric == "No"]
df = df.loc[df.Projection.isin(["PA"])]


def process(x, target):
    if isinstance(x, str):
        list_labels = x[1:-1].split(",")
        list_labels = [label.replace("'", "").strip() for label in list_labels]
        return target in list_labels
    else:
        return False


for label in [
    "pneumonia",
    "exclude",
    "suboptimal study",
]:
    df[label] = df.Labels.astype(str).apply(lambda x: process(x, label))
    print(df[label].value_counts())
df = df.loc[~df.exclude]
df = df.loc[~df["suboptimal study"]]
df["Manufacturer"] = df.Manufacturer_DICOM.apply(
    lambda x: "Phillips" if x == "PhilipsMedicalSystems" else "Imaging"
)
df = df.loc[df["PatientSex_DICOM"].isin(["M", "F"])]
df["PatientAge"] = (
    df.StudyDate_DICOM.apply(lambda x: datetime.strptime(str(x), "%Y%M%d").year)
    - df.PatientBirth
)
invalid_filenames = [
    "216840111366964013829543166512013353113303615_02-092-190.png",
    "216840111366964013962490064942014134093945580_01-178-104.png",
    "216840111366964012989926673512011151082430686_00-157-045.png",
    "216840111366964012558082906712009327122220177_00-102-064.png",
    "216840111366964012959786098432011033083840143_00-176-115.png",
    "216840111366964012373310883942009152114636712_00-102-045.png",
    "216840111366964012487858717522009280135853083_00-075-001.png",
    "216840111366964012819207061112010307142602253_04-014-084.png",
    "216840111366964012989926673512011074122523403_00-163-058.png",
    "216840111366964013590140476722013058110301622_02-056-111.png",
    "216840111366964012339356563862009072111404053_00-043-192.png",
    "216840111366964013590140476722013043111952381_02-065-198.png",
    "216840111366964012819207061112010281134410801_00-129-131.png",
    "216840111366964013686042548532013208193054515_02-026-007.png",
    "216840111366964012989926673512011083134050913_00-168-009.png",
    "216840111366964012373310883942009170084120009_00-097-074.png",
]
df = df.loc[~df.ImageID.isin(invalid_filenames)]

In [None]:
df.pneumonia.value_counts(normalize=True)

In [None]:
(
    df["PatientSex_DICOM"].value_counts(normalize=True),
    df["PatientSex_DICOM"].value_counts(),
)

In [None]:
df["Manufacturer"].value_counts(normalize=True), df["Manufacturer"].value_counts()

## Prepare and save splits

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

random_seed_for_splits = 33

indices_train_val, indices_test = train_test_split(
    df.PatientID.unique(),
    test_size=0.2,
    random_state=random_seed_for_splits,
    stratify=df.groupby("PatientID").pneumonia.max(),
)

train_val_df = df.loc[df.PatientID.isin(indices_train_val)]
test_df = df.loc[df.PatientID.isin(indices_test)]

# Further split train and val
indices_train, indices_val = train_test_split(
    train_val_df.PatientID.unique(),
    test_size=0.2,
    random_state=random_seed_for_splits,
    stratify=train_val_df.groupby("PatientID").pneumonia.max(),
)

train_df = train_val_df.loc[train_val_df.PatientID.isin(indices_train)]
val_df = train_val_df.loc[train_val_df.PatientID.isin(indices_val)]

In [None]:
(
    train_df.pneumonia.value_counts(normalize=True),
    train_df.pneumonia.value_counts(normalize=False),
)

In [None]:
(
    val_df.pneumonia.value_counts(normalize=True),
    val_df.pneumonia.value_counts(normalize=False),
)

In [None]:
(
    test_df.pneumonia.value_counts(normalize=True),
    test_df.pneumonia.value_counts(normalize=False),
)

In [None]:
(
    train_df["PatientSex_DICOM"].value_counts(normalize=True),
    val_df["PatientSex_DICOM"].value_counts(normalize=True),
    test_df["PatientSex_DICOM"].value_counts(normalize=True),
)

In [None]:
train_df.to_csv(
    PROJECT_ROOT / "data/train_padchest.csv"
)

In [None]:
val_df.to_csv(PROJECT_ROOT / "data/val_padchest.csv")

In [None]:
test_df["idx_in_original_test"] = np.arange(len(test_df))
test_df.to_csv(
    PROJECT_ROOT / "data/test_padchest.csv"
)