# Prepare PadChest split csv

In [1]:
import pandas as pd
from datetime import datetime

path_to_root = "/vol/biomedic3/bglocker/mscproj/zm1224/msc_individual_project/"
import sys

sys.path.append(path_to_root)
from default_paths import PADCHEST_ROOT, ROOT

## Load original csv file

In [2]:
df = pd.read_csv(PADCHEST_ROOT / "PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv")
df = df.loc[df.Pediatric == "No"]
df = df.loc[df.Projection.isin(["PA"])]


def process(x, target):
    if isinstance(x, str):
        list_labels = x[1:-1].split(",")
        list_labels = [label.replace("'", "").strip() for label in list_labels]
        return target in list_labels
    else:
        return False


for label in [
    "pneumonia",
    "exclude",
    "suboptimal study",
]:
    df[label] = df.Labels.astype(str).apply(lambda x: process(x, label))
    print(df[label].value_counts())
df = df.loc[~df.exclude]
df = df.loc[~df["suboptimal study"]]
df["Manufacturer"] = df.Manufacturer_DICOM.apply(
    lambda x: "Phillips" if x == "PhilipsMedicalSystems" else "Imaging"
)
df = df.loc[df["PatientSex_DICOM"].isin(["M", "F"])]
df["PatientAge"] = (
    df.StudyDate_DICOM.apply(lambda x: datetime.strptime(str(x), "%Y%M%d").year)
    - df.PatientBirth
)
invalid_filenames = [
    "216840111366964013829543166512013353113303615_02-092-190.png",
    "216840111366964013962490064942014134093945580_01-178-104.png",
    "216840111366964012989926673512011151082430686_00-157-045.png",
    "216840111366964012558082906712009327122220177_00-102-064.png",
    "216840111366964012959786098432011033083840143_00-176-115.png",
    "216840111366964012373310883942009152114636712_00-102-045.png",
    "216840111366964012487858717522009280135853083_00-075-001.png",
    "216840111366964012819207061112010307142602253_04-014-084.png",
    "216840111366964012989926673512011074122523403_00-163-058.png",
    "216840111366964013590140476722013058110301622_02-056-111.png",
    "216840111366964012339356563862009072111404053_00-043-192.png",
    "216840111366964013590140476722013043111952381_02-065-198.png",
    "216840111366964012819207061112010281134410801_00-129-131.png",
    "216840111366964013686042548532013208193054515_02-026-007.png",
    "216840111366964012989926673512011083134050913_00-168-009.png",
    "216840111366964012373310883942009170084120009_00-097-074.png",
]
df = df.loc[~df.ImageID.isin(invalid_filenames)]

pneumonia
False    88389
True      3286
Name: count, dtype: int64
exclude
False    90862
True       813
Name: count, dtype: int64
suboptimal study
False    91024
True       651
Name: count, dtype: int64


In [3]:
df.pneumonia.value_counts(normalize=True)

pneumonia
False    0.963863
True     0.036137
Name: proportion, dtype: float64

In [4]:
(
    df["PatientSex_DICOM"].value_counts(normalize=True),
    df["PatientSex_DICOM"].value_counts(),
)

(PatientSex_DICOM
 F    0.513478
 M    0.486522
 Name: proportion, dtype: float64,
 PatientSex_DICOM
 F    46308
 M    43877
 Name: count, dtype: int64)

In [5]:
df["Manufacturer"].value_counts(normalize=True), df["Manufacturer"].value_counts()

(Manufacturer
 Imaging     0.599002
 Phillips    0.400998
 Name: proportion, dtype: float64,
 Manufacturer
 Imaging     54021
 Phillips    36164
 Name: count, dtype: int64)

## Prepare and save splits

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

random_seed_for_splits = 33

indices_train_val, indices_test = train_test_split(
    df.PatientID.unique(),
    test_size=0.2,
    random_state=random_seed_for_splits,
    stratify=df.groupby("PatientID").pneumonia.max(),
)

train_val_df = df.loc[df.PatientID.isin(indices_train_val)]
test_df = df.loc[df.PatientID.isin(indices_test)]

# Further split train and val
indices_train, indices_val = train_test_split(
    train_val_df.PatientID.unique(),
    test_size=0.2,
    random_state=random_seed_for_splits,
    stratify=train_val_df.groupby("PatientID").pneumonia.max(),
)

train_df = train_val_df.loc[train_val_df.PatientID.isin(indices_train)]
val_df = train_val_df.loc[train_val_df.PatientID.isin(indices_val)]

In [7]:
(
    train_df.pneumonia.value_counts(normalize=True),
    train_df.pneumonia.value_counts(normalize=False),
)

(pneumonia
 False    0.963784
 True     0.036216
 Name: proportion, dtype: float64,
 pneumonia
 False    55460
 True      2084
 Name: count, dtype: int64)

In [8]:
(
    val_df.pneumonia.value_counts(normalize=True),
    val_df.pneumonia.value_counts(normalize=False),
)

(pneumonia
 False    0.963728
 True     0.036272
 Name: proportion, dtype: float64,
 pneumonia
 False    13949
 True       525
 Name: count, dtype: int64)

In [9]:
(
    test_df.pneumonia.value_counts(normalize=True),
    test_df.pneumonia.value_counts(normalize=False),
)

(pneumonia
 False    0.964221
 True     0.035779
 Name: proportion, dtype: float64,
 pneumonia
 False    17517
 True       650
 Name: count, dtype: int64)

In [10]:
(
    train_df["PatientSex_DICOM"].value_counts(normalize=True),
    val_df["PatientSex_DICOM"].value_counts(normalize=True),
    test_df["PatientSex_DICOM"].value_counts(normalize=True),
)

(PatientSex_DICOM
 F    0.513277
 M    0.486723
 Name: proportion, dtype: float64,
 PatientSex_DICOM
 F    0.511538
 M    0.488462
 Name: proportion, dtype: float64,
 PatientSex_DICOM
 F    0.51566
 M    0.48434
 Name: proportion, dtype: float64)

In [11]:
train_df.to_csv(ROOT / "experiments" / "train_padchest.csv")

In [12]:
val_df.to_csv(ROOT / "experiments" / "val_padchest.csv")

In [13]:
test_df["idx_in_original_test"] = np.arange(len(test_df))
test_df.to_csv(ROOT / "experiments" / "test_padchest.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["idx_in_original_test"] = np.arange(len(test_df))


## Create example of shifted dataset for the motivating example (Appendix A)

In [14]:
imaging_test = test_df.loc[test_df["Manufacturer"] == "Imaging"]
phillips_test = test_df.loc[test_df["Manufacturer"] == "Phillips"]

In [15]:
# Sample with 80% of phillips
n_phillips = len(phillips_test)
n_imaging = int(0.2 * n_phillips / 0.8)
n_phillips, n_imaging

(7172, 1793)

In [16]:
(
    phillips_test.pneumonia.value_counts(normalize=True),
    imaging_test.pneumonia.value_counts(normalize=True),
)

(pneumonia
 False    0.938232
 True     0.061768
 Name: proportion, dtype: float64,
 pneumonia
 False    0.981173
 True     0.018827
 Name: proportion, dtype: float64)

In [17]:
phillips_test_positive = phillips_test.loc[phillips_test.pneumonia]
phillips_test_negative = phillips_test.loc[~phillips_test.pneumonia]
test_prev = 0.0384
n_phillips_positive = int(test_prev * len(phillips_test_negative) / (1 - test_prev))
sub_sampled_phillips = phillips_test_positive.sample(
    axis=0, replace=False, n=n_phillips_positive, random_state=33
)
phillips_prev_adjusted = pd.concat([phillips_test_negative, sub_sampled_phillips])
phillips_prev_adjusted.pneumonia.value_counts(normalize=True)

pneumonia
False    0.961698
True     0.038302
Name: proportion, dtype: float64

In [18]:
# Switch to 90% Phillips

n_imaging = 0.1 * (len(phillips_prev_adjusted) / 0.9)
n_imaging_positive = int(test_prev * n_imaging)
n_imaging_negative = int((1 - test_prev) * n_imaging)

imaging_negative = imaging_test.loc[~imaging_test.pneumonia]
imaging_positive = imaging_test.loc[imaging_test.pneumonia]

sub_sampled_imaging_pos = imaging_positive.sample(
    axis=0, replace=False, n=n_imaging_positive, random_state=33
)
sub_sampled_imaging_neg = imaging_negative.sample(
    axis=0, replace=False, n=n_imaging_negative, random_state=33
)

subsampling_imaging = pd.concat([sub_sampled_imaging_pos, sub_sampled_imaging_neg])
subsampling_imaging.pneumonia.value_counts(normalize=True)

pneumonia
False    0.962629
True     0.037371
Name: proportion, dtype: float64

In [19]:
manufacturer_shift_test = pd.concat([phillips_prev_adjusted, subsampling_imaging])
(
    manufacturer_shift_test.pneumonia.value_counts(normalize=True),
    manufacturer_shift_test["Manufacturer"].value_counts(normalize=True),
)

(pneumonia
 False    0.961791
 True     0.038209
 Name: proportion, dtype: float64,
 Manufacturer
 Phillips    0.900167
 Imaging     0.099833
 Name: proportion, dtype: float64)

In [20]:
manufacturer_shift_test.to_csv(ROOT / "experiments" / "padchest_manufacturer_shift.csv")

In [21]:
test_prev = 0.20
orig_prev_phillips = 0.42

len(phillips_test_positive), len(imaging_positive)
n_positive_imaging = len(imaging_positive)
n_positive_phillips = int(
    orig_prev_phillips * n_positive_imaging / (1 - orig_prev_phillips)
)
n_negative_phillips = int((1 - test_prev) * n_positive_phillips / test_prev)
n_negative_imaging = int((1 - test_prev) * n_positive_imaging / test_prev)

sub_sampled_phillips_pos = phillips_test_positive.sample(
    axis=0, replace=False, n=n_positive_phillips, random_state=33
)
sub_sampled_phillips_neg = phillips_test_negative.sample(
    axis=0, replace=False, n=n_negative_phillips, random_state=33
)
sub_sampled_imaging_pos = imaging_positive.sample(
    axis=0, replace=False, n=n_positive_imaging, random_state=33
)
sub_sampled_imaging_neg = imaging_negative.sample(
    axis=0, replace=False, n=n_negative_imaging, random_state=33
)

prevalence_shift = pd.concat(
    [
        sub_sampled_imaging_pos,
        sub_sampled_imaging_neg,
        sub_sampled_phillips_pos,
        sub_sampled_phillips_neg,
    ]
)
(
    prevalence_shift.Manufacturer.value_counts(normalize=True),
    prevalence_shift.pneumonia.value_counts(normalize=True),
)

(Manufacturer
 Imaging     0.581461
 Phillips    0.418539
 Name: proportion, dtype: float64,
 pneumonia
 False    0.8
 True     0.2
 Name: proportion, dtype: float64)

In [22]:
prevalence_shift.to_csv(ROOT / "experiments" / "padchest_prev_shift.csv")

In [23]:
test_prev = 0.20
target_prev_phillips = 0.90

len(phillips_test_positive), len(imaging_positive)
n_positive_phillips = len(phillips_test_positive)
n_positive_imaging = int(
    (1 - target_prev_phillips) * n_positive_phillips / target_prev_phillips
)
n_negative_phillips = int((1 - test_prev) * n_positive_phillips / test_prev)
n_negative_imaging = int((1 - test_prev) * n_positive_imaging / test_prev)

sub_sampled_phillips_pos = phillips_test_positive.sample(
    axis=0, replace=False, n=n_positive_phillips, random_state=33
)
sub_sampled_phillips_neg = phillips_test_negative.sample(
    axis=0, replace=False, n=n_negative_phillips, random_state=33
)
sub_sampled_imaging_pos = imaging_positive.sample(
    axis=0, replace=False, n=n_positive_imaging, random_state=33
)
sub_sampled_imaging_neg = imaging_negative.sample(
    axis=0, replace=False, n=n_negative_imaging, random_state=33
)

manufacturer_prevalence_shift = pd.concat(
    [
        sub_sampled_imaging_pos,
        sub_sampled_imaging_neg,
        sub_sampled_phillips_pos,
        sub_sampled_phillips_neg,
    ]
)
(
    manufacturer_prevalence_shift.Manufacturer.value_counts(normalize=True),
    manufacturer_prevalence_shift.pneumonia.value_counts(normalize=True),
)

(Manufacturer
 Phillips    0.900407
 Imaging     0.099593
 Name: proportion, dtype: float64,
 pneumonia
 False    0.8
 True     0.2
 Name: proportion, dtype: float64)

In [24]:
manufacturer_prevalence_shift.to_csv(
    ROOT / "experiments" / "padchest_manufactuer_prev_shift.csv"
)