# Prepare RSNA Pneumonia splits csv

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from pathlib import Path
import sys

sys.path.append("/vol/biomedic3/mb121/shift_identification")

from default_paths import (
    ROOT,
    PATH_NIH_TO_RSNA_MAPPING,
    NIH_METADATA_CSV,
    DATA_DIR_RSNA,
)
from rsna_utils import create_mapping_dataset_nih

## Part I: generate the main csv (merging RSNA labels with metadata from NIH dataset)

In [None]:
mapping_file = PATH_NIH_TO_RSNA_MAPPING
nih_metadata = NIH_METADATA_CSV
kaggle_dataset_path = DATA_DIR_RSNA / "stage_2_train_labels.csv"
dataset = create_mapping_dataset_nih(
    mapping_file,
    nih_metadata,
    kaggle_dataset_path,
)
dataset.to_csv(
    ROOT / "data_handling" / "pneumonia_dataset_with_metadata.csv", index=False
)

## Part II: generate splits

In [None]:
random_seed_for_splits = 33
df = pd.read_csv(ROOT / "data_handling" / "pneumonia_dataset_with_metadata.csv")
indices_train_val, indices_test = train_test_split(
    np.arange(len(df)),
    test_size=0.3,
    random_state=random_seed_for_splits,
)
train_val_df = df.iloc[indices_train_val]
test_df = df.iloc[indices_test]

# Further split train and val
indices_train, indices_val = train_test_split(
    np.arange(len(train_val_df)),
    test_size=0.2,
    random_state=random_seed_for_splits,
)

train_df = train_val_df.iloc[indices_train]
val_df = train_val_df.iloc[indices_val]

In [None]:
train_df.to_csv(ROOT / "experiments" / "train_rsna.csv")
val_df.to_csv(ROOT / "experiments" / "val_rsna.csv")
test_df["idx_in_original_test"] = np.arange(len(test_df))
test_df.to_csv(ROOT / "experiments" / "test_rsna.csv")