<a href="https://colab.research.google.com/github/bamf-health/aimi-prostate-mr/blob/colab-datasets/prostate_mr_qa_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validataion Datasets

Get other datasets preped for validation with the model.

## prerequisites
1. Install required packages
2. import them

In [None]:
%%capture
%%shell
pip install SimpleITK tqdm ipywidgets pydicom pydicom-seg

In [None]:
#@title Enter your Project ID and authenticate with GCP
# initialize this variable with your Google Cloud Project ID!
my_ProjectID = ''

import os
os.environ["GCP_PROJECT_ID"] = my_ProjectID

from google.colab import auth
auth.authenticate_user()

from pathlib import Path
from tempfile import TemporaryDirectory
import subprocess
import shutil
from tqdm.auto import tqdm
import SimpleITK as sitk
from google.colab import files
import pydicom
import pydicom_seg

## Medical Segmentation Decathlon

Task 5 of the [Medical Segmentation Decathlon](http://medicaldecathlon.com/) is to segment liver and liver tumors from CT scans. We can use this dataset to evaluate our model.

1. Download and extract Task05_Prostate from the [Medical Segmentation Decathlon](http://medicaldecathlon.com/). You should have a folder structure of `MSD/Task05_Prostate/imagesTr` and `MSD/Task05_Prostate/labelsTr`.


In [None]:
%%shell
gdown 1Ff7c21UksxyT4JfETjaarmuKEjdqe1-a
mkdir -p MSD
tar -xf Task05_Prostate.tar -C MSD

2. Extract just the T2 channel from the images

In [None]:
in_dir = Path(f"MSD/Task05_Prostate/imagesTr")
out_dir = Path(f"MSD/Task05_Prostate/T2imagesTr")
out_dir.mkdir(exist_ok=True)
for img_path in tqdm(list(in_dir.glob("prostate_*.nii.gz")), desc="Extracting T2 images"):
    img = sitk.ReadImage(str(img_path))[..., 0]
    sitk.WriteImage(
        img, str(out_dir / img_path.name), useCompression=True, compressionLevel=9
    )


3. Combine the labels into a single channel

In [None]:
in_dir = Path(f"MSD/Task05_Prostate/labelsTr")
out_dir = Path(f"MSD/Task05_Prostate/prostateLabelsTr")
out_dir.mkdir(exist_ok=True)
for img_path in tqdm(list(in_dir.glob("prostate_*.nii.gz")), desc="Combining segments"):
    img = sitk.ReadImage(str(img_path))
    img = sitk.ChangeLabel(img, {2: 1})
    sitk.WriteImage(
        img, str(out_dir / img_path.name), useCompression=True, compressionLevel=9
    )

In [None]:
%%shell
zip -r MSD.zip MSD/Task05_Prostate/T2imagesTr/prostate*.nii.gz MSD/Task05_Prostate/prostateLabelsTr/prostate*.nii.gz

Download processed files

In [None]:
files.download('MSD.zip')

## PROMISE12
Download the [PROMISE12](https://promise12.grand-challenge.org/) dataset from zenodo

In [None]:
%%shell
mkdir -p promise12
# live challenge test data
wget https://zenodo.org/record/8026660/files/livechallenge_test_data.zip
unzip livechallenge_test_data.zip -d promise12/livechallenge_test_data

# test data
wget https://zenodo.org/record/8026660/files/test_data.zip
unzip test_data.zip -d promise12/test_data

# training data
wget https://zenodo.org/record/8026660/files/training_data.zip
unzip training_data.zip -d promise12/training_data

Convert the mhd/raw files to nifti

In [None]:
# convert images and labels to nifti
promise_nii_images = Path("promise12/imagesTr")
promise_nii_labels = Path("promise12/labelsTr")
promise_nii_images.mkdir(exist_ok=True)
promise_nii_labels.mkdir(exist_ok=True)

p_labals = sorted(list(Path("promise12").rglob("*_segmentation.mhd")))
for i, lbl_file in enumerate(tqdm(p_labals, desc="Converting images")):
    lbl_file = str(lbl_file)
    img_file = lbl_file.replace("_segmentation", "")
    assert Path(img_file).exists()
    new_img_file = str(promise_nii_images / f"promise12_{i:03d}.nii.gz")
    new_lbl_file = str(promise_nii_labels / f"promise12_{i:03d}.nii.gz")
    sitk.WriteImage(
        sitk.ReadImage(img_file), new_img_file, useCompression=True, compressionLevel=9
    )
    sitk.WriteImage(
        sitk.ReadImage(lbl_file), new_lbl_file, useCompression=True, compressionLevel=9
    )

Zip and download process files

In [None]:
%%shell
zip -r promise12.zip promise12/imagesTr promise12/labelsTr

In [None]:
files.download('promise12.zip')

## QIN-Prostate-Repeatability
The [QIN-Prostate-Repeatability](https://wiki.cancerimagingarchive.net/display/Public/QIN-PROSTATE-Repeatability) collection can be downloaded from the IDC


### Prerequisites
Install the `s5cmd` tool for efficient manifest downloads

In [None]:
%%shell
VERSION="s5cmd_2.2.2_Linux-64bit"
wget https://github.com/peak/s5cmd/releases/download/v2.2.2/${VERSION}.tar.gz
tar zxf ${VERSION}.tar.gz
mv s5cmd /usr/bin

Install dcm2niix for dicom conversion. Use the prebuild version because it was jpeg support.

In [None]:
%%shell
curl -fLO https://github.com/rordenlab/dcm2niix/releases/latest/download/dcm2niix_lnx.zip
unzip -o dcm2niix_lnx.zip -d /usr/bin

Query IDC for QIN-Prostate-Repeatability data

In [None]:
# python API is the most flexible way to query IDC BigQuery metadata tables
from google.cloud import bigquery
bq_client = bigquery.Client(my_ProjectID)

seg_selection_query =f"""
SELECT
  ANY_VALUE(CONCAT("cp s3",REGEXP_SUBSTR(gcs_url, "(://.*)/"),"/* ",collection_id,"/",PatientID,"/",StudyInstanceUID,"/T2AxialSeg")) AS s5cmd_command
FROM
  `bigquery-public-data.idc_current.dicom_all`
WHERE
  collection_id = "qin_prostate_repeatability" AND SeriesDescription='T2 Weighted Axial Segmentations'
GROUP BY
  SeriesInstanceUID
"""

mr_selection_query =f"""
SELECT
  ANY_VALUE(CONCAT("cp s3",REGEXP_SUBSTR(gcs_url, "(://.*)/"),"/* ",collection_id,"/",PatientID,"/",StudyInstanceUID,"/T2Axial")) AS s5cmd_command
FROM
  `bigquery-public-data.idc_current.dicom_all`
WHERE
  collection_id = "qin_prostate_repeatability" AND SeriesDescription='T2 Weighted Axial'
GROUP BY
  SeriesInstanceUID
"""

selection_result = bq_client.query(seg_selection_query)
selection_df = selection_result.result().to_dataframe()
selection_df.to_csv("/content/s5cmd_gcp_manifest_seg.txt", header=False, index=False)

selection_result = bq_client.query(mr_selection_query)
selection_df = selection_result.result().to_dataframe()
selection_df.to_csv("/content/s5cmd_gcp_manifest_mr.txt", header=False, index=False)

Download manifests with the `s5cmd` tool.

In [None]:
%%shell
# check if dicoms have already been downloaded
if test -n "$(find dcms -name '*.dcm' -print -quit)"
then
    echo "dicoms already downloaded"
else
  mkdir -p dcms
  cd dcms
  s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run ../s5cmd_gcp_manifest_seg.txt
  s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run ../s5cmd_gcp_manifest_mr.txt
  cd -
fi

Functions to save label and mri as nifti from dicoms

In [None]:
def save_prostate_label(seg_file:Path, out_file:Path):
  ds = pydicom.dcmread(seg_file)
  reader = pydicom_seg.SegmentReader()
  result = reader.read(ds)
  for segment_number in result.available_segments:
    if result.segment_infos[segment_number].SegmentLabel == 'Prostate':
      image = result.segment_image(segment_number)
      out_file.parent.mkdir(parents=True, exist_ok=True)
      sitk.WriteImage(sitk.Cast(image, sitk.sitkUInt8), str(out_file), useCompression=True, compressionLevel=9)
      break
  else:
    raise RuntimeError(f"Could not find 'Prostate' label for {seg_file}")

In [None]:
def convert_dcm_to_nii(in_series_dir: Path, out_file: Path) -> bool:
    with TemporaryDirectory() as tmpdir:
        args = [
            "dcm2niix",
            "-o",
            tmpdir,
            "-z",
            "y",
            str(in_series_dir.resolve()),
        ]
        res = subprocess.run(args)
        if res.returncode != 0:
            return False

        out_file.parent.mkdir(parents=True, exist_ok=True)

        nii_files = list(Path(tmpdir).glob("*Eq_*.nii.gz"))
        if len(nii_files) > 1:
            # raise ValueError(f"Expected 1 Eq_*.nii.gz file, found {len(nii_files)}")
            return False
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], out_file)
            return True

        # no Eq images
        nii_files = list(Path(tmpdir).glob("*.nii.gz"))
        if len(nii_files) > 1:
            # raise ValueError(f"Expected 1 *.nii.gz file, found {len(nii_files)}")
            return False
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], out_file)
            return True
        # raise ValueError(f"Expected 1 *.nii.gz file, found 0")
        return False


convert MRI and segmentation files to nifti

In [None]:
seg_dcm_dirs = sorted(list(Path('dcms/qin_prostate_repeatability').rglob('T2AxialSeg')))
save_dir = Path('qin-prostate-repeatability')

for i, seg_dcm_dir in enumerate(tqdm(seg_dcm_dirs)):
  mr_dcm_dir = seg_dcm_dir.parent / "T2Axial"
  assert len(list(mr_dcm_dir.glob("*.dcm"))), f'no dicom files found for {mr_dcm_dir}'

  seg_dcm_file = list(seg_dcm_dir.glob("*.dcm"))
  assert len(seg_dcm_file)==1
  seg_dcm_file = seg_dcm_file[0]

  mr_nii = save_dir / 'imagesTr' / f"qin-prostate_{i:03d}.nii.gz"
  label_nii = save_dir / 'labelsTr' / f"qin-prostate_{i:03d}.nii.gz"

  # convert to nifti
  convert_dcm_to_nii(mr_dcm_dir, mr_nii)
  save_prostate_label(seg_dcm_file, label_nii)


In [None]:
%%shell
zip -r qin-prostate-repeatability.zip qin-prostate-repeatability

In [None]:
files.download('qin-prostate-repeatability.zip')