In [11]:
from tempfile import TemporaryDirectory
from pathlib import Path
import shutil
import subprocess
from tqdm.auto import tqdm
from textwrap import dedent

%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [12]:
task_dir = Path()
dcm_dir = task_dir / "data" / "dcm"
nii_dir = task_dir / "data" / "nii"
dcm_dir.mkdir(exist_ok=True, parents=True)
nii_dir.mkdir(exist_ok=True, parents=True)

In [13]:
%%bigquery series_desc
# Get a list of all the unique series descriptions in the upenn_gbm collection that contain the string "t2"
SELECT DISTINCT(seriesDescription)
FROM `bigquery-public-data.idc_current.dicom_all`
WHERE collection_id IN ('hcc_tace_seg', 'colorectal_liver_metastases') and Modality="CT"

Query is running:   0%|          |

Downloading:   0%|          |

In [14]:
%%bigquery selection_df
# create list of series to download from upenn_gbm collection that contain the string "t2" in the series description
SELECT
  collection_id,
  PatientID,
  StudyInstanceUID,
  SeriesInstanceUID,
  StudyDate,
  Modality,
  StudyDescription,
  SeriesDescription,
  # Organize the files in-place on the fly
  ANY_VALUE(CONCAT("cp s3",
      REGEXP_SUBSTR(gcs_url, "(://.*)/"),
      "/* ",collection_id,"/",PatientID,"/",
      StudyInstanceUID,"/",SeriesInstanceUID)) AS s5cmd_command,
  COUNT(sopInstanceUID) AS instance_count,
FROM
  `bigquery-public-data.idc_current.dicom_all`
WHERE
  collection_id IN ('hcc_tace_seg', 'colorectal_liver_metastases') and Modality="CT"
GROUP BY
  SeriesInstanceUID,collection_id,PatientID,StudyInstanceUID,SeriesInstanceUID,StudyDate,Modality,StudyDescription,SeriesDescription

Query is running:   0%|          |

Downloading:   0%|          |

In [15]:
# save selection dataframe to pickle, useful for further data curation
selection_df.sort_values(
    by=[
        "collection_id",
        "PatientID",
        "StudyDate",
        "StudyInstanceUID",
        "SeriesInstanceUID",
    ],
    inplace=True,
)
pkl_file = task_dir / "data" / "scan_data.pkl"
selection_df.to_pickle(pkl_file)
selection_df.to_csv(pkl_file.with_suffix(".csv"), index=False)

Download the manifest with s5cmd

In [16]:
# check the download directory and limit to just the files that are missing
for i, row in selection_df.iterrows():
    dcm_series_dir = dcm_dir / row["s5cmd_command"].split()[-1]
    dcm_cnt = len(list(dcm_series_dir.glob("*.dcm")))
    selection_df.loc[i, "downloaded_cnt"] = dcm_cnt

In [17]:
# save to file
missing_df = selection_df[
    selection_df["instance_count"] != selection_df["downloaded_cnt"]
]
if len(missing_df) == 0:
    print("all files already downloaded")
else:
    missing_df["s5cmd_command"].to_csv(
        "s5cmd_gcs_missing_manifest.s5cmd", header=False, index=False
    )
    print(
        dedent(
            f"""Run the following commands to download the dcm data
            cd data/dcm
            s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run {Path("s5cmd_gcs_missing_manifest.s5cmd").resolve()}
            cd -"""
        )
    )

selection_df["s5cmd_command"].to_csv(
    "s5cmd_gcs_manifest.s5cmd", header=False, index=False
)

all files already downloaded


Convert the files to nifti

In [18]:
def dcm_to_niix(dcm_dir: Path, nii_path: Path):
    """uses dcm2niix to convert a series of dicom files to a nifti file"""
    with TemporaryDirectory() as tmpdir:
        args = [
            "dcm2niix",
            "-o",
            tmpdir,
            "-z",
            "y",
            str(dcm_dir.resolve()),
        ]
        subprocess.run(args, check=True)

        nii_files = list(Path(tmpdir).glob("*Eq_*.nii.gz"))
        if len(nii_files) > 1:
            raise ValueError(f"Expected 1 Eq_*.nii.gz file, found {len(nii_files)}")
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], nii_path)
            return
        # no Eq images
        nii_files = list(Path(tmpdir).glob("*.nii.gz"))
        if len(nii_files) > 1:
            raise ValueError(f"Expected 1 *.nii.gz file, found {len(nii_files)}")
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], nii_path)
            return
        raise ValueError(f"Expected 1 *.nii.gz file, found 0")


In [19]:
bad_files = []
dcm_series_dirs = {x.parent for x in dcm_dir.rglob("*.dcm")}
for dcm_series_dir in tqdm(dcm_series_dirs):
    nii_file = nii_dir / f"{dcm_series_dir.relative_to(dcm_dir)}.nii.gz"
    if not nii_file.exists():
        nii_file.parent.mkdir(parents=True, exist_ok=True)
        try:
            dcm_to_niix(dcm_series_dir, nii_file)
        except:
            bad_files.append(dcm_series_dir)


  0%|          | 0/769 [00:00<?, ?it/s]

Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 91 DICOM file(s)
Convert 91 DICOM as /tmp/tmpzjztreuy/1.3.6.1.4.1.14519.5.2.1.1706.8374.205785371691452221491896488464_Recon_3_LIVER_3_PHASE_(AP)_20010902000000_6 (512x512x91x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpzjztreuy/1.3.6.1.4.1.14519.5.2.1.1706.8374.205785371691452221491896488464_Recon_3_LIVER_3_PHASE_(AP)_20010902000000_6.nii"
Conversion required 0.491137 seconds (0.202795 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 99 DICOM file(s)
Convert 99 DICOM as /tmp/tmpkn27tkuq/1.3.6.1.4.1.14519.5.2.1.9203.8273.248615515339569718019244079700_19920314104251_101 (512x512x99x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpkn27tkuq/1.3.6.1.4.1.14519.5.2.1.9203.8273.248615515339569718019244079700_19920314104251_101.nii"
Conversion required 0.595748 seconds (0.198309 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..245


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpfu7mlaf9/1.3.6.1.4.1.14519.5.2.1.1706.8374.197173974471763845846953652402_Recon_2_LIVER_3_PHASE_(AP)_19980305000000_4.nii"
Conversion required 0.621735 seconds (0.293486 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 79 DICOM file(s)
Convert 79 DICOM as /tmp/tmpwvweup96/1.3.6.1.4.1.14519.5.2.1.1706.8374.990430729211112437006486676615_Recon_2_LIVER_2PHASE_CAP_20020215000000_5 (512x512x79x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpwvweup96/1.3.6.1.4.1.14519.5.2.1.1706.8374.990430729211112437006486676615_Recon_2_LIVER_2PHASE_CAP_20020215000000_5.nii"
Conversion required 0.338082 seconds (0.153668 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 36 DICOM file(s)
Convert 36 DICOM as /tmp/tmpxrt_pl0e/1.3.6.1.4.1.14519.5.2.1.9203.8273.309953836542238933910820230787_CT_CAP_19950416082840_2 (512x512x36x1)
Compress: "/usr/bin/pigz

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..186


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp39_txh4e/1.3.6.1.4.1.14519.5.2.1.1706.8374.152132430147830313491115253022_2.5_SOFT_20060521000000_7.nii"
Conversion required 0.497476 seconds (0.262797 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 73 DICOM file(s)
Convert 73 DICOM as /tmp/tmp9hltdbq7/1.3.6.1.4.1.14519.5.2.1.1706.8374.831318638316036776295681699837_Recon_2_LIVER_2PHASE_CAP_20040719000000_5 (512x512x73x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp9hltdbq7/1.3.6.1.4.1.14519.5.2.1.1706.8374.831318638316036776295681699837_Recon_2_LIVER_2PHASE_CAP_20040719000000_5.nii"
Conversion required 0.238611 seconds (0.103407 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 37 DICOM file(s)
Convert 37 DICOM as /tmp/tmpkgug2lmn/1.3.6.1.4.1.14519.5.2.1.9203.8273.109205660681110647874415418231_CT_CAP_19920921123226_2 (512x512x37x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..249


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpcj5_hool/1.3.6.1.4.1.14519.5.2.1.1706.8374.144967540901815428834541637279_Recon_2_LIVER_3_PHASE_(AP)_19980221000000_4.nii"
Conversion required 0.579248 seconds (0.290947 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 41 DICOM file(s)
Convert 41 DICOM as /tmp/tmp6o2t3mtt/1.3.6.1.4.1.14519.5.2.1.9203.8273.283126429693593231564281104242_CT_CAP_19940802091456_2 (512x512x41x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp6o2t3mtt/1.3.6.1.4.1.14519.5.2.1.9203.8273.283126429693593231564281104242_CT_CAP_19940802091456_2.nii"
Conversion required 0.178506 seconds (0.068316 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 73 DICOM file(s)
Convert 73 DICOM as /tmp/tmpwix1zb9g/1.3.6.1.4.1.14519.5.2.1.1706.8374.271434394598584103418210274809_Recon_2_LIVER_3_PHASE_(C_A_P)_20000229000000_5 (512x512x73x1)
Compress: "/usr/bin/pigz" -b 960 -n -

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..236


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpg_adr902/1.3.6.1.4.1.14519.5.2.1.1706.8374.159125101693806936710380243661_2.5_STANDARD_20060521000000_7.nii"
Conversion required 0.547599 seconds (0.314443 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 77 DICOM file(s)
Convert 77 DICOM as /tmp/tmpc62k_lqf/1.3.6.1.4.1.14519.5.2.1.1706.8374.283760337562573792147261813340_Recon_2_LIVER_2PHASE_CAP_20040719000000_5 (512x512x77x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpc62k_lqf/1.3.6.1.4.1.14519.5.2.1.1706.8374.283760337562573792147261813340_Recon_2_LIVER_2PHASE_CAP_20040719000000_5.nii"
Conversion required 0.222500 seconds (0.106252 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 107 DICOM file(s)
Convert 107 DICOM as /tmp/tmpsqzq18wv/1.3.6.1.4.1.14519.5.2.1.1706.8374.241913026857534190223688407606_Recon_2_PRE_LIVER_20040410000000_3 (512x512x107x1)
Compress: "/usr/bin/pigz

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..176


Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 176 DICOM file(s)
Convert 176 DICOM as /tmp/tmpqhagkrl_/1.3.6.1.4.1.14519.5.2.1.1706.8374.679165792408332289805810990635_2.5_STANDARD_20060609000000_6 (512x512x88x2)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpqhagkrl_/1.3.6.1.4.1.14519.5.2.1.1706.8374.679165792408332289805810990635_2.5_STANDARD_20060609000000_6.nii"
Conversion required 0.885085 seconds (0.312939 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 87 DICOM file(s)
Convert 87 DICOM as /tmp/tmpgnez7osa/1.3.6.1.4.1.14519.5.2.1.1706.8374.226840645739856379941376572146_Recon_3_LIVER_3_PHASE_(AP)_19991119000000_5 (512x512x87x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpgnez7osa/1.3.6.1.4.1.14519.5.2.1.1706.8374.226840645739856379941376572146_Recon_3_LIVER_3_PHASE_(AP)_19991119000000_5.nii"
Conversion required 0.829828 seconds (0.169135 for core code).
Chris Rorden's dcm2niiX ve

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..184


Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 184 DICOM file(s)
Convert 184 DICOM as /tmp/tmpi_b7gcng/1.3.6.1.4.1.14519.5.2.1.1706.8374.244378575730197943717643702292_2.5_STANDARD_20060528000000_6 (512x512x92x2)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpi_b7gcng/1.3.6.1.4.1.14519.5.2.1.1706.8374.244378575730197943717643702292_2.5_STANDARD_20060528000000_6.nii"
Conversion required 0.881992 seconds (0.288105 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 83 DICOM file(s)
Convert 83 DICOM as /tmp/tmpu2k3s841/1.3.6.1.4.1.14519.5.2.1.1706.8374.185786012652432152862050527349_Recon_3_LIVER_2_PHASE_(C_A_P)_20050723000000_6 (512x512x83x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpu2k3s841/1.3.6.1.4.1.14519.5.2.1.1706.8374.185786012652432152862050527349_Recon_3_LIVER_2_PHASE_(C_A_P)_20050723000000_6.nii"
Conversion required 0.574165 seconds (0.161888 for core code).
Chris Rorden's dcm2n

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..273


Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 194 DICOM file(s)
Convert 194 DICOM as /tmp/tmpcszqdzwy/1.3.6.1.4.1.14519.5.2.1.1706.8374.201257050106336588378824687008_Recon_2_3_PHASE_LIVER_(ABD)_19980314000000_4 (512x512x97x2)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpcszqdzwy/1.3.6.1.4.1.14519.5.2.1.1706.8374.201257050106336588378824687008_Recon_2_3_PHASE_LIVER_(ABD)_19980314000000_4.nii"
Conversion required 0.590548 seconds (0.341622 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 49 DICOM file(s)
Convert 49 DICOM as /tmp/tmp6289kgzw/1.3.6.1.4.1.14519.5.2.1.9203.8273.530591543157467397770820982844_CT_CAP_19920725205204_102 (512x512x49x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp6289kgzw/1.3.6.1.4.1.14519.5.2.1.9203.8273.530591543157467397770820982844_CT_CAP_19920725205204_102.nii"
Conversion required 0.161748 seconds (0.063628 for core code).
Chris Rorden's dcm2niiX version 

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..212


Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 212 DICOM file(s)
Convert 212 DICOM as /tmp/tmpezgglmxg/1.3.6.1.4.1.14519.5.2.1.1706.8374.205180616857077421834017805477_2.5_STANDARD_20060418000000_7 (512x512x106x2)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpezgglmxg/1.3.6.1.4.1.14519.5.2.1.1706.8374.205180616857077421834017805477_2.5_STANDARD_20060418000000_7.nii"
Conversion required 0.815080 seconds (0.395885 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 54 DICOM file(s)
Convert 54 DICOM as /tmp/tmp6n9g1055/1.3.6.1.4.1.14519.5.2.1.1706.8374.187463073269563002113667955329_PRE_LIVER_20000122000000_2 (512x512x54x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp6n9g1055/1.3.6.1.4.1.14519.5.2.1.1706.8374.187463073269563002113667955329_PRE_LIVER_20000122000000_2.nii"
Conversion required 0.396258 seconds (0.104868 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x8

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..237


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmplavazubm/1.3.6.1.4.1.14519.5.2.1.1706.8374.933229621958600898029221260035_Recon_2_LIVER_3_PHASE_A_P_19971128000000_4.nii"
Conversion required 0.477684 seconds (0.249029 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 111 DICOM file(s)
Convert 111 DICOM as /tmp/tmpohtc_egb/1.3.6.1.4.1.14519.5.2.1.1706.8374.263547785435353286163865547042_Recon_3_LIVER_2_PHASE_(C_A_P)_20031129000000_6 (512x512x111x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpohtc_egb/1.3.6.1.4.1.14519.5.2.1.1706.8374.263547785435353286163865547042_Recon_3_LIVER_2_PHASE_(C_A_P)_20031129000000_6.nii"
Conversion required 0.437498 seconds (0.179155 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 79 DICOM file(s)
Convert 79 DICOM as /tmp/tmpwsyvp6h_/1.3.6.1.4.1.14519.5.2.1.1706.8374.160822956621848269004462469650_Recon_2_LIVER_3_PHASE_(AP)_19990309000000_4 (512x51

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..208


Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 208 DICOM file(s)
Convert 208 DICOM as /tmp/tmpxhxvzo9k/1.3.6.1.4.1.14519.5.2.1.1706.8374.312921235935989856481675251539_2.5_STANDARD_20051010000000_7 (512x512x104x2)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpxhxvzo9k/1.3.6.1.4.1.14519.5.2.1.1706.8374.312921235935989856481675251539_2.5_STANDARD_20051010000000_7.nii"
Conversion required 0.863607 seconds (0.444614 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 39 DICOM file(s)
Convert 39 DICOM as /tmp/tmpvg4tbl0k/1.3.6.1.4.1.14519.5.2.1.1706.8374.259752544656199142873412586200_6.3_@_ABD_(LIVER_3_PHASE)_5CC_@_19981023130551_2 (512x512x39x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpvg4tbl0k/1.3.6.1.4.1.14519.5.2.1.1706.8374.259752544656199142873412586200_6.3_@_ABD_(LIVER_3_PHASE)_5CC_@_19981023130551_2.nii"
Conversion required 0.236048 seconds (0.074545 for core code).
Chris Rorden's 

Error: Check sorted order: 4D dataset has 2 volumes, but volume index ranges from 1..196


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpog12knoz/1.3.6.1.4.1.14519.5.2.1.1706.8374.211979814210878034288134337933_2.5_STANDARD_20060709000000_6.nii"
Conversion required 0.613624 seconds (0.327945 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 99 DICOM file(s)
Convert 99 DICOM as /tmp/tmpho3gzsqk/1.3.6.1.4.1.14519.5.2.1.1706.8374.116451085904950186258093963163_Recon_3_LIVER_2PHASE_CAP_20010624000000_6 (512x512x99x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpho3gzsqk/1.3.6.1.4.1.14519.5.2.1.1706.8374.116451085904950186258093963163_Recon_3_LIVER_2PHASE_CAP_20010624000000_6.nii"
Conversion required 0.397720 seconds (0.185069 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 52 DICOM file(s)
Convert 52 DICOM as /tmp/tmpgev6y0bk/1.3.6.1.4.1.14519.5.2.1.1706.8374.319828841723664696291827284292_PRE_LIVER_19991003000000_103 (512x512x52x1)
Compress: "/usr/bin/pigz" -b 960 

In [20]:
len(bad_files)

1