In [1]:
from tempfile import TemporaryDirectory
from pathlib import Path
import shutil
import subprocess
from tqdm.auto import tqdm
from textwrap import dedent

%load_ext google.cloud.bigquery

In [2]:
task_dir = Path()
dcm_dir = task_dir / "data" / "dcm"
nii_dir = task_dir / "data" / "nii"
dcm_dir.mkdir(exist_ok=True, parents=True)
nii_dir.mkdir(exist_ok=True, parents=True)

In [3]:
%%bigquery series_desc
# Get a list of all the unique series descriptions in the upenn_gbm collection that contain the string "t2"
SELECT DISTINCT(seriesDescription)
FROM `bigquery-public-data.idc_current.dicom_all`
WHERE collection_id="upenn_gbm" and Modality="MR"

Query is running:   0%|          |

Downloading:   0%|          |

In [4]:
%%bigquery selection_df
# create list of series to download from upenn_gbm collection that contain the string "t2" in the series description
SELECT
  collection_id,
  PatientID,
  StudyInstanceUID,
  SeriesInstanceUID,
  StudyDate,
  Modality,
  StudyDescription,
  SeriesDescription,
  # Organize the files in-place on the fly
  ANY_VALUE(CONCAT("cp s3",
      REGEXP_SUBSTR(gcs_url, "(://.*)/"),
      "/* ",collection_id,"/",PatientID,"/",
      StudyInstanceUID,"/",SeriesInstanceUID)) AS s5cmd_command,
  COUNT(sopInstanceUID) AS instance_count,
FROM
  `bigquery-public-data.idc_current.dicom_all`
WHERE
  collection_id="upenn_gbm" and Modality="MR"
GROUP BY
  SeriesInstanceUID,collection_id,PatientID,StudyInstanceUID,SeriesInstanceUID,StudyDate,Modality,StudyDescription,SeriesDescription

Query is running:   0%|          |

Downloading:   0%|          |

In [5]:
# save selection dataframe to pickle, useful for further data curation
selection_df.sort_values(
    by=[
        "collection_id",
        "PatientID",
        "StudyDate",
        "StudyInstanceUID",
        "SeriesInstanceUID",
    ],
    inplace=True,
)
pkl_file = task_dir / "data" / "scan_data.pkl"
selection_df.to_pickle(pkl_file)
selection_df.to_csv(pkl_file.with_suffix(".csv"), index=False)


Download the manifest with s5cmd

In [6]:
# check the download directory and limit to just the files that are missing
for i, row in selection_df.iterrows():
    dcm_series_dir = dcm_dir / row["s5cmd_command"].split()[-1]
    dcm_cnt = len(list(dcm_series_dir.glob("*.dcm")))
    selection_df.loc[i, "downloaded_cnt"] = dcm_cnt


In [7]:
# save to file
missing_df = selection_df[
    selection_df["instance_count"] != selection_df["downloaded_cnt"]
]
if len(missing_df) == 0:
    print("all files already downloaded")
else:
    missing_df["s5cmd_command"].to_csv(
        "s5cmd_gcs_missing_manifest.s5cmd", header=False, index=False
    )
    print(
        dedent(
            f"""Run the following commands to download the dcm data
                cd data/dcm
                s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run {Path("s5cmd_gcs_missing_manifest.s5cmd").resolve()}
                cd -"""
        )
    )

selection_df["s5cmd_command"].to_csv(
    "s5cmd_gcs_manifest.s5cmd", header=False, index=False
)

Run the following commands to download the dcm data
                cd data/dcm
                s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run /home/gmurugesan/projects/experimental_projects/AIMI/aimiv2/deliverable/github_repo/aimi-brain-mr/s5cmd_gcs_missing_manifest.s5cmd
                cd -


Convert the files to nifti

In [8]:
def dcm_to_niix(dcm_dir: Path, nii_path: Path):
    """uses dcm2niix to convert a series of dicom files to a nifti file"""
    with TemporaryDirectory() as tmpdir:
        args = [
            "dcm2niix",
            "-o",
            tmpdir,
            "-z",
            "y",
            str(dcm_dir.resolve()),
        ]
        subprocess.run(args, check=True)

        nii_files = list(Path(tmpdir).glob("*Eq_*.nii.gz"))
        if len(nii_files) > 1:
            raise ValueError(f"Expected 1 Eq_*.nii.gz file, found {len(nii_files)}")
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], nii_path)
            return
        # no Eq images
        nii_files = list(Path(tmpdir).glob("*.nii.gz"))
        if len(nii_files) > 1:
            raise ValueError(f"Expected 1 *.nii.gz file, found {len(nii_files)}")
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], nii_path)
            return
        raise ValueError(f"Expected 1 *.nii.gz file, found 0")

In [9]:
bad_files = []
dcm_series_dirs = {x.parent for x in dcm_dir.rglob("*.dcm")}
for dcm_series_dir in tqdm(dcm_series_dirs):
    nii_file = nii_dir / f"{dcm_series_dir.relative_to(dcm_dir)}.nii.gz"
    if not nii_file.exists():
        nii_file.parent.mkdir(parents=True, exist_ok=True)
        try:
            dcm_to_niix(dcm_series_dir, nii_file)
        except:
            bad_files.append(dcm_series_dir)


0it [00:00, ?it/s]

In [10]:
len(bad_files)


0