In [1]:
from tempfile import TemporaryDirectory
from pathlib import Path
import shutil
import subprocess
from tqdm.auto import tqdm
from textwrap import dedent
import pandas as pd
import re

%load_ext google.cloud.bigquery

In [2]:
task_dir = Path()
dcm_dir = task_dir / "data" / "dcm"
nii_dir = task_dir / "data" / "nii"
dcm_dir.mkdir(exist_ok=True, parents=True)
nii_dir.mkdir(exist_ok=True, parents=True)

## get the data from the IDC collections

the lung-ct task uses images from these IDC collections
import subprocess
* QIN-LUNG-CT
* SPIE-AAPM Lung CT Challenge
* National Lung Screening Trial (NLST)

But only a portion of the NLST collection is used, lets get the easy ones first.

### Get QIN-LUNG-CT and SPIE-AAPM collections

In [4]:
%%bigquery qin_lung_spie_df
# create list of series to download from upenn_gbm collection that contain the string "t2" in the series description
SELECT
      collection_id,
      PatientID,
      StudyInstanceUID,
      SeriesInstanceUID,
      StudyDate,
      Modality,
      StudyDescription,
      SeriesDescription,
  # Organize the files in-place on the fly
  ANY_VALUE(CONCAT("cp s3",
      REGEXP_SUBSTR(gcs_url, "(://.*)/"),
      "/* ",collection_id,"/",PatientID,"/",
      StudyInstanceUID,"/",SeriesInstanceUID)) AS s5cmd_command,
      COUNT(sopInstanceUID) AS instance_count,
FROM
  `bigquery-public-data.idc_current.dicom_all`
WHERE
  collection_id IN ('qin_lung_ct', 'spie_aapm_lung_ct_challenge') and Modality in ("CT", "SEG")
GROUP BY
  SeriesInstanceUID,collection_id,PatientID,StudyInstanceUID,SeriesInstanceUID,StudyDate,Modality,StudyDescription,SeriesDescription

Query is running:   0%|          |

Downloading:   0%|          |

### Get NLST collection
The NLST collection is huge. we need to limited it to ~1000 scans. Fortunately a paper by [Krishnaswamy, D. et al. (2023)](https://arxiv.org/abs/2306.00150) has already done this for us. We will use a modified version of their [query](https://github.com/ImagingDataCommons/nnU-Net-BPR-annotations/blob/main/common/queries/NLST_query.txt).

In [5]:
%%bigquery nlst_table
WITH
  # get the positive patients from the NLST dataset
  nlst_positive AS (
  SELECT
    SAFE_CAST(pid AS STRING) AS pid,
    PatientID,
    StudyInstanceUID,
    SeriesInstanceUID
  FROM
    `bigquery-public-data.idc_current_clinical.nlst_prsn` AS nlst
  JOIN
    `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  ON
    SAFE_CAST(nlst.pid AS STRING) = dicom_all.PatientID
  WHERE
    can_scr = 1
  ),


  # get various values for each series: number of instances, number of pizel spacing, min and max slice thickness, min and max z axis slice locations, ...
  nlst_instances_per_series AS (
  SELECT
    ANY_VALUE(nlst_positive.PatientID) AS PatientID,
    nlst_positive.StudyInstanceUID,
    nlst_positive.SeriesInstanceUID,
    COUNT(DISTINCT(SOPInstanceUID)) AS num_instances,
    COUNT(DISTINCT(ARRAY_TO_STRING(ImagePositionPatient,"/"))) AS position_count,
    COUNT(DISTINCT(ARRAY_TO_STRING(PixelSpacing,"/"))) AS pixel_spacing_count,
    COUNT(DISTINCT(ARRAY_TO_STRING(ImageOrientationPatient,"/"))) AS orientation_count,
    MIN(SAFE_CAST(SliceThickness AS float64)) AS min_SliceThickness,
    MAX(SAFE_CAST(SliceThickness AS float64)) AS max_SliceThickness,
    MIN(SAFE_CAST(ImagePositionPatient[SAFE_OFFSET(2)] AS float64)) AS min_SliceLocation,
    MAX(SAFE_CAST(ImagePositionPatient[SAFE_OFFSET(2)] AS float64)) AS max_SliceLocation,
    STRING_AGG(DISTINCT(SAFE_CAST("LOCALIZER" IN UNNEST(ImageType) AS string)),"") AS has_localizer,
    ANY_VALUE(dicom_all.ImageOrientationPatient) AS ImageOrientationPatient,
    ANY_VALUE(dicom_all.Modality) AS Modality
  FROM
    `bigquery-public-data.idc_current.dicom_all` as dicom_all
  JOIN
    nlst_positive
  ON
    nlst_positive.SeriesInstanceUID = dicom_all.SeriesInstanceUID
  WHERE
    collection_id = "nlst"
    AND Modality = "CT"
  GROUP BY
    StudyInstanceUID,
    SeriesInstanceUID ),


  # get the different in slice location between each slice in a series. Ideally there should be a single consistant position difference between each slice
  distinct_slice_location_difference_values AS (
  SELECT
      DISTINCT(TRUNC(SAFE_CAST(ImagePositionPatient[SAFE_OFFSET(2)] AS NUMERIC),1) - LAG(TRUNC(SAFE_CAST(ImagePositionPatient[SAFE_OFFSET(2)] AS NUMERIC),1),1) OVER(partition by SeriesInstanceUID ORDER BY TRUNC(SAFE_CAST(ImagePositionPatient[SAFE_OFFSET(2)] AS NUMERIC),1) DESC)) AS SliceLocation_difference,
      SeriesInstanceUID,
      StudyInstanceUID
  FROM
      `bigquery-public-data.idc_current.dicom_all`
  WHERE
    collection_id = "nlst"
    AND Modality = "CT"
  ),


  # Get counts and max and min values for the slice location differences
  # filter out series with slice thickness outside of the range 1.5 to 3.5 mm
  # filter out series with less than 100 instances
  # only keep series with 1:1 ratio of instances to position count
  # only keep series with a single, consistent pixel spacing
  # only keep series with a single, consistent orientation
  # limited the imageorentation to [1 0 0 0 1 0], which is typlically FFS patient orientation
  nlst_values_per_series AS (
  SELECT
    COUNT(distinct_slice_location_difference_values.SliceLocation_difference) as num_differences,
    MAX(ABS(distinct_slice_location_difference_values.SliceLocation_difference)) as max_difference,
    MIN(ABS(distinct_slice_location_difference_values.SliceLocation_difference)) as min_difference,
    ANY_VALUE(nlst_instances_per_series.PatientID) AS PatientID,
    ANY_VALUE(nlst_instances_per_series.StudyInstanceUID) AS StudyInstanceUID,
    distinct_slice_location_difference_values.SeriesInstanceUID AS SeriesInstanceUID,
    ANY_VALUE(nlst_instances_per_series.Modality) AS Modality,
    ANY_VALUE(nlst_instances_per_series.num_instances) AS num_instances,
    ANY_VALUE(nlst_instances_per_series.ImageOrientationPatient) AS ImageOrientationPatient
  FROM
    distinct_slice_location_difference_values
  JOIN
    nlst_instances_per_series
  ON
    nlst_instances_per_series.SeriesInstanceUID = distinct_slice_location_difference_values.SeriesInstanceUID
  WHERE
    nlst_instances_per_series.min_SliceThickness >= 1.5
    AND nlst_instances_per_series.max_SliceThickness <= 3.5
    AND nlst_instances_per_series.num_instances > 100
    AND nlst_instances_per_series.num_instances/nlst_instances_per_series.position_count = 1
    AND nlst_instances_per_series.pixel_spacing_count = 1
    AND nlst_instances_per_series.orientation_count = 1
    AND has_localizer = "false"
    AND ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(0)] AS float64)) > ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(1)] AS float64))
    AND ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(0)] AS float64)) > ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(2)] AS float64))
    AND ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(4)] AS float64)) > ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(3)] AS float64))
    AND ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(4)] AS float64)) > ABS(SAFE_CAST(nlst_instances_per_series.ImageOrientationPatient[SAFE_OFFSET(5)] AS float64))

  GROUP BY
    distinct_slice_location_difference_values.SeriesInstanceUID ),



  # select a series from each study
  select_single_series_from_study AS (
  SELECT
    ANY_VALUE(PatientID) AS PatientID,
    StudyInstanceUID,
    ANY_VALUE(SeriesInstanceUID) AS SeriesInstanceUID,
    ANY_VALUE(Modality) AS Modality,
    ANY_VALUE(nlst_values_per_series.num_differences) AS num_differences,
    ANY_VALUE(nlst_values_per_series.max_difference) AS max_difference,
    ANY_VALUE(nlst_values_per_series.min_difference) AS min_difference,
    ANY_VALUE(nlst_values_per_series.num_instances) AS num_instances
  FROM
    nlst_values_per_series
  GROUP BY
    StudyInstanceUID )


  # create table with download urls for each instance
  SELECT
    DISTINCT(select_single_series_from_study.SeriesInstanceUID) as SeriesInstanceUID,
    dicom_all.collection_id,
    select_single_series_from_study.PatientID,
    select_single_series_from_study.StudyInstanceUID,
    dicom_all.StudyDate,
    select_single_series_from_study.Modality,
    dicom_all.StudyDescription,
    dicom_all.SeriesDescription,
    # select_single_series_from_study.num_instances,
    # select_single_series_from_study.num_differences,
    # select_single_series_from_study.max_difference,
    # select_single_series_from_study.min_difference,
    # CONCAT("https://viewer.imaging.datacommons.cancer.gov/viewer/",select_single_series_from_study.StudyInstanceUID,"?seriesInstanceUID=",select_single_series_from_study.SeriesInstanceUID) AS idc_url,
    # Organize the files in-place on the fly
    ANY_VALUE(CONCAT("cp s3",
      REGEXP_SUBSTR(gcs_url, "(://.*)/"),
      "/* ",
      dicom_all.collection_id,"/",
      select_single_series_from_study.PatientID,"/",
      select_single_series_from_study.StudyInstanceUID,"/",
      select_single_series_from_study.SeriesInstanceUID)) AS s5cmd_command,
    COUNT(sopInstanceUID) AS instance_count,
  FROM
   `bigquery-public-data.idc_current.dicom_all` AS dicom_all
  JOIN
    select_single_series_from_study
  ON
    dicom_all.SeriesInstanceUID = select_single_series_from_study.SeriesInstanceUID
  WHERE
    select_single_series_from_study.num_differences <= 2
    AND select_single_series_from_study.max_difference/select_single_series_from_study.min_difference < 2
  GROUP BY
    SeriesInstanceUID,collection_id,PatientID,StudyInstanceUID,SeriesInstanceUID,StudyDate,Modality,StudyDescription,SeriesDescription


Query is running:   0%|          |

Downloading:   0%|          |

Combine the NLST query with the QIN-LUNG-CT and SPIE-AAPM queries.

In [6]:
selection_df = pd.concat([qin_lung_spie_df, nlst_table], ignore_index=True)

In [7]:
# save selection dataframe to pickle, useful for further data curation
selection_df.sort_values(
    by=[
        "collection_id",
        "PatientID",
        "StudyDate",
        "StudyInstanceUID",
        "SeriesInstanceUID",
    ],
    inplace=True,
)
pkl_file = task_dir / "data" / "scan_data.pkl"
selection_df.to_pickle(pkl_file)
selection_df.to_csv(pkl_file.with_suffix(".csv"), index=False)

Download the manifest with s5cmd

In [8]:
# check the download directory and limit to just the files that are missing
for i, row in selection_df.iterrows():
    dcm_series_dir = dcm_dir / row["s5cmd_command"].split()[-1]
    dcm_cnt = len(list(dcm_series_dir.glob("*.dcm")))
    selection_df.loc[i, "downloaded_cnt"] = dcm_cnt

### Make sure to run this command before next steps

In [9]:
# save to file
missing_df = selection_df[
    selection_df["instance_count"] != selection_df["downloaded_cnt"]
]
if len(missing_df) == 0:
    print("all files already downloaded")
else:
    missing_df["s5cmd_command"].to_csv(
        "s5cmd_gcs_missing_manifest.s5cmd", header=False, index=False
    )
    print(
        dedent(
            f"""Run the following commands to download the dcm data
                cd data/dcm
                s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run {Path("s5cmd_gcs_missing_manifest.s5cmd").resolve()}
                cd -"""
        )
    )

selection_df["s5cmd_command"].to_csv(
    "s5cmd_gcs_manifest.s5cmd", header=False, index=False
)

Run the following commands to download the dcm data
                cd data/dcm
                s5cmd --no-sign-request --endpoint-url https://storage.googleapis.com run /home/gmurugesan/projects/experimental_projects/AIMI/aimiv2/deliverable/github_repo/aimi-lung2-ct/s5cmd_gcs_missing_manifest.s5cmd
                cd -


Convert the files to nifti

In [10]:
def dcm_to_niix(dcm_dir: Path, nii_path: Path):
    """uses dcm2niix to convert a series of dicom files to a nifti file"""
    with TemporaryDirectory() as tmpdir:
        args = [
            "dcm2niix",
            "-o",
            tmpdir,
            "-z",
            "y",
            str(dcm_dir.resolve()),
        ]
        subprocess.run(args, check=True)

        nii_files = list(Path(tmpdir).glob("*Eq_*.nii.gz"))
        if len(nii_files) > 1:
            raise ValueError(f"Expected 1 Eq_*.nii.gz file, found {len(nii_files)}")
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], nii_path)
            return
        # no Eq images
        nii_files = list(Path(tmpdir).glob("*.nii.gz"))
        if len(nii_files) > 1:
            raise ValueError(f"Expected 1 *.nii.gz file, found {len(nii_files)}")
        elif len(nii_files) == 1:
            shutil.move(nii_files[0], nii_path)
            return
        raise ValueError(f"Expected 1 *.nii.gz file, found 0")

In [11]:
bad_files = []
dcm_series_dirs = {x.parent for x in dcm_dir.rglob("*.dcm")}
for dcm_series_dir in tqdm(dcm_series_dirs):
    nii_file = nii_dir / f"{dcm_series_dir.relative_to(dcm_dir)}.nii.gz"
    if not nii_file.exists():
        nii_file.parent.mkdir(parents=True, exist_ok=True)
        try:
            dcm_to_niix(dcm_series_dir, nii_file)
        except:
            bad_files.append(dcm_series_dir)

  0%|          | 0/379 [00:00<?, ?it/s]

Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 108 DICOM file(s)
Convert 108 DICOM as /tmp/tmpzbx6d97b/1.2.840.113654.2.55.275373474254165031723732882021251186848_0,OPA,GE,LSPLUS,LUNG,330,2.5,120,80,0.1,1.5_19990102000000_1 (512x512x108x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpzbx6d97b/1.2.840.113654.2.55.275373474254165031723732882021251186848_0,OPA,GE,LSPLUS,LUNG,330,2.5,120,80,0.1,1.5_19990102000000_1.nii"
Conversion required 0.551846 seconds (0.161153 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 108 DICOM file(s)
Convert 108 DICOM as /tmp/tmpzfbllpfu/1.2.840.113654.2.55.68868452922487664632761862635623704901_0,OPA,GE,LSPLUS,LUNG,326,2.5,120,80,0.1,1.5_19990102000000_1 (512x512x108x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmpzfbllpfu/1.2.840.113654.2.55.68868452922487664632761862635623704901_0,OPA,GE,LSPLUS,LUNG,326,2.5,120,80,0.1,1.5_19990102000000_1.nii"
Conversion 

Error: Unable to open '/home/gmurugesan/projects/experimental_projects/AIMI/aimiv2/deliverable/github_repo/aimi-lung2-ct/data/dcm/nlst/111221/1.2.840.113654.2.55.336655363694899352482953694955554158418/1.2.840.113654.2.55.38380760938863740334622103795226994737/e08a1a2c-5438-48a5-9904-3968876b1838.dcm2399124308'


Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp_5d17ld7/1.2.840.113654.2.55.22084245517968704647300402362399724489_0,OPA,TO,AQUL4,FC51,309.4,2,120,40,na,na_19990102000000_4.nii"
Conversion required 0.487649 seconds (0.243436 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 131 DICOM file(s)
Convert 131 DICOM as /tmp/tmp686vyyim/1.2.840.113654.2.55.33725636231942379140846526086694033319_0,OPA,GE,LSQX,STANDARD,350,2.5,120,60,0.1,1.5_19990102000000_2 (512x512x131x1)
Compress: "/usr/bin/pigz" -b 960 -n -f -6 "/tmp/tmp686vyyim/1.2.840.113654.2.55.33725636231942379140846526086694033319_0,OPA,GE,LSQX,STANDARD,350,2.5,120,60,0.1,1.5_19990102000000_2.nii"
Conversion required 0.451206 seconds (0.239750 for core code).
Chris Rorden's dcm2niiX version v1.0.20230411  GCC11.3.0 x86-64 (64-bit Linux)
Found 170 DICOM file(s)
Convert 170 DICOM as /tmp/tmpuv1pegv7/1.2.840.113654.2.55.323067664901890164304171820617478007365_2,OPA,PH,MX8000,C,351,3.2

In [None]:
len(bad_files)

0

In [None]:
bad_files

[]