## Download data from IDC

Download the DICOM images from IDC


Install utils to process dicom images, process the LIDC dataset, process the images and save them to files, and to download files from GCP stores

In [None]:
!pip install -r requirements.txt

Install the utils to sort the DICOM images

In [None]:
!git clone https://github.com/pieper/dicomsort.git

#### Set up connection to GCP, Gcloud, Big table 
Install utils to query the Bigtables

In [None]:
!pip install google-cloud-bigquery
!pip install --upgrade google-auth-oauthlib
!pip install --upgrade 'google-cloud-bigquery[bqstorage,pandas]'
!pip list | grep google-cloud-bigquery

- Load the magic to query bigtables from notebook
- Might need one restart after the previous step for the magic extension to load correctly

In [None]:
%load_ext google.cloud.bigquery

In [None]:
import pylidc as pl
from pylidc.utils import consensus
import pydicom as dicom
from skimage.measure import find_contours
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
import contextlib
import matplotlib.patches as patches
import os
import imageio
import sys
from glob import glob
import os
import cv2
import numpy as np
from skimage import morphology
from skimage import measure
from sklearn.cluster import KMeans
from skimage.transform import resize
import os
from numpy import random
import time
import pandas as pd
from PIL import Image
import contextlib

Check if the pylidc library is able to process the records

In [None]:
pl.query(pl.Scan).count()
pid = 'LIDC-IDRI-0069'#'LIDC-IDRI-0001'
scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first()
scan.study_instance_uid

Set up connection to the google big table

In [None]:
from google_auth_oauthlib import flow

# TODO: Uncomment the line below to set the `launch_browser` variable.
launch_browser = False
#
# The `launch_browser` boolean variable indicates if a local server is used
# as the callback URL in the auth flow. A value of `True` is recommended,
# but a local server does not work if accessing the application remotely,
# such as over SSH or from a remote Jupyter notebook.

appflow = flow.InstalledAppFlow.from_client_secrets_file(
    "client_secret_desktop.json", scopes=["https://www.googleapis.com/auth/bigquery"]
)

if launch_browser:
    appflow.run_local_server()
else:
    appflow.run_console()

credentials = appflow.credentials

In [None]:
from google.cloud.bigquery import magics
magics.context.credentials = credentials

Note: mandatory next step

> Configure gsutil in a console
> - gsutil config

In [None]:
#myProjectID = "positive-sector-330514"
myProjectID = "idc-external-012"

In [None]:
%%bigquery CT_series --project=$myProjectID 

WITH
  all_lidc_ct_series AS (
  SELECT
    DISTINCT(SeriesInstanceUID),
    StudyInstanceUID,  
    PatientID,
    SliceThickness,
    ARRAY_TO_STRING(PixelSpacing,"/") as pixelspa 
  FROM
    `canceridc-data.idc_views.dicom_all`
  WHERE
    Modality = "CT"
    AND collection_id = "lidc_idri")
SELECT PatientID,SliceThickness,pixelspa,StudyInstanceUID,SeriesInstanceUID FROM
  all_lidc_ct_series
ORDER BY
  PatientID

In [None]:
total_patients = len(np.unique(CT_series['PatientID']))
chosen_patients = len(np.unique(CT_series[(CT_series["SliceThickness"].astype(float)>=1.0)]['PatientID']))
print("Choosing patients with slice thickness > 1.0")
print("Total patients is {} and chosen patients are {}".format(total_patients, chosen_patients))
ct_series_HR = CT_series[(CT_series["SliceThickness"].astype(float)>=1.0)]
print("Shape of the raw dataset is {}".format(ct_series_HR.shape))

to_download = []
base_gs_uri = 'gs://idc-tcia-lidc-idri/dicom/'

for i in range(ct_series_HR.shape[0]):
    to_download.append(base_gs_uri + ct_series_HR.iloc[i]['StudyInstanceUID'] + '/' + ct_series_HR.iloc[i]['SeriesInstanceUID'])

manifest_df = pd.DataFrame({"gcs":to_download},columns=["gcs"])

manifest_df.to_csv("gcs_paths_all.txt",header=False, index=False)

In [None]:
!head gcs_paths_all.txt

#### Download the DICOM images

- Define the paths where we will download the DICOM images from GCP and then sort them.
- Create the train / val / test split based on the manifest URLs

In [None]:
yolo_data_dir = '/data/yolov5/'
idxs = np.arange(manifest_df.shape[0])
np.random.seed(2011)
np.random.shuffle(idxs)
split_ratio = 0.05
data_idxs = idxs[int(manifest_df.shape[0]*split_ratio):]
test_idxs = idxs[:int(manifest_df.shape[0]*split_ratio)]
split_ratio_val = 0.2
train_idxs = data_idxs[int(data_idxs.shape[0]*split_ratio_val):]
val_idxs = data_idxs[:int(data_idxs.shape[0]*split_ratio_val)]
print("Splits train:{}, val: {}, test: {}".format(train_idxs.shape[0],val_idxs.shape[0],test_idxs.shape[0]))

#### Routine to download and sort the DICOM images

You can adjust how many you wish to download at a time, to conserve space on the disk

In [None]:
import os, shutil
import glob

def download_data(idxs, data_dir, dtype='train', paging=True):
    
    download_dir = data_dir + "/" + dtype + "/" + 'gcsfiles/'
    sorted_dir = data_dir + "/" + dtype + "/" + 'sorted_gcsfiles/'
    if not os.path.exists(download_dir):
        print("Creating directory {}".format(download_dir))
        os.makedirs(download_dir)
    else:
        print("Directory {} exists".format(download_dir))

    if not os.path.exists(sorted_dir):
        print("Creating directory {}".format(sorted_dir))
        os.makedirs(sorted_dir)
    else:
        print("Directory {} exists".format(sorted_dir))
    
    manifest = manifest_df.iloc[idxs]
    
    if paging:
        for i in tqdm(range(manifest.shape[0])):
            url = manifest.iloc[i].gcs

            #download the DICOM images
            download_command = "gsutil -u {} -m cp -r {} {} >/dev/null 2>&1".format(myProjectID, url, download_dir)
            os.system(download_command)

            if (i > 1) and (i % 50 == 0): # process 50 patients at a time, to save space on disk
                # sort the images
                sort_command = "python dicomsort/dicomsort.py -u {} {}/%PatientID/%StudyInstanceUID/%SeriesInstanceUID/%SOPInstanceUID.dcm".format(download_dir, sorted_dir)
                os.system(sort_command)

                # clean up the download dir
                files = glob.glob(download_dir + "/*")
                for j in tqdm(range(len(files))):
                    if os.path.isdir(files[j]):
                        shutil.rmtree(files[j])
    else:
        #download all
        for i in tqdm(range(manifest.shape[0])):
            url = manifest.iloc[i].gcs
            #download the DICOM images
            download_command = "gsutil -u {} -m cp -r {} {} >/dev/null 2>&1".format(myProjectID, url, download_dir)
            os.system(download_command)
            
        #sort all
        sort_command = "python dicomsort/dicomsort.py -u {} {}/%PatientID/%StudyInstanceUID/%SeriesInstanceUID/%SOPInstanceUID.dcm".format(download_dir, sorted_dir)
        os.system(sort_command)
        
        #clean up the download dir
        files = glob.glob(download_dir + "/*")
        for j in tqdm(range(len(files))):
            if os.path.isdir(files[j]):
                shutil.rmtree(files[j])

#### Configure the data directories

In [None]:
data_dir = '/data/lidc/dicom/'

In [None]:
%%time
download_data(train_idxs, data_dir, 'train', True)

In [None]:
%%time
download_data(val_idxs, data_dir, 'val', True)

In [None]:
%%time
download_data(test_idxs, data_dir, 'test', False)

In [None]:
def set_dicom_path(dicom_path):
    f = open ('/root/.pylidcrc','w')
    f.write('[dicom]'+'\n')
    f.write('path =' + dicom_path +'\n')
    f.write('warn = True')
    f.close()

In [None]:
def load_vol_for_patient(patient_id, dicom_path):
    set_dicom_path(dicom_path)
    #patient_id = [patient_id
    scan = pl.query(pl.Scan).filter(pl.Scan.patient_id.in_([patient_id]))[0]
    nodules_annotation = scan.cluster_annotations()
    with contextlib.redirect_stdout(None):
        vol = scan.to_volume()
    return vol, nodules_annotation

In [None]:
dicom_root_dir = '/data/lidc/dicom/train/sorted_gcsfiles/'

In [None]:
# test
pat_id = 'LIDC-IDRI-0033'#'LIDC-IDRI-0001'
vol, nodules_annotation = load_vol_for_patient(pat_id, dicom_root_dir)
print("There are {} nodules annotated for the patient {}".format(len(nodules_annotation), pat_id))