# File-visitor-based data exploration (moved to convert_dicom_to_pandas)

In [None]:
import os
import pydicom
from pprint import pprint
from dicom_visitors import *

# Directory containing MRI studies
mri_data_root = "/home/lukasd/src/hpc-predict/data/v0/input_data/original/mri/MRT Daten Bern/"

In [None]:
%%script false --no-raise-error

## Compare DICOM files listed in DICOMDIR to those available in subfolders

list_of_dicom_dir_files = []
list_of_dicom_dir_series = []
list_of_dicom_dir_studies = []
list_of_dicom_dir_patients = []
def list_of_dicom_dir_files_append(image_filename, dcm_image, dcmdir_entry):
    list_of_dicom_dir_files.append(image_filename) 
    list_of_dicom_dir_series.append(dcmdir_entry['series'].SeriesInstanceUID)
    list_of_dicom_dir_studies.append(dcmdir_entry['study'].StudyInstanceUID)
    list_of_dicom_dir_patients.append(dcmdir_entry['patient'].PatientID)
    
list_of_dicom_files = []
list_of_dicom_series = []
list_of_dicom_studies = []
list_of_dicom_patients = []
def list_of_dicom_files_append(image_filename, dcm_image):
    list_of_dicom_files.append(image_filename)
    list_of_dicom_series.append(dcm_image.SeriesInstanceUID)
    list_of_dicom_studies.append(dcm_image.StudyInstanceUID)
    list_of_dicom_patients.append(dcm_image.PatientID)
    
def validate_dicom_dir_collection(mri_data_root, max_dirs=2):
    mri_data_collection = sorted([s.path for s in os.scandir(mri_data_root) if s.is_dir()], key=lambda x: int(os.path.basename(x)))
    
    for mri_data_sample in mri_data_collection if max_dirs is None else mri_data_collection[:max_dirs]:
        # fetch the path to the test data
        mri_data_path = os.path.join(mri_data_root, mri_data_sample)
        
        visit_dicom_files(mri_data_path, list_of_dicom_files_append)
        
        filepath = os.path.join(mri_data_path, 'DICOMDIR')
        print('Validating DICOMDIR for {}...'.format(mri_data_path))
        dicom_dir = pydicom.filereader.read_dicomdir(filepath)
        base_dir = os.path.dirname(filepath)

        visit_dicom_dir(base_dir, dicom_dir, list_of_dicom_dir_files_append)
        
        for entity, list_dicom_files, list_dicom_dir in [('files',    list_of_dicom_files,    list_of_dicom_dir_files), 
                                                         ('series',   list_of_dicom_series,   list_of_dicom_dir_series),
                                                         ('studies',  list_of_dicom_studies,  list_of_dicom_dir_studies),
                                                         ('patients', list_of_dicom_patients, list_of_dicom_dir_patients)]:
            set_dicom_files = set(list_dicom_files)
            set_dicom_dir = set(list_dicom_dir)
            diff_files_to_dicom = set_dicom_files.difference(set_dicom_dir)

            if len(diff_files_to_dicom) > 0:
                diff_files_to_dicom_list = sorted(list(diff_files_to_dicom))
                print(" " * 4 + "Found {} {} in DICOMDIR and {} on filesystem. Only on filesystem:".format(len(set_dicom_dir), entity, len(set_dicom_files)))
                pprint(diff_files_to_dicom_list if len(diff_files_to_dicom_list) < 10 else diff_files_to_dicom_list[:10], indent=4)


            list_dicom_files.clear()
            list_dicom_dir.clear()
        
validate_dicom_dir_collection(mri_data_root)

In [None]:
%%script false --no-raise-error

# Compute value range of selected DICOM header fields based on images referenced in DICOMDIR files

accumulate_attributes = ['ImageOrientationPatient', 'ImagePositionPatient'] #['MRAcquisitionType', 'ImageType', 'Modality'] # 'CardiacNumberOfImages', 'PatientAge'

accumulators = {accumulate_attribute: (('patient', {}), 
                                       ('study', {}), 
                                       ('series', {})) for accumulate_attribute in accumulate_attributes}
group_by = {'patient': 'PatientID', 
            'study': 'StudyInstanceUID', 
            'series': 'SeriesInstanceUID'}

def accumulate_visitor(image_filename, dcm_image, dcmdir_entry):
    for accumulate_attribute in accumulate_attributes:
        for dcm_type, accumulator in accumulators[accumulate_attribute]: #[('patient', accumulate_by_patient), ('study', accumulate_by_study), ('series',accumulate_by_series)]:
            key = getattr(dcmdir_entry[dcm_type], group_by[dcm_type])
            attribute = getattr(dcm_image, accumulate_attribute)
            if type(attribute) in [list, pydicom.multival.MultiValue]:
                attribute = tuple(attribute)
            if key in accumulator:
                accumulator[key].add(attribute)
            else:
                accumulator[key] = set([attribute])
            #print("{}: {}".format(image_filename, dcm_image.CardiacNumberOfImages))
    
#visit_dicom_dir(base_dir, dicom_dir, accumulate_visitor)
visit_dicom_dir_collection(mri_data_root, accumulate_visitor, max_dirs=1)

for accumulate_attribute in accumulate_attributes:
    for dcm_type, accumulator in accumulators[accumulate_attribute]:
        pprint("{} grouped by {} ({} -> {}):".format(accumulate_attribute, dcm_type, group_by[dcm_type], accumulate_attribute), indent=12)
        pprint(accumulator, indent=12)
    pprint("{} aggregated for all patients:".format(accumulate_attribute), indent=12)
    accumulated_attribute_set = set()
    for attribute_set in accumulators[accumulate_attribute][0][1].values():
        accumulated_attribute_set = accumulated_attribute_set.union(attribute_set)
    pprint(accumulated_attribute_set, indent=12)


In [None]:
%%script false --no-raise-error

# Compute value range of selected DICOM header fields based on all DICOM files

accumulate_attributes = ['ImageOrientationPatient', 'ImagePositionPatient'] #['MRAcquisitionType', 'ImageType', 'Modality'] # 'CardiacNumberOfImages', 'PatientAge'

accumulators = {accumulate_attribute: (('patient', {}), 
                                       ('study', {}), 
                                       ('series', {})) for accumulate_attribute in accumulate_attributes}
group_by = {'patient': 'PatientID', 
            'study': 'StudyInstanceUID', 
            'series': 'SeriesInstanceUID'}

def accumulate_visitor(image_filename, dcm_image):
    # Apply filter
    if not(dcm_image.MRAcquisitionType == '3D' and \
                ('M' in dcm_image.ImageType or 'P' in dcm_image.ImageType) and \
                dcm_image.Modality == 'MR'):
        print(" " * 4 + "Skipping {} due to lack of phase/magnitude image type".format(image_filename))# TODO: print range of headers of excluded images
    for accumulate_attribute in accumulate_attributes:
        for dcm_type, accumulator in accumulators[accumulate_attribute]: #[('patient', accumulate_by_patient), ('study', accumulate_by_study), ('series',accumulate_by_series)]:
            key = getattr(dcm_image, group_by[dcm_type])
            attribute = getattr(dcm_image, accumulate_attribute)
            if type(attribute) in [list, pydicom.multival.MultiValue]:
                attribute = tuple(attribute)
            if key in accumulator:
                accumulator[key].add(attribute)
            else:
                accumulator[key] = set([attribute])
            #print("{}: {}".format(image_filename, dcm_image.CardiacNumberOfImages))
    
#visit_dicom_dir(base_dir, dicom_dir, accumulate_visitor)
visit_dicom_file_collection(mri_data_root, accumulate_visitor, max_dirs=1)

for accumulate_attribute in accumulate_attributes:
    for dcm_type, accumulator in accumulators[accumulate_attribute]:
        pprint("{} grouped by {} ({} -> {}):".format(accumulate_attribute, dcm_type, group_by[dcm_type], accumulate_attribute), indent=12)
        pprint(accumulator, indent=12)
    pprint("{} aggregated for all patients:".format(accumulate_attribute), indent=12)
    accumulated_attribute_set = set()
    for attribute_set in accumulators[accumulate_attribute][0][1].values():
        accumulated_attribute_set = accumulated_attribute_set.union(attribute_set)
    pprint(accumulated_attribute_set, indent=12)