# DICOM FlowMRI Writer - output of DICOM files from hpc-predict-io HDF5-format

In [28]:
# Notebook parameters - to be moved into papermill parameters
hpc_predict_data_root = "/home/lukasd/src/hpc-predict/hpc-predict/data/v1" # e.g. ../../../data/v0
mri_data_root = "input_data/original/mri/MRT Daten Bern tar"
mri_samples = ['10/10-10000718-10000719.h5']
tar_files = True
mri_preprocessed_root = "input_data/preprocessed/mri/MRT Daten Bern tar"
output_root = "output_data/clinical/testing/dicom-io/MRT Daten Bern"

In [29]:
# Bash version of list
mri_samples_bash = ' '.join([str(sample) for sample in mri_samples])
tar_files_bash = '--tar' if tar_files else ''
mri_samples_bash

'10/10-10000718-10000719.h5'

In [30]:
%env hpc_predict_data_root = {hpc_predict_data_root}
%env mri_data_root = {mri_data_root}
%env mri_samples = {mri_samples_bash}
%env tar_files = {tar_files_bash}
%env mri_preprocessed_root = {mri_preprocessed_root}
%env output_root = {output_root}

env: hpc_predict_data_root=/home/lukasd/src/hpc-predict/hpc-predict/data/v1
env: mri_data_root=input_data/original/mri/MRT Daten Bern tar
env: mri_samples=10/10-10000718-10000719.h5
env: tar_files=--tar
env: mri_preprocessed_root=input_data/preprocessed/mri/MRT Daten Bern tar
env: output_root=output_data/clinical/testing/dicom-io/MRT Daten Bern


In [31]:
mri_sources = [sample.split('/', maxsplit=1)[0] for sample in mri_samples]

# Adjust file paths

This script assumes that you have the Freiburg dataset available, otherwise first run `data/fetch_scripts/fetch_freiburg_to_tar.sh data/tmp data/v1`.

To run the following steps, the DICOM headers must previously be converted to pandas tables, i.e.

In [32]:
# %%bash
# set -x
# python3  convert_dicom_headers_to_pandas_df.py --hpc-predict-data-root "${hpc_predict_data_root}" --mri-data-root "${mri_data_root}" --mri-samples ${mri_samples} ${tar_files} --output-root "${output_root}"
# set +x

In [33]:
import os
import tarfile
import sys
import json
import pydicom
import pandas as pd
import numpy as np
from pprint import pprint

if not '../../python' in sys.path:
    sys.path.append('../../python')
from mr_io import FlowMRI

In [34]:
# Possibly duplicate with code in convert_dicom_headers_to_pandas_df

if os.path.exists(os.path.join(hpc_predict_data_root, 'encrypt')):
    hpc_predict_data_root = os.path.join(hpc_predict_data_root, 'decrypt')
    assert os.path.exists(hpc_predict_data_root)

mri_data_root = os.path.join(hpc_predict_data_root, mri_data_root)
assert os.path.exists(mri_data_root)

for sample in mri_sources:
    assert os.path.exists(os.path.join(mri_data_root, str(sample) + ('.tar' if tar_files else '/') ))
    
if tar_files:
    mri_source_tar_paths = [os.path.relpath(os.path.join(mri_data_root, str(sample) + '.tar'), start=hpc_predict_data_root) for sample in mri_sources]

mri_preprocessed_root = os.path.join(hpc_predict_data_root, mri_preprocessed_root)
assert os.path.exists(mri_preprocessed_root)

for sample in mri_samples:
    assert os.path.exists(os.path.join(mri_preprocessed_root, sample))
    assert os.path.exists(os.path.join(mri_preprocessed_root, sample.replace('.h5','.json')))

output_root = os.path.join(hpc_predict_data_root, output_root)
assert os.path.exists(output_root)

# Read FlowMRIs (and potentially original DICOM files)

In [35]:
flow_mris = []

for sample in mri_samples:
    flow_mri = FlowMRI.read_hdf5(os.path.join(mri_preprocessed_root, sample))
    with open(os.path.join(mri_preprocessed_root, sample.replace('.h5','.json')), 'r') as json_f:
        metainfo = json.load(json_f)
    flow_mris.append({ 'h5' : flow_mri,
                       'json': metainfo })

In [48]:
len(flow_mris[0]['json']['file_path']['P_files'][0])
len(flow_mris[0]['json']['file_path']['M_files'][0])

900

# Conversion to DICOM from hpc-predict-io HDF5

In [None]:
# Determine target folder
output_sample_dir = os.path.join(output_root, sample.rstrip('.h5'))
os.makedirs(output_sample_dir, exist_ok=False)

In [None]:
# Try with highdicom

In [None]:
# Try with pydicom-seg

In [49]:
# Try with pure pydicom

In [None]:
# slice_location = geometry[2]
# calculate image position patient from old_image_position_origin + (geometry[2] - slice_location[0])*ort_unit
# Pixel Spacing = (geometry[i][-1] - geometry[i][0])/(len(geometry[i])-1)
# instance number: magnitude: t*num_slice_locations + z + 1, 
#                  phase: j*cardiac_number_of_images*num_slice_locations + t*num_slice_locations + z + 1
# trigger times
# Rows = len(geometry[0]) ?
# Columns = = len(geometry[0]) ?
# Rescale Intercept/Slope ? Intercept = -max(abs(velocity_mean)), slope = 2*abs(Intercept)/2*Bits Stored(?)
# skip all other tags with multiplicity > 1 such as acquisition/content time (make up SOP Class UID etc. with uid.generate_uid)
# for those with multiplicity 1 select which ones to add 
#
# Create tags to annotate HPC-PREDICT information, such as code that was used to produce the data
#
# Question: how do we add overlays? 

In [None]:
df_native_grouped_unique_flow = df_native_grouped_unique[(df_native_grouped_unique['Study Instance UID'] == flow_mris[1]['study_instance_uid']) & (df_native_grouped_unique['Sequence Name'] == flow_mris[1]['sequence_name'])]
df_native_grouped_unique_flow

In [None]:

df_native_grouped_unique_flow_reindexed = df_native_grouped_unique_flow.set_index(df_native_grouped_unique_flow['Image Type'].apply(lambda x: x[0][2]))
# df_native_grouped_unique_flow_reindexed.loc['M']['FilePath'][0]

singleton_labels = df_native_grouped_unique_flow.drop(df_native_grouped_unique_flow.columns[~df_native_grouped_unique_flow.applymap(lambda x: not(hasattr(x,'__len__') and len(x) == 1) ).all()], axis=1) #.columns.values
singleton_labels

In [None]:
group_by_series_keys = ['FileCollectionID', 'Instance Creation Date', 'Patient ID', 'Study Instance UID', 'Sequence Name', 'Series Instance UID', 'Image Type']
# df.groupby(group_by_series_keys)

# Get study instance UID, convert to pandas table, compute unique tags by base tag address, iterate over tags - for unique copy them to new dataset except if should be included (by second table tag -> name), for non-unique, replace them appropriately (name look-up, etc.) 

tag_to_name = df.apply( lambda c: c.dropna().apply(lambda x: x.name).unique(), axis=0).transpose()
name_to_tag = tag_to_name.reset_index().rename(columns={0: 'name', 'index': 'tag'}).set_index('name')

tag_to_name[0]
name_to_tag['tag']['Study Instance UID']
#df[df.apply(lambda row: row['Study Instance UID'] == study_instance_uid and row['Sequence Name'] == sequence_name, axis=1)]

# df_hashified = df.applymap(lambda x: unpack_pydicom_value(x.value) if pd.notnull(x) else x)
# df_tag_to_name = df.applymap(lambda x: x.name if pd.notnull(x) else x)
# df_hashified.groupby(group_by_series_keys)

# # df_native_grouped = df_native_renamed.groupby(group_by_series_keys)
# # df_native_grouped.apply(lambda x: agg_unique(x))   


In [None]:
# dicom_img[name_to_tag['tag']['SOP Class UID']]
pydicom.dcmread(df_native_grouped_unique_flow_reindexed.loc['M']['FilePath'][0]).file_meta


In [None]:
len(df_native_grouped_unique_flow_reindexed.loc['M','Specific Character Set'])

In [None]:
#file_meta = FileMetaDataset
# file_meta = pydicom.dataset.FileMetaDataset()
# file_meta.MediaStorageSOPClassUID = dicom_img.file_meta[0x00020002].value
# file_meta.ImplementationClassUID = PYDICOM_IMPLEMENTATION_UID

# First get the study (flow MRI needs to describe data it comes from...)

# for series in source study, open first image as dicom_img
dcm_img = pydicom.dcmread(df_native_grouped_unique_flow_reindexed.loc['M']['FilePath'][0])
# create file_meta_ds/file_ds
ds = pydicom.dataset.Dataset()

print("### Simulating writing of DICOM ###")
for el in dcm_img: # also include dcm_img.file_meta later
    print(el.name, flush=True)
    if el.name == 'Pixel Data':
        print("Writing Pixel Data") # replace dcm-che prefix by pydicom prefix        
    elif len(df_native_grouped_unique_flow_reindexed.loc['M', el.name]) == 1: # FIXME: use el.tag instead of el.name (compute unique values by tags)
        if not el.name in ['Referenced Image Sequence', 'Study Instance UID', 'Series Instance UID', 'Rows', 'Columns', 'Pixel Spacing']: # more? for UIDs can just replace dcm-che prefix by that of pydicom implementation
            print("Copying unique element {} ".format(el.name)) # replace dcm-che prefix by pydicom prefix
#             file_ds.add(el.tag, el.VR, el.value) # module modifications...
        else: # special treatment of el.tag
            print("Special treatment for {} ".format(el.name)) # replace dcm-che prefix by pydicom prefix
    else: # probably one of the tags that has to be computed... (see above for the manual)
        print("Element {} has different values".format(el.name))

            

### Load pickled DataFrames and metainformation

In [None]:
with open(os.path.join(hpc_predict_data_root, mri_data_root,"venc.json"), 'r') as f:
    venc = json.load(f)
venc

In [None]:
pkls = sorted([os.path.join(output_root, str(sample), "dicom_header_df.pkl") for sample in mri_samples], key=lambda x: int(os.path.basename(os.path.dirname(x))) ) # os.path.basename(x).split('.')[0]
pkls

In [None]:
# Read pandas DataFrame that contains pydicom DataElement objects in individual entries 
df = pd.concat([pd.read_pickle(pkl) for pkl in pkls],axis=0)

In [None]:
# A pydicom Dataelement: The value is held in the _value attribute, 
# often in "semi-serialized" representation with the VR describing the DICOM type  
vars(df.iloc[0,0])

### Use names instead of DICOM tags as column labels

In [None]:
## Compute the DICOM tag -> name mapping to rename columns
tag_names = df.apply( lambda c: c.dropna().apply(lambda x: x.name).unique(), axis=0); assert tag_names.shape[0] == 1; tag_names = tag_names.loc[0] 
#tag_names.to_dict()

In [None]:
df_renamed = df.rename(columns=tag_names.to_dict())

# display only values
pd.set_option('display.max_columns', 20)
df_renamed.applymap(lambda x: x.value if pd.notnull(x) else x )

### Determine range of DICOM tags (via unique rows, grouped by patient ID > study UID > series UID)

In [None]:
# Determine types of DataElement.value entries
from collections import namedtuple, OrderedDict
from types import SimpleNamespace
import json

pd.set_option('display.max_columns', None)
pd.concat([pd.Series(name=name, data=col.dropna().unique()) for (name, col) in df_renamed.applymap(lambda x: (x.VR, type(x.value)) if pd.notnull(x) else x).iteritems()], axis=1)

In [None]:
# Extract native Python datatypes (built-in/standard library) from pydicom DataElements
def get_native_from_pydicom(val):
    from pydicom.valuerep import DA, DT, TM, DSfloat, DSdecimal, IS, PersonName
    from pydicom.uid import UID
    from pydicom.multival import MultiValue
    from datetime import (date, datetime, time, timedelta, timezone)
    from decimal import Decimal

    if isinstance(val, DA):
        return date(val.year, val.month, val.day)
    elif isinstance(val, DT):
        return datetime(val.year, val.month, val.day,
                        val.hour, val.minute, val.second,
                        val.microsecond, val.tzinfo)
    elif isinstance(val, TM):
        return time(val.hour, val.minute, val.second,
                    val.microsecond)
    elif isinstance(val, DSfloat):
        return float(val)
    elif isinstance(val, DSdecimal):
        return Decimal(val)
    elif isinstance(val, IS):
        return int(val)
    elif isinstance(val, MultiValue):
        return tuple([get_native_from_pydicom(el) for el in val])
    elif isinstance(val, UID):
        return str(val)
    elif isinstance(val, (PersonName, str, int, float)):
        return val
    elif isinstance(val, list):
        return tuple(val)
    else:
        return val        

def unpack_pydicom_value(value):
    if hasattr(type(value),'__module__') and type(value).__module__.startswith('pydicom'):
        return get_native_from_pydicom(value)
    elif isinstance(value, list):
        return tuple([unpack_pydicom_value(v) for v in value])
    elif isinstance(value,OrderedDict):
        return tuple( [unpack_pydicom_value(v.value) for v in value.values()] )
    elif isinstance(value, dict):
        return tuple(value.items())
    else:
        return value
    
def agg_unique(group):
    labels = []
    row = []
    for (name, col) in group[group.columns[:]].iteritems():
        labels.append(name)
        row.append(col.dropna().unique())

    return pd.DataFrame([row], columns=labels)

In [None]:
df_native_renamed = df_renamed.applymap(lambda x: unpack_pydicom_value(x.value) if pd.notnull(x) else x)

## Group-by patient/study/series ID

In [None]:
group_by_series_keys = ['FileCollectionID',
                        'Instance Creation Date',
                        'Patient ID',
                        'Study Instance UID',
                        'Sequence Name',
                        'Series Instance UID',
                        'Image Type']

df_native_grouped = df_native_renamed.groupby(group_by_series_keys)
df_native_grouped_unique = df_native_grouped.apply(lambda x: agg_unique(x))  

## All tags with <= 1 unique values per series in any series

In [None]:
df_native_grouped_unique_singleton_cols = df_native_grouped_unique.applymap(lambda x: len(x) <= 1).all()
df_native_grouped_unique.drop(
    df_native_grouped_unique.columns[~df_native_grouped_unique_singleton_cols], axis=1).applymap(lambda x: x[0] if len(x) == 1 else x)

## ...and number of unique values

In [None]:
df_native_grouped_unique.drop(
    df_native_grouped_unique.columns[df_native_grouped_unique_singleton_cols],axis=1).applymap(lambda x: len(x))

## All tags with more than a single unique values in at least one series

In [None]:
df_native_grouped_unique.drop(
    df_native_grouped_unique.columns[df_native_grouped_unique_singleton_cols],axis=1).applymap(
    lambda x: x[0] if len(x) == 1 else x).drop(['FilePath'], axis=1)