# Exploratory data analysis with Pandas - small number of studies

In [None]:
hpc_predict_data_dir="/home/lukasd/src/hpc-predict/data/v0" # e.g. ../../../data/v0

This script assumes that you have the Freiburg dataset available, otherwise first run `data/fetch_scripts/fetch_freiburg.sh`.

To run the following steps, the DICOM headers must previously be converted to pandas tables, i.e.

```
convert_dicom_to_pandas.py --mri-data-root "${hpc_predict_data_dir}/input_data/original/mri/MRT Daten Bern" --mri-samples ... --output-root "${hpc_predict_data_dir}/input_data/original/mri/MRT Daten Bern DICOM Header"
```

### Import pickled DataFrames

In [None]:
from glob import glob
import os

pkls = sorted(glob(hpc_predict_data_dir + "/input_data/preprocessed/mri/MRT Daten Bern DICOM Header/3.pkl"), key=lambda x: int(os.path.basename(x).split('.')[0]) )
pkls

In [None]:
import pydicom
import pandas as pd
import numpy as np
from pprint import pprint

# Read pandas DataFrame that contains pydicom DataElement entries and was generated with convert_dicom_to_pandas.py
df = pd.concat([pd.read_pickle(pkl) for pkl in pkls],axis=0)

In [None]:
# Individual entries are pydicom DataElements
df.iloc[0,0]

In [None]:
# The value is held in the _value attribute, often in "semi-serialized" representation with 
# the VR describing the DICOM type  
vars(df.iloc[0,0])

### Use names instead of DICOM tags as column labels

In [None]:
## Compute the DICOM tag -> name mapping to renamed columns
tag_names = df.apply( lambda c: c.dropna().apply(lambda x: x.name).unique(), axis=0); assert tag_names.shape[0] == 1; tag_names = tag_names.loc[0] 
#tag_names.to_dict()

In [None]:
df_renamed = df.rename(columns=tag_names.to_dict())

# display only values
pd.set_option('display.max_columns', 20)
df_renamed.applymap(lambda x: x.value if pd.notnull(x) else x )

### Determine range of DICOM tags (via unique rows, grouped by patient ID > study UID > series UID)

In [None]:
# Determine types of DataElement.value entries
from collections import namedtuple, OrderedDict
from types import SimpleNamespace
import json

pd.set_option('display.max_columns', None)
pd.concat([pd.Series(name=name, data=col.dropna().unique()) for (name, col) in df_renamed.applymap(lambda x: (x.VR, type(x.value)) if pd.notnull(x) else x).iteritems()], axis=1)

In [None]:
# Extract native Python datatypes (built-in/standard library) from pydicom DataElements
def get_native_from_pydicom(val):
    from pydicom.valuerep import DA, DT, TM, DSfloat, DSdecimal, IS, PersonName
    from pydicom.uid import UID
    from pydicom.multival import MultiValue
    from datetime import (date, datetime, time, timedelta, timezone)
    from decimal import Decimal

    if isinstance(val, DA):
        return date(val.year, val.month, val.day)
    elif isinstance(val, DT):
        return datetime(val.year, val.month, val.day,
                        val.hour, val.minute, val.second,
                        val.microsecond, val.tzinfo)
    elif isinstance(val, TM):
        return time(val.hour, val.minute, val.second,
                    val.microsecond)
    elif isinstance(val, DSfloat):
        return float(val)
    elif isinstance(val, DSdecimal):
        return Decimal(val)
    elif isinstance(val, IS):
        return int(val)
    elif isinstance(val, MultiValue):
        return tuple([get_native_from_pydicom(el) for el in val])
    elif isinstance(val, UID):
        return str(val)
    elif isinstance(val, (PersonName, str, int, float)):
        return val
    elif isinstance(val, list):
        return tuple(val)
    else:
        return val        

def unpack_pydicom_value(value):
    if hasattr(type(value),'__module__') and type(value).__module__.startswith('pydicom'):
        return get_native_from_pydicom(value)
    elif isinstance(value, list):
        return tuple([unpack_pydicom_value(v) for v in value])
    elif isinstance(value,OrderedDict):
        return tuple( [unpack_pydicom_value(v.value) for v in value.values()] )
    elif isinstance(value, dict):
        return tuple(value.items())
    else:
        return value
    
def agg_unique(group):
    labels = []
    row = []
    for (name, col) in group[group.columns[:]].iteritems():
        labels.append(name)
        row.append(col.dropna().unique())

    return pd.DataFrame([row], columns=labels)

In [None]:
df_native_renamed = df_renamed.applymap(lambda x: unpack_pydicom_value(x.value) if pd.notnull(x) else x)

group_by_series_keys = ['FileCollectionID',
                        'Instance Creation Date',
                        'Patient ID',
                        'Study Instance UID',
                        'Sequence Name',
                        'Series Instance UID',
                        'Image Type']

df_native_grouped = df_native_renamed.groupby(group_by_series_keys)
df_native_grouped_unique = df_native_grouped.apply(lambda x: agg_unique(x))  

In [None]:
# Display all tags with <= 1 unique values per series in any series

df_native_grouped_unique_singleton_cols = df_native_grouped_unique.applymap(lambda x: len(x) <= 1).all()
df_native_grouped_unique.drop(
    df_native_grouped_unique.columns[~df_native_grouped_unique_singleton_cols], axis=1).applymap(lambda x: x[0] if len(x) == 1 else x)

In [None]:
#list(df_native_grouped_unique['De-identification Method Code Sequence'])

In [None]:
df_native_grouped_unique.drop(
    df_native_grouped_unique.columns[df_native_grouped_unique_singleton_cols],axis=1).applymap(lambda x: len(x))

In [None]:
# Display all tags with more than a single unique values in at least one series

df_native_grouped_unique.drop(
    df_native_grouped_unique.columns[df_native_grouped_unique_singleton_cols],axis=1).applymap(lambda x: x[0] if len(x) == 1 else x).drop(['FilePath'], axis=1)

### Analysis: Referenced Image Sequence tag

In [None]:
example_referenced_uid = df_native_renamed['Referenced Image Sequence'].iloc[0][0][1]

# Check if we can find the referenced image sequence
print("The string length of a Referenced Image Sequence UID is {}: {}.".format(len(example_referenced_uid), example_referenced_uid))
        
def get_uid_prefix(uid):
    return '.'.join(uid.split('.')[:-1])
example_referenced_uid_prefix = get_uid_prefix(example_referenced_uid)
print("Example referenced UID prefix is {}".format(example_referenced_uid_prefix))

print("Share of SOP Instance UIDs with same prefix up to and after last point: {}.".format(', '.join([
    str(100.*
    (df_native_renamed['SOP Instance UID'].apply(lambda x: x[:prefix_len] ) == example_referenced_uid[:prefix_len] ).sum()/
    df_native_renamed.shape[0]) + " %" for prefix_len in range(len(example_referenced_uid_prefix),len(example_referenced_uid_prefix)+10)]) ) )

Conclusion: Referenced Image Sequence Tag identical before last point, afterwards ~random - the prefix 1.2.40.0.13.1 is taken from the dcm4che DICOM implementation, the remainder is a randomly generated UID.

### Analysis of time (Acquisition time, Content time, Trigger time and Nominal interval)

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
plt.rcParams.update({'font.size': 14})

In [None]:
import datetime  
from types import SimpleNamespace

In [None]:
group_by_series_title = "Group by series UID plot (" + \
                        " / ".join([group_by_series_keys[0],
                                    group_by_series_keys[1],
                                    group_by_series_keys[4],
                                    group_by_series_keys[6],
                                    'Nominal Interval',
                                    'Cardiac Number of Images',
                                    '[P-index]'
                                  ]) + \
                        ")"

def legend_unique_values(ls):
    return str(ls[0]) if len(ls)==1 else "{}..{}".format(ls.min(), ls.max()) 

def group_by_series_legend(name, group, j=None):
    nominal_intervals = group['Nominal Interval'].unique() #.apply(lambda x: x.original_string).unique()
    cardiac_number_of_images = group['Cardiac Number of Images'].unique() #.apply(lambda x: x.original_string).unique()
    return ", ".join([
            str(name[0]),
            str(name[1]),
            name[4],
            str(name[6]), 
            "NomInt {} ms".format(legend_unique_values(nominal_intervals)),
            "CardImgs {}".format(legend_unique_values(cardiac_number_of_images))
            ]) + (", P-{}".format(j) if j is not None else "")    

In [None]:
#from inspect import signature

# groups = df.\
#     applymap(lambda x: unpack_pydicom_value(x.value) if pd.notnull(x) else x).\
#     groupby(group_by_series_keys)

def make_group_by_plot(groups, x_series, x_label, y_series, y_label):    
    
    ngroups = len(groups.groups)
    ncols = 3
    nrows = (ngroups+3-1)//ncols    
    
    fig, ax = plt.subplots(nrows=nrows,ncols=ncols, sharex=True, figsize=(12*ncols,8*nrows))
    ax = ax.flatten()
    
    for i, (name, group) in enumerate(groups): 

        if 'P' not in group['Image Type'].iloc[0]:
            ax[i].plot(x_series(group), 
                       y_series(group), 
                       'o',  ms=6, linewidth=1, 
                       label=group_by_series_legend(name,group))
        else:        
            for j in range(3):
                select_subgroup = group['Instance Number'].apply(lambda x: j*group.shape[0]//3 < x and x <= (j+1)*group.shape[0]//3)            
                ax[i].plot(x_series(group[select_subgroup]), 
                           y_series(group[select_subgroup]),
                           'o',  ms=6, linewidth=1, 
                           label=group_by_series_legend(name,group,j))

    for a in ax[:ngroups]: 
        a.set_xlabel(x_label)
        a.set_ylabel(y_label)

        a.legend(bbox_to_anchor=(-0.3, 1.04), loc="lower left")
        a.grid()
    fig.suptitle(group_by_series_title)
    fig.tight_layout(pad=3.)

### Acquisition Time vs. Trigger Time

In [None]:
def acquisition_datetime(df):
    return df.apply(
        lambda x: datetime.datetime(x['Series Date'].year, 
                                    x['Series Date'].month, 
                                    x['Series Date'].day, 
                                    x['Acquisition Time'].hour, 
                                    x['Acquisition Time'].minute,
                                    x['Acquisition Time'].second, 
                                    x['Acquisition Time'].microsecond, 
                                    x['Acquisition Time'].tzinfo ), axis=1)

def acqusition_datetime_offset(df):
    acquisition_datetime_series = acquisition_datetime(df)
    return acquisition_datetime_series - acquisition_datetime_series.min()


group_x_series = lambda group: group['Trigger Time']
x_label = 'Trigger Time [ms]'
group_y_series = lambda group: acqusition_datetime_offset(group).apply(lambda x: x.total_seconds()*1e3)+group['Trigger Time'].min()
y_label = 'Acquisition Time (offset to min) [ms]'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)

It seems that the Trigger Time and Acquisition time coincide up to an offset for Acquisition time. Also, they do not cover the entire heart cycle but only a bit more than the first half. This could be the case since it may be a prospective hear MRI as explained [here](http://mriquestions.com/gating-parameters.html).

In [None]:
# Trigger time vs. nominal interval
pd.concat([df_native_grouped_unique[['Trigger Time']].applymap(lambda x: np.max(x)), df_native_grouped_unique[['Nominal Interval']]], axis=1)

### Content Time vs. Trigger Time

In [None]:
def content_datetime(df):
    return df.apply(
        lambda x: datetime.datetime(x['Series Date'].year, 
                                    x['Series Date'].month, 
                                    x['Series Date'].day, 
                                    x['Content Time'].hour, 
                                    x['Content Time'].minute,
                                    x['Content Time'].second, 
                                    x['Content Time'].microsecond, 
                                    x['Content Time'].tzinfo ), axis=1)

def content_datetime_offset(df):
    content_datetime_series = content_datetime(df)
    return content_datetime_series - content_datetime_series.min()

group_x_series = lambda group: group['Trigger Time']
x_label = 'Trigger Time [ms]'
group_y_series = lambda group: content_datetime_offset(group).apply(lambda x: x.total_seconds()*1e3)+group['Trigger Time'].min()
y_label = 'Content Time (offset to min) [ms]'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)

Seems that the Content Time refers to the time when the MRI signal was recorded - the entire sequence is spread over more than 40 seconds. No structure apparent from these plots

### Trigger Time vs. Instance Number

In [None]:
group_x_series = lambda group: group['Instance Number'] 
x_label = 'Instance Number'
group_y_series = lambda group:  group['Trigger Time'] 
y_label = 'Trigger Time [ms]'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)

These plots clearly suggest that the P-series contain 3 sub-series that are sorted according to instance number (the visible plateau is due to multiple slices per fixed time being recorded one after another). 

### Content Time vs. Instance Number

In [None]:
group_x_series = lambda group: group['Instance Number']
x_label = 'Instance Number'
group_y_series = lambda group: content_datetime_offset(group).apply(lambda x: x.total_seconds()*1e3)+group['Trigger Time'].min()
y_label = 'Content Time (offset to min) [ms]'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)


No correlation between content time and instance number.

### Instance Creation Time vs. Instance Number

In [None]:
def instance_creation_datetime(df):
    return df.apply(
        lambda x: datetime.datetime(x['Series Date'].year, 
                                    x['Series Date'].month, 
                                    x['Series Date'].day, 
                                    x['Instance Creation Time'].hour, 
                                    x['Instance Creation Time'].minute,
                                    x['Instance Creation Time'].second, 
                                    x['Instance Creation Time'].microsecond, 
                                    x['Instance Creation Time'].tzinfo ), axis=1)

def instance_creation_datetime_offset(df):
    content_datetime_series = content_datetime(df)
    return content_datetime_series - content_datetime_series.min()

group_x_series = lambda group: group['Instance Number']
x_label = 'Instance Number'
group_y_series = lambda group: instance_creation_datetime_offset(group).apply(lambda x: x.total_seconds()*1e3)+group['Trigger Time'].min()
y_label = 'Instance Creation Time (offset to min) [ms]'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)

### SOP Instance UID vs. Instance Number

In [None]:
group_x_series = lambda group: group['Instance Number'] 
x_label = 'Instance Number'
group_y_series = lambda group:  group['SOP Instance UID'].apply(lambda x: int(x.split('.')[-1]))
y_label = 'SOP Instance UID'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)


In [None]:
# Unique SOP Instance UID prefixes
df_native_renamed['SOP Instance UID'].apply(lambda x: get_uid_prefix(x)).unique()

The SOP Instance UID seems to be an Implementation label for dcm4che followed by a randomly generated UID with no correlation with Instance Number.

### Slice Location vs. Instance Number

In [None]:
group_x_series = lambda group: group['Instance Number']
x_label = 'Instance Number'
group_y_series = lambda group: group['Slice Location']
y_label = 'Slice Location'
make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)

### Slice Location (col x row coord sys) - Slice Location (DICOM) vs. Instance Number

In [None]:
group_x_series = lambda group: group['Instance Number'] #.apply(lambda x: float(x.original_string))
x_label = 'Instance Number'

def slice_location_image_coord_sys(group):
    seq_origin = np.array(group.loc[group['Instance Number'].idxmin(),'Image Position (Patient)'])    
    
    orientation = group['Image Orientation (Patient)'].unique()
    assert len(orientation) == 1

    row_unit = np.array(orientation[0][:3])
    col_unit = np.array(orientation[0][3:])
    ort_unit = np.cross(col_unit, row_unit)

    return group['Image Position (Patient)'].apply(lambda x: np.dot(ort_unit, np.array(x)-seq_origin) )

def slice_location_diff(group):
    return slice_location_image_coord_sys(group) - group['Slice Location'].apply(lambda x: np.array(x))

group_y_series = slice_location_diff
y_label = 'Slice Location (col x row) - Slice Location (DICOM)'

make_group_by_plot(df_native_grouped, group_x_series, x_label, group_y_series, y_label)

The coordinate system of the DICOM images produces the same alignment (with "Slice Location" as z-axis) when taking the column direction as the primary and row-direction as the secondary axis (i.e. x runs along the columns, y along the rows).

### Check orthogonality of image planes and relative image offset vectors

In [None]:
# for i, (name, group) in enumerate(pd.concat([df_renamed, instance_creation_datetime], axis=1).\
#     applymap(lambda x: unpack_pydicom_value(x.value) if pd.notnull(x) else x).\
#     groupby(['FileCollectionID', 'Instance Creation Date', 'Patient ID', 'Study Instance UID', 'Series Instance UID', 'Image Type', 'Sequence Name'])): # group['Instance Creation Date'].iloc[0]

def print_if_above_numeric_tol(name, value, tol=1e-11):
    if value > tol:
        print("    FAIL: {} = {} exceeds numeric tolerance ".format(name, value))
    else:
        print("    OK:   {} = {} ".format(name, value))
            

for i, (name, group) in enumerate(df_native_grouped):

    orientation = group['Image Orientation (Patient)'].unique()
    assert len(orientation) == 1

    row_unit = np.array(orientation[0][:3])
    col_unit = np.array(orientation[0][3:])
    ort_unit = np.cross(col_unit, row_unit)
    

    if 'P' not in group['Image Type'].iloc[0]:

        print(name[0],name[1],name[4], name[6])

        positions = group.sort_values('Instance Number')['Image Position (Patient)'].apply(lambda x: np.array(x))    
        #pprint((positions.iloc[1:] - positions[:-1]).apply(lambda x: np.linalg.norm(x)).max())
        if len(positions) > 1:
            print_if_above_numeric_tol("max(dot(col_unit, patient_position_offset))", np.abs(np.dot( np.vstack(positions.to_numpy()[1:] - positions.to_numpy()[:-1]), col_unit) ).max() )
            print_if_above_numeric_tol("max(dot(row_unit, patient_position_offset))", np.abs(np.dot( np.vstack(positions.to_numpy()[1:] - positions.to_numpy()[:-1]), row_unit) ).max() )

    else:
        for j in range(3):

            print(name[0], name[1], name[4], name[6], j)

            select_subgroup = group['Instance Number'].apply(lambda x: j*group.shape[0]//3 < x and x <= (j+1)*group.shape[0]//3)            

            positions = group[select_subgroup].sort_values('Instance Number')['Image Position (Patient)'].apply(lambda x: np.array(x))
            if len(positions) > 1:
                print_if_above_numeric_tol("max(dot(col_unit, patient_position_offset))", np.abs(np.dot( np.vstack(positions.to_numpy()[1:] - positions.to_numpy()[:-1]), col_unit) ).max() )
                print_if_above_numeric_tol("max(dot(row_unit, patient_position_offset))", np.abs(np.dot( np.vstack(positions.to_numpy()[1:] - positions.to_numpy()[:-1]), row_unit) ).max() )


Image planes are orthogonal to Image Position (Patient) offset vector (note that the calculation wraps around time slice edges, i.e. it computes offsets between last and first Image Position (Patient), which, however, does not affect the conclusion).

In [None]:
#TODO:
# - Check that Cardiac number of images coincides with number of times (trigger/acquisition)
# - number of slice locations coincides with number of image position patients
# - and that both together with Rows and Columns explain the number of instances
# - statistics over unique values per series and conversion to HPC-PREDICT-IO/create MRI writer 

# Conversion to HDF5

### DICOM Reader

In [None]:
df_native_grouped_unique[['FilePath']]

In [None]:
print("Found sequence names {} (only processing fl3d1).".format(df_native_renamed['Sequence Name'].unique()))

group_by_series_keys = ['FileCollectionID', 'Instance Creation Date', 'Patient ID', 'Study Instance UID', 'Sequence Name'] #, 'Series Instance UID', 'Image Type']
df_flash_grouped = df_native_renamed[df_native_renamed['Sequence Name'] == 'fl3d1'].groupby(group_by_series_keys)

In [None]:
flow_mris = []

def extract_unique_value(group, col):
    value = group[col].unique()
    assert len(value) == 1; 
    return value[0]

for (name, group) in df_flash_grouped:

    magnitude = None
    phase = []
    
    for (series_name, series_group) in group.groupby(['Series Instance UID', 'Image Type']):
        if 'M' in series_name[1]:
            magnitude = series_group.sort_values('Instance Number')
        elif 'P' in series_name[1]:
            phase_group = series_group.sort_values('Instance Number')
            phase = [ phase_group[ phase_group['Instance Number'].apply(
                lambda x: j*phase_group.shape[0]//3 < x and x <= (j+1)*phase_group.shape[0]//3) ] 
                     for j in range(3) ]           

    num_rows = extract_unique_value(group, 'Rows')
    num_cols = extract_unique_value(group, 'Columns')
    num_slice_locations = len(group['Slice Location'].unique()) # FIXME: sanity check!
    cardiac_number_of_images = extract_unique_value(group, 'Cardiac Number of Images')
    
    assert magnitude.shape[0] == num_slice_locations*cardiac_number_of_images
    for j in range(3):
        assert phase[j].shape[0] == num_slice_locations*cardiac_number_of_images

    orientation = extract_unique_value(group, 'Image Orientation (Patient)')
    row_unit = np.array(orientation[:3])
    col_unit = np.array(orientation[3:])
    ort_unit = np.cross(col_unit, row_unit)
    assert (np.linalg.norm(ort_unit) - 1.) < 1e-12

    magnitude_positions = magnitude['Image Position (Patient)'].apply(lambda x: np.array(x))    
    magnitude_position_origin = np.array(magnitude.loc[magnitude['Instance Number'].idxmin(),'Image Position (Patient)'])    
    magnitude_slice_position_shift = magnitude_positions.apply(lambda x: np.dot(ort_unit,x - magnitude_position_origin)) - magnitude['Slice Location']    
    assert (magnitude_slice_position_shift - magnitude_slice_position_shift.mean()).abs().max() < 1e-11
    
    slice_locations = magnitude['Slice Location'][:num_slice_locations].values # FIXME: sanity check!
    assert len(np.unique(slice_locations)) == num_slice_locations
#     magnitude_slice_locations = magnitude['Slice Location'][:num_slice_locations].values    
    magnitude_trigger_times = magnitude['Trigger Time'][::num_slice_locations].values
    assert len(np.unique(magnitude_trigger_times)) == cardiac_number_of_images
    magnitude_values = np.ndarray(shape=(num_rows, num_cols, num_slice_locations, cardiac_number_of_images))
    # TODO: Check correctness of pixel value reading
    for t,_ in enumerate(magnitude_trigger_times):
        for z,_ in enumerate(slice_locations): # magnitude_slice_locations
            img_row = magnitude[magnitude['Instance Number'] == t*num_slice_locations + z + 1]; assert len(img_row) == 1; img_row = img_row.iloc[0]
            dcm_img = pydicom.dcmread(img_row['FilePath'])
            magnitude_values[:,:,z,t] = dcm_img.pixel_array # no rescale properties
   

    phase_positions = []
    phase_slice_locations = []
    phase_trigger_times = []
    phase_values = np.ndarray(shape=(num_rows, num_cols, num_slice_locations, cardiac_number_of_images, 3))
    for j in range(3):
        # If this does not work can validate the phase z-coordinates here as well
        assert (slice_locations == phase[j]['Slice Location'][:num_slice_locations].values).all()

#         phase_slice_locations.append( phase[j]['Slice Location'][:num_slice_locations].values )    
        phase_trigger_times.append( phase[j]['Trigger Time'][::num_slice_locations].values )
        print("Trigger time offsets of phase[{}] relative to magnitude in {} ms".format(j, np.unique(phase_trigger_times[j]-magnitude_trigger_times)), flush=True)
        for t,_ in enumerate(phase_trigger_times[j]):
            for z,_ in enumerate(slice_locations): # phase_slice_locations[j]
                img_row = phase[j][phase[j]['Instance Number'] == j*cardiac_number_of_images*num_slice_locations + t*num_slice_locations + z + 1]; assert len(img_row) == 1; img_row = img_row.iloc[0]
                dcm_img = pydicom.dcmread(img_row['FilePath'])
                phase_values[:,:,z,t,j] = dcm_img.RescaleSlope*dcm_img.pixel_array+dcm_img.RescaleIntercept

    
    #check for uniqueness!
    pixel_spacing = extract_unique_value(group, 'Pixel Spacing')
    geometry = [pixel_spacing[0]*(np.arange(0,num_rows)+0.5),
                pixel_spacing[1]*(np.arange(0,num_cols)+0.5),
                slice_locations]

    #check for uniqueness!
    #group['Nominal Interval'].unique()  #.apply(lambda x: agg_unique(x))
    heart_cycle_period = group['Nominal Interval'].mean()
    
    print("Writing {} to HDF5...".format(name))
    flow_mris.append({
        "study_instance_uid": name[3],
        "sequence_name": name[4],
        "cardiac_number_of_images": cardiac_number_of_images,
        "num_slice_locations": num_slice_locations,
        "heart_cycle_period": heart_cycle_period,
        "geometry": geometry,
        "magnitude_trigger_times": magnitude_trigger_times,
        "magnitude_values": magnitude_values, #"phase_slice_locations": phase_slice_locations,
        "phase_trigger_times": phase_trigger_times,
        "phase_values": phase_values,        
    })
## df_native_grouped_unique = df_native_grouped.apply(lambda x: agg_unique(x))   # 
    
    
    

In [None]:
import sys
if not '/home/lukasd/src/review/hpc-predict-io/python' in sys.path:
    sys.path.append('/home/lukasd/src/review/hpc-predict-io/python')

In [None]:
from mr_io import FlowMRI

hpc_predict_mri = FlowMRI(geometry=geometry, 
                          time=magnitude_trigger_times, # FIXME: phase times!
                          time_heart_cycle_period=heart_cycle_period, 
                          intensity=magnitude_values, 
                          velocity_mean=phase_values, 
                          velocity_cov=np.zeros(shape=phase_values.shape+(3,)))
# FlowMRI?

In [None]:
filename = "{}-{}-{}.h5".format(name[0], name[1], name[3])
hpc_predict_mri.write_hdf5(filename)

read_result = FlowMRI.read_hdf5(filename)

In [None]:
import matplotlib.pyplot as plt

figure, ax = plt.subplots(ncols=4, sharex=True, figsize=(12*4,8*1))
ax[0].hist(flow_mris[0]['magnitude_values'].flatten(), bins=50, label='magnitude')
ax[0].hist(hpc_predict_mri.intensity.flatten(), bins=50, label='magnitude-hpc-predict') # Todo: make this semitransparent
for j in range(3):
    ax[j+1].hist(flow_mris[0]['phase_values'][:,:,:,:,j].flatten(), bins=50, label='phase-{}'.format(j))
    ax[j+1].hist(hpc_predict_mri.velocity_mean[:,:,:,:,j].flatten(), bins=50, label='phase-{}-hpc-predict'.format(j))
figure.legend()
figure.tight_layout()

Looks good on samples. Phase/velocity units remain to be determined. According to analysis below, all sequences are either fl3d1 or fl3d1_2 in the dataset. The code can be moved to a script that does the conversion.

### Prototype of DICOM writer (immature)

In [None]:
# Determine target folder

# slice_location = geometry[2]
# calculate image position patient from old_image_position_origin + (geometry[2] - slice_location[0])*ort_unit
# Pixel Spacing = (geometry[i][-1] - geometry[i][1])/(len(geometry[i])-1)
# instance number: magnitude: t*num_slice_locations + z + 1, 
#                  phase: j*cardiac_number_of_images*num_slice_locations + t*num_slice_locations + z + 1
# trigger times
# Rows = len(geometry[0])
# Columns = = len(geometry[0])
# Rescale Intercept/Slope ? Intercept = -max(abs(velocity_mean)), slope = 2*abs(Intercept)/2*Bits Stored(?)
# skip all other tags with multiplicity > 1 such as acquisition/content time (make up SOP Class UID etc. with uid.generate_uid)
# for those with multiplicity 1 select which ones to add 
#
# Create tags to annotate HPC-PREDICT information, such as code that was used to produce the data
#
# Question: how do we add overlays? 

In [None]:
df_native_grouped_unique_flow = df_native_grouped_unique[(df_native_grouped_unique['Study Instance UID'] == flow_mris[1]['study_instance_uid']) & (df_native_grouped_unique['Sequence Name'] == flow_mris[1]['sequence_name'])]
df_native_grouped_unique_flow

In [None]:

df_native_grouped_unique_flow_reindexed = df_native_grouped_unique_flow.set_index(df_native_grouped_unique_flow['Image Type'].apply(lambda x: x[0][2]))
# df_native_grouped_unique_flow_reindexed.loc['M']['FilePath'][0]

singleton_labels = df_native_grouped_unique_flow.drop(df_native_grouped_unique_flow.columns[~df_native_grouped_unique_flow.applymap(lambda x: not(hasattr(x,'__len__') and len(x) == 1) ).all()], axis=1) #.columns.values
singleton_labels

In [None]:
group_by_series_keys = ['FileCollectionID', 'Instance Creation Date', 'Patient ID', 'Study Instance UID', 'Sequence Name', 'Series Instance UID', 'Image Type']
# df.groupby(group_by_series_keys)

# Get study instance UID, convert to pandas table, compute unique tags by base tag address, iterate over tags - for unique copy them to new dataset except if should be included (by second table tag -> name), for non-unique, replace them appropriately (name look-up, etc.) 

tag_to_name = df.apply( lambda c: c.dropna().apply(lambda x: x.name).unique(), axis=0).transpose()
name_to_tag = tag_to_name.reset_index().rename(columns={0: 'name', 'index': 'tag'}).set_index('name')

tag_to_name[0]
name_to_tag['tag']['Study Instance UID']
#df[df.apply(lambda row: row['Study Instance UID'] == study_instance_uid and row['Sequence Name'] == sequence_name, axis=1)]

# df_hashified = df.applymap(lambda x: unpack_pydicom_value(x.value) if pd.notnull(x) else x)
# df_tag_to_name = df.applymap(lambda x: x.name if pd.notnull(x) else x)
# df_hashified.groupby(group_by_series_keys)

# # df_native_grouped = df_native_renamed.groupby(group_by_series_keys)
# # df_native_grouped.apply(lambda x: agg_unique(x))   


In [None]:
# dicom_img[name_to_tag['tag']['SOP Class UID']]
pydicom.dcmread(df_native_grouped_unique_flow_reindexed.loc['M']['FilePath'][0]).file_meta


In [None]:
len(df_native_grouped_unique_flow_reindexed.loc['M','Specific Character Set'])

In [None]:
#file_meta = FileMetaDataset
# file_meta = pydicom.dataset.FileMetaDataset()
# file_meta.MediaStorageSOPClassUID = dicom_img.file_meta[0x00020002].value
# file_meta.ImplementationClassUID = PYDICOM_IMPLEMENTATION_UID

# First get the study (flow MRI needs to describe data it comes from...)

# for series in source study, open first image as dicom_img
dcm_img = pydicom.dcmread(df_native_grouped_unique_flow_reindexed.loc['M']['FilePath'][0])
# create file_meta_ds/file_ds
ds = pydicom.dataset.Dataset()

print("### Simulating writing of DICOM ###")
for el in dcm_img: # also include dcm_img.file_meta later
    print(el.name, flush=True)
    if el.name == 'Pixel Data':
        print("Writing Pixel Data") # replace dcm-che prefix by pydicom prefix        
    elif len(df_native_grouped_unique_flow_reindexed.loc['M', el.name]) == 1: # FIXME: use el.tag instead of el.name (compute unique values by tags)
        if not el.name in ['Referenced Image Sequence', 'Study Instance UID', 'Series Instance UID', 'Rows', 'Columns', 'Pixel Spacing']: # more? for UIDs can just replace dcm-che prefix by that of pydicom implementation
            print("Copying unique element {} ".format(el.name)) # replace dcm-che prefix by pydicom prefix
#             file_ds.add(el.tag, el.VR, el.value) # module modifications...
        else: # special treatment of el.tag
            print("Special treatment for {} ".format(el.name)) # replace dcm-che prefix by pydicom prefix
    else: # probably one of the tags that has to be computed... (see above for the manual)
        print("Element {} has different values".format(el.name))

            