# Incompatibility of Pydicom with Pandas

In [None]:
hpc_predict_data_dir="/home/lukasd/src/hpc-predict/data/v0" # e.g. ../../../data/v0

This script assumes that you have the Freiburg dataset available, otherwise first run `data/fetch_scripts/fetch_freiburg.sh`.

To run the following steps, the DICOM headers must previously be converted to pandas tables, i.e.

```
convert_dicom_to_pandas.py --mri-data-root "${hpc_predict_data_dir}/input_data/original/mri/MRT Daten Bern" --mri-samples ... --output-root "${hpc_predict_data_dir}/input_data/original/mri/MRT Daten Bern DICOM Header"
```

### Import pickled DataFrames

In [None]:
from glob import glob
import os

pkls = sorted(glob(hpc_predict_data_dir + "/input_data/preprocessed/mri/MRT Daten Bern DICOM Header/3.pkl"), key=lambda x: int(os.path.basename(x).split('.')[0]) )
pkls

In [None]:
import pydicom
import pandas as pd
import numpy as np
from pprint import pprint

# Read pandas DataFrame that contains pydicom DataElement entries and was generated with convert_dicom_to_pandas.py
df = pd.concat([pd.read_pickle(pkl) for pkl in pkls],axis=0)

In [None]:
# Individual entries are pydicom DataElements
df.iloc[0,0]

In [None]:
# The value is held in the _value attribute, often in "semi-serialized" representation with 
# the VR describing the DICOM type  
vars(df.iloc[0,0])

### Use names instead of DICOM tags as column labels

In [None]:
## Compute the DICOM tag -> name mapping to renamed columns
tag_names = df.apply( lambda c: c.dropna().apply(lambda x: x.name).unique(), axis=0); assert tag_names.shape[0] == 1; tag_names = tag_names.loc[0] 
#tag_names.to_dict()

In [None]:
df_renamed = df.rename(columns=tag_names.to_dict())

# display only values
pd.set_option('display.max_columns', 20)
df_renamed.applymap(lambda x: x.value if pd.notnull(x) else x )

In [None]:

# This won't work with pydicom DataElements in the entries as they are not hashable
df_renamed_sequence_columns = list(df_renamed.columns[[c.endswith("Sequence") for c in df_renamed.columns]].values)
for col in df_renamed.drop(columns=df_renamed_sequence_columns + ['Acquisition Matrix', 'Private Creator', 'Overlay Origin']).columns:
    #print("Calculating unique values in column {}.".format(col))
    #df_renamed[col].apply(lambda x: x.value if pd.notnull(x) else x ).unique()
    print("Grouping by column {}.".format(col))
    df_renamed.applymap(lambda x: x.value if pd.notnull(x) else x ).groupby(col)
    
