In [1]:
import pydicom
import os 
import numpy as np 
import polars as pl 
import datetime
import copy
os.environ["DICOM_DATASET"] = f'{os.getcwd()}/dicom_example/' 
os.environ["PROCESSED_DATA_DIR"] = f'{os.getcwd()}/processed_data' 
processed_data_dir = f'{os.getcwd()}/processed_data/' 



## use bash to get a list of all files in the dataset, and write them to a table of contents file.

In [2]:
! find $DICOM_DATASET -type 'f' > $PROCESSED_DATA_DIR/cohort/toc.txt

## lets load in the table of contents and look at our dataset.

In [3]:
toc = pl.read_csv(
    os.path.join(processed_data_dir, 'cohort', 'toc.txt'),
    has_header=False, new_columns=['absolute_dir']
    )


## lets continue defining our dataset. 
### first lets define some fields we are interested in extracting from out dicom files 

In [4]:
dicom_fields = ['PatientName', 'PatientID', 'AccessionNumber', 'StudyDate', 'Modality', 'StudyDescription', 'PatientSex', 'PatientAge', 'PatientWeight', 'ImageOrientationPatient']

### now lets extract the relevant fields from our metadata values

In [5]:
def safe_retrive(dicom_object, key):
    try:
        value = dicom_object[key].value
        if isinstance(value, pydicom.multival.MultiValue):
            return list(value)
        elif isinstance(value, pydicom.valuerep.PersonName):
            return str(value)
        else:
            return value
    except:
        return None

def get_dicom_fields(f_name:str, dicom_fields:list):
    """reads in a dicom file, and returns a dictionary with the requested metadata extracted from that file."""
    header = pydicom.dcmread(f_name)
    dicom_dict = {
        field_name : safe_retrive( header, field_name) for field_name in dicom_fields
    }
    
    dicom_dict['absolute_dir'] = f_name
    return dicom_dict


In [6]:
meta_rows = []
for f_name in toc.iter_rows():
    meta_rows.append(get_dicom_fields(f_name=f_name[0], dicom_fields=dicom_fields))
meta = pl.from_dicts(meta_rows)
meta

PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir
str,str,str,str,str,str,str,str,f64,list[f64],str
"""Doe^Peter""","""98890234""","""134""","""20030505""","""MR""","""Brain""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…"
"""Doe^Peter""","""98890234""","""428""","""20030505""","""MR""","""Carotids""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.414374, 0.910111, … -1.0]","""/home/buzgalbraith/workspace/d…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.840635, 0.54161, … -1.0]","""/home/buzgalbraith/workspace/d…"
…,…,…,…,…,…,…,…,…,…,…
"""Citizen^Jan""","""12345678""","""1""","""20200913""","""CT""","""Testing File-set""",,,,,"""/home/buzgalbraith/workspace/d…"
"""Citizen^Jan""","""12345678""","""1""","""20200913""","""CT""","""Testing File-set""",,,,,"""/home/buzgalbraith/workspace/d…"
"""Citizen^Jan""","""12345678""","""1""","""20200913""","""CT""","""Testing File-set""",,,,,"""/home/buzgalbraith/workspace/d…"
"""Citizen^Jan""","""12345678""","""1""","""20200913""","""CT""","""Testing File-set""",,,,,"""/home/buzgalbraith/workspace/d…"


## lets take a look at our dataset to see what we are dealing with 

In [7]:
meta.describe()

statistic,PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir
str,str,str,str,str,str,str,str,str,f64,f64,str
"""count""","""81""","""81""","""81""","""81""","""81""","""81""","""31""","""31""",17.0,28.0,"""81"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""50""","""50""",64.0,53.0,"""0"""
"""mean""",,,,,,,,,81.6327,,
"""std""",,,,,,,,,0.0,,
"""min""","""Citizen^Jan""","""12345678""","""1""","""19950903""","""CR""","""""","""""","""042Y""",81.6327,,"""/home/buzgalbraith/workspace/d…"
"""25%""",,,,,,,,,81.6327,,
"""50%""",,,,,,,,,81.6327,,
"""75%""",,,,,,,,,81.6327,,
"""max""","""Doe^Peter""","""98890234""","""428""","""20200913""","""MR""","""XR C Spine Comp Min 4 Views""","""M""","""047Y""",81.6327,,"""/home/buzgalbraith/workspace/d…"


In [8]:
print(meta.group_by("Modality").len())
print(meta.group_by("PatientName").len())
print(meta.group_by("StudyDescription").len())


shape: (3, 2)
┌──────────┬─────┐
│ Modality ┆ len │
│ ---      ┆ --- │
│ str      ┆ u32 │
╞══════════╪═════╡
│ CR       ┆ 3   │
│ MR       ┆ 17  │
│ CT       ┆ 61  │
└──────────┴─────┘
shape: (3, 2)
┌───────────────┬─────┐
│ PatientName   ┆ len │
│ ---           ┆ --- │
│ str           ┆ u32 │
╞═══════════════╪═════╡
│ Doe^Archibald ┆ 7   │
│ Citizen^Jan   ┆ 50  │
│ Doe^Peter     ┆ 24  │
└───────────────┴─────┘
shape: (7, 2)
┌─────────────────────────────┬─────┐
│ StudyDescription            ┆ len │
│ ---                         ┆ --- │
│ str                         ┆ u32 │
╞═════════════════════════════╪═════╡
│ CT, HEAD/BRAIN WO CONTRAST  ┆ 4   │
│ XR C Spine Comp Min 4 Views ┆ 3   │
│ Carotids                    ┆ 2   │
│ Brain-MRA                   ┆ 11  │
│ Brain                       ┆ 4   │
│ Testing File-set            ┆ 50  │
│                             ┆ 7   │
└─────────────────────────────┴─────┘


## Now that we have defined out dataset we can finally start processing it. 
### suppose we first want to add a column with the relative path to each directory

In [9]:
meta = meta.with_columns(
        relative_path = pl.col('absolute_dir').str.strip_prefix(os.getcwd()+'/')
    )
meta.head()

PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path
str,str,str,str,str,str,str,str,f64,list[f64],str,str
"""Doe^Peter""","""98890234""","""134""","""20030505""","""MR""","""Brain""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/491…"
"""Doe^Peter""","""98890234""","""428""","""20030505""","""MR""","""Carotids""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/158…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/564…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.414374, 0.910111, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.840635, 0.54161, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"


### now lets filter for rows that, that have the patient name Citizen^Jan  

In [10]:
meta = meta.filter(~pl.col('PatientName').eq('Citizen^Jan'))
meta.head()

PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path
str,str,str,str,str,str,str,str,f64,list[f64],str,str
"""Doe^Peter""","""98890234""","""134""","""20030505""","""MR""","""Brain""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/491…"
"""Doe^Peter""","""98890234""","""428""","""20030505""","""MR""","""Carotids""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/158…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/564…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.414374, 0.910111, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.840635, 0.54161, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"


### lets look out how that changed our dataset statistics

In [11]:
meta.describe()

statistic,PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path
str,str,str,str,str,str,str,str,str,f64,f64,str,str
"""count""","""31""","""31""","""31""","""31""","""31""","""31""","""31""","""31""",17.0,28.0,"""31""","""31"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",14.0,3.0,"""0""","""0"""
"""mean""",,,,,,,,,81.6327,,,
"""std""",,,,,,,,,0.0,,,
"""min""","""Doe^Archibald""","""77654033""","""134""","""19950903""","""CR""","""""","""""","""042Y""",81.6327,,"""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CR1/615…"
"""25%""",,,,,,,,,81.6327,,,
"""50%""",,,,,,,,,81.6327,,,
"""75%""",,,,,,,,,81.6327,,,
"""max""","""Doe^Peter""","""98890234""","""428""","""20030505""","""MR""","""XR C Spine Comp Min 4 Views""","""M""","""047Y""",81.6327,,"""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"


In [12]:
print(meta.group_by("Modality").len())
print(meta.group_by("PatientName").len())
print(meta.group_by("StudyDescription").len())

shape: (3, 2)
┌──────────┬─────┐
│ Modality ┆ len │
│ ---      ┆ --- │
│ str      ┆ u32 │
╞══════════╪═════╡
│ CT       ┆ 11  │
│ CR       ┆ 3   │
│ MR       ┆ 17  │
└──────────┴─────┘
shape: (2, 2)
┌───────────────┬─────┐
│ PatientName   ┆ len │
│ ---           ┆ --- │
│ str           ┆ u32 │
╞═══════════════╪═════╡
│ Doe^Archibald ┆ 7   │
│ Doe^Peter     ┆ 24  │
└───────────────┴─────┘
shape: (6, 2)
┌─────────────────────────────┬─────┐
│ StudyDescription            ┆ len │
│ ---                         ┆ --- │
│ str                         ┆ u32 │
╞═════════════════════════════╪═════╡
│ Brain-MRA                   ┆ 11  │
│ XR C Spine Comp Min 4 Views ┆ 3   │
│                             ┆ 7   │
│ CT, HEAD/BRAIN WO CONTRAST  ┆ 4   │
│ Brain                       ┆ 4   │
│ Carotids                    ┆ 2   │
└─────────────────────────────┴─────┘


### Further suppose our clinical collaborators tell us only MRI and CAT scans are relavent to the project.

In [13]:
meta = meta.filter(~pl.col('Modality').eq('CR'))
meta.head()

PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path
str,str,str,str,str,str,str,str,f64,list[f64],str,str
"""Doe^Peter""","""98890234""","""134""","""20030505""","""MR""","""Brain""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/491…"
"""Doe^Peter""","""98890234""","""428""","""20030505""","""MR""","""Carotids""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/158…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.0, 1.0, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR1/564…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.414374, 0.910111, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[0.840635, 0.54161, … -1.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"


In [14]:
meta.describe()

statistic,PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path
str,str,str,str,str,str,str,str,str,f64,f64,str,str
"""count""","""28""","""28""","""28""","""28""","""28""","""28""","""28""","""28""",17.0,28.0,"""28""","""28"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",11.0,0.0,"""0""","""0"""
"""mean""",,,,,,,,,81.6327,,,
"""std""",,,,,,,,,0.0,,,
"""min""","""Doe^Archibald""","""77654033""","""134""","""19950903""","""CT""","""""","""""","""042Y""",81.6327,,"""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…"
"""25%""",,,,,,,,,81.6327,,,
"""50%""",,,,,,,,,81.6327,,,
"""75%""",,,,,,,,,81.6327,,,
"""max""","""Doe^Peter""","""98890234""","""428""","""20030505""","""MR""","""Carotids""","""M""","""045Y""",81.6327,,"""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR700/4…"


In [15]:
print(meta.group_by("Modality").len())
print(meta.group_by("PatientName").len())
print(meta.group_by("StudyDescription").len())

shape: (2, 2)
┌──────────┬─────┐
│ Modality ┆ len │
│ ---      ┆ --- │
│ str      ┆ u32 │
╞══════════╪═════╡
│ MR       ┆ 17  │
│ CT       ┆ 11  │
└──────────┴─────┘
shape: (2, 2)
┌───────────────┬─────┐
│ PatientName   ┆ len │
│ ---           ┆ --- │
│ str           ┆ u32 │
╞═══════════════╪═════╡
│ Doe^Peter     ┆ 24  │
│ Doe^Archibald ┆ 4   │
└───────────────┴─────┘
shape: (5, 2)
┌────────────────────────────┬─────┐
│ StudyDescription           ┆ len │
│ ---                        ┆ --- │
│ str                        ┆ u32 │
╞════════════════════════════╪═════╡
│ Carotids                   ┆ 2   │
│ CT, HEAD/BRAIN WO CONTRAST ┆ 4   │
│                            ┆ 7   │
│ Brain                      ┆ 4   │
│ Brain-MRA                  ┆ 11  │
└────────────────────────────┴─────┘


### Lets say your clinical collaborator, asks us just to filter for axial scans now.  

In [16]:
reference = [1.0, 0.0, 0.0, 0.0, 1.0, 0.0] ## orientation of an axial scan 
def is_axial(vec ):
    return np.allclose(vec, reference, atol=1e-5)
meta = meta.with_columns(
    axial = pl.col("ImageOrientationPatient").map_elements(is_axial, return_dtype=pl.Boolean)
)
meta = meta.filter(pl.col('axial'))
meta

PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path,axial
str,str,str,str,str,str,str,str,f64,list[f64],str,str,bool
"""Doe^Peter""","""98890234""","""134""","""20030505""","""MR""","""Brain""","""M""","""045Y""",81.6327,"[1.0, -0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR2/498…",true
"""Doe^Peter""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[1.0, -0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR2/627…",true
"""Doe^Peter""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/26…",true
"""Doe^Peter""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/30…",true
"""Doe^Peter""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/33…",true
…,…,…,…,…,…,…,…,…,…,…,…,…
"""Doe^Peter""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/23…",true
"""Doe^Archibald""","""77654033""","""2""","""19950903""","""CT""","""CT, HEAD/BRAIN WO CONTRAST""","""""","""042Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…",true
"""Doe^Archibald""","""77654033""","""2""","""19950903""","""CT""","""CT, HEAD/BRAIN WO CONTRAST""","""""","""042Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…",true
"""Doe^Archibald""","""77654033""","""2""","""19950903""","""CT""","""CT, HEAD/BRAIN WO CONTRAST""","""""","""042Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…",true


In [17]:
print(meta.group_by("Modality").len())
print(meta.group_by("PatientName").len())
print(meta.group_by("StudyDescription").len())

shape: (2, 2)
┌──────────┬─────┐
│ Modality ┆ len │
│ ---      ┆ --- │
│ str      ┆ u32 │
╞══════════╪═════╡
│ MR       ┆ 2   │
│ CT       ┆ 9   │
└──────────┴─────┘
shape: (2, 2)
┌───────────────┬─────┐
│ PatientName   ┆ len │
│ ---           ┆ --- │
│ str           ┆ u32 │
╞═══════════════╪═════╡
│ Doe^Peter     ┆ 7   │
│ Doe^Archibald ┆ 4   │
└───────────────┴─────┘
shape: (4, 2)
┌────────────────────────────┬─────┐
│ StudyDescription           ┆ len │
│ ---                        ┆ --- │
│ str                        ┆ u32 │
╞════════════════════════════╪═════╡
│                            ┆ 5   │
│ Brain-MRA                  ┆ 1   │
│ CT, HEAD/BRAIN WO CONTRAST ┆ 4   │
│ Brain                      ┆ 1   │
└────────────────────────────┴─────┘


## now lets suppose we are happy with our dataset, but we want to send it to a collaborator at another intuition. We now want to think about how to de-identify the dataset.
### assume we only care about de-identifying the following fields  

In [18]:
deid_fields = ['PatientName', 'PatientID', 'AccessionNumber', 'StudyDate']

### lets start out easy and just remove the patient Name values all together  

In [19]:
deid_meta = copy.deepcopy(meta)
deid_meta = deid_meta.with_columns(PatientName = pl.lit(''))
deid_meta

PatientName,PatientID,AccessionNumber,StudyDate,Modality,StudyDescription,PatientSex,PatientAge,PatientWeight,ImageOrientationPatient,absolute_dir,relative_path,axial
str,str,str,str,str,str,str,str,f64,list[f64],str,str,bool
"""""","""98890234""","""134""","""20030505""","""MR""","""Brain""","""M""","""045Y""",81.6327,"[1.0, -0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR2/498…",true
"""""","""98890234""","""2""","""20030505""","""MR""","""Brain-MRA""","""M""","""045Y""",81.6327,"[1.0, -0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892003/MR2/627…",true
"""""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/26…",true
"""""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/30…",true
"""""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/33…",true
…,…,…,…,…,…,…,…,…,…,…,…,…
"""""","""98890234""","""2""","""20010101""","""CT""","""""","""M""","""043Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/98892001/CT5N/23…",true
"""""","""77654033""","""2""","""19950903""","""CT""","""CT, HEAD/BRAIN WO CONTRAST""","""""","""042Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…",true
"""""","""77654033""","""2""","""19950903""","""CT""","""CT, HEAD/BRAIN WO CONTRAST""","""""","""042Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…",true
"""""","""77654033""","""2""","""19950903""","""CT""","""CT, HEAD/BRAIN WO CONTRAST""","""""","""042Y""",,"[1.0, 0.0, … 0.0]","""/home/buzgalbraith/workspace/d…","""dicom_example/77654033/CT2/171…",true


### Now suppose we want to have PatientIDs and AccessionNumbers
- these are what we call cross walk tables. 

In [20]:
pids = deid_meta.select(pl.col('PatientID').unique()).with_row_index(offset=1)
pids = pids.with_columns(
    deid_PatientID = pl.lit('PID') + pl.col(('index')).cast(pl.String).str.zfill(5)
)
pids.head()

index,PatientID,deid_PatientID
u32,str,str
1,"""77654033""","""PID00001"""
2,"""98890234""","""PID00002"""


In [21]:
accns = deid_meta.select(pl.col('AccessionNumber').unique()).with_row_index(offset=1)
accns = accns.with_columns(
    deid_AccessionNumber = pl.lit('ACCN') + pl.col(('index')).cast(pl.String).str.zfill(5)
)
accns.head()

index,AccessionNumber,deid_AccessionNumber
u32,str,str
1,"""2""","""ACCN00001"""
2,"""134""","""ACCN00002"""


## now finally suppose we want to shift the StudyDate field by some random amount

- we want to shift the dates of each patient by some random amount, while keeping the relative dates within a patient the same,   

In [22]:

date_format = "%Y%m%d"
study_dates = deid_meta.group_by(['PatientID']).first().select(['PatientID', 'StudyDate'])
rng = np.random.default_rng(12345)
## generate date shifts quantities
dates_shift_amounts = rng.integers(low=-100, high=100, size = len(study_dates))
study_dates = study_dates.with_columns(
    number_of_days_shifted = dates_shift_amounts
)
## format and output the proper dates
study_dates = study_dates.with_columns(
    formatted_date = pl.col('StudyDate').str.to_datetime(format=date_format), 
    shift_delta = pl.col('number_of_days_shifted').map_elements(datetime.timedelta, return_dtype=pl.datatypes.Duration)
).with_columns(
    died_StudyDate = (pl.col('formatted_date') + pl.col('shift_delta')).dt.to_string(format=date_format)
)
# ## select only needed columns 
study_dates = study_dates.select(
    [
        'PatientID', 
        'StudyDate', 
        'died_StudyDate'
    ]
)


### ok lets now write those three tables out to files, so we can re-identify the data later if need be. 

In [23]:
pids.write_csv(os.path.join(processed_data_dir,'cohort', 'PatientIDCrossWalk.csv'))
accns.write_csv(os.path.join(processed_data_dir, 'cohort','AccessionNumberCrossWalk.csv'))
study_dates.write_csv(os.path.join(processed_data_dir, 'cohort','StudyDateShift.csv'))

## Finally lets de-identify our files and save the results to a new folder 

### first lets create new paths to save the de-id dataset

In [24]:
deid_dir = os.path.join('processed_data/', 'de-identified-data')
os.makedirs(deid_dir, exist_ok=True)
deid_meta = deid_meta.with_columns(
    deid_path = deid_dir + pl.col('relative_path').str.strip_prefix('dicom_example')
)


### next lets merge our cross walk tables with our dataset

In [25]:
deid_meta = deid_meta.join(
    pids.drop('index'), on='PatientID'
    ).join(
        accns.drop('index'), on='AccessionNumber'
    ).join(
        study_dates.drop('StudyDate'), on='PatientID'
    )


### Finally lets write the de-identified dataset to the desired locations

In [26]:
def deid_dicom(row):
    """reads in a dicom file and write the result to a specified location """
    header = pydicom.dcmread(row['absolute_dir'])
    header['PatientName'].value = row['PatientName']
    header['PatientID'].value = row['deid_PatientID']
    header['AccessionNumber'].value = row['deid_AccessionNumber']
    header['StudyDate'].value = row['died_StudyDate']
    pth = row['deid_path']
    os.makedirs(os.path.dirname(pth), exist_ok=True)
    pydicom.dcmwrite(pth, header)


for row in deid_meta.iter_rows(named=True):
    deid_dicom(row)

In [27]:
header = pydicom.dcmread('/home/buzgalbraith/workspace/data-processing-workshop/processed_data/de-identified-data/77654033/CT2/17106')

In [28]:
header

Dataset.file_meta -------------------------------
(0002,0000) File Meta Information Group Length  UL: 192
(0002,0001) File Meta Information Version       OB: b'\x00\x01'
(0002,0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002,0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.5962.1.1.0.0.0.1196530851.28319.0.93
(0002,0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002,0012) Implementation Class UID            UI: 1.3.6.1.4.1.5962.2
(0002,0013) Implementation Version Name         SH: 'DCTOOL100'
(0002,0016) Source Application Entity Title     AE: 'CLUNIE1'
-------------------------------------------------
(0008,0005) Specific Character Set              CS: 'ISO_IR 100'
(0008,0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008,0012) Instance Creation Date              DA: '19950903'
(0008,0013) Instance Creation Time              TM: '173353'
(0008,0014) Instance Creator UID                UI: 1.3.6.1.4