In [1]:
import pandas as pd
from tqdm import tqdm
import os
import pydicom
from ast import literal_eval

# Read Files

In [9]:
BASE_DIR = '/srv/app/data'

DATA_DIR = BASE_DIR + '/data'

TRAIN_DIR = DATA_DIR + '/stage_2_train_images'

files_list = os.listdir(TRAIN_DIR)

In [10]:
def read_dicom(file):
    dc_file = pydicom.dcmread(file)
    output = dict()
    for element in dc_file:
        if element.tag != 'PixelData':
            output[element.tag] = element.value
    return output

In [11]:
train_metadata = []
for i in tqdm(range(len(files_list))):
    train_metadata.append(read_dicom(str(TRAIN_DIR) + '/' +str(files_list[i])))

100%|██████████| 121232/121232 [17:47<00:00, 113.55it/s]


# Get MetaData to Pandas DF

In [12]:
cols = ['SOP Instance UID', 'Modality', 'Patient ID', 'Study Instance UID', 'Series Instance UID',
        'Study ID', 'Image Position (Patient)', 'Image Orientation (Patient)','Samples per Pixel',
        'Photometric Interpretation', 'Rows', 'Columns', 'Pixel Spacing', 'Bits Allocated', 
        'Bits Stored','High Bit', 'Pixel Representation', 'Window Center', 'Window Width',
        'Rescale Intercept','Rescale Slope']

train_meta_df = pd.DataFrame(train_metadata)
train_meta_df.columns = cols

# Explode Image position

In [13]:
def extractPosition(positionList):
    if type(positionList) == str:
        return literal_eval(positionList)
    else:
        return positionList

train_meta_df['Image Position (Patient)'] = train_meta_df['Image Position (Patient)'].apply(extractPosition) 
train_meta_df[['Image Position x','Image Position y', 'Image Position z']] = pd.DataFrame(
    train_meta_df['Image Position (Patient)'].values.tolist(), index= train_meta_df.index)

# Drop columns with only 1 value

In [14]:
DROP_COLUMNS = ['Modality', 'Photometric Interpretation', 'Study ID', 'Samples per Pixel', 'Bits Allocated',
               'Rescale Slope', 'Image Position (Patient)']
train_meta_df = train_meta_df.drop(DROP_COLUMNS, axis= 1)

# Save to CSV

In [15]:
train_meta_df.to_csv(DATA_DIR + '/stage_2_train_metadata.csv', index=False)