In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pydicom
import numpy as np
import warnings
import multiprocessing
import os

warnings.filterwarnings('ignore')

In [4]:
sns.set_style('darkgrid')
sns.set_context('notebook', font_scale=1.2)

In [6]:
RAWDATA_DIR = os.path.join('..', 'data', 'raw')

In [11]:
tr = pd.read_csv(os.path.join(RAWDATA_DIR, 'stage_1_train_labels.csv'))
tr['aspect_ratio'] = tr['width'] / tr['height']
tr['area'] = tr['width'] * tr['height']
patient_ids = list(tr.patientId.unique())

In [14]:
def get_info(patientId, root_dir=os.path.join(RAWDATA_DIR, 'stage_1_train_images/')):
    fn = os.path.join(root_dir, f'{patientId}.dcm')
    dcm_data = pydicom.read_file(fn)
    return {'age': dcm_data.PatientAge, 
            'gender': dcm_data.PatientSex, 
            'id': os.path.basename(fn).split('.')[0],
            'pixel_spacing': float(dcm_data.PixelSpacing[0]),
            'mean_black_pixels': np.mean(dcm_data.pixel_array == 0)}

In [16]:
with multiprocessing.Pool(4) as pool:
    result = pool.map(get_info, patient_ids)
demo = pd.DataFrame(result)
demo['gender'] = demo['gender'].astype('category')
demo['age'] = demo['age'].astype(int)

tr = tr.merge(demo, left_on='patientId', right_on='id', how='left').drop(columns='id')

In [17]:
tr.columnsmns

Index(['patientId', 'x', 'y', 'width', 'height', 'Target', 'aspect_ratio',
       'area', 'age_x', 'gender_x', 'mean_black_pixels_x', 'pixel_spacing_x',
       'age_y', 'gender_y', 'mean_black_pixels_y', 'pixel_spacing_y'],
      dtype='object')