In [1]:
%matplotlib inline

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pydicom

import gc
import warnings
warnings.simplefilter(action = 'ignore')

### Origin of data for training
We have to consider several sources of data for training
* stage_1_detailed_class_info: provides the class for each image id: Lung Opacity (i.e. pneumonia diagnosis), No Lung Opacity / Not Normal, Normal
* stage_1_train_labels: the bounding boxes:
    - patientId _- A patientId. Each patientId corresponds to a unique image.
    - x - the upper-left x coordinate of the bounding box.
    - y - the upper-left y coordinate of the bounding box.
    - width - the width of the bounding box.
    - height - the height of the bounding box.
    - Target - the binary Target, indicating whether this sample has evidence of pneumonia.
* the image file themselves in DICOM format, that contains some useful meta-data (from https://www.kaggle.com/aantonova/practical-eda-on-numerical-data)
    - age
    - sex
    - ViewPosition

#### Get the boxes and class

In [3]:
# To get the info from stage_1_detailed_class_info and stage_1_train_labels

detailed_class_info = pd.read_csv('../data/stage_1_detailed_class_info.csv')

# detailed_class_info (class) presents the same lines as train_labels (boxes) which means some of them are duplicated
detailed_class_info = detailed_class_info.drop_duplicates()

train_labels = pd.read_csv('../data/stage_1_train_labels.csv')

df = pd.merge(left = detailed_class_info, right = train_labels, how = 'left', on = 'patientId')

del detailed_class_info, train_labels
gc.collect()

df.info(null_counts = True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 28989 entries, 0 to 28988
Data columns (total 7 columns):
patientId    28989 non-null object
class        28989 non-null object
x            8964 non-null float64
y            8964 non-null float64
width        8964 non-null float64
height       8964 non-null float64
Target       28989 non-null int64
dtypes: float64(4), int64(1), object(2)
memory usage: 1.8+ MB


#### To retrieve the meta-data from DICOM images (takes several 10' of minutes)

In [4]:
dcm_columns = None

for n, pid in enumerate(df['patientId'].unique()):
    dcm_file = '../data/stage_1_train_images/%s.dcm' % pid
    dcm_data = pydicom.read_file(dcm_file)
    
    if not dcm_columns:
        dcm_columns = dcm_data.dir()
        dcm_columns.remove('PixelSpacing')
        dcm_columns.remove('PixelData')
    
    for col in dcm_columns:
        if not (col in df.columns):
            df[col] = np.nan
        index = df[df['patientId'] == pid].index
        df.loc[index, col] = dcm_data.data_element(col).value
    
    if n % 1000 == 0:
        print("Treated image n°: ", n)
    
    del dcm_data
    
gc.collect()

df.head()

Unnamed: 0,patientId,class,x,y,width,height,Target,AccessionNumber,BitsAllocated,BitsStored,...,SamplesPerPixel,SeriesDescription,SeriesInstanceUID,SeriesNumber,SpecificCharacterSet,StudyDate,StudyID,StudyInstanceUID,StudyTime,ViewPosition
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal,,,,,0,,8.0,8.0,...,1.0,view: PA,1.2.276.0.7230010.3.1.3.8323329.28530.15178744...,1.0,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.28530.15178744...,0.0,PA
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal,,,,,0,,8.0,8.0,...,1.0,view: PA,1.2.276.0.7230010.3.1.3.8323329.26024.15178744...,1.0,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.26024.15178744...,0.0,PA
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal,,,,,0,,8.0,8.0,...,1.0,view: AP,1.2.276.0.7230010.3.1.3.8323329.11252.15178743...,1.0,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.11252.15178743...,0.0,AP
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal,,,,,0,,8.0,8.0,...,1.0,view: PA,1.2.276.0.7230010.3.1.3.8323329.2293.151787429...,1.0,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.2293.151787429...,0.0,PA
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity,264.0,152.0,213.0,379.0,1,,8.0,8.0,...,1.0,view: AP,1.2.276.0.7230010.3.1.3.8323329.6379.151787432...,1.0,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.6379.151787432...,0.0,AP


#### Remove the useless image meta-data

In [5]:
to_drop = df.nunique()
to_drop = to_drop[(to_drop <= 1) | (to_drop == to_drop['patientId'])].index
to_drop = to_drop.drop('patientId')

# Removing all constant features
df.drop(to_drop, axis = 1, inplace = True)

# Removing the physician name, as this information is always empty
df.drop('ReferringPhysicianName', axis = 1, inplace = True)

# Removing SeriesDescription which is identical to ViewPosition
df.drop('SeriesDescription', axis = 1, inplace = True)

# Numerical encoding for PatientSex and ViewPosition
df['PatientSex'] = df['PatientSex'].map({'F': 0, 'M': 1})
df['ViewPosition'] = df['ViewPosition'].map({'PA': 0, 'AP': 1})
df.head()

df.head()

Unnamed: 0,patientId,class,x,y,width,height,Target,PatientAge,PatientSex,ViewPosition
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,No Lung Opacity / Not Normal,,,,,0,51,0,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,No Lung Opacity / Not Normal,,,,,0,48,0,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,No Lung Opacity / Not Normal,,,,,0,19,1,1
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal,,,,,0,28,1,0
4,00436515-870c-4b36-a041-de91049b9ab4,Lung Opacity,264.0,152.0,213.0,379.0,1,32,0,1


#### Save the data file

In [6]:
file_name = '../data/prepared_data.csv'
df.to_csv(file_name, sep='\t', encoding='utf-8')