## Capstone Notebook 1
## EDA and Image preprocessing

This notebook covers the Exploratory Data Analysis of the BIMCV-COVID-19 Chest X-ray (CXR) dataset:
https://github.com/BIMCV-CSUSP/BIMCV-COVID-19

Code for image preprocessing is also here, which is adapted from the following file in the above repo:
<ins>my_data_generator.py</ins>

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras
from keras.preprocessing.image import img_to_array
from skimage.transform import resize

from PIL import Image

Using TensorFlow backend.


Start from the file list of all images: ./balanced-one-partition/pneumo_dataset_ITI_rev.tsv

In [3]:
df_pneumo_2d = pd.read_csv('./balanced-one-partition/pneumo_dataset_ITI_rev.tsv' , sep="\t" )

Our Target variable will be group

In [4]:
features=['ImageID','Projection','MethodProjection','group','Valid','Blurry']
df_pneumo_2d.columns

Index(['Unnamed: 0', 'ImageID', 'StudyDate_DICOM', 'StudyID', 'PatientID',
       'PatientBirth', 'PatientSex_DICOM', 'ViewPosition_DICOM', 'Projection',
       'MethodProjection', 'Pediatric', 'Modality_DICOM', 'Manufacturer_DICOM',
       'PhotometricInterpretation_DICOM', 'PixelRepresentation_DICOM',
       'PixelAspectRatio_DICOM', 'SpatialResolution_DICOM', 'BitsStored_DICOM',
       'WindowCenter_DICOM', 'WindowWidth_DICOM', 'Rows_DICOM',
       'Columns_DICOM', 'XRayTubeCurrent_DICOM', 'Exposure_DICOM',
       'ExposureInuAs_DICOM', 'ExposureTime', 'RelativeXRayExposure_DICOM',
       'Labels', 'group', 'Partition', 'Subject_occurrences',
       'Partition_occurrences', 'Partitionlabel_occurrences', 'Valid',
       'Blurry', 'Rotation_needed', 'Repeat', 'Observations'],
      dtype='object')

In [5]:
df_pneumo_2d[features]

Unnamed: 0,ImageID,Projection,MethodProjection,group,Valid,Blurry
0,126022968388682456059208259745221627283_wb7atn...,L,Manual review of DICOM fields,C,0,0
1,126022968388682456059208259745221627283_nul4bn...,PA,Manual review of DICOM fields,C,0,0
2,126022968388682456059208259745221627283_40l46j...,PA,Manual review of DICOM fields,C,1,0
3,126022968388682456059208259745221627283_mdr23v...,PA,Manual review of DICOM fields,C,0,0
4,126022968388682456059208259745221627283_eed72o...,L,Manual review of DICOM fields,C,1,0
...,...,...,...,...,...,...
23516,99520118437198458972509421695745364861_qsbymr.png,PA,Manual review of DICOM fields,C,1,0
23517,99711749323098443927719696849300549782_zqpdv1.png,PA,Manual review of DICOM fields,C,1,0
23518,99732995977478883316522560170983113855_q9uu8l.png,PA,Manual review of DICOM fields,C,1,0
23519,99829469149321466659827372110305233978_wrmsh0.png,PA,Manual review of DICOM fields,C,1,0


In [6]:
df_pneumo_2d.group.unique()

array(['C', 'N', 'I', 'NI'], dtype=object)

In [7]:
df_test=df_pneumo_2d

In [8]:
df_test[df_test.group==('NI')];
df_test.head()

Unnamed: 0.1,Unnamed: 0,ImageID,StudyDate_DICOM,StudyID,PatientID,PatientBirth,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,...,group,Partition,Subject_occurrences,Partition_occurrences,Partitionlabel_occurrences,Valid,Blurry,Rotation_needed,Repeat,Observations
0,14701,126022968388682456059208259745221627283_wb7atn...,20160115,126022968388682456059208259745221627283,331532285849366680169157256726092267279,1935,F,,L,Manual review of DICOM fields,...,C,tr,12,15612,7808,0,0,0,24,Repeated
1,14702,126022968388682456059208259745221627283_nul4bn...,20160115,126022968388682456059208259745221627283,331532285849366680169157256726092267279,1935,F,,PA,Manual review of DICOM fields,...,C,tr,12,15612,7808,0,0,0,25,Repeated
2,14703,126022968388682456059208259745221627283_40l46j...,20160115,126022968388682456059208259745221627283,331532285849366680169157256726092267279,1935,F,,PA,Manual review of DICOM fields,...,C,tr,12,15612,7808,1,0,0,25,
3,14704,126022968388682456059208259745221627283_mdr23v...,20160115,126022968388682456059208259745221627283,331532285849366680169157256726092267279,1935,F,,PA,Manual review of DICOM fields,...,C,tr,12,15612,7808,0,0,0,25,Repeated
4,14705,126022968388682456059208259745221627283_eed72o...,20160115,126022968388682456059208259745221627283,331532285849366680169157256726092267279,1935,F,,L,Manual review of DICOM fields,...,C,tr,12,15612,7808,1,0,0,24,


In [9]:
df_test=df_test[df_test.Valid==1]
df_test=df_test[df_test.Blurry==0]
df_test=df_test[df_test.Repeat==-1]
df_test.reset_index(drop=True,inplace=True)

In [10]:
df_test['group'].unique()

array(['C', 'N', 'I', 'NI'], dtype=object)

In [11]:
df_test['group']= [1 if df_test.group[x]==('N' or 'NI')  else 0 for x in range(df_test.shape[0])]

In [12]:
df_test.groupby('group').describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,StudyDate_DICOM,StudyDate_DICOM,...,Rotation_needed,Rotation_needed,Repeat,Repeat,Repeat,Repeat,Repeat,Repeat,Repeat,Repeat
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,17299.0,15065.19926,5496.844932,5921.0,10402.5,14816.0,19161.5,26014.0,17299.0,20140960.0,...,0.0,270.0,17299.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,5725.0,2970.943581,1709.58108,0.0,1502.0,2964.0,4451.0,5920.0,5725.0,20121610.0,...,0.0,270.0,5725.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [13]:
df_test.groupby('PatientBirth').count()

Unnamed: 0_level_0,Unnamed: 0,ImageID,StudyDate_DICOM,StudyID,PatientID,PatientSex_DICOM,ViewPosition_DICOM,Projection,MethodProjection,Pediatric,...,group,Partition,Subject_occurrences,Partition_occurrences,Partitionlabel_occurrences,Valid,Blurry,Rotation_needed,Repeat,Observations
PatientBirth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1911,4,4,4,4,4,4,0,4,4,4,...,4,4,4,4,4,4,4,4,4,0
1912,4,4,4,4,4,4,0,4,4,4,...,4,4,4,4,4,4,4,4,4,0
1913,4,4,4,4,4,4,0,4,4,4,...,4,4,4,4,4,4,4,4,4,0
1914,16,16,16,16,16,16,6,16,16,16,...,16,16,16,16,16,16,16,16,16,0
1915,19,19,19,19,19,19,2,19,19,19,...,19,19,19,19,19,19,19,19,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012,93,93,93,93,93,93,17,93,93,93,...,93,93,93,93,93,93,93,93,93,0
2013,63,63,63,63,63,63,21,63,63,63,...,63,63,63,63,63,63,63,63,63,0
2014,69,69,69,69,69,69,11,69,69,69,...,69,69,69,69,69,69,69,69,69,0
2015,29,29,29,29,29,29,8,29,29,29,...,29,29,29,29,29,29,29,29,29,0


In [None]:
df_test.to_csv('./balanced-one-partition/pneumo_dataset_ITI_rev_clean.tsv',sep='\t')