In [83]:
import pandas as pd
import numpy as np
import os

In [84]:
# Importing the data

In [85]:
DATA_PATH = 'covid-chestxray-dataset'

In [86]:
df = pd.read_csv(os.path.join(DATA_PATH, 'metadata.csv'))

In [87]:
df.head()

Unnamed: 0,patientid,offset,sex,age,finding,survival,intubated,intubation_present,went_icu,needed_supplemental_O2,...,date,location,folder,filename,doi,url,license,clinical_notes,other_notes,Unnamed: 27
0,2,0.0,M,65.0,COVID-19,Y,,,,,...,"January 22, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,infiltrate in the upper lobe of the left lung,,
1,2,3.0,M,65.0,COVID-19,Y,,,,,...,"January 25, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
2,2,5.0,M,65.0,COVID-19,Y,,,,,...,"January 27, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
3,2,6.0,M,65.0,COVID-19,Y,,,,,...,"January 28, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
4,4,0.0,F,52.0,COVID-19,,,,,,...,"January 25, 2020","Changhua Christian Hospital, Changhua City, Ta...",images,nejmc2001573_f1a.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,diffuse infiltrates in the bilateral lower lungs,,


In [89]:
df['finding'].unique()

array(['COVID-19', 'ARDS', 'SARS', 'Pneumocystis', 'Streptococcus',
       'No Finding', 'Chlamydophila', 'E.Coli', 'COVID-19, ARDS',
       'Klebsiella', 'Legionella'], dtype=object)

# Focus only on COVID-19, so will want to relabel the classes

* 1 if covid-19 or covid-19, ards
* 0 otherwise

In [90]:
findings = df['finding']
is_covid = (findings == 'COVID-19') | (findings == 'COVID-19, ARDS')

In [91]:
is_covid_binary = is_covid.replace([True, False], [1, 0])

In [92]:
df['class'] = is_covid_binary

## Using images as features

In [93]:
image_filenames = df['filename']

In [94]:
df.shape

(345, 29)

In [95]:
df[df.modality == 'X-ray'].shape

(301, 29)

In [96]:
df[df.modality == 'CT'].shape

(44, 29)

## Filter out images that do not exist

In [97]:
IMG_PATH = os.path.join(DATA_PATH, 'images')

In [98]:
all_imgs = os.listdir(IMG_PATH)
all_imgs[:10]

['01E392EE-69F9-4E33-BFCE-E5C968654078.jpeg',
 '03BF7561-A9BA-4C3C-B8A0-D3E585F73F3C.jpeg',
 '1-s2.0-S0140673620303706-fx1_lrg.jpg',
 '1-s2.0-S0929664620300449-gr2_lrg-a.jpg',
 '1-s2.0-S0929664620300449-gr2_lrg-b.jpg',
 '1-s2.0-S0929664620300449-gr2_lrg-c.jpg',
 '1-s2.0-S0929664620300449-gr2_lrg-d.jpg',
 '1-s2.0-S0929664620300449-gr3_lrg-a.jpg',
 '1-s2.0-S0929664620300449-gr3_lrg-b.jpg',
 '1-s2.0-S0929664620300449-gr3_lrg-c.jpg']

Step 1) Remove rows in the dataframe that do not have the image

Step 2) Remove images that do not have a row

In [100]:
simple_df = df[['filename', 'modality', 'class']]

In [101]:
simple_df.head()

Unnamed: 0,filename,modality,class
0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,X-ray,1
1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,X-ray,1
2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,X-ray,1
3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,X-ray,1
4,nejmc2001573_f1a.jpeg,X-ray,1


In [102]:
simple_df.shape

(345, 3)

In [115]:
df_filt = pd.DataFrame(columns=['filename', 'modality', 'class'])

In [116]:
for index, (filename, modality, cls) in simple_df.iterrows():
    file_path = os.path.join(IMG_PATH, filename)
    if os.path.isfile(file_path):
        df_filt = df_filt.append({'filename': filename, 'modality': modality, 'class': cls}, ignore_index=True)

In [117]:
df_filt.shape

(324, 3)

# Handle images that do not have metadata

In [72]:
imgs_with_metadata = df_filt['filename'].values

In [104]:
imgs_without_metadata = []

In [105]:
all_imgs = os.listdir(IMG_PATH)

In [108]:
for img in all_imgs:
    if img not in imgs_with_metadata:
        imgs_without_metadata.append(img)

In [109]:
# We can delete these because we have no label or any information associated with these images

In [111]:
for img in imgs_without_metadata:
    os.remove(os.path.join(IMG_PATH, img))
    print("Deleted: {0}".format(img))

Deleted: auntminnie-2020_01_31_20_24_2322_2020_01_31_x-ray_coronavirus_US.jpg
Deleted: MERS-CoV-1-s2.0-S0378603X1500248X-gr4e.jpg
Deleted: nCoV-Snohomish-20382862_web1_M1-Lungs-EDH-200201-640x300@2x.jpg
Deleted: radiopaedia-2019-novel-coronavirus-infected-pneumonia.jpg


# Sanity check
Verifying that the number of images in the training set matches the metadata size

In [119]:
len(os.listdir(IMG_PATH)) == df_filt.shape[0]

True

## Sort data into training/validation set

Since we have filtered out all the CT images, we are left with X-rays only.

We will do a balanced 80-20 split on the X-ray images.

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
covid_df = pd.read_csv('Covid.csv')

In [9]:
covid_df

Unnamed: 0,Image Directory,Class Label (COVID +=1/-=0)
0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,1
1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,1
2,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,1
3,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,1
4,nejmc2001573_f1a.jpeg,1
...,...,...
263,covid-19-pneumonia-58-day-9.jpg,1
264,covid-19-pneumonia-58-day-10.jpg,1
265,covid-19-pneumonia-mild.JPG,1
266,covid-19-pneumonia-67.jpeg,1


In [10]:
IMAGE_PATH = 'covid-chestxray-dataset/images'

In [12]:
covid_df = covid_df.rename(columns={'Image Directory': 'filename', 'Class Label (COVID +=1/-=0)' : 'class'})

In [15]:
covid_df[covid_df.filename == 'auntminnie-2020_01_31_20_24_2322_2020_01_31_x-ray_coronavirus_US.jpg']

Unnamed: 0,filename,class
