In [51]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

COVID_DIR = 'covid-chestxray-dataset'
PNEUMONIA_DIR = 'chest_xray'

if COVID_DIR not in os.listdir():
    !git clone https://github.com/ieee8023/covid-chestxray-dataset

if PNEUMONIA_DIR not in os.listdir():
    !kaggle datasets download paultimothymooney/chest-xray-pneumonia && unzip chest-xray-pneumonia.zip && rm chest-xray-pneumonia.zip

metadata = pd.read_csv(f'{COVID_DIR}/metadata.csv')

In [52]:
metadata.head(5)

Unnamed: 0,Patientid,offset,sex,age,finding,survival,view,modality,date,location,filename,doi,url,license,clinical notes,other notes,Unnamed: 16
0,2,0.0,M,65.0,COVID-19,Y,PA,X-ray,"January 22, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,infiltrate in the upper lobe of the left lung,,
1,2,3.0,M,65.0,COVID-19,Y,PA,X-ray,"January 25, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
2,2,5.0,M,65.0,COVID-19,Y,PA,X-ray,"January 27, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
3,2,6.0,M,65.0,COVID-19,Y,PA,X-ray,"January 28, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
4,4,0.0,F,52.0,COVID-19,,PA,X-ray,"January 25, 2020","Changhua Christian Hospital, Changhua City, Ta...",nejmc2001573_f1a.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,diffuse infiltrates in the bilateral lower lungs,,


In [53]:
# only want PA view
metadata = metadata.loc[(metadata['view'] == 'PA') & (metadata['finding'].isin(('COVID-19', 'SARS', 'MERS', 'Streptococcus'))), :] # findings with more than 1 instance
metadata.head()

Unnamed: 0,Patientid,offset,sex,age,finding,survival,view,modality,date,location,filename,doi,url,license,clinical notes,other notes,Unnamed: 16
0,2,0.0,M,65.0,COVID-19,Y,PA,X-ray,"January 22, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,infiltrate in the upper lobe of the left lung,,
1,2,3.0,M,65.0,COVID-19,Y,PA,X-ray,"January 25, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
2,2,5.0,M,65.0,COVID-19,Y,PA,X-ray,"January 27, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
3,2,6.0,M,65.0,COVID-19,Y,PA,X-ray,"January 28, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
4,4,0.0,F,52.0,COVID-19,,PA,X-ray,"January 25, 2020","Changhua Christian Hospital, Changhua City, Ta...",nejmc2001573_f1a.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,diffuse infiltrates in the bilateral lower lungs,,


In [54]:
X_train_full, X_test = train_test_split(metadata, test_size=0.1, random_state=42, stratify=metadata['finding'])
X_train, X_val = train_test_split(X_train_full, test_size=0.3, random_state=42, stratify=X_train_full['finding'])

In [55]:
classes = ['COVID-19', 'vp', 'bp', 'normal']

diagnoses = {
    'COVID-19': 'COVID-19',
    'SARS': 'vp',
    'MERS': 'vp',
    'Streptococcus': 'bp'
}

train_dir = 'train'
test_dir = 'test'
val_dir = 'val'
covid_img_dir = f'{COVID_DIR}/images'

all_dirs = [train_dir, test_dir, val_dir]
dir_mapping = dict(zip(all_dirs, [X_train, X_test, X_val]))

for dir_ in all_dirs:
    for class_ in classes:
        new_dir = f'{dir_}/{class_}'
        if not os.path.exists(new_dir):
            os.makedirs(new_dir, exist_ok=True)

In [56]:
# covid data
for dir_ in dir_mapping:
    for filename, finding in dir_mapping[dir_][['filename', 'finding']].values:
        if finding in diagnoses and filename in os.listdir(covid_img_dir):
            shutil.move(f'{covid_img_dir}/{filename}', f'{dir_}/{diagnoses[finding]}/{filename}')

In [57]:
# pneumonia data
pneum_dirs = ['NORMAL', 'PNEUMONIA']

for dir_ in all_dirs:
    topdir = f'{PNEUMONIA_DIR}/{dir_}'
    for pneum_dir in pneum_dirs:
        curdir = f'{topdir}/{pneum_dir}'
        for img in os.listdir(curdir):
            if 'normal' in img.lower():
                shutil.move(f'{curdir}/{img}', f'{dir_}/normal/{img}')
            elif 'virus' in img.lower():
                shutil.move(f'{curdir}/{img}', f'{dir_}/vp/{img}')
            elif 'bacteria' in img.lower():
                shutil.move(f'{curdir}/{img}', f'{dir_}/bp/{img}')