In [1]:
import numpy as np
import pandas as pd
import pydicom as dicom
import cv2
import os
import matplotlib.pyplot as plt
import random
%matplotlib inline 

In [2]:
# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
imgpath = 'data/cohen/images'
csvpath = 'data/cohen/metadata.csv'

# path to RSNA data
rsna_img = "data/rsna/images"
# get normal cases
rsna_csv = "data/rsna/stage_2_detailed_class_info.csv"
# to get pneumonia data
rsna_csv2 = "data/rsna/stage_2_train_labels.csv"

# categorize the disiases
mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'pneumonia'
mapping['MERS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Klebsiella'] = 'pneumonia'
mapping['Chlamydophila'] = 'pneumonia'
mapping['Legionella'] = 'pneumonia'
mapping['Normal'] = 'normal'
mapping['Lung Opacity'] = 'pneumonia'
mapping['1'] = 'pneumonia'

In [3]:
metadata = pd.read_csv(csvpath)
views = ["PA", "AP", "AP Supine", "AP semi erect", "AP erect"]
metadata = metadata[metadata.view.isin(views)]

In [4]:
patients1 = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
for index, row in metadata.iterrows():
    f = row['finding'].split(',')[0] # take the first finding, for the case of COVID-19, ARDS
    if f in mapping: # 
        count[mapping[f]] += 1
        patients1[mapping[f]].append(row['filename'])

In [5]:
csv_normal = pd.read_csv(rsna_csv, nrows=None)
csv_pneu = pd.read_csv(rsna_csv2, nrows=None)
patients2 = {'normal': [], 'pneumonia': []}


for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        patients2['normal'].append(row['patientId'])
        
for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        patients2['pneumonia'].append(row['patientId'])

In [6]:
# creating a few directories to contain a normalized dataset 
try:
    os.mkdir('dataset')
    os.mkdir('dataset/train')
    os.mkdir('dataset/test')
    path_train = os.path.abspath('dataset/train')
    os.mkdir(path_train + '/COVID-19')
    os.mkdir(path_train + '/normal')
    os.mkdir(path_train + '/pneumonia')
    path_test = os.path.abspath('dataset/test')
    os.mkdir(path_test + '/COVID-19')
    os.mkdir(path_test + '/normal')
    os.mkdir(path_test + '/pneumonia')
except FileExistsError:
    print("Directory already exists")

In [7]:
def gen_test_dict(patients: dict, test_split=0.3):
    train, test = dict(), dict()
    for key in patients.keys():
        test[key] = random.sample(patients[key], int(test_split*len(patients[key])))
        train[key] = list(set(patients[key]) - set(test[key]))
    return train, test

In [8]:
train1, test1 = gen_test_dict(patients1)
train2, test2 = gen_test_dict(patients2)

In [9]:
# saving data from the first dataset to form general dataset
def save_jpg(save_path, patients1):
    work_path = os.path.abspath(os.getcwd())
    path = os.path.abspath('data/cohen/images')
    
    for key in patients1.keys():
        save = os.path.join(save_path, key)
        os.chdir(save)
        for img in patients1[key]:
            try:
                image = cv2.imread(os.path.join(path, img))
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                cv2.imwrite(img, image)
            except:
                continue

    os.chdir(work_path)


In [10]:
# save process, but for data from the second dataset
def save_dcm(save_path, patients2):
    work_path = os.path.abspath(os.getcwd())
    path = os.path.abspath('data/rsna/images')

    for key in patients2.keys():
        save = os.path.join(save_path, key)
        os.chdir(save)
        for img in patients2[key]:
            try:
                read_img = img + '.dcm'
                image = dicom.dcmread(os.path.join(path, read_img))
                image = image.pixel_array
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                save_img = img + '.jpg'
                cv2.imwrite(save_img, image)
            except:
                continue

    os.chdir(work_path)

In [11]:
save_path = os.path.abspath('dataset')

train_path = os.path.join(save_path, 'train')
test_path = os.path.join(save_path, 'test')

save_jpg(train_path, train1)
save_jpg(test_path, test1)

save_dcm(train_path, train2)
save_dcm(test_path, test2)