In [1]:
import numpy as np
import pandas as pd
import os
import random
from shutil import copyfile
#import pydicom as dicom
import cv2
from pathlib import Path

# Paths Definition

Careful, the dataset loader separates by whitespaces so in the file names there cannot be any white spaces

### Paths in Ubuntu

In [1]:
SIMM_COVID_DETECTION_PATH = "/media/usuario/WIN10/COVID19/images/SIIM-Detection/siim-covid19-detection"
PAPER_IMAGES_PATH = "/media/usuario/WIN10/COVID19/images/PaperOriginal/"
TEST_PATH = "/media/usuario/WIN10/COVID19/images/test"

### Paths in MAC

In [2]:
BASE_PATH = '/Volumes/Alvaro HD/UPM/Corpus TFM/COVID'

In [3]:
HACKATHON_PATH = os.path.join(BASE_PATH,'hackathon')
SIIM_DETECTION_PATH = os.path.join(BASE_PATH,'siim-covid19-detection')
PAPER_IMAGES_PATH = os.path.join(BASE_PATH,'CroppedSegmentedImages')
COVID_QU_EX_PATH = os.path.join(BASE_PATH,'Masked-COVID-QU-Ex-dataset')

In [4]:
train = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
path_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
mapping = {0:'normal',1:'COVID-19',2:'pneumonia'}

# Reading from paper files

Prepared to work in Kaggle

In [5]:
for j,directory in enumerate(os.listdir(PAPER_IMAGES_PATH)):
    if not directory.startswith('.') and not directory=='CRXNIH':
        for i,file in enumerate(os.listdir(os.path.join(PAPER_IMAGES_PATH,directory))):
            try:
                cat = int(file.split('__')[1])
            except Exception as e:
                cat = int(file.split('_')[1])
            finally:
                count[mapping[cat]] += 1
                entry = [int(count[mapping[cat]]),'PaperOriginal'+ '/'+ directory + '/' + file,mapping[cat]]
                filename_label[mapping[cat]].append(entry)
print(count)

{'normal': 60594, 'pneumonia': 22746, 'COVID-19': 8014}


# Reading from SIIM-COVID-Detection

In [42]:
image_level_df = pd.read_csv(os.path.join(SIIM_DETECTION_PATH,'train_image_level.csv'))
study_level_df = pd.read_csv(os.path.join(SIIM_DETECTION_PATH,'train_study_level.csv'))

In [43]:
image_level_df.head(2)

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed


In [44]:
study_level_df.head(2)

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1


In [45]:
study_level_df['StudyInstanceUID'] = study_level_df['id'].apply(lambda x: x.replace('_study',''))
study_level_df.drop('id',axis=1,inplace=True)
train_df = image_level_df.merge(study_level_df,on='StudyInstanceUID')
train_df['id'] = train_df['id'].apply(lambda x: x.replace('_image',''))
train_df.sort_values(by='id',ascending=False)
train_df.head()

Unnamed: 0,id,boxes,label,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,0,1,0,0
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,1,0,0,0
2,0012ff7358bc,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,0,1,0,0
3,001398f4ff4f,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,0,0,0,1
4,001bd15d1891,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e,0,1,0,0


In [46]:
train_df['id'].value_counts()

000a312787f2    1
a6f2e4c387f0    1
a6e715a47642    1
a6de43bd68d4    1
a6dca97e3503    1
               ..
500a498ca956    1
4ffc2689420f    1
4ff81f916fac    1
4ff7b31b3269    1
ffe942c8655f    1
Name: id, Length: 6334, dtype: int64

In [48]:
train_df.shape

(6334, 8)

In [13]:
paths = []
for id in train_df['id']:
    paths.append(os.path.join(SIIM_DETECTION_PATH,'train',id + '.jpg'))

In [14]:
labels = []
for id in train_df['id']:
    if (train_df[train_df['id']==id]['Negative for Pneumonia'].values == 1):
        labels.append('normal')
    elif (train_df[train_df['id']==id]['Typical Appearance'].values == 1):
        labels.append('COVID-19')
    elif (train_df[train_df['id']==id]['Indeterminate Appearance'].values == 1):
        labels.append('COVID-19')
    elif (train_df[train_df['id']==id]['Atypical Appearance'].values == 1):
        labels.append('COVID-19')
    else:
        raise Exception

In [15]:
for j,file in enumerate(paths):
    count[labels[j]] += 1
    entry = [int(count[labels[j]]),file,labels[j]]
    filename_label[labels[j]].append(entry)
print(count)

{'normal': 120242, 'pneumonia': 24114, 'COVID-19': 12612}


# Reading from Masked-COVID-QU-Ex-Dataset

In [6]:
for i,parent_directory in enumerate(os.listdir(COVID_QU_EX_PATH)):
    if not parent_directory.startswith('.'):
        full_parent_directory = os.path.join(COVID_QU_EX_PATH,parent_directory)
        for j,category in enumerate(os.listdir(full_parent_directory)):
            if not category.startswith('.'):
                full_category_directory = os.path.join(full_parent_directory,category)
                for k,img_dir in enumerate(os.listdir(full_category_directory)):
                    if not img_dir.startswith('.'):
                        img_dir = img_dir.replace(' ','_')
                        count[category] += 1
                        entry = [int(count[category]) ,os.path.join('Masked-COVID-QU-Ex-dataset',parent_directory,category,img_dir)
                        , category]
                        filename_label[category].append(entry)
print(count)

{'normal': 71295, 'pneumonia': 34009, 'COVID-19': 19970}


# Reading from COVID-QU-Ex-Dataset (No mask)

In [None]:
for i,parent_directory in enumerate(os.listdir(COVID_QU_EX_PATH)):
    if not parent_directory.startswith('.'):
        full_parent_directory = os.path.join(COVID_QU_EX_PATH,parent_directory)
        for j,category in enumerate(os.listdir(full_parent_directory)):
            if not category.startswith('.'):
                full_category_directory = os.path.join(full_parent_directory,category)
                for k,img_dir in enumerate(os.listdir(full_category_directory)):
                    if not img_dir.startswith('.'):
                        count[category] += 1
                        entry = [int(count[category]) ,os.path.join('COVID-QU-Ex-dataset',
                                                                    'Lung Segmentation Data',
                                                                    'Lung Segmentation Data'
                                                                    ,parent_directory,
                                                                    category,
                                                                    'images',
                                                                    img_dir)
                        , category]
                        filename_label[category].append(entry)
print(count)

### Finally, writing to txt file

In [7]:
np.random.seed(seed=2)
test_per = 0.1
for key in filename_label.keys():
    if key == 'pneumonia':
        test_patients = np.random.permutation(count[key])[:int(count[key]*test_per)]
    elif key == 'COVID-19':
        test_patients = np.random.permutation(count[key])[:int(count[key]*test_per)]
    else:
        test_patients = np.random.permutation(count[key])[:int(count[key]*test_per)]
    for i,patient in enumerate(filename_label[key]):
        if int(patient[0]) in test_patients:
            test.append(patient)
            test_count[key] += 1
        else:
            train.append(patient)
            train_count[key] += 1

print('test count: ', test_count)
print('train count: ', train_count)

test count:  {'normal': 7129, 'pneumonia': 3400, 'COVID-19': 1997}
train count:  {'normal': 64166, 'pneumonia': 30609, 'COVID-19': 17973}


In [8]:
WORK_DIR = os.path.dirname(os.path.realpath(__name__))
WORK_DIR = Path(WORK_DIR)
WORK_DIR = WORK_DIR.parent.absolute()
train_file = open(os.path.join(WORK_DIR,'split_files/alvaro_train_split.txt'),"w")
for sample in train:
    info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
    train_file.write(info)

train_file.close()

test_file = open(os.path.join(WORK_DIR,'split_files/alvaro_test_split.txt'), "w")
for sample in test:
    info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
    test_file.write(info)

test_file.close()

# Building a Smaller DB for Testing

In [33]:
MAX_IMAGES_PER_CLASS = 1000

In [34]:
train = []
test = []
test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
path_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
mapping = {0:'normal',1:'COVID-19',2:'pneumonia'}

### For MAC

In [37]:
for j,directory in enumerate(os.listdir(PAPER_IMAGES_PATH)):
    if not directory.startswith('.'):
        for i,file in enumerate(os.listdir(os.path.join(PAPER_IMAGES_PATH,directory))):
            try:
                cat = int(file.split('__')[1])
            except Exception as e:
                cat = int(file.split('_')[1])
            finally:
                if(count[mapping[cat]]<MAX_IMAGES_PER_CLASS):
                    count[mapping[cat]] += 1
                    entry = [int(count[mapping[cat]]),directory + '/' + file,mapping[cat]]
                    filename_label[mapping[cat]].append(entry)
print(count)

{'normal': 1000, 'pneumonia': 1000, 'COVID-19': 1000}


In [38]:
np.random.seed(seed=2)
test_per = 0.1
for key in filename_label.keys():
    if key == 'pneumonia':
        test_patients = np.random.permutation(count[key])[:int(count[key]*test_per)]
    elif key == 'COVID-19':
        test_patients = np.random.permutation(count[key])[:int(count[key]*test_per)]
    else:
        test_patients = np.random.permutation(count[key])[:int(count[key]*test_per)]
    for i,patient in enumerate(filename_label[key]):
        if int(patient[0]) in test_patients:
            test.append(patient)
            test_count[key] += 1
        else:
            train.append(patient)
            train_count[key] += 1

print('test count: ', test_count)
print('train count: ', train_count)

test count:  {'normal': 100, 'pneumonia': 100, 'COVID-19': 100}
train count:  {'normal': 900, 'pneumonia': 900, 'COVID-19': 900}


In [39]:
WORK_DIR = os.path.dirname(os.path.realpath(__name__))
WORK_DIR = Path(WORK_DIR)
WORK_DIR = WORK_DIR.parent.absolute()

train_file = open(os.path.join(WORK_DIR,'split_files/ubuntu_train_split.txt'),"a+")
for sample in train:
    info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
    train_file.write(info)

train_file.close()

test_file = open(os.path.join(WORK_DIR,'split_files/ubuntu_test_split.txt'), "a+")
for sample in test:
    info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
    test_file.write(info)

test_file.close()

# Full DB Building

In [None]:
paths = ''