# Data Set Creation for the CNN Detection Model

We are using the COVID-19 Radiography Database available at: https://www.kaggle.com/tawsifurrahman/covid19-radiography-database

This database consists of 1200 COVID-19, 1345 Viral Pneumonia and 1341 Normal X-Ray images. 

We are going to divide the database into training, testing and validation datasets, keeping 200 images each aside for testing and doing a 80-20 split between the remaining images for training and validation.

In [101]:
import pandas as pd
import os
import random
import shutil

In [102]:
#Metadata and Image paths for all Covid samples from the dataset

COVID_FILE_PATH = "COVID-19-Radiography-Database/COVID.metadata.csv"
COVID_IMAGES_PATH = "COVID-19-Radiography-Database/COVID"

In [103]:
covid_df = pd.read_csv(COVID_FILE_PATH) #Read csv file containing metadata for all Covid images
print(covid_df.shape)
covid_df.head()

(1200, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,COVID 1,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
1,COVID 2,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
2,COVID 3,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
3,COVID 4,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
4,COVID 5,PNG,256*256,https://sirm.org/category/senza-categoria/covi...


In [104]:
#Metadata and Image paths for all Pneumonia samples from the dataset

VIRAL_PNEUMONIA_FILE_PATH = "COVID-19-Radiography-Database/ViralPneumonia.metadata.csv"
VIRAL_PNEUMONIA_IMAGES_PATH = "COVID-19-Radiography-Database/ViralPneumonia"

In [105]:
viral_pneumonia_df = pd.read_csv(VIRAL_PNEUMONIA_FILE_PATH) #Read csv file containing metadata for all Covid images
print(viral_pneumonia_df.shape)
viral_pneumonia_df.head()

(1345, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,Viral Pneumonia-1,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
1,Viral Pneumonia-2,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
2,Viral Pneumonia-3,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
3,Viral Pneumonia-4,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
4,Viral Pneumonia-5,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...


In [106]:
#Metadata and Image paths for all Normal samples from the dataset

NORMAL_FILE_PATH = "COVID-19-Radiography-Database/NORMAL.metadata.csv"
NORMAL_IMAGES_PATH = "COVID-19-Radiography-Database/NORMAL"

In [107]:
normal_df = pd.read_csv(NORMAL_FILE_PATH) #Read csv file containing metadata for all Covid images
print(normal_df.shape)
normal_df.head()

(1341, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,NORMAL-1,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
1,NORMAL-2,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
2,NORMAL-3,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
3,NORMAL-4,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
4,NORMAL-5,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...


In [108]:
#Creating the directories and folders 
#for training, validation and test datasets
Target_Dir = 'Dataset'
Covid_Train_Dir = 'Dataset/Train/COVID'
Pneumonia_Train_Dir = 'Dataset/Train/ViralPneumonia'
Normal_Train_Dir = 'Dataset/Train/Normal'

Covid_Val_Dir = 'Dataset/Validation/COVID'
Pneumonia_Val_Dir = 'Dataset/Validation/ViralPneumonia'
Normal_Val_Dir = 'Dataset/Validation/Normal'

Covid_Test_Dir = 'Dataset/Test/COVID'
Pneumonia_Test_Dir= 'Dataset/Test/ViralPneumonia'
Normal_Test_Dir = 'Dataset/Test/Normal'

if not os.path.exists(Target_Dir):
    os.makedirs(Target_Dir)
    print("Created Dataset folder")
    
    os.makedirs(Covid_Train_Dir)
    print("Created COVID Training Data folder")
    os.makedirs(Pneumonia_Train_Dir)
    print("Created Viral Pneumonia Training Data folder")
    os.makedirs(Normal_Train_Dir)
    print("Created Normal Training Data folder")
    
    os.makedirs(Covid_Val_Dir)
    print("Created COVID Validation Data folder")
    os.makedirs(Pneumonia_Val_Dir)
    print("Created Viral Pneumonia Validation Data folder")
    os.makedirs(Normal_Val_Dir)
    print("Created Normal Validation Data folder")
    
    os.makedirs(Covid_Test_Dir)
    print("Created COVID Test Data folder")
    os.makedirs(Pneumonia_Test_Dir)
    print("Created Viral Pneumonia Test Data folder")
    os.makedirs(Normal_Test_Dir)
    print("Created Normal Test Data folder")
    

Created Dataset folder
Created COVID Training Data folder
Created Viral Pneumonia Training Data folder
Created Normal Training Data folder
Created COVID Validation Data folder
Created Viral Pneumonia Validation Data folder
Created Normal Validation Data folder
Created COVID Test Data folder
Created Viral Pneumonia Test Data folder
Created Normal Test Data folder


Organizing the data into training, validation and test datasets. 
The dataset is split into 80% images for training, 10% for validation and 10% for testing


In [113]:
#Save all the image file names into arrays 
Covid_image_names = os.listdir(COVID_IMAGES_PATH)
Pneumonia_image_names = os.listdir(VIRAL_PNEUMONIA_IMAGES_PATH)
Normal_image_names = os.listdir(NORMAL_IMAGES_PATH)


In [137]:
Covid_image_names

['COVID (978).png',
 'COVID (356).png',
 'COVID (56).png',
 'COVID (131).png',
 'COVID (337).png',
 'COVID (373).png',
 'COVID (488).png',
 'COVID (411).png',
 'COVID (711).png',
 'COVID (45).png',
 'COVID (947).png',
 'COVID (904).png',
 'COVID (485).png',
 'COVID (999).png',
 'COVID (190).png',
 'COVID (194).png',
 'COVID (985).png',
 'COVID (79).png',
 'COVID (989).png',
 'COVID (657).png',
 'COVID (580).png',
 'COVID (808).png',
 'COVID (219).png',
 'COVID (886).png',
 'COVID (798).png',
 'COVID (381).png',
 'COVID (92).png',
 'COVID (215).png',
 'COVID (276).png',
 'COVID (58).png',
 'COVID (115).png',
 'COVID (1060).png',
 'COVID (655).png',
 'COVID (865).png',
 'COVID (461).png',
 'COVID (317).png',
 'COVID (1033).png',
 'COVID (472).png',
 'COVID (386).png',
 'COVID (415).png',
 'COVID (331).png',
 'COVID (211).png',
 'COVID (1074).png',
 'COVID (354).png',
 'COVID (49).png',
 'COVID (921).png',
 'COVID (378).png',
 'COVID (1058).png',
 'COVID (479).png',
 'COVID (784).png',
 '

In [138]:
Pneumonia_image_names

['Viral Pneumonia (1).png',
 'Viral Pneumonia (10).png',
 'Viral Pneumonia (100).png',
 'Viral Pneumonia (1000).png',
 'Viral Pneumonia (1001).png',
 'Viral Pneumonia (1002).png',
 'Viral Pneumonia (1003).png',
 'Viral Pneumonia (1004).png',
 'Viral Pneumonia (1005).png',
 'Viral Pneumonia (1006).png',
 'Viral Pneumonia (1007).png',
 'Viral Pneumonia (1008).png',
 'Viral Pneumonia (1009).png',
 'Viral Pneumonia (101).png',
 'Viral Pneumonia (1010).png',
 'Viral Pneumonia (1011).png',
 'Viral Pneumonia (1012).png',
 'Viral Pneumonia (1013).png',
 'Viral Pneumonia (1014).png',
 'Viral Pneumonia (1015).png',
 'Viral Pneumonia (1016).png',
 'Viral Pneumonia (1017).png',
 'Viral Pneumonia (1018).png',
 'Viral Pneumonia (1019).png',
 'Viral Pneumonia (102).png',
 'Viral Pneumonia (1020).png',
 'Viral Pneumonia (1021).png',
 'Viral Pneumonia (1022).png',
 'Viral Pneumonia (1023).png',
 'Viral Pneumonia (1024).png',
 'Viral Pneumonia (1025).png',
 'Viral Pneumonia (1026).png',
 'Viral Pneumoni

In [139]:
Normal_image_names

['NORMAL (1).png',
 'NORMAL (10).png',
 'NORMAL (100).png',
 'NORMAL (1000).png',
 'NORMAL (1001).png',
 'NORMAL (1002).png',
 'NORMAL (1003).png',
 'NORMAL (1004).png',
 'NORMAL (1005).png',
 'NORMAL (1006).png',
 'NORMAL (1007).png',
 'NORMAL (1008).png',
 'NORMAL (1009).png',
 'NORMAL (101).png',
 'NORMAL (1010).png',
 'NORMAL (1011).png',
 'NORMAL (1012).png',
 'NORMAL (1013).png',
 'NORMAL (1014).png',
 'NORMAL (1015).png',
 'NORMAL (1016).png',
 'NORMAL (1017).png',
 'NORMAL (1018).png',
 'NORMAL (1019).png',
 'NORMAL (102).png',
 'NORMAL (1020).png',
 'NORMAL (1021).png',
 'NORMAL (1022).png',
 'NORMAL (1023).png',
 'NORMAL (1024).png',
 'NORMAL (1025).png',
 'NORMAL (1026).png',
 'NORMAL (1027).png',
 'NORMAL (1028).png',
 'NORMAL (1029).png',
 'NORMAL (103).png',
 'NORMAL (1030).png',
 'NORMAL (1031).png',
 'NORMAL (1032).png',
 'NORMAL (1033).png',
 'NORMAL (1034).png',
 'NORMAL (1035).png',
 'NORMAL (1036).png',
 'NORMAL (1037).png',
 'NORMAL (1038).png',
 'NORMAL (1039).png

In [144]:
#Splitting up all the COVID images into Train, Val and Test sets
Covid_image_names.sort() # to make sure that the filenames have a fixed order before shuffling
#To make sure to have the same split each time this code is run, 
#we need to fix the random seed before shuffling the filenames
random.seed(80) 
random.shuffle(Covid_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Covid_image_names))
split_2 = int(0.9 * len(Covid_image_names))
Covid_train_img_names = Covid_image_names[:split_1]
Covid_val_img_names = Covid_image_names[split_1:split_2]
Covid_test_img_names = Covid_image_names[split_2:]


for i in range(len(Covid_train_img_names)):
    img_element = Covid_train_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_train_img_path = os.path.join(Covid_Train_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_train_img_path)
print("Copied ",i+1," images from COVID dataset path to the COVID Training directory")


for i in range(len(Covid_val_img_names)):
    img_element = Covid_val_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_val_img_path = os.path.join(Covid_Val_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_val_img_path)
print("Copied ",i+1," images from COVID dataset path to the COVID Validation directory")


for i in range(len(Covid_test_img_names)):
    img_element = Covid_test_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_test_img_path = os.path.join(Covid_Test_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_test_img_path)
print("Copied ",i+1," images from COVID dataset path to the COVID Test directory")

Copied  960  images from COVID dataset path to the COVID Training directory
Copied  120  images from COVID dataset path to the COVID Validation directory
Copied  120  images from COVID dataset path to the COVID Test directory


In [145]:
#Splitting up all the Viral Pneumonia images into Train, Val and Test sets
Pneumonia_image_names.sort() 
random.seed(50) 
random.shuffle(Pneumonia_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Pneumonia_image_names))
split_2 = int(0.9 * len(Pneumonia_image_names))
Pneumonia_train_img_names = Pneumonia_image_names[:split_1]
Pneumonia_val_img_names = Pneumonia_image_names[split_1:split_2]
Pneumonia_test_img_names = Pneumonia_image_names[split_2:]


for i in range(len(Pneumonia_train_img_names)):
    img_element = Pneumonia_train_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_train_img_path = os.path.join(Pneumonia_Train_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_train_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia Training directory")


for i in range(len(Pneumonia_val_img_names)):
    img_element = Pneumonia_val_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_val_img_path = os.path.join(Pneumonia_Val_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_val_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia Validation directory")


for i in range(len(Pneumonia_test_img_names)):
    img_element = Pneumonia_test_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_test_img_path = os.path.join(Pneumonia_Test_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_test_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia Test directory")

Copied  1076  images from Viral Pneumonia dataset path to the Viral Pneumonia Training directory
Copied  134  images from Viral Pneumonia dataset path to the Viral Pneumonia Validation directory
Copied  135  images from Viral Pneumonia dataset path to the Viral Pneumonia Test directory


In [146]:
#Splitting up all the Normal images into Train, Val and Test sets
Normal_image_names.sort() 
random.seed(50) 
random.shuffle(Normal_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Normal_image_names))
split_2 = int(0.9 * len(Normal_image_names))
Normal_train_img_names = Normal_image_names[:split_1]
Normal_val_img_names = Normal_image_names[split_1:split_2]
Normal_test_img_names = Normal_image_names[split_2:]


for i in range(len(Normal_train_img_names)):
    img_element = Normal_train_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_train_img_path = os.path.join(Normal_Train_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_train_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Normal images Training directory")


for i in range(len(Normal_val_img_names)):
    img_element = Normal_val_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_val_img_path = os.path.join(Normal_Val_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_val_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Normal images Validation directory")


for i in range(len(Normal_test_img_names)):
    img_element = Normal_test_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_test_img_path = os.path.join(Normal_Test_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_test_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Normal images Test directory")

Copied  1072  images from Normal images dataset path to the Normal images Training directory
Copied  134  images from Normal images dataset path to the Normal images Validation directory
Copied  135  images from Normal images dataset path to the Normal images Test directory
