# Data Set Creation for the CNN Detection Model
The COVID-19 Radiography Database is being used, it is available at: https://www.kaggle.com/tawsifurrahman/covid19-radiography-database

This database consists of 1200 COVID-19, 1345 Viral Pneumonia and 1341 Normal X-Ray images.

The database will be divided into training,validation, testing and webapp prediction datasets. The dataset split will be 80% of data for training, 10% for validation, 8% for testing the model and the remaining 2% data to be used for testing the webapp.

In [2]:
import pandas as pd
import os
import random
import shutil

In [3]:
#Metadata and Image paths for all Covid samples from the dataset

COVID_FILE_PATH = "COVID-19-Radiography-Database/COVID.metadata.csv"
COVID_IMAGES_PATH = "COVID-19-Radiography-Database/COVID"

In [4]:
covid_df = pd.read_csv(COVID_FILE_PATH) #Read csv file containing metadata for all Covid images
print(covid_df.shape)
covid_df.head()

(1200, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,COVID 1,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
1,COVID 2,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
2,COVID 3,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
3,COVID 4,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
4,COVID 5,PNG,256*256,https://sirm.org/category/senza-categoria/covi...


In [5]:
#Metadata and Image paths for all Pneumonia samples from the dataset

VIRAL_PNEUMONIA_FILE_PATH = "COVID-19-Radiography-Database/ViralPneumonia.metadata.csv"
VIRAL_PNEUMONIA_IMAGES_PATH = "COVID-19-Radiography-Database/ViralPneumonia"

In [6]:
viral_pneumonia_df = pd.read_csv(VIRAL_PNEUMONIA_FILE_PATH) #Read csv file containing metadata for all Covid images
print(viral_pneumonia_df.shape)
viral_pneumonia_df.head()

(1345, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,Viral Pneumonia-1,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
1,Viral Pneumonia-2,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
2,Viral Pneumonia-3,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
3,Viral Pneumonia-4,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
4,Viral Pneumonia-5,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...


In [7]:
#Metadata and Image paths for all Normal samples from the dataset

NORMAL_FILE_PATH = "COVID-19-Radiography-Database/NORMAL.metadata.csv"
NORMAL_IMAGES_PATH = "COVID-19-Radiography-Database/NORMAL"

In [8]:
normal_df = pd.read_csv(NORMAL_FILE_PATH) #Read csv file containing metadata for all Covid images
print(normal_df.shape)
normal_df.head()

(1341, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,NORMAL-1,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
1,NORMAL-2,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
2,NORMAL-3,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
3,NORMAL-4,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...
4,NORMAL-5,PNG,1024*1024,https://www.kaggle.com/paultimothymooney/chest...


In [19]:
#Creating the directories and folders 
#for training, validation, model testing and webapp prediction datasets
Target_Dir = 'ThreeLabelDataset'
Covid_Train_Dir = 'ThreeLabelDataset/Train/COVID'
Pneumonia_Train_Dir = 'ThreeLabelDataset/Train/ViralPneumonia'
Normal_Train_Dir = 'ThreeLabelDataset/Train/Normal'

Covid_Val_Dir = 'ThreeLabelDataset/Validation/COVID'
Pneumonia_Val_Dir = 'ThreeLabelDataset/Validation/ViralPneumonia'
Normal_Val_Dir = 'ThreeLabelDataset/Validation/Normal'

Covid_Test_Dir = 'ThreeLabelDataset/Test/COVID'
Pneumonia_Test_Dir= 'ThreeLabelDataset/Test/ViralPneumonia'
Normal_Test_Dir = 'ThreeLabelDataset/Test/Normal'

Covid_WebPred_Dir = 'ThreeLabelDataset/WebAppPrediction/COVID'
Pneumonia_WebPred_Dir = 'ThreeLabelDataset/WebAppPrediction/ViralPneumonia'
Normal_WebPred_Dir = 'ThreeLabelDataset/WebAppPrediction/Normal'


if not os.path.exists(Target_Dir):
    os.makedirs(Target_Dir)
    print("Created ThreeLabel Dataset folder")
    
    os.makedirs(Covid_Train_Dir)
    print("Created COVID Training Data folder")
    os.makedirs(Pneumonia_Train_Dir)
    print("Created Viral Pneumonia Training Data folder")
    os.makedirs(Normal_Train_Dir)
    print("Created Normal Training Data folder")
    
    os.makedirs(Covid_Val_Dir)
    print("Created COVID Validation Data folder")
    os.makedirs(Pneumonia_Val_Dir)
    print("Created Viral Pneumonia Validation Data folder")
    os.makedirs(Normal_Val_Dir)
    print("Created Normal Validation Data folder")
    
    os.makedirs(Covid_Test_Dir)
    print("Created COVID Test Data folder")
    os.makedirs(Pneumonia_Test_Dir)
    print("Created Viral Pneumonia Test Data folder")
    os.makedirs(Normal_Test_Dir)
    print("Created Normal Test Data folder")
    
    os.makedirs(Covid_WebPred_Dir)
    print("Created COVID Webapp Prediction Data folder")
    os.makedirs(Pneumonia_WebPred_Dir)
    print("Created Viral Pneumonia Webapp Prediction Data folder")
    os.makedirs(Normal_WebPred_Dir)
    print("Created Normal Webapp Prediction Data folder")
    

Created ThreeLabel Dataset folder
Created COVID Training Data folder
Created Viral Pneumonia Training Data folder
Created Normal Training Data folder
Created COVID Validation Data folder
Created Viral Pneumonia Validation Data folder
Created Normal Validation Data folder
Created COVID Test Data folder
Created Viral Pneumonia Test Data folder
Created Normal Test Data folder
Created COVID Webapp Prediction Data folder
Created Viral Pneumonia Webapp Prediction Data folder
Created Normal Webapp Prediction Data folder


In [20]:
#Save all the image file names into arrays 
Covid_image_names = os.listdir(COVID_IMAGES_PATH)
Pneumonia_image_names = os.listdir(VIRAL_PNEUMONIA_IMAGES_PATH)
Normal_image_names = os.listdir(NORMAL_IMAGES_PATH)
print("Total number of COVID images: " , len(Covid_image_names))
print("Total number of Viral Pneumonia images: " , len(Pneumonia_image_names))
print("Total number of Normal images: " , len(Normal_image_names))

Total number of COVID images:  1200
Total number of Viral Pneumonia images:  1345
Total number of Normal images:  1341


In [21]:
#Splitting up all the COVID images into Training, Validation, Model Testing and Web App Prediction arrays
Covid_image_names.sort() # to make sure that the filenames have a fixed order before shuffling
#To make sure to have the same split each time this code is run, 
#we need to fix the random seed before shuffling the filenames
random.seed(80) 
random.shuffle(Covid_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Covid_image_names))
split_2 = int(0.9 * len(Covid_image_names))
split_3 = int(0.98 * len(Covid_image_names))
Covid_train_img_names = Covid_image_names[:split_1]
Covid_val_img_names = Covid_image_names[split_1:split_2]
Covid_test_img_names = Covid_image_names[split_2:split_3]
Covid_webpred_img_names = Covid_image_names[split_3:]

print("No. of COVID images to be used for training: ",len(Covid_train_img_names))
print("No. of COVID images to be used for validation: ",len(Covid_val_img_names))
print("No. of COVID images to be used for testing: ",len(Covid_test_img_names))
print("Remaining COVID images that can be used for web application predictions: ",len(Covid_webpred_img_names))



No. of COVID images to be used for training:  960
No. of COVID images to be used for validation:  120
No. of COVID images to be used for testing:  96
Remaining COVID images that can be used for web application predictions:  24


In [22]:
#Copying the images from the COVID directory to their respective Training, Testing, Validation and WebApp Prediction Datasets
for i in range(len(Covid_train_img_names)):
    img_element = Covid_train_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_train_img_path = os.path.join(Covid_Train_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_train_img_path)
print("Copied ",i+1," images from COVID dataset path to the COVID Training directory")


for i in range(len(Covid_val_img_names)):
    img_element = Covid_val_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_val_img_path = os.path.join(Covid_Val_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_val_img_path)
print("Copied ",i+1," images from COVID dataset path to the COVID Validation directory")


for i in range(len(Covid_test_img_names)):
    img_element = Covid_test_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_test_img_path = os.path.join(Covid_Test_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_test_img_path)
print("Copied ",i+1," images from COVID dataset path to the COVID Test directory")

for i in range(len(Covid_webpred_img_names)):
    img_element = Covid_webpred_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_webpred_img_path = os.path.join(Covid_WebPred_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_webpred_img_path)
print("Copied the remaining ",i+1," images from COVID dataset path to the COVID WebApp Predictions directory")

Copied  960  images from COVID dataset path to the COVID Training directory
Copied  120  images from COVID dataset path to the COVID Validation directory
Copied  96  images from COVID dataset path to the COVID Test directory
Copied the remaining  24  images from COVID dataset path to the COVID WebApp Predictions directory


In [27]:
#Splitting up all the Viral Pneumonia images into Training, Validation, Model Testing and Web App Prediction arrays
Pneumonia_image_names.sort()
random.seed(50) 
random.shuffle(Pneumonia_image_names)
split_1 = int(0.8 * (len(Pneumonia_image_names)))
split_2 = int(0.9 * (len(Pneumonia_image_names)))
split_3 = int(0.98 * (len(Pneumonia_image_names)))
Pneumonia_train_img_names = Pneumonia_image_names[:split_1]
Pneumonia_val_img_names = Pneumonia_image_names[split_1:split_2]
Pneumonia_test_img_names = Pneumonia_image_names[split_2: split_3]
Pneumonia_webpred_img_names = Pneumonia_image_names[split_3:]

print("No. of Viral Pneumonia images to be sent to the Viral Pneumonia Training Dataset:",len(Pneumonia_train_img_names))
print("No. of Viral Pneumonia images to be sent to the Viral Pneumonia Validation Dataset:",len(Pneumonia_val_img_names))
print("No. of Viral Pneumonia images to be sent to the Viral Pneumonia Testing Dataset:",len(Pneumonia_test_img_names))
print("Remaining Viral Pneumonia images to be sent to the Viral Pneumonia Web App Prediction Dataset:",len(Pneumonia_webpred_img_names))



No. of Viral Pneumonia images to be sent to the Viral Pneumonia Training Dataset: 1076
No. of Viral Pneumonia images to be sent to the Viral Pneumonia Validation Dataset: 134
No. of Viral Pneumonia images to be sent to the Viral Pneumonia Testing Dataset: 108
Remaining Viral Pneumonia images to be sent to the Viral Pneumonia Web App Prediction Dataset: 27


In [28]:
#Copying the images from the Viral Pneumonia directory to their respective Training, Testing, Validation and WebApp Prediction Datasets
for i in range(len(Pneumonia_train_img_names)):
    img_element = Pneumonia_train_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_train_img_path = os.path.join(Pneumonia_Train_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_train_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia Training directory")


for i in range(len(Pneumonia_val_img_names)):
    img_element = Pneumonia_val_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_val_img_path = os.path.join(Pneumonia_Val_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_val_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia Validation directory")


for i in range(len(Pneumonia_test_img_names)):
    img_element = Pneumonia_test_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_test_img_path = os.path.join(Pneumonia_Test_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_test_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia Test directory")

for i in range(len(Pneumonia_webpred_img_names)):
    img_element = Pneumonia_webpred_img_names[i]
    Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    Pneumonia_webpred_img_path = os.path.join(Pneumonia_WebPred_Dir,img_element)
    shutil.copy2(Pneumonia_image_path,Pneumonia_webpred_img_path)
print("Copied the remaining ",i+1," images from Viral Pneumonia dataset path to the Viral Pneumonia WebApp Predictions directory")

Copied  1076  images from Viral Pneumonia dataset path to the Viral Pneumonia Training directory
Copied  134  images from Viral Pneumonia dataset path to the Viral Pneumonia Validation directory
Copied  108  images from Viral Pneumonia dataset path to the Viral Pneumonia Test directory
Copied the remaining  27  images from Viral Pneumonia dataset path to the Viral Pneumonia WebApp Predictions directory


In [29]:
#Splitting up all the Normal images into Training, Validation, Model Testing and Web App Prediction arrays
Normal_image_names.sort() 
random.seed(40) 
random.shuffle(Normal_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Normal_image_names))
split_2 = int(0.9 * len(Normal_image_names))
split_3 = int(0.98 * (len(Normal_image_names)))
Normal_train_img_names = Normal_image_names[:split_1]
Normal_val_img_names = Normal_image_names[split_1:split_2]
Normal_test_img_names = Normal_image_names[split_2:split_3]
Normal_webpred_img_names = Normal_image_names[split_3:]

print("No. of Normal images to be sent to the Normal Training Dataset:",len(Normal_train_img_names))
print("No. of Normal images to be sent to the Normal Validation Dataset:",len(Normal_val_img_names))
print("No. of Normal images to be sent to the Normal Testing Dataset:",len(Normal_test_img_names))
print("Remaining Normal images to be sent to the Normal Web App Prediction Dataset:",len(Normal_webpred_img_names))


No. of Normal images to be sent to the Normal Training Dataset: 1072
No. of Normal images to be sent to the Normal Validation Dataset: 134
No. of Normal images to be sent to the Normal Testing Dataset: 108
Remaining Normal images to be sent to the Normal Web App Prediction Dataset: 27


In [30]:
#Copying the images from the Normal directory to their respective Training, Testing, Validation and WebApp Prediction Datasets
for i in range(len(Normal_train_img_names)):
    img_element = Normal_train_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_train_img_path = os.path.join(Normal_Train_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_train_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Normal images Training directory")


for i in range(len(Normal_val_img_names)):
    img_element = Normal_val_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_val_img_path = os.path.join(Normal_Val_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_val_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Normal images Validation directory")


for i in range(len(Normal_test_img_names)):
    img_element = Normal_test_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_test_img_path = os.path.join(Normal_Test_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_test_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Normal images Test directory")

for i in range(len(Normal_webpred_img_names)):
    img_element = Normal_webpred_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    Normal_webpred_img_path = os.path.join(Normal_WebPred_Dir,img_element)
    shutil.copy2(Normal_image_path,Normal_webpred_img_path)
print("Copied the remaining ",i+1," images from Normal dataset path to the Normal WebApp Predictions directory")

Copied  1072  images from Normal images dataset path to the Normal images Training directory
Copied  134  images from Normal images dataset path to the Normal images Validation directory
Copied  108  images from Normal images dataset path to the Normal images Test directory
Copied the remaining  27  images from Normal dataset path to the Normal WebApp Predictions directory


Dataset for Three Label Classification has been prepared 