# Data Set Creation for Binary Classification
The COVID-19 Radiography Database being used is available at: https://www.kaggle.com/tawsifurrahman/covid19-radiography-database

This database consists of 3616 COVID-19, 10192 Normal, 6012 Lung Opacity(Non-COVID Lung Infection) and 1345 Non-COVID Viral Pneumonia X-Ray images.

The database will be divided into training,validation, testing and webapp prediction datasets for COVID and Non-COVID categories, where Non-COVID includes Normal, Non-COVID Lung Infection and Viral Pneumonia images. The dataset split will be 80% of data for training, 10% for validation, 9% for testing the model and the remaining 1% data to be used for testing the webapp.

In [3]:
import pandas as pd
import os
import random
import shutil

In [4]:
#Metadata and Image paths for all Covid samples from the dataset
COVID_FILE_PATH = "COVID-19_Radiography_Dataset/COVID.metadata.csv"
COVID_IMAGES_PATH = "COVID-19_Radiography_Dataset/COVID"
covid_df = pd.read_csv(COVID_FILE_PATH) #Read csv file containing metadata for all Covid images
print(covid_df.shape)
covid_df.head()

(3616, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,COVID-1,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
1,COVID-2,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
2,COVID-3,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
3,COVID-4,PNG,256*256,https://sirm.org/category/senza-categoria/covi...
4,COVID-5,PNG,256*256,https://sirm.org/category/senza-categoria/covi...


In [5]:
#Metadata and Image paths for all Pneumonia samples from the dataset

VIRAL_PNEUMONIA_FILE_PATH = "COVID-19_Radiography_Dataset/Viral Pneumonia.metadata.csv"
VIRAL_PNEUMONIA_IMAGES_PATH = "COVID-19_Radiography_Dataset/Viral Pneumonia"
viral_pneumonia_df = pd.read_csv(VIRAL_PNEUMONIA_FILE_PATH) #Read csv file containing metadata for all Covid images
print(viral_pneumonia_df.shape)
viral_pneumonia_df.head()

(1345, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,Viral Pneumonia-1,PNG,256*256,https://www.kaggle.com/paultimothymooney/chest...
1,Viral Pneumonia-2,PNG,256*256,https://www.kaggle.com/paultimothymooney/chest...
2,Viral Pneumonia-3,PNG,256*256,https://www.kaggle.com/paultimothymooney/chest...
3,Viral Pneumonia-4,PNG,256*256,https://www.kaggle.com/paultimothymooney/chest...
4,Viral Pneumonia-5,PNG,256*256,https://www.kaggle.com/paultimothymooney/chest...


In [6]:
#Metadata and Image paths for all Normal samples from the dataset

NORMAL_FILE_PATH = "COVID-19_Radiography_Dataset/Normal.metadata.csv"
NORMAL_IMAGES_PATH = "COVID-19_Radiography_Dataset/Normal"
normal_df = pd.read_csv(NORMAL_FILE_PATH) #Read csv file containing metadata for all Covid images
print(normal_df.shape)
normal_df.head()


(10192, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,NORMAL-1,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
1,NORMAL-2,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
2,NORMAL-3,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
3,NORMAL-4,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
4,NORMAL-5,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...


In [7]:
#Metadata and Image paths for all Non-COVID Lung Infection samples from the dataset

LUNG_OPACITY_FILE_PATH = "COVID-19_Radiography_Dataset/Lung_Opacity.metadata.csv"
LUNG_OPACITY_IMAGES_PATH = "COVID-19_Radiography_Dataset/Lung_Opacity"
lung_opacity_df = pd.read_csv(LUNG_OPACITY_FILE_PATH) #Read csv file containing metadata for all Lung Opacity images
print(lung_opacity_df.shape)
lung_opacity_df.head()

(6012, 4)


Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,Lung_Opacity-1,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
1,Lung_Opacity-2,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
2,Lung_Opacity-3,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
3,Lung_Opacity-4,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...
4,Lung_Opacity-5,PNG,256*256,https://www.kaggle.com/c/rsna-pneumonia-detect...


In [9]:
#Creating the directories and folders 
#for training, validation,testing and webapp predictions datasets
Target_Dir = 'BinaryClassificationDataset'
Covid_Train_Dir = 'BinaryClassificationDataset/Train/COVID'
NonCovid_Train_Dir = 'BinaryClassificationDataset/Train/Non-COVID'

Covid_Val_Dir = 'BinaryClassificationDataset/Validation/COVID'
NonCovid_Val_Dir = 'BinaryClassificationDataset/Validation/Non-COVID'

Covid_Test_Dir = 'BinaryClassificationDataset/Test/COVID'
NonCovid_Test_Dir = 'BinaryClassificationDataset/Test/Non-COVID'

Covid_WebPred_Dir = 'BinaryClassificationDataset/WebAppPrediction/COVID'
NonCovid_WebPred_Dir = 'BinaryClassificationDataset/WebAppPrediction/Non-COVID'

if not os.path.exists(Target_Dir):
    os.makedirs(Target_Dir)
    print("Created Binary Classification Dataset folder")
    
    os.makedirs(Covid_Train_Dir)
    print("Created COVID Training Data folder")
    os.makedirs(NonCovid_Train_Dir)
    print("Created Non-COVID Training Data folder")
    
    os.makedirs(Covid_Val_Dir)
    print("Created COVID Validation Data folder")
    os.makedirs(NonCovid_Val_Dir)
    print("Created Non-COVID Validation Data folder")
    
    os.makedirs(Covid_Test_Dir)
    print("Created COVID Test Data folder")
    os.makedirs(NonCovid_Test_Dir)
    print("Created Non-COVID Test Data folder")
    
    os.makedirs(Covid_WebPred_Dir)
    print("Created COVID WebApp Prediction Data folder")
    os.makedirs(NonCovid_WebPred_Dir)
    print("Created Non-COVID WebApp Prediction Data folder")

Created Binary Classification Dataset folder
Created COVID Training Data folder
Created Non-COVID Training Data folder
Created COVID Validation Data folder
Created Non-COVID Validation Data folder
Created COVID Test Data folder
Created Non-COVID Test Data folder
Created COVID WebApp Prediction Data folder
Created Non-COVID WebApp Prediction Data folder


In [10]:
#Save all the image file names into arrays 
#Save all the image file names into arrays 
Covid_image_names = os.listdir(COVID_IMAGES_PATH)
Viral_Pneumonia_image_names = os.listdir(VIRAL_PNEUMONIA_IMAGES_PATH)
Normal_image_names = os.listdir(NORMAL_IMAGES_PATH)
Lung_Opacity_image_names = os.listdir(LUNG_OPACITY_IMAGES_PATH)


In [11]:
print("Total number of COVID-19 images: " , len(Covid_image_names))
print("Total number of Viral Pneumonia images: " , len(Viral_Pneumonia_image_names))
print("Total number of Normal images: " , len(Normal_image_names))
print("Total number of Non-COVID Lung Infection images: " , len(Lung_Opacity_image_names))

Total number of COVID-19 images:  3616
Total number of Viral Pneumonia images:  1345
Total number of Normal images:  10192
Total number of Non-COVID Lung Infection images:  6012


In [12]:
#Splitting up all the COVID-19 images into Training, Validation, Model Testing and Web App Prediction arrays
Covid_image_names.sort() # to make sure that the filenames have a fixed order before shuffling
#To make sure to have the same split each time this code is run, 
#we need to fix the random seed before shuffling the filenames
random.seed(70) 
random.shuffle(Covid_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Covid_image_names))
split_2 = int(0.9 * len(Covid_image_names))
split_3 = int(0.99 * len(Covid_image_names))
Covid_train_img_names = Covid_image_names[:split_1]
Covid_val_img_names = Covid_image_names[split_1:split_2]
Covid_test_img_names = Covid_image_names[split_2:split_3]
Covid_webpred_img_names = Covid_image_names[split_3:]

print("No. of COVID-19 images to be used for training: ",len(Covid_train_img_names))
print("No. of COVID-19 images to be used for validation: ",len(Covid_val_img_names))
print("No. of COVID-19 images to be used for testing: ",len(Covid_test_img_names))
print("Remaining COVID-19 images that can be used for web application predictions: ",len(Covid_webpred_img_names))

No. of COVID-19 images to be used for training:  2892
No. of COVID-19 images to be used for validation:  362
No. of COVID-19 images to be used for testing:  325
Remaining COVID-19 images that can be used for web application predictions:  37


In [13]:
#Copying the images from the COVID-19 directory to their respective Training, Testing, Validation and WebApp Prediction Datasets
for i in range(len(Covid_train_img_names)):
    img_element = Covid_train_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_train_img_path = os.path.join(Covid_Train_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_train_img_path)
print("Copied ",i+1," images from COVID-19 dataset path to the COVID-19 Training directory")


for i in range(len(Covid_val_img_names)):
    img_element = Covid_val_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_val_img_path = os.path.join(Covid_Val_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_val_img_path)
print("Copied ",i+1," images from COVID-19 dataset path to the COVID-19 Validation directory")


for i in range(len(Covid_test_img_names)):
    img_element = Covid_test_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_test_img_path = os.path.join(Covid_Test_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_test_img_path)
print("Copied ",i+1," images from COVID-19 dataset path to the COVID-19 Test directory")

for i in range(len(Covid_webpred_img_names)):
    img_element = Covid_webpred_img_names[i]
    Covid_image_path = os.path.join(COVID_IMAGES_PATH,img_element)
    Covid_webpred_img_path = os.path.join(Covid_WebPred_Dir,img_element)
    shutil.copy2(Covid_image_path,Covid_webpred_img_path)
print("Copied the remaining ",i+1," images from COVID-19 dataset path to the COVID-19 WebApp Predictions directory")

Copied  2892  images from COVID-19 dataset path to the COVID-19 Training directory
Copied  362  images from COVID-19 dataset path to the COVID-19 Validation directory
Copied  325  images from COVID-19 dataset path to the COVID-19 Test directory
Copied the remaining  37  images from COVID-19 dataset path to the COVID-19 WebApp Predictions directory


In [14]:
#Splitting up all the Viral Pneumonia images into Training, Validation, Model Testing and Web App Prediction arrays
Viral_Pneumonia_image_names.sort()
random.seed(50) 
random.shuffle(Viral_Pneumonia_image_names)
split_1 = int(0.8 * (len(Viral_Pneumonia_image_names)))
split_2 = int(0.9 * (len(Viral_Pneumonia_image_names)))
split_3 = int(0.99 * (len(Viral_Pneumonia_image_names)))
Viral_Pneumonia_train_img_names = Viral_Pneumonia_image_names[:split_1]
Viral_Pneumonia_val_img_names = Viral_Pneumonia_image_names[split_1:split_2]
Viral_Pneumonia_test_img_names = Viral_Pneumonia_image_names[split_2: split_3]
Viral_Pneumonia_webpred_img_names = Viral_Pneumonia_image_names[split_3:]

print("No. of Viral Pneumonia images to be sent to the Non-COVID Training Dataset:",len(Viral_Pneumonia_train_img_names))
print("No. of Viral Pneumonia images to be sent to the Non-COVID Validation Dataset:",len(Viral_Pneumonia_val_img_names))
print("No. of Viral Pneumonia images to be sent to the Non-COVID Testing Dataset:",len(Viral_Pneumonia_test_img_names))
print("Remaining Viral Pneumonia images to be sent to the Non-COVID Web App Prediction Dataset:",len(Viral_Pneumonia_webpred_img_names))


No. of Viral Pneumonia images to be sent to the Non-COVID Training Dataset: 1076
No. of Viral Pneumonia images to be sent to the Non-COVID Validation Dataset: 134
No. of Viral Pneumonia images to be sent to the Non-COVID Testing Dataset: 121
Remaining Viral Pneumonia images to be sent to the Non-COVID Web App Prediction Dataset: 14


In [15]:
#Copying all the Viral Pneumonia images into the Non-COVID Training, Validation, Model Testing and Web App Prediction Datasets
for i in range(len(Viral_Pneumonia_train_img_names)):
    img_element = Viral_Pneumonia_train_img_names[i]
    Viral_Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    NonCovid_train_img_path = os.path.join(NonCovid_Train_Dir,img_element)
    shutil.copy2(Viral_Pneumonia_image_path,NonCovid_train_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Non-COVID Training directory")


for i in range(len(Viral_Pneumonia_val_img_names)):
    img_element = Viral_Pneumonia_val_img_names[i]
    Viral_Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    NonCovid_val_img_path = os.path.join(NonCovid_Val_Dir,img_element)
    shutil.copy2(Viral_Pneumonia_image_path,NonCovid_val_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Non-COVID Validation directory")


for i in range(len(Viral_Pneumonia_test_img_names)):
    img_element = Viral_Pneumonia_test_img_names[i]
    Viral_Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    NonCovid_test_img_path = os.path.join(NonCovid_Test_Dir,img_element)
    shutil.copy2(Viral_Pneumonia_image_path,NonCovid_test_img_path)
print("Copied ",i+1," images from Viral Pneumonia dataset path to the Non-COVID Test directory")

for i in range(len(Viral_Pneumonia_webpred_img_names)):
    img_element = Viral_Pneumonia_webpred_img_names[i]
    Viral_Pneumonia_image_path = os.path.join(VIRAL_PNEUMONIA_IMAGES_PATH,img_element)
    NonCovid_webpred_img_path = os.path.join(NonCovid_WebPred_Dir,img_element)
    shutil.copy2(Viral_Pneumonia_image_path,NonCovid_webpred_img_path)
print("Copied the remaining ",i+1," images from Viral Pneumonia dataset path to the Non-COVID WebApp Predictions directory")

Copied  1076  images from Viral Pneumonia dataset path to the Non-COVID Training directory
Copied  134  images from Viral Pneumonia dataset path to the Non-COVID Validation directory
Copied  121  images from Viral Pneumonia dataset path to the Non-COVID Test directory
Copied the remaining  14  images from Viral Pneumonia dataset path to the Non-COVID WebApp Predictions directory


In [16]:
#Sorting and then shuffling the Normal image names array
#before combining it into the Non-Covid testing, training and validation datasets.
#The remaining Normal X-Ray images are then sent to the WebApp Predictions dataset
# so that they can used to test the predictions of different models using the Web Application.
Normal_image_names.sort() 
random.seed(30) 
random.shuffle(Normal_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Normal_image_names))
split_2 = int(0.9 * len(Normal_image_names))
split_3 = int(0.99 * (len(Normal_image_names)))
Normal_train_img_names = Normal_image_names[:split_1]
Normal_val_img_names = Normal_image_names[split_1:split_2]
Normal_test_img_names = Normal_image_names[split_2:split_3]
Normal_webpred_img_names = Normal_image_names[split_3:]

print("No. of Normal images to be sent to the Non-COVID Training Dataset:",len(Normal_train_img_names))
print("No. of Normal images to be sent to the Non-COVID Validation Dataset:",len(Normal_val_img_names))
print("No. of Normal images to be sent to the Non-COVID Testing Dataset:",len(Normal_test_img_names))
print("Remaining Normal images to be sent to the Non-COVID Web App Prediction Dataset:",len(Normal_webpred_img_names))

No. of Normal images to be sent to the Non-COVID Training Dataset: 8153
No. of Normal images to be sent to the Non-COVID Validation Dataset: 1019
No. of Normal images to be sent to the Non-COVID Testing Dataset: 918
Remaining Normal images to be sent to the Non-COVID Web App Prediction Dataset: 102


In [17]:
for i in range(len(Normal_train_img_names)):
    img_element = Normal_train_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    NonCovid_train_img_path = os.path.join(NonCovid_Train_Dir,img_element)
    shutil.copy2(Normal_image_path,NonCovid_train_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Non-COVID Training directory")


for i in range(len(Normal_val_img_names)):
    img_element = Normal_val_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    NonCovid_val_img_path = os.path.join(NonCovid_Val_Dir,img_element)
    shutil.copy2(Normal_image_path,NonCovid_val_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Non-COVID Validation directory")


for i in range(len(Normal_test_img_names)):
    img_element = Normal_test_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    NonCovid_test_img_path = os.path.join(NonCovid_Test_Dir,img_element)
    shutil.copy2(Normal_image_path,NonCovid_test_img_path)
print("Copied ",i+1," images from Normal images dataset path to the Non-COVID Test directory")

for i in range(len(Normal_webpred_img_names)):
    img_element = Normal_webpred_img_names[i]
    Normal_image_path = os.path.join(NORMAL_IMAGES_PATH,img_element)
    NonCovid_webpred_img_path = os.path.join(NonCovid_WebPred_Dir,img_element)
    shutil.copy2(Normal_image_path,NonCovid_webpred_img_path)
print("Copied the remaining",i+1," images from Normal images dataset path to the Non-COVID WebApp Predictions directory")

Copied  8153  images from Normal images dataset path to the Non-COVID Training directory
Copied  1019  images from Normal images dataset path to the Non-COVID Validation directory
Copied  918  images from Normal images dataset path to the Non-COVID Test directory
Copied the remaining 102  images from Normal images dataset path to the Non-COVID WebApp Predictions directory


In [18]:
#Splitting up all the Non-COVID Lung Infection images into Training, Validation, Model Testing and Web App Prediction arrays
Lung_Opacity_image_names.sort() 
random.seed(10) 
random.shuffle(Lung_Opacity_image_names) # shuffles the ordering of image names

split_1 = int(0.8 * len(Lung_Opacity_image_names))
split_2 = int(0.9 * len(Lung_Opacity_image_names))
split_3 = int(0.99 * (len(Lung_Opacity_image_names)))
Lung_Opacity_train_img_names = Lung_Opacity_image_names[:split_1]
Lung_Opacity_val_img_names = Lung_Opacity_image_names[split_1:split_2]
Lung_Opacity_test_img_names = Lung_Opacity_image_names[split_2:split_3]
Lung_Opacity_webpred_img_names = Lung_Opacity_image_names[split_3:]

print("No. of Non-COVID Lung Infection images to be sent to the Non-COVID Training Dataset:",len(Lung_Opacity_train_img_names))
print("No. of Non-COVID Lung Infection images to be sent to the Non-COVID Validation Dataset:",len(Lung_Opacity_val_img_names))
print("No. of Non-COVID Lung Infection images to be sent to the Non-COVID Testing Dataset:",len(Lung_Opacity_test_img_names))
print("Remaining Non-COVID Lung Infection images to be sent to the Non-COVID Web App Prediction Dataset:",len(Lung_Opacity_webpred_img_names))

No. of Non-COVID Lung Infection images to be sent to the Non-COVID Training Dataset: 4809
No. of Non-COVID Lung Infection images to be sent to the Non-COVID Validation Dataset: 601
No. of Non-COVID Lung Infection images to be sent to the Non-COVID Testing Dataset: 541
Remaining Non-COVID Lung Infection images to be sent to the Non-COVID Web App Prediction Dataset: 61


In [19]:
#Copying the images from the Lung Infection directory to their respective 
#Non-COVID Training, Testing, Validation and WebApp Prediction Datasets
for i in range(len(Lung_Opacity_train_img_names)):
    img_element = Lung_Opacity_train_img_names[i]
    Lung_Opacity_image_path = os.path.join(LUNG_OPACITY_IMAGES_PATH,img_element)
    NonCovid_train_img_path = os.path.join(NonCovid_Train_Dir,img_element)
    shutil.copy2(Lung_Opacity_image_path,NonCovid_train_img_path)
print("Copied ",i+1," images from Lung Infection images dataset path to the Non-COVID Training directory")


for i in range(len(Lung_Opacity_val_img_names)):
    img_element = Lung_Opacity_val_img_names[i]
    Lung_Opacity_image_path = os.path.join(LUNG_OPACITY_IMAGES_PATH,img_element)
    NonCovid_val_img_path = os.path.join(NonCovid_Val_Dir,img_element)
    shutil.copy2(Lung_Opacity_image_path,NonCovid_val_img_path)
print("Copied ",i+1," images from Lung Infection images dataset path to the Non-COVID Validation directory")


for i in range(len(Lung_Opacity_test_img_names)):
    img_element = Lung_Opacity_test_img_names[i]
    Lung_Opacity_image_path = os.path.join(LUNG_OPACITY_IMAGES_PATH,img_element)
    NonCovid_test_img_path = os.path.join(NonCovid_Test_Dir,img_element)
    shutil.copy2(Lung_Opacity_image_path,NonCovid_test_img_path)
print("Copied ",i+1," images from Lung Infection images dataset path to the Non-COVID Test directory")

for i in range(len(Lung_Opacity_webpred_img_names)):
    img_element = Lung_Opacity_webpred_img_names[i]
    Lung_Opacity_image_path = os.path.join(LUNG_OPACITY_IMAGES_PATH,img_element)
    NonCovid_webpred_img_path = os.path.join(NonCovid_WebPred_Dir,img_element)
    shutil.copy2(Lung_Opacity_image_path,NonCovid_webpred_img_path)
print("Copied the remaining ",i+1," images from Lung Infection dataset path to the Non-COVID WebApp Predictions directory")

Copied  4809  images from Lung Infection images dataset path to the Non-COVID Training directory
Copied  601  images from Lung Infection images dataset path to the Non-COVID Validation directory
Copied  541  images from Lung Infection images dataset path to the Non-COVID Test directory
Copied the remaining  61  images from Lung Infection dataset path to the Non-COVID WebApp Predictions directory


In [20]:
print("Total number of Non-COVID images used for training: ",len(os.listdir(NonCovid_Train_Dir)))
print("Total number of Non-COVID images used for validation: ",len(os.listdir(NonCovid_Val_Dir)))
print("Total number of Non-COVID images used for testing: ",len(os.listdir(NonCovid_Test_Dir)))
print("Remaining Non-COVID images that can be used for web application predictions: ",len(os.listdir(NonCovid_WebPred_Dir)))

Total number of Non-COVID images used for training:  14038
Total number of Non-COVID images used for validation:  1754
Total number of Non-COVID images used for testing:  1580
Remaining Non-COVID images that can be used for web application predictions:  177


Dataset for binary classification of COVID and Non-COVID Chest X-rays has been created