#### This code:
####   1. Loads in all the pictures from the COVID_QU_EX dataset
####   2. Makes five train_test split 
####   3. Saves the train-test splits to unique files (pkl)
####   4. For each split, filters out the covid images of the train set
####   5. For the covid images makes subsplits keeping the [0.8, 0.6, 0.4, 0.2] ratios of the covid images 
####   6. Saves all ratio-split variaton of covid images to unique files (pkl)
##### Note: The covid images are used in training Generator Adversarial Networks and the train-test datasets are used in classification 


In [None]:
import random
import sys
import os
import pickle
from sklearn.model_selection import train_test_split

In [73]:
#Making class-index pairs
classes = ['COVID-19', 'Non-COVID', 'Normal']
class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}

def get_images_from_folder(folder):
    #Gets a folder: Train, Val or Test, returns all images in the folders (COVID19, Non-COVID and Normal)
    instances = []
    for clas in classes:
        class_path = os.path.join(folder, clas, 'images')
        class_index = class_to_idx[clas]
        for root, _, fnames in sorted(os.walk(class_path, followlinks=True)):
            for fname in sorted(fnames):
                #path = os.path.join(root, fname)
                item = fname, class_index
                instances.append(item)
    return instances

In [74]:
#This code will create 5 train-test splits from the original COVID-Qu-Ex dataset 
#The first split is the original, given by the authors of the dataset (The original cut was 
#implemented for a reason)
folders = {
    'train' : '.\Image_sets\Train',
    'val' : '.\Image_sets\Val',
    'test' : '.\Image_sets\Test'
}

train_set = get_images_from_folder(folders['train'])
test_set = get_images_from_folder(folders['test'])
val_set = get_images_from_folder(folders['val'])

print(len(train_set))
print(len(test_set))
print(len(val_set))
print(len(test_set) / (len(train_set) + len(val_set) + len(test_set)) ) #test set is 20%
print(len(val_set) / (len(train_set) + len(val_set) + len(test_set)) ) #val set is 16%
print(len(train_set) / (len(train_set) + len(val_set) + len(test_set)) ) #test set is 64%

21715
6788
5417
0.20011792452830188
0.15969929245283018
0.6401827830188679


In [75]:
#dump test_set, train_set and val_set to train.pkl, test.pkl, val.pkl with pickle
def save_sets(train_set, val_set, test_set, split):
    dump_obj1 = (train_set, val_set)
    dump_obj2 = test_set

    with open(f'{split}_split_train_and_val.pkl', 'wb') as file:
        pickle.dump(dump_obj1, file)
        
    with open(f'{split}_split_test.pkl', 'wb') as file:
        pickle.dump(dump_obj2, file)

def check_ratio(train, test, val):
    #Making sure that in each train-test splits the ratio of the classes of the images are roughly the same
    ls = [train, test, val]
    for set in ls:
        classes = [0,0,0]
        for image in set:
            classes[image[1]] +=1
        for i in range(len(classes)):
            print(f'The ratio of the {i} class is {classes[i]/sum(classes)}')

In [76]:
save_sets(train_set, val_set, test_set, 'orig') #The first split is the original one from COVID_QU_EX database
from sklearn.model_selection import train_test_split

all = [*train_set, *val_set, *test_set]
for i in range(4):
    trainey, test = train_test_split(all, test_size=0.2, random_state=42+i)
    val_size = 0.16*(5/4) #should be 16% of all data
    train, val = train_test_split(trainey, test_size=val_size, random_state=420+2*i)
    save_sets(train_set, val_set, test_set, str(i))
    #Furhtermore, we need gan_subsets for all splits, with 1, 0.8, 0.6, 0.4, 0.2 ratio of the train-val set
    # We need only COVID-19 pictures
    covid = [x for x in trainey if x[1] == class_to_idx['COVID-19']]
    ratios = [0.8, 0.6, 0.4, 0.2]
    with open(f'{i}_split_{1}_gan.pkl', 'wb') as file:
        pickle.dump(covid, file)
    for ratio in ratios:
        _, gan = train_test_split(covid, test_size=ratio, random_state=420+3*i)
        with open(f'{i}_split_{ratio}_gan.pkl', 'wb') as file:
            pickle.dump(gan, file)
    print("Done with saving gans")

Done with saving sets, now saving covid pictures for gan
Done with saving gans
Done with saving sets, now saving covid pictures for gan
Done with saving gans
Done with saving sets, now saving covid pictures for gan
Done with saving gans
Done with saving sets, now saving covid pictures for gan
Done with saving gans


In [95]:
#Making the split-ratio subsets for the original dataset split
data = [*train_set, *val_set]
covid = [x for x in data if x[1] == class_to_idx['COVID-19']]
with open(f'orig_split_{1}_gan.pkl', 'wb') as file:
        pickle.dump(covid, file)
for ratio in ratios:
    _, gan = train_test_split(covid, test_size=ratio, random_state=420+3*i)
    with open(f'orig_split_{ratio}_gan.pkl', 'wb') as file:
        pickle.dump(gan, file)

27132
