In [37]:
import os
import pandas as pd
import numpy as np

In [38]:
lines = ["img_name,case_num,source,label,typ,frame"]
for img_name in os.listdir("data"):
    case_num, source, label, _, typ, _, frame = img_name.split("_")
    case_num = int(case_num)
    frame = int(frame[5:frame.find(".")])
    data = f"{img_name},{case_num},{source},{label},{typ},{frame}"
    lines.append(data)

f = open("metadata.csv", "w") 
f.writelines("\n".join(lines))
f.close()

In [39]:
df = pd.read_csv("metadata.csv").sample(frac=1).reset_index(drop=True)

In [40]:
df

Unnamed: 0,img_name,case_num,source,label,typ,frame
0,185_uf_other_prc_convex_clean_frame20.jpg,185,uf,other,convex,20
1,101_litfl_other_prc_convex_clean_frame8.jpg,101,litfl,other,convex,8
2,29_grepmed_covid_prc_linear_clean_frame56.jpg,29,grepmed,covid,linear,56
3,31_grepmed_covid_prc_convex_clean_frame0.jpg,31,grepmed,covid,convex,0
4,104_litfl_other_prc_linear_clean_frame16.jpg,104,litfl,other,linear,16
...,...,...,...,...,...,...
2349,197_uf_other_prc_convex_clean_frame45.jpg,197,uf,other,convex,45
2350,83_butterfly_covid_prc_convex_clean_frame21.jpg,83,butterfly,covid,convex,21
2351,193_uf_other_prc_convex_clean_frame32.jpg,193,uf,other,convex,32
2352,115_litfl_other_prc_convex_clean_frame16.jpg,115,litfl,other,convex,16


In [41]:
for source in df["source"].unique():
    print(source, len(df[df.source == source]["case_num"].unique()), sep=" - ")

uf - 24
litfl - 63
grepmed - 20
core - 18
butterfly - 35
clarius - 23
paper - 22
pocusatlas - 32
radio - 5


In [42]:
def sample_test_cases(num, source):
    cases = df[df.source == source]["case_num"].unique()
    idxs = []
    for i in range(num):
        random_idx = np.random.randint(0, len(cases))
        while random_idx in idxs:
            random_idx = np.random.randint(0, len(cases))
        idxs.append(random_idx)
    return cases[np.array(idxs)]

def sample_val_cases(num, source, test_cases):
    cases = df[df.source == source]["case_num"].unique()
    idxs = []
    for i in range(num):
        random_idx = np.random.randint(0, len(cases))
        while random_idx in idxs or cases[random_idx] in test_cases:
            random_idx = np.random.randint(0, len(cases))
        idxs.append(random_idx)
    return cases[np.array(idxs)]

In [43]:
i_1, i_2 = 4,1
test_cases = np.concatenate([
    sample_test_cases(5, "uf"),
    sample_test_cases(5, "pocusatlas"),
    sample_test_cases(5, "butterfly"),
    sample_test_cases(5, "core"),
    sample_test_cases(5, "clarius"),
    sample_test_cases(5, "paper"),
    sample_test_cases(5, "litfl"),
    sample_test_cases(5, "grepmed"),
    sample_test_cases(1, "radio")])

val_cases = np.concatenate([
    sample_val_cases(3, "uf", test_cases),
    sample_val_cases(3, "pocusatlas", test_cases),
    sample_val_cases(3, "butterfly", test_cases),
    sample_val_cases(3, "core", test_cases),
    sample_val_cases(3, "clarius", test_cases),
    sample_val_cases(3, "paper", test_cases),
    sample_val_cases(3, "litfl", test_cases),
    sample_val_cases(3, "grepmed", test_cases),
    sample_val_cases(1, "radio", test_cases)])

In [44]:
test_cases

array([195, 183, 181, 188, 177,  54,  49,  78,  60,  50,   8,   9,  83,
        91,  86, 162, 159, 164, 174, 171, 221, 236, 217, 220, 231, 242,
       210, 213, 208, 199, 147, 137, 150, 103, 120,  25,  27,  34,  42,
        36, 156], dtype=int64)

In [45]:
len(test_cases)

41

In [46]:
val_cases

array([184, 191, 194,  62,  73,  64,  89,  19,  90, 172, 167, 173, 229,
       235, 238, 203, 212, 205, 114,  96, 106,  40,  35,  38, 155],
      dtype=int64)

In [47]:
len(val_cases)

25

In [48]:
set(test_cases).intersection(set(val_cases))

set()

In [49]:
df["label"].unique()

array(['other', 'covid', 'pneumonia', 'normal'], dtype=object)

In [50]:
if not os.path.exists("train_data/"):
    os.mkdir("train_data/")
    os.mkdir("train_data/other")
    os.mkdir("train_data/covid")
    os.mkdir("train_data/pneumonia")
    os.mkdir("train_data/normal")
    
if not os.path.exists("val_data/"):
    os.mkdir("val_data/")
    os.mkdir("val_data/other")
    os.mkdir("val_data/covid")
    os.mkdir("val_data/pneumonia")
    os.mkdir("val_data/normal")

if not os.path.exists("test_data/"):
    os.mkdir("test_data/")
    os.mkdir("test_data/other")
    os.mkdir("test_data/covid")
    os.mkdir("test_data/pneumonia")
    os.mkdir("test_data/normal")

In [51]:
import shutil

for i, row in df.iterrows():
    if row["case_num"] in test_cases:
        shutil.copy(os.path.join("data", row["img_name"]), os.path.join("test_data", row["label"],  row["img_name"]))
    elif row["case_num"] in val_cases:
        shutil.copy(os.path.join("data", row["img_name"]), os.path.join("val_data", row["label"],  row["img_name"]))
    else:
        shutil.copy(os.path.join("data", row["img_name"]), os.path.join("train_data", row["label"], row["img_name"]))

In [52]:
os.rename("train_data/normal", "train_data/regular")
os.rename("val_data/normal", "val_data/regular")
os.rename("test_data/normal", "test_data/regular")

shutil.rmtree("train_data/other")
shutil.rmtree("val_data/other")
shutil.rmtree("test_data/other")

In [53]:
print("train")
print("covid:", len(os.listdir("train_data/covid")))
print("pneumonia:", len(os.listdir("train_data/pneumonia")))
print("regular:", len(os.listdir("train_data/regular")))
print("val")
print("covid:", len(os.listdir("val_data/covid")))
print("pneumonia:", len(os.listdir("val_data/pneumonia")))
print("regular:", len(os.listdir("val_data/regular")))
print("test")
print("covid:", len(os.listdir("test_data/covid")))
print("pneumonia:", len(os.listdir("test_data/pneumonia")))
print("regular:", len(os.listdir("test_data/regular")))

train
covid: 478
pneumonia: 380
regular: 181
val
covid: 70
pneumonia: 59
regular: 20
test
covid: 140
pneumonia: 48
regular: 70


In [54]:
import albumentations as A
import cv2

In [55]:
if not os.path.exists("augmented_data"):
    os.mkdir("augmented_data")
    shutil.copytree("train_data", "augmented_data/train_data")
    shutil.copytree("val_data", "augmented_data/val_data")   

In [56]:
def augment_images(split, cls, complete_to, transform):
    image_paths = os.listdir(split + "/" + cls)
    n = complete_to - len(image_paths)
    for i in range(n):
        idx = np.random.randint(0, len(image_paths))
        current_image_path = os.path.join(split + "/" + cls, image_paths[idx])
        image = cv2.imread(current_image_path)
        cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        transformed_image = transform(image=image)["image"]
        cv2.imwrite(os.path.join(f"augmented_data/{split}/{cls}", f"augmented{i}_" + image_paths[idx]), transformed_image)    

In [57]:
transform = A.Compose([
    A.HorizontalFlip(p=1),
    A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=30)
])

In [58]:
augment_images("train_data", "covid", 500, transform)
augment_images("train_data", "pneumonia", 500, transform)
augment_images("train_data", "regular", 500, transform)

augment_images("val_data", "covid", 70, transform)
augment_images("val_data", "pneumonia", 59, transform)
augment_images("val_data", "regular", 20, transform)

In [59]:
print("train")
print("covid:", len(os.listdir("augmented_data/train_data/covid")))
print("pneumonia:", len(os.listdir("augmented_data/train_data/pneumonia")))
print("regular:", len(os.listdir("augmented_data/train_data/regular")))
print("val")
print("covid:", len(os.listdir("augmented_data/val_data/covid")))
print("pneumonia:", len(os.listdir("augmented_data/val_data/pneumonia")))
print("regular:", len(os.listdir("augmented_data/val_data/regular")))

train
covid: 500
pneumonia: 500
regular: 500
val
covid: 70
pneumonia: 59
regular: 20


In [60]:
def sample_test_images(cls, n):
    path = "test_data" + "/" + cls
    image_paths = os.listdir(path)
    sampled = []
    for i in range(n):
        idx = np.random.randint(0, len(image_paths))
        while image_paths[idx] in sampled:
            idx = np.random.randint(0, len(image_paths))
        shutil.copy(os.path.join(path, image_paths[idx]), 
                    os.path.join(f"augmented_data/test_data/{cls}", image_paths[idx]))
        sampled.append(image_paths[idx])

In [61]:
if not os.path.exists("augmented_data/test_data"):
    os.mkdir("augmented_data/test_data")
    os.mkdir("augmented_data/test_data/covid")
    os.mkdir("augmented_data/test_data/pneumonia")
    os.mkdir("augmented_data/test_data/regular")

In [62]:
sample_test_images("covid", 140)
sample_test_images("pneumonia", 48)
sample_test_images("regular", 70)

In [63]:
print("train")
print("covid:", len(os.listdir("augmented_data/train_data/covid")))
print("pneumonia:", len(os.listdir("augmented_data/train_data/pneumonia")))
print("regular:", len(os.listdir("augmented_data/train_data/regular")))
print("val")
print("covid:", len(os.listdir("augmented_data/val_data/covid")))
print("pneumonia:", len(os.listdir("augmented_data/val_data/pneumonia")))
print("regular:", len(os.listdir("augmented_data/val_data/regular")))
print("test")
print("covid:", len(os.listdir("augmented_data/test_data/covid")))
print("pneumonia:", len(os.listdir("augmented_data/test_data/pneumonia")))
print("regular:", len(os.listdir("augmented_data/test_data/regular")))

train
covid: 500
pneumonia: 500
regular: 500
val
covid: 70
pneumonia: 59
regular: 20
test
covid: 140
pneumonia: 48
regular: 70
