In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import cv2
import numpy as np

In [2]:
df = pd.read_csv("/data/BBox_List_2017.csv")

In [16]:
more_than_one = {}

for idx, item in df.iterrows():
    if item.Image in more_than_one:
        more_than_one[item.Image]+=1
    else:
        more_than_one[item.Image] = 1


In [18]:
items = [x for x in more_than_one if more_than_one[x]>1]

In [21]:
len(items)

93

In [22]:
new_data = []
excluded_data=  []
for idx, item in df.iterrows():
    if item.Image in items:
        excluded_data.append([item.Image, item.Label, item.x, item.y, item.w, item.h])
    else:
        new_data.append([item.Image, item.Label, item.x, item.y, item.w, item.h])

In [23]:
len(excluded_data)

197

In [24]:
len(new_data)

787

In [25]:
unique = pd.DataFrame(data=new_data, columns=['Image', 'Label', 'x', 'y', 'w', 'h'])
excluded = pd.DataFrame(data=excluded_data, columns=['Image', 'Label', 'x', 'y', 'w', 'h'])

In [29]:
unique.Label.value_counts()

Atelectasis     149
Cardiomegaly    135
Effusion        102
Pneumonia        97
Pneumothorax     86
Infiltrate       79
Nodule           76
Mass             63
Name: Label, dtype: int64

In [33]:
def function(data):
    X = data.drop(["Label"], axis=1).values
    y = data["Label"].values
    
    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    ss.get_n_splits(X, y)
    for i, (train_index, test_index) in enumerate(ss.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    x_cols = df.drop(["Label"], axis=1).columns.to_list()
    y_cols = df[["Label"]].columns.to_list()
    
    train = pd.DataFrame(data=X_train, columns=x_cols)
    test = pd.DataFrame(data=X_test, columns=x_cols)
    
    train[y_cols[0]] = y_train
    test[y_cols[0]] = y_test
    
    return train,test

In [38]:
train,test = function(unique)

In [39]:
train.shape, test.shape

((550, 6), (237, 6))

In [40]:
train.Label.value_counts()

Atelectasis     104
Cardiomegaly     95
Effusion         71
Pneumonia        68
Pneumothorax     60
Infiltrate       55
Nodule           53
Mass             44
Name: Label, dtype: int64

In [41]:
test.Label.value_counts()

Atelectasis     45
Cardiomegaly    40
Effusion        31
Pneumonia       29
Pneumothorax    26
Infiltrate      24
Nodule          23
Mass            19
Name: Label, dtype: int64

In [42]:
train.to_csv("/data/yolo_dataset/train_unique.csv", index=False)
test.to_csv("/data/yolo_dataset/test_unique.csv", index=False)

In [44]:
excluded.shape

(197, 6)

In [47]:
excluded.Image.value_counts()

00010277_000.png    4
00018427_004.png    3
00008814_010.png    3
00005066_030.png    3
00010828_039.png    3
                   ..
00011402_007.png    2
00012261_001.png    2
00000732_005.png    2
00021009_001.png    2
00005869_001.png    2
Name: Image, Length: 93, dtype: int64

In [78]:
from sklearn.model_selection import train_test_split
import os
import shutil

In [71]:
train, test = train_test_split(excluded.Image.unique(), test_size=0.2)

In [79]:
labels = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltrate', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax']
def convert_to_yolo(image_width, image_height, top_left_x, top_left_y, width, height):
    x_center = top_left_x + width / 2.0
    y_center = top_left_y + height / 2.0

    x_center /= image_width
    y_center /= image_height
    width /= image_width
    height /= image_height

    return x_center, y_center, width, height

In [84]:
def process(data, main_df, base_folder,output_folder, image_folder):
    idx = 0
    for item in data:
        filtered_samples = main_df[main_df['Image'] == item]
        src_image_path = os.path.join(image_folder, item)
        dst_image_path = os.path.join(base_folder,output_folder,"images",item)
        dst_label_path = os.path.join(base_folder,output_folder, "labels", item.replace(".png", ".txt"))
        print(idx,src_image_path, dst_image_path, dst_label_path)
        idx+=1
        shutil.copy(src_image_path, dst_image_path)
        write_data = ""
        
        image = cv2.imread(src_image_path)
        img_h, img_w, _ = image.shape
        for idx,row in filtered_samples.iterrows():
            label = labels.index(row.Label)
            top_left_x = row.x
            top_left_y = row.y
            width = row.w
            height = row.h
            
            xc, yc, w, h = convert_to_yolo(image_width=img_w, image_height=img_h, top_left_x=top_left_x,
                                       top_left_y=top_left_y, width=width, height=height)
            write_data+=f"{label} {xc} {yc} {w} {h}\n"
        
        with open(dst_label_path, "w") as f:
            f.write(write_data)
        print(write_data.strip())
            
process(train, excluded, base_folder="/data/yolo_dataset", output_folder="train", image_folder="/data/images")
print('----------------------------------------------------------------------------------------')
process(test, excluded, base_folder="/data/yolo_dataset", output_folder="val", image_folder="/data/images")

0 /data/images/00000732_005.png /data/yolo_dataset/train/images/00000732_005.png /data/yolo_dataset/train/labels/00000732_005.txt
1 0.6191737288135596 0.6215572033898306 0.40254237288135547 0.3368644067796611
7 0.6838888888888887 0.1586481560601133 0.16888888888888867 0.10111111111111133
183 /data/images/00017582_003.png /data/yolo_dataset/train/images/00017582_003.png /data/yolo_dataset/train/labels/00017582_003.txt
0 0.348148148148148 0.46349206349206395 0.1947089947089951 0.13968253968254005
2 0.16084656084656054 0.48571428571428565 0.1121693121693125 0.30264550264550294
73 /data/images/00012376_010.png /data/yolo_dataset/train/images/00012376_010.png /data/yolo_dataset/train/labels/00012376_010.txt
3 0.3394444444444443 0.5086481560601133 0.22222222222222265 0.25
4 0.785 0.41253704494900145 0.12222222222222266 0.413333333333333
7 0.42444444444444485 0.1675370449490015 0.11666666666666699 0.054444444444444434
182 /data/images/00018253_054.png /data/yolo_dataset/train/images/00018253_