In [3]:
import os
import cv2
import pandas as pd
import shutil
from PIL import Image, ImageOps

In [4]:
labels1 = pd.read_csv("../../statues-train/statues_labels.csv",)
labels2 = pd.read_csv("../../statues-train/statues_labels2.csv", 
                      names=["filename", "xmin", "ymin", "xmax", "ymax", "class"], sep=";")

In [5]:
labels1.head()

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,Balashikha_0204.JPG,1296,864,lenin,683,198,709,230
1,New_Mexico_947.jpg,1404,936,other,619,205,648,234
2,Skopje_037.JPG,1128,752,other,457,34,509,97
3,Skopje_037.JPG,1128,752,other,414,267,480,346
4,Skopje_037.JPG,1128,752,other,715,287,789,349


In [6]:
labels2.head()

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class
0,Acipayam_043.jpg,2814,1065,2955,1233,ataturk
1,Acipayam_044.jpg,2851,738,3015,948,ataturk
2,Acipayam_047.jpg,2189,1513,2465,1805,ataturk
3,Acipayam_048.jpg,2066,816,2318,1168,ataturk
4,Acipayam_050.jpg,4153,1532,4240,1634,ataturk


In [8]:
train_split = .8
save_dir = "data2"
for folder in ["images", "labels"]:
    for phase in ["train", "val"]:
        os.makedirs(f"{save_dir}/{folder}/{phase}", exist_ok=True)

In [9]:
total = 0
folder_path = "../../statues-train/"
for root, _, files in os.walk(folder_path):
    for file in files:
        if not file.lower().endswith(".jpg"):
            continue
        total += 1

In [11]:
# bad images
bad_image_files = [
                    "Atlanta_069.jpg",
                    "Bath_068.jpg",
                    "Cesky_Krumlov_087.jpg",
                    "Istra_005.jpg",
                    "Izhevsk_010.jpg",
                    "London_2019_092.jpg",
                    "London_2019_610.jpg",
                    "Lyubertsy_024.jpg",
                    "Moscow_2017_1465.jpg",
                    "Mytishchi_007.jpg",
                    "Nizhny_Novgorod_2018_041.jpg",
                    "Nizhny_Novgorod_2018_181.jpg",
                    "Odintsovo_031.jpg",
                    "Odintsovo_032.jpg",
                    "Oxford_027.jpg",
                    "Paris_2019_093.jpg",
                    "Paris_2019_227.jpg",
                    "Petropavl_037.jpg",
                    "Rouen_034.jpg",
                    "Sligo_049.jpg",
                    "Tambov_067.jpg",
                    "Odintsovo_031.jpg",
                    "Odintsovo_032.jpg"
                    ]

In [12]:
class_dict = {
                "other"  : 0,
                "lenin"  : 1,
                "ataturk": 2
             }

In [13]:
classwise_total = {
    "lenin": len(labels1.loc[labels1["class"] == "lenin"]["filename"].unique()),
    "other": len(labels1.filename.unique()) - len(labels1.loc[labels1["class"] == "lenin"]["filename"].unique()),
    "ataturk": len(labels2),
    "background": total - len(labels1.filename.unique()) - len(labels2.filename.unique())
}
classwise_data_split = {"train": {}, "val": {}}

for root, _, files in os.walk(folder_path):
    for file in files:
        if not file.lower().endswith(".jpg") or file in bad_image_files:
            continue
        
        phase = "val"
        filename = file.split(".")[0]

        if file in list(labels1["filename"]):
            labels = labels1.loc[labels1["filename"] == file]["class"].to_list()
            label = "lenin" if labels.count("lenin") >= labels.count("other") else "other"
            phase = "train" if len(classwise_data_split["train"].get(label, [])) <= int(
                classwise_total[label] * train_split) else "val"
            classwise_data_split[phase][label
                                        ] = classwise_data_split[phase].get(label, []) + [file]
            
        elif file in list(labels2["filename"]):
            phase = "train" if len(classwise_data_split["train"].get("ataturk", [])) <= int(
                classwise_total["ataturk"] * train_split) else "val"
            classwise_data_split[phase]["ataturk"
                                        ] = classwise_data_split[phase].get("ataturk", []) + [file]
        else:
            phase = "train" if len(classwise_data_split["train"].get("background", [])) <= int(
                classwise_total["background"] * train_split) else "val"
            classwise_data_split[phase]["background"
                                        ] = classwise_data_split[phase].get("background", []) + [file]

print("Total:", classwise_total)
print("Train:", {k: len(v) for k, v in classwise_data_split['train'].items()})
print("Val:", {k: len(v) for k, v in classwise_data_split['val'].items()})


Total: {'lenin': 350, 'other': 315, 'ataturk': 121, 'background': 67}
Train: {'other': 253, 'background': 54, 'lenin': 281, 'ataturk': 97}
Val: {'other': 54, 'ataturk': 20, 'background': 11, 'lenin': 58}


In [14]:
data_distribution = {"train": {}, "val": {}}
for root, _, files in os.walk(folder_path):
    for file in files:
        if not file.lower().endswith(".jpg") or file in bad_image_files:
            continue
        ann_filename = f"{os.path.splitext(file)[0]}.txt"

        phase = "train"
        for temp in ["train", "val"]:
            for k, v in classwise_data_split[temp].items():
                if file in v:
                    phase = temp
                    break
                
        img_path = os.path.join(root, file)
        filename = file.split(".")[0]

        shutil.copy(img_path, f"{save_dir}/images/{phase}/{file}")
        f = open(f"{save_dir}/labels/{phase}/{filename}.txt", "w+")

        if file in list(labels1["filename"]):
            anns = labels1.loc[labels1["filename"] == file]

            for i, ann in anns.iterrows():
                width, height = ann["width"], ann["height"]
                x1 = ann["xmin"] / width
                y1 = ann["ymin"] / height
                x2 = ann["xmax"] / width
                y2 = ann["ymax"] / height

                w = abs(x2 - x1)
                h = abs(y2 - y1)
                
                x_center = min(x1, x2) + w / 2.
                y_center = min(y1, y2) + h / 2.

                class_id = class_dict[ann["class"]]
                data_distribution[phase][ann["class"]
                                         ] = data_distribution[phase].get(ann["class"], 0) + 1
                line = f"{class_id} {x_center} {y_center} {w} {h}\n"
                f.write(line)
                
        elif file in list(labels2["filename"]):
            img = cv2.imread(img_path)
            height, width = img.shape[:2]
            anns = labels2.loc[labels2["filename"] == file]
            for i, ann in anns.iterrows():
                xmin = float(ann["xmin"]) / width
                ymin = float(ann["ymin"]) / height
                xmax = float(ann["xmax"]) / width
                ymax = float(ann["ymax"]) / height

                w = abs(xmin - xmax)
                h = abs(ymin - ymax) 

                x_center = min(xmin, xmax) + (w / 2.)
                y_center = min(ymin, ymax) + (h / 2.)

                class_id = class_dict["ataturk"]
                data_distribution[phase]["ataturk"
                                         ] = data_distribution[phase].get("ataturk", 0) + 1
                line = f"{class_id} {x_center} {y_center} {w} {h}\n"
                f.write(line)
        else:
            data_distribution[phase]["background"
                                         ] = data_distribution[phase].get("background", 0) + 1

print(data_distribution)

{'train': {'other': 386, 'background': 54, 'lenin': 285, 'ataturk': 101}, 'val': {'other': 114, 'lenin': 71, 'ataturk': 20, 'background': 11}}
