In [2]:
import pandas as pd
import os
from PIL import Image

def get_dataset(dataset_path, labels_path):
    if not os.path.exists(dataset_path):
        print("Dataset not found")
        return None
    dataset = []

    for i, file in enumerate(os.listdir(dataset_path)):
        if not file.endswith('.png'):
            continue
        csv_file = labels_path + '/' + file[:-4] + '.csv'
        if not os.path.exists(csv_file):
            print(f"Labels file {csv_file} not found")
            continue
        res_df = pd.read_csv(csv_file)
        image_path = os.path.join(dataset_path, file)
        y = res_df['y'].to_numpy()
        image = Image.open(image_path).convert('RGB')
        width = image.width
        height = image.height
        text = ""
        for y_i in y:
            center_x = width / 2
            center_y = y_i + 75
            text += f'0 {center_x / width} {center_y / height} 1 {150/height}\n'
        dataset.append((file, image_path, text))
    return pd.DataFrame(dataset, columns=['file', 'image_path', 'text'])

dataset = get_dataset('raw_dataset', 'synthetic_labels')
dataset

Labels file synthetic_labels/n06-156.csv not found
Labels file synthetic_labels/g06-042f.csv not found
Labels file synthetic_labels/b03-098.csv not found
Labels file synthetic_labels/n02-000.csv not found
Labels file synthetic_labels/n02-028.csv not found
Labels file synthetic_labels/g07-022b.csv not found
Labels file synthetic_labels/b03-104.csv not found
Labels file synthetic_labels/f04-100.csv not found
Labels file synthetic_labels/b06-082.csv not found
Labels file synthetic_labels/n06-182.csv not found
Labels file synthetic_labels/n06-169.csv not found
Labels file synthetic_labels/g06-042e.csv not found
Labels file synthetic_labels/g06-018f.csv not found
Labels file synthetic_labels/n06-186.csv not found
Labels file synthetic_labels/b06-045.csv not found
Labels file synthetic_labels/p02-000.csv not found
Labels file synthetic_labels/n02-004.csv not found
Labels file synthetic_labels/b06-087.csv not found
Labels file synthetic_labels/h02-017.csv not found
Labels file synthetic_label

Unnamed: 0,file,image_path,text
0,g06-042g.png,raw_dataset/g06-042g.png,0 0.5 0.05587840858292356 1 0.0670540902995082...
1,h02-004.png,raw_dataset/h02-004.png,0 0.5 0.08393285371702638 1 0.0719424460431654...
2,n06-194.png,raw_dataset/n06-194.png,0 0.5 0.08012820512820513 1 0.0686813186813186...
3,c04-044.png,raw_dataset/c04-044.png,0 0.5 0.08208255159474671 1 0.0703564727954971...
4,n02-016.png,raw_dataset/n02-016.png,0 0.5 0.05868544600938967 1 0.0704225352112676...


In [6]:
import shutil

output_folder = 'yolo_dataset_v2'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

if not os.path.exists(output_folder + '/train'):
    os.makedirs(output_folder + '/train')

if not os.path.exists(output_folder + '/test'):
    os.makedirs(output_folder + '/test')
    
cut = int(len(dataset) * 0.8)

for i in range(len(dataset)):
    file, image_path, text = dataset.iloc[i]
    folder_path = output_folder + '/train'
    if i >= cut:
        folder_path = output_folder + '/test'
    with open(folder_path + '/' + file[:-4] + '.txt', 'w') as f:
        f.write(text)
    shutil.copy(image_path, folder_path + '/' + file)