# Importing Packages

In [16]:
import os
import shutil
import random
from pathlib import Path
import yaml

# --------------------------------------------------------------------------------------

##### Checking the total number of images in the dataset

In [4]:
original_data_path1 = "../data/external/public_training_set_release_2.0/"
original_data_path2 = "../data/external/public_validation_set_2.0/"
yolo_data_Set = "../data/yolo-dataset/"

In [5]:
total_num_images = len(os.listdir(original_data_path1+"images"))
total_num_images += len(os.listdir(original_data_path2+"images"))
print("Total Number of Images:", total_num_images)

Total Number of Images: 40962


##### Determining the ratio of train,test,val datasets

In [10]:
train_ratio = 0.7
test_ratio = 0.15
val_ratio = 0.15

##### Determining all valid image-label pairs

In [12]:
# Original data paths
original_data_paths = [
    "../data/external/public_training_set_release_2.0",
    "../data/external/public_validation_set_2.0"
]

# Target base path
yolo_dataset_path = "../data/yolo-dataset"

# Collect all (image, label) pairs
all_data = []

for path in original_data_paths:
    images_dir = os.path.join(path, "images")
    labels_dir = os.path.join(path, "annotations-segmentation-normalized")
    
    for img_file in os.listdir(images_dir):
        if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
            label_file = os.path.splitext(img_file)[0] + ".txt" 
            img_path = os.path.join(images_dir, img_file)
            label_path = os.path.join(labels_dir, label_file)
            if os.path.exists(label_path):
                all_data.append((img_path, label_path))

print(f"Total valid image-label pairs found: {len(all_data)}")

Total valid image-label pairs found: 40962


##### Shuffle data and compute split indices

In [13]:
# Shuffle data
random.shuffle(all_data)

# Compute split indices
total = len(all_data)
train_end = int(total * train_ratio)
val_end = train_end + int(total * val_ratio)

train_data = all_data[:train_end]
val_data = all_data[train_end:val_end]
test_data = all_data[val_end:]

##### Copy images/labels to yolo-dataset directory

In [15]:
def copy_data(data, split_name):
    for img_path, label_path in data:
        img_name = os.path.basename(img_path)
        label_name = os.path.basename(label_path)
        
        shutil.copy(img_path, f"{yolo_dataset_path}/{split_name}/images/{img_name}")
        shutil.copy(label_path, f"{yolo_dataset_path}/{split_name}/labels/{label_name}")

# Copy all splits
copy_data(train_data, "train")
copy_data(val_data, "valid")
copy_data(test_data, "test")

##### Checking the size of train,test,val dataset

In [6]:
train_dataset_size = len(os.listdir(yolo_data_Set+"train/images"))
test_dataset_size = len(os.listdir(yolo_data_Set+"test/images"))
val_dataset_size = len(os.listdir(yolo_data_Set+"valid/images"))

print("Train Dataset Size:", train_dataset_size)
print("Test Dataset Size:", test_dataset_size)
print("Valid Dataset Size:", val_dataset_size)

Train Dataset Size: 28673
Test Dataset Size: 6145
Valid Dataset Size: 6144


##### Creating a data.yaml file

In [23]:
with open("../data/external/food_classes.txt") as file:
    food_classes = file.read().split("\n")
if food_classes[-1] == "":
    food_classes.pop()

In [24]:
data_yaml = {
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'nc': len(food_classes),
    'names': food_classes
}

# Write to YAML file
with open('../data/yolo-dataset/data.yaml', 'w') as f:
    yaml.dump(data_yaml, f, sort_keys=False)