### Converting and splitting dataset from datasetsninja.com

In [1]:
import os
import shutil
import random
import cv2

In [5]:
image_dir = "../data/Dataset 1 (Simplex)/Train data/Positive data"
negative_data_dir = "../data/Dataset 1 (Simplex)/Train data/Negative data"
annotation_file = "../data/Dataset 1 (Simplex)/simpleTrainFullPhotosSortedFullAnnotations.txt"

# Creating data folders
yolo_images_dir = "../yolo_data/images"
yolo_labels_dir = "../yolo_data/labels"
os.makedirs(yolo_images_dir, exist_ok=True)
os.makedirs(yolo_labels_dir, exist_ok=True)

In [6]:
# Function for converting from COCO format to YOLO
def convert_to_yolo_format(x, y, w, h, image_width, image_height):
    x_center = (x + w / 2) / image_width
    y_center = (y + h / 2) / image_height
    width = w / image_width
    height = h / image_height
    return x_center, y_center, width, height

#### Processing Negative Images
Processing the negative seperately as we don't have label files.

In [7]:
negative_images = os.listdir(negative_data_dir)

# Move negative images and create empty annotation files
for image_name in negative_images:
    negative_image_path = os.path.join(negative_data_dir, image_name)
    yolo_image_path = os.path.join(yolo_images_dir, image_name)
    shutil.copy(negative_image_path, yolo_image_path)

    txt_name = os.path.splitext(image_name)[0] + ".txt"
    yolo_label_path = os.path.join(yolo_labels_dir, txt_name)
    
    open(yolo_label_path, 'w').close()

### Dealing with positive data

### Shuffling into train and validation

In [9]:
with open(annotation_file, 'r') as file:
    lines = file.readlines()

# For each line in the annotation file
for line in lines:
    parts = line.strip().split()
    image_name = parts[2].replace('.bmp', '.JPG')
    image_name = image_name.split("data\\", 1)[-1]
    num_potholes = int(parts[3])

    # Extract bounding boxes (4 values for each pothole)
    bboxes = []
    for i in range(num_potholes):
        x = int(parts[4 + i * 4])
        y = int(parts[5 + i * 4])
        width = int(parts[6 + i * 4])
        height = int(parts[7 + i * 4])
        bboxes.append((x, y, width, height))

    # Load, get dimensions and save
    image_path = os.path.join(image_dir, image_name)
    image = cv2.imread(image_path)
    image_height, image_width, _ = image.shape

    yolo_image_path = os.path.join(yolo_images_dir, image_name)
    shutil.copy(image_path, yolo_image_path)

    # Labels, converting format
    yolo_label_path = os.path.join(yolo_labels_dir, image_name.replace('.JPG', '.txt'))

    with open(yolo_label_path, 'w') as label_file:
        for (x, y, w, h) in bboxes:
            x_center, y_center, width, height = convert_to_yolo_format(x, y, w, h, image_width, image_height)
            label_file.write(f"0 {x_center} {y_center} {width} {height}\n")

# Split the dataset into train and validation sets (80% train, 20% val)
image_files = os.listdir(yolo_images_dir)
random.shuffle(image_files)

split_index = int(0.8 * len(image_files))

train_images = image_files[:split_index]
val_images = image_files[split_index:]

# Create train/val directories
train_images_dir = "../yolo_data/train/images"
val_images_dir = "../yolo_data/val/images"
train_labels_dir = "../yolo_data/train/labels"
val_labels_dir = "../yolo_data/val/labels"

os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(train_labels_dir, exist_ok=True)
os.makedirs(val_labels_dir, exist_ok=True)

In [10]:
# Move images and labels to train/val directories
for image_name in train_images:
    shutil.move(os.path.join(yolo_images_dir, image_name), os.path.join(train_images_dir, image_name))
    shutil.move(os.path.join(yolo_labels_dir, image_name.replace('.JPG', '.txt')), os.path.join(train_labels_dir, image_name.replace('.JPG', '.txt')))

for image_name in val_images:
    shutil.move(os.path.join(yolo_images_dir, image_name), os.path.join(val_images_dir, image_name))
    shutil.move(os.path.join(yolo_labels_dir, image_name.replace('.JPG', '.txt')), os.path.join(val_labels_dir, image_name.replace('.JPG', '.txt')))

print("Dataset is now ready for YOLOv8 training.")

Dataset is now ready for YOLOv8 training.


### Dealing with test data

In [11]:
test_image_dir = "../data/Dataset 1 (Simplex)/Test data"
test_annotation_file = "../data/Dataset 1 (Simplex)/simpleTestFullSizeAllPotholesSortedFullAnnotation.txt"

# YOLO Test Directories
yolo_test_images_dir = "../yolo_data/test/images"
yolo_test_labels_dir = "../yolo_data/test/labels"
os.makedirs(yolo_test_images_dir, exist_ok=True)
os.makedirs(yolo_test_labels_dir, exist_ok=True)

with open(test_annotation_file, 'r') as file:
    lines = file.readlines()

# For each line in the test annotation file
for line in lines:
    parts = line.strip().split()
    image_name = parts[1].replace('.bmp', '.JPG')
    image_name = image_name.split("data\\", 1)[-1]
    num_potholes = int(parts[2])

    # Extract bounding boxes
    bboxes = []
    for i in range(num_potholes):
        x = int(parts[3 + i * 4])
        y = int(parts[4 + i * 4])
        width = int(parts[5 + i * 4])
        height = int(parts[6 + i * 4])
        bboxes.append((x, y, width, height))

    # Load, get dimensions and save images
    image_path = os.path.join(test_image_dir, image_name)
    image = cv2.imread(image_path)
    image_height, image_width, _ = image.shape

    yolo_image_path = os.path.join(yolo_test_images_dir, image_name)
    shutil.copy(image_path, yolo_image_path)

    # labels and converting
    yolo_label_path = os.path.join(yolo_test_labels_dir, image_name.replace('.JPG', '.txt'))

    with open(yolo_label_path, 'w') as label_file:
        for (x, y, w, h) in bboxes:
            x_center, y_center, width, height = convert_to_yolo_format(x, y, w, h, image_width, image_height)
            label_file.write(f"0 {x_center} {y_center} {width} {height}\n")

print("Test dataset is now ready for YOLOv8 evaluation.")

Test dataset is now ready for YOLOv8 evaluation.
