# Generate data from annotated data

Download the annotations file in YOLO pytorch format, unzip, copy file and rename to annotated_data/

Run each code block, ensuring that the file paths are correct

In [1]:
import os
import csv
import cv2
import glob

## Training data

In [2]:
def process_train(csv_file_path, test_labels_csv, image_folder_path, output_images, output_images_test, output_annotations, output_annotations_test):
    os.makedirs(output_images, exist_ok=True)
    os.makedirs(output_annotations, exist_ok=True)
    os.makedirs(output_images_test, exist_ok=True)
    os.makedirs(output_annotations_test, exist_ok=True)
    
    with open(test_labels_csv, "r") as test_csv_file:
        test_lines = test_csv_file.readlines()[1:]
    test_image_ids = {int(line.split(",")[0].removeprefix("p")) for line in test_lines}
    
    with open(csv_file_path, "r") as csv_file:
        lines = csv_file.readlines()[1:]

    annotations_dict = {}

    for line in lines:
        result = line.rsplit(",")
        image_name, width, height, class_id, xmin, ymin, xmax, ymax = result
        image_number, _, version, _ = image_name.rsplit(".")
        image_number = image_number.rsplit("_")[0]

        if class_id == "L":
            class_id = 1
        elif class_id == "potholes":
            class_id = 0

        img_width, img_height = int(width), int(height)
        xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)

        x_center = (xmin + xmax) / 2.0 / img_width
        y_center = (ymin + ymax) / 2.0 / img_height
        width = (xmax - xmin) / img_width
        height = (ymax - ymin) / img_height

        if image_number not in annotations_dict:
            annotations_dict[image_number] = {}

        annotations_dict[image_number] = [class_id, x_center, y_center, width, height]

    for image_path in glob.glob(os.path.join(image_folder_path, "*.jpg")):
        image_name = os.path.basename(image_path)
        image_number, _, version, _ = image_name.rsplit(".")
        image_number = int(image_number.rsplit("_")[0].removeprefix("p"))
        
        if image_number in test_image_ids:
            output_image_path = os.path.join(output_images_test, f"{image_number}.jpg")
            output_annotation_path = os.path.join(output_annotations_test, f"{image_number}.txt")
        else:
            output_image_path = os.path.join(output_images, f"{image_number}.jpg")
            output_annotation_path = os.path.join(output_annotations, f"{image_number}.txt")

        image = cv2.imread(image_path)
        cv2.imwrite(output_image_path, image)
        
        image_number_p = "p"+str(image_number)
        if image_number_p in annotations_dict:
            with open(output_annotation_path, 'w', newline='') as outfile:
                writer = csv.writer(outfile, delimiter=' ')
                writer.writerow(annotations_dict[image_number_p])

    print("Processing complete!")


### Train data

In [3]:
# Paths
csv_file_path_train = "annotated_data/train/_annotations.csv"
image_folder_path_train = "annotated_data/train/"
output_image_folder_train = "data_2/train_images/"
output_annotations = "data_2/train_annotations/"

output_image_folder_test = "data_2/test_images/"
output_annotations_test = "data_2/test_annotations/"
csv_file_path_test = "annotated_data/test/_annotations.csv"
test_labels_csv = "data/test_labels.csv"

process_train(csv_file_path_train, test_labels_csv, image_folder_path_train, output_image_folder_train, output_image_folder_test, output_annotations, output_annotations_test)

csv_file_path_valid = "annotated_data/valid/_annotations.csv"
image_folder_path_valid = "annotated_data/valid/"

process_train(csv_file_path_valid, test_labels_csv, image_folder_path_valid, output_image_folder_train, output_image_folder_test, output_annotations, output_annotations_test)

csv_file_path_test = "annotated_data/test/_annotations.csv"
image_folder_path_test =  "annotated_data/test/"

process_train(csv_file_path_test, test_labels_csv, image_folder_path_test, output_image_folder_train, output_image_folder_test, output_annotations, output_annotations_test)

csv_file_path_test = "annotated_data/their_test/_annotations.csv"
image_folder_path_test =  "annotated_data/their_test/"

process_train(csv_file_path_test, test_labels_csv, image_folder_path_test, output_image_folder_train, output_image_folder_test, output_annotations, output_annotations_test)

Processing complete!
Processing complete!
Processing complete!
Processing complete!


In [4]:
import csv, glob, os
 
train_labels_path = "data/train_labels.csv"
output_train_labels_path = "data_2/train_labels.csv"
image_folder_path = "annotated_data/train/"

# Read train_labels.csv
labels_dict = {}
with open(train_labels_path, "r") as labels_file:
    reader = csv.reader(labels_file, delimiter="\t")
    next(reader)  # Skip header
    for row in reader:
        pothole_number, bags = row
        labels_dict[int(pothole_number)] = bags
        
# Create new train_labels.csv
with open(output_train_labels_path, "w", newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(["Pothole number", "Bags used"])
    
    for image_path in glob.glob(os.path.join(image_folder_path, "*.jpg")):
        image_name = os.path.basename(image_path)
        image_number = int(image_name.rsplit("_")[0].removeprefix("p"))
        if image_number in labels_dict:
            bags = labels_dict[image_number]
            image_name = os.path.basename(image_path)
            writer.writerow([image_number, bags])

In [5]:
# Paths
test_image_folder = "data_2/test_images/"
output_test_labels_path = "data_2/test_labels.csv"

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_test_labels_path), exist_ok=True)

# Create new test_labels.csv
with open(output_test_labels_path, "w", newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(["Pothole number", "Bags used"])

    for image_path in glob.glob(os.path.join(test_image_folder, "*.jpg")):
        image_name = os.path.basename(image_path)
        image_number = image_name.replace(".jpg", "")
        writer.writerow([image_number, None])

print("Processing complete!")

Processing complete!
