In [1]:
import os

In [2]:
# Define paths to label and image directories
label_dirs = ["taco/test/labels", "taco/train/labels", "taco/valid/labels"]
image_dirs = ["taco/test/images", "taco/train/images", "taco/valid/images"]

# Classes to keep
classes_to_keep = {3, 4, 5, 7}

In [3]:
# 0: Aluminium foil
# 1: Bottle
# 2: Bottle cap
# 3: Can
# 4: Carton
# 5: Cigarette
# 6: Glass bottle
# 7: Lid
# 8: Metal
# 9: Metal cap
# 10: Other litter
# 11: Other plastic
# 12: Paper
# 13: Plastic bag
# 14: Plastic buoy
# 15: Plastic vessels
# 16: Pop tab
# 17: Straw
# 18: Styrofoam cup
# 19: Styrofoam piece
# 20: Styrofoam_Buoy
# 21: Wrapper

In [4]:
def clean_labels_and_images(label_dir, image_dir):
    # Loop through each .txt label file in the directory
    for filename in os.listdir(label_dir):
        if filename.endswith(".txt"):
            label_path = os.path.join(label_dir, filename)
            image_path = os.path.join(
                image_dir, filename.replace(".txt", ".jpg")
            )  # Assumes images are .jpg; change if needed

            cleaned_lines = []
            keep_file = False  # Flag to keep or delete the file

            # Open and read each line in the label file
            with open(label_path, "r") as file:
                lines = file.readlines()
                for line in lines:
                    # Split the line by spaces to isolate the class ID
                    parts = line.strip().split()
                    class_id = int(parts[0])

                    # Check if class_id is in the classes_to_keep set
                    if class_id in classes_to_keep:
                        cleaned_lines.append(line)
                        keep_file = True  # Set flag to keep file if we find at least one desired class

            if keep_file:
                # Write back only the filtered lines to the file, overwriting it
                with open(label_path, "w") as file:
                    file.writelines(cleaned_lines)
                print(f"Kept label file with desired classes: {label_path}")
            else:
                # Delete the label file and the associated image file if no desired classes are found
                os.remove(label_path)
                if os.path.exists(image_path):
                    os.remove(image_path)
                    print(f"Deleted label and image: {label_path}, {image_path}")

In [5]:
# Run the cleaning function for each pair of label and image directories
for label_dir, image_dir in zip(label_dirs, image_dirs):
    print(f"Processing directory: {label_dir}")
    clean_labels_and_images(label_dir, image_dir)

print("Dataset cleaning completed.")

Processing directory: taco/test/labels
Deleted label and image: taco/test/labels\20231231_165249_jpg.rf.4692b3a0eb6f6c1ebb376d0e278550ca.txt, taco/test/images\20231231_165249_jpg.rf.4692b3a0eb6f6c1ebb376d0e278550ca.jpg
Deleted label and image: taco/test/labels\20231231_165341_jpg.rf.58cc71782a042ef24de7499b16cac711.txt, taco/test/images\20231231_165341_jpg.rf.58cc71782a042ef24de7499b16cac711.jpg
Deleted label and image: taco/test/labels\20231231_165416_jpg.rf.b5b83d02adad55a865a0a80d379b4c69.txt, taco/test/images\20231231_165416_jpg.rf.b5b83d02adad55a865a0a80d379b4c69.jpg
Deleted label and image: taco/test/labels\20231231_170745_jpg.rf.fb1f2d9a3d3af5b1116771288e526b80.txt, taco/test/images\20231231_170745_jpg.rf.fb1f2d9a3d3af5b1116771288e526b80.jpg
Deleted label and image: taco/test/labels\20231231_171245_jpg.rf.aba1c46e8dd58d23ff8414a8dbd10ffd.txt, taco/test/images\20231231_171245_jpg.rf.aba1c46e8dd58d23ff8414a8dbd10ffd.jpg
Deleted label and image: taco/test/labels\20231231_171301_jpg