In [1]:
import ultralytics
ultralytics.checks()
import json
import os
import shutil
from pprint import pprint
from pathlib import Path

Ultralytics 8.3.146  Python-3.11.9 torch-2.6.0+cu118 CUDA:0 (GeForce GTX 1650, 4096MiB)
Setup complete  (12 CPUs, 15.9 GB RAM, 150.7/931.5 GB disk)


Only run this notebook once, after downloading the dataset!

It inspects the labels and annotations, cleans up the COCO annotation json, corrects the bbox and stores results in clean_annotations

In [None]:
# how to get COCO formatted data: 
# download the dataset from https://datasetninja.com/mju-waste#download and place in folder 'data/mju-waste-COCO
# clone the instances.json files from https://github.com/realwecan/mju-waste and place in folder 'data/mju-waste-COCO/annotations'

#! ⚠️ IMPORTANT: bbox is NOT in the right location, we will construct it ourselves from the segmentation coordinates

In [3]:
# inspect annotations file
def show_first_two_per_category(json_path):
    """
    Prints the first two entries of each root-level list in a JSON file.

    Useful for quickly inspecting the structure and content of a COCO-style
    annotations json file.

    It pretty-prints the first two entries of each top-level key that contains a list.

    Args:
        json_path (str or Path): Path to the JSON file to inspect.

    Raises:
        FileNotFoundError: If the provided path does not point to an existing file.
        json.JSONDecodeError: If the file is not valid JSON.
    """
    json_path = Path(json_path)

    if not json_path.exists():
        print(f"File not found: {json_path}")
        return

    with open(json_path, 'r') as f:
        data = json.load(f)

    for key, value in data.items():
        print(f"\n--- {key.upper()} (showing first 2 entries) ---")
        if isinstance(value, list):
            for item in value[:2]:
                pprint(item)
        else:
            print(f"{key} is not a list, skipping.")


show_first_two_per_category(json_path=Path("../..") / "data" / "mju-waste-COCO" / "annotations" / "train.json")



--- INFO (showing first 2 entries) ---
info is not a list, skipping.

--- LICENSES (showing first 2 entries) ---
licenses is not a list, skipping.

--- CATEGORIES (showing first 2 entries) ---
{'id': 0, 'name': 'Rubbish', 'supercategory': 'Waste'}

--- ANNOTATIONS (showing first 2 entries) ---
{'area': 11013.695949999998,
 'bbox': [450, 255, 137, 204],
 'category_id': 0,
 'id': 1621,
 'image_id': 1617,
 'iscrowd': 0,
 'segmentation': [318.11,
                  188.75,
                  332.25,
                  324.48,
                  414.96,
                  315.99,
                  395.17,
                  180.27,
                  318.11,
                  188.04]}
{'area': 13896.986949999991,
 'bbox': [403, 228, 149, 198],
 'category_id': 0,
 'id': 1622,
 'image_id': 1618,
 'iscrowd': 0,
 'segmentation': [287.01,
                  164.71,
                  385.98,
                  162.59,
                  390.22,
                  296.2,
                  285.6,
           

In [5]:
# Combines the train, val, and test JSON files from the MJU Waste dataset into a single COCO-style JSON file.
# it also cleans the annotation files, 
# only keeping annotation id, image_id, category_id (set to 0 for 'trash'), area, and iscrowd.
# ⚠️IMPORTANT: this cell computes the bounding box from the segmentation coordinates

# Set paths
input_dir = Path("../..") / "data" / "mju-waste-COCO" / "annotations"
output_file = Path("../..") / "data" / "mju-waste-COCO" / "clean_annotations" / "annotations.json"
output_file.parent.mkdir(parents=True, exist_ok=True)

# Files to process
splits = ['train', 'val', 'test']

combined_images = []
combined_annotations = []
annotation_id = 0
image_ids_seen = set()

for split in splits:
    input_file = input_dir / f"{split}.json"

    with input_file.open('r') as f:
        data = json.load(f)

    for img in data.get("images", []):
        if img["id"] not in image_ids_seen:
            combined_images.append(img)
            image_ids_seen.add(img["id"])

    for ann in data.get("annotations", []):
        # compute bbox from segmentation
        seg = ann['segmentation'] # input is flat list
        # take all x coordinates
        xs = seg[0::2]
        # take all y coordinates from segmentation
        ys = seg[1::2]
        # find the bouding boxes
        x_min = min(xs)
        y_min = min(ys)
        x_max = max(xs)
        y_max = max(ys)
        # calculate width and height of the bounding box
        width = round(x_max - x_min, 2)
        height = round(y_max - y_min, 2)

        cleaned_ann = {
            'id': annotation_id,
            'image_id': ann['image_id'],
            'category_id': 0,  # unify to 'trash'
            'segmentation': ann['segmentation'],
            'area': ann['area'],
            'iscrowd': ann.get('iscrowd', 0),
            # add my own bbox from segmentation
            'bbox': [x_min, y_min, width, height]

        }
        combined_annotations.append(cleaned_ann)
        annotation_id += 1

# Set single category at the end
categories = [{"id": 0, "name": "trash"}]

# Build and save final dataset
cleaned_data = {
    'images': combined_images,
    'annotations': combined_annotations,
    'categories': categories
}

with output_file.open('w') as f:
    json.dump(cleaned_data, f)

print(f"Saved combined cleaned annotations to {output_file}")
print(f"Total images: {len(combined_images)}")
print(f"Total annotations: {len(combined_annotations)}")

Saved combined cleaned annotations to ..\..\data\mju-waste-COCO\clean_annotations\annotations.json
Total images: 2475
Total annotations: 2532


In [13]:
# Investigate mismatches between length of annotations and images
image_ids_in_images = set(img["id"] for img in combined_images)
image_ids_in_annotations = set(ann["image_id"] for ann in combined_annotations)

# Check if all annotation image_ids exist in images
missing_image_ids = image_ids_in_annotations - image_ids_in_images
if missing_image_ids:
    print(f"⚠️ {len(missing_image_ids)} annotation image_ids are missing in images!")
    print(f"Missing image_ids (up to 10 shown): {list(missing_image_ids)[:10]}")
    # Optionally, show counts of how many annotations reference each missing image_id
    from collections import Counter
    missing_counts = Counter([ann["image_id"] for ann in combined_annotations if ann["image_id"] in missing_image_ids])
    print(f"Counts of annotations per missing image_id (up to 5 shown): {missing_counts.most_common(5)}")
else:
    print("✅ All annotation image_ids are present in the image list.")

# Optional: how many annotations reference missing images
num_annotations_missing_image = sum(1 for ann in combined_annotations if ann["image_id"] in missing_image_ids)
print(f"Annotations referencing missing images: {num_annotations_missing_image}")


✅ All annotation image_ids are present in the image list.
Annotations referencing missing images: 0


In [6]:
# inspect results
show_first_two_per_category(Path("../..") / "data" / "mju-waste-COCO" / "clean_annotations" / "annotations.json")


--- IMAGES (showing first 2 entries) ---
{'coco_url': '',
 'date_captured': '2019-11-21 16:19:37',
 'file_name': '2019-09-19_16_19_32-29_color.png',
 'flickr_url': '',
 'height': 480,
 'id': 1617,
 'license': 1,
 'width': 640}
{'coco_url': '',
 'date_captured': '2019-11-21 16:19:37',
 'file_name': '2019-09-19_16_19_44-93_color.png',
 'flickr_url': '',
 'height': 480,
 'id': 1618,
 'license': 1,
 'width': 640}

--- ANNOTATIONS (showing first 2 entries) ---
{'area': 11013.695949999998,
 'bbox': [318.11, 180.27, 96.85, 144.21],
 'category_id': 0,
 'id': 0,
 'image_id': 1617,
 'iscrowd': 0,
 'segmentation': [318.11,
                  188.75,
                  332.25,
                  324.48,
                  414.96,
                  315.99,
                  395.17,
                  180.27,
                  318.11,
                  188.04]}
{'area': 13896.986949999991,
 'bbox': [284.89, 161.18, 105.33, 139.97],
 'category_id': 0,
 'id': 1,
 'image_id': 1618,
 'iscrowd': 0,
 'segment