In [1]:
import warnings
import fiftyone as fo
import json
import tensorflow as tf
from tensorflow import keras
from keras import layers
import numpy as np
from fiftyone.types import COCODetectionDataset
from fiftyone import ViewField as F
warnings.filterwarnings('ignore')
dataset_dir = "/Users/adrienmonks/CS5990/FinalProject/fiftyone/coco-2017"

train_dir = f"{dataset_dir}/train2017"
val_dir = f"{dataset_dir}/val2017"
test_dir = f"{dataset_dir}/test2017"
annotations_dir = f"{dataset_dir}/annotations/annotations"

In [29]:
with open(f"{annotations_dir}/instances_train2017.json", "r") as f:
    annotations = json.load(f)

In [ ]:
print("loading train dataset...")
train_dataset = fo.Dataset.from_dir(
    dataset_type=COCODetectionDataset,
    data_path=train_dir,
    labels_path=f"{annotations_dir}/instances_train2017.json",
)

In [6]:
# saving training data
train_dataset.name = "train_dataset" 
train_dataset.save()

In [ ]:
print("Loading train dataset from saved state...")
train_dataset = fo.load_dataset("train_dataset")


In [ ]:
print("loading val dataset...")
val_dataset = fo.Dataset.from_dir(
    dataset_type=COCODetectionDataset,
    data_path=val_dir,
    labels_path=f"{annotations_dir}/instances_val2017.json",
)

In [ ]:
val_dataset.name = "val_dataset"
val_dataset.save()

In [4]:
print("Loading val dataset from saved state...")
val_dataset = fo.load_dataset("val_dataset")


Loading val dataset from saved state...


In [ ]:
print("loading test dataset...")
test_dataset = fo.Dataset.from_dir(
    dataset_type=COCODetectionDataset,
    data_path=test_dir,
)


In [ ]:
#filter by animals 

train_animal_images = train_dataset.filter_labels("detections", F("supercategory") == "animal")
val_animal_images = val_dataset.filter_labels("detections", F("supercategory") == "animal")

print(len(train_animal_images))

In [ ]:
test_animal_images = test_dataset.filter_labels("detections", F("supercategory") == "animal")

In [10]:
print(len(train_dataset))

118287


In [10]:
#launch fiftyone session 
session = fo.launch_app(train_animal_images)

In [8]:
session.close()

In [ ]:
from sklearn.model_selection import train_test_split
import numpy as np

# Extract unique categories for animals from the filtered train dataset
# Create unique labels from 'label' field
unique_labels = list(
    set(
        detection["label"]
        for sample in train_animal_images.select_fields("detections")
        if sample["detections"] is not None and "detections" in sample["detections"] 
        for detection in sample["detections"]["detections"]
    )
)
num_classes = len(unique_labels)
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}  # Create mapping correctly 


# Convert COCO annotations for train, val datasets to NumPy-compatible structures
def extract_images_and_labels(dataset, label_to_int):
    image_paths = []
    labels = []

    for sample in dataset:
        filepath = sample.filepath
        # Ensure 'detections' exists and is not None - doesn't exist for a couple
        if sample["detections"] is None or "detections" not in sample["detections"]:
            continue

        detections = sample["detections"]["detections"]
        if not detections:  # Skip samples with no detections
            continue

        for detection in detections:
            label = detection["label"] 
            if label in label_to_int:  # Only animals
                image_paths.append(filepath)
                labels.append(label_to_int[label])

    return image_paths, labels


print("Extracting train images and labels...")
train_images, train_labels = extract_images_and_labels(train_animal_images, label_to_int)

print("Extracting val images and labels...")
val_images, val_labels = extract_images_and_labels(val_animal_images, label_to_int)




In [28]:
print(unique_labels)

['cow', 'zebra', 'horse', 'sheep', 'cat', 'bear', 'bird', 'elephant', 'giraffe', 'dog']


In [13]:
from ultralytics import YOLO

In [14]:
#model = YOLO('yolov8l.pt') # large
#model = YOLO('yolov8x.pt') #extra large, most accurate, but slower
model = YOLO("yolov8s.pt") #small

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:01<00:00, 11.8MB/s]


In [14]:
print(unique_labels)

['cow', 'zebra', 'horse', 'sheep', 'cat', 'bear', 'bird', 'elephant', 'giraffe', 'dog']


In [2]:
with open(f"{annotations_dir}/instances_val2017.json", "r") as f:
    val_annotations = json.load(f)

In [26]:
import os
import shutil
yolo_dataset_dir = "./fiftyone/yolo-dataset"

# Delete the directory
if os.path.exists(yolo_dataset_dir):
    shutil.rmtree(yolo_dataset_dir)
    print(f"Deleted existing YOLO dataset at {yolo_dataset_dir}")
else:
    print(f"No existing YOLO dataset found at {yolo_dataset_dir}")

Deleted existing YOLO dataset at ./fiftyone/yolo-dataset


In [39]:
import os
import json
from tqdm import tqdm
from shutil import copyfile

coco_dir = "./fiftyone/coco-2017" 
output_dir = "./fiftyone/yolo-dataset"  # Output directory for YOLO dataset

import shutil

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)  # Delete the folder and its contents

os.makedirs(output_dir, exist_ok=True)

def convert_coco_to_yolo(annotations, image_dir, output_dir, subset_name, unique_labels):
    """
    Converts COCO annotations to YOLO format and organizes the dataset structure.
    """
    # Create directories for YOLO dataset
    subset_img_dir = os.path.join(output_dir, subset_name, "images")
    subset_lbl_dir = os.path.join(output_dir, subset_name, "labels")
    os.makedirs(subset_img_dir, exist_ok=True)
    os.makedirs(subset_lbl_dir, exist_ok=True)

    # Mapping category IDs to YOLO class indices for unique_labels only
    categories = annotations["categories"]
    cat_id_to_name = {cat["id"]: cat["name"] for cat in categories if cat["name"] in unique_labels}
    # Ensure the order of YOLO indices matches unique_labels
    cat_id_to_yolo = {cat["id"]: unique_labels.index(cat["name"]) for cat in categories if cat["name"] in unique_labels}
        
    # Map image IDs to filenames and dimensions
    img_id_to_file = {img["id"]: img["file_name"] for img in annotations["images"]}
    img_id_to_dims = {img["id"]: (img["width"], img["height"]) for img in annotations["images"]}

    valid_image_ids = set() 

    # Process annotations
    for ann in tqdm(annotations["annotations"], desc=f"Converting {subset_name}"):
        img_id = ann["image_id"]
        category_id = ann["category_id"]
        bbox = ann["bbox"]  # [x_min, y_min, width, height]

        if category_id not in cat_id_to_yolo:
            continue  # Skip non-relevant categories within image

        # Get image dimensions
        img_width, img_height = img_id_to_dims[img_id]
        x_min, y_min, box_width, box_height = bbox
        x_center = (x_min + box_width / 2) / img_width
        y_center = (y_min + box_height / 2) / img_height
        norm_width = box_width / img_width
        norm_height = box_height / img_height

        # Create YOLO label line
        label_line = f"{cat_id_to_yolo[category_id]} {x_center} {y_center} {norm_width} {norm_height}\n"

        # Write label to corresponding .txt file
        label_path = os.path.join(subset_lbl_dir, f"{os.path.splitext(img_id_to_file[img_id])[0]}.txt")
        with open(label_path, "a") as f:
            f.write(label_line)

        # Mark this image as valid
        valid_image_ids.add(img_id)

    # Copy only valid images
    for img_id in valid_image_ids:
        src_img_path = os.path.join(image_dir, img_id_to_file[img_id])
        dest_img_path = os.path.join(subset_img_dir, img_id_to_file[img_id])
        if os.path.exists(src_img_path):
            copyfile(src_img_path, dest_img_path)


# Convert train and val datasets
convert_coco_to_yolo(
    annotations=annotations,
    image_dir=os.path.join(coco_dir, "train2017"),
    output_dir=output_dir,
    subset_name="train",
    unique_labels=unique_labels
)


convert_coco_to_yolo(
    annotations=val_annotations,
    image_dir=os.path.join(coco_dir, "val2017"),
    output_dir=output_dir,
    subset_name="val",
    unique_labels=unique_labels
)

# Create data.yaml
data_yaml_path = os.path.join(output_dir, "data.yaml")
with open(data_yaml_path, "w") as f:
    f.write(f"train: {os.path.abspath(os.path.join(output_dir, 'train/images'))}\n")
    f.write(f"val: {os.path.abspath(os.path.join(output_dir, 'val/images'))}\n")
    f.write(f"nc: {len(unique_labels)}\n")
    f.write(f"names: {unique_labels}\n")

print("YOLO dataset created successfully!")


Converting train: 100%|██████████| 860001/860001 [00:06<00:00, 124543.44it/s]
Converting val: 100%|██████████| 36781/36781 [00:00<00:00, 73729.98it/s]


YOLO dataset created successfully!


In [6]:
import os
from tqdm import tqdm
from shutil import copyfile

output_dir = "./fiftyone/yolo-dataset/test"  # Output directory for YOLO-compatible test dataset
os.makedirs(output_dir, exist_ok=True)

# Create YOLO test dataset structure
subset_img_dir = os.path.join(output_dir, "images")
subset_lbl_dir = os.path.join(output_dir, "labels")
os.makedirs(subset_img_dir, exist_ok=True)
os.makedirs(subset_lbl_dir, exist_ok=True)

# Ensure unique labels mapping exists
# Replace with your actual mapping of categories to indices
label_to_int = {label: idx for idx, label in enumerate(set(y_val_split))}

# Convert `X_val_split` and `y_val_split` into YOLO format
def convert_to_yolo_format(image_paths, labels, output_img_dir, output_lbl_dir, label_to_int):
    for img_path, label in tqdm(zip(image_paths, labels), desc="Converting test set", total=len(image_paths)):
        # Copy the image to the test images directory
        dest_img_path = os.path.join(output_img_dir, os.path.basename(img_path))
        copyfile(img_path, dest_img_path)

        # Generate the YOLO label file
        label_file = os.path.join(output_lbl_dir, f"{os.path.splitext(os.path.basename(img_path))[0]}.txt")
        
        # If label includes bounding boxes, modify this block
        with open(label_file, "w") as f:
            if isinstance(label, list):  # Multiple detections
                for bbox in label:
                    # For bounding box labels, format as "<class_id> <x_center> <y_center> <width> <height>"
                    f.write(" ".join(map(str, bbox)) + "\n")
            else:  # Single detection (just class_id for now)
                f.write(f"{label_to_int[label]} 0.5 0.5 0.2 0.2\n")  # Placeholder bbox

convert_to_yolo_format(
    X_val_split, y_val_split, subset_img_dir, subset_lbl_dir, label_to_int
)

print("YOLO test dataset created successfully!")


Directories created: ./fiftyone/yolo-dataset/test/images, ./fiftyone/yolo-dataset/test/labels


In [3]:
#2014 conversions
dataset_dir2 = "/Users/adrienmonks/CS5990/FinalProject/fiftyone/coco-2014"
with open(f"{dataset_dir2}/annotations/instances_val2014.json","r") as f:
    annotations2 = json.load(f)


In [4]:
print(len(annotations2))

5


In [22]:
print("loading val dataset2...")
val_dataset2 = fo.Dataset.from_dir(
    dataset_type=COCODetectionDataset,
    data_path="/Users/adrienmonks/CS5990/FinalProject/fiftyone/coco-2014/val2014",
    labels_path="/Users/adrienmonks/CS5990/FinalProject/fiftyone/coco-2014/annotations/instances_val2014.json",
)

loading val dataset2...
 100% |█████████████| 40504/40504 [14.6m elapsed, 0s remaining, 125.8 samples/s]      


In [23]:
val_dataset2.name = "val_dataset2"
val_dataset2.save()

In [5]:
print("Loading val2 dataset from saved state...")
val_dataset2 = fo.load_dataset("val_dataset2")

Loading val2 dataset from saved state...


In [6]:
val_animal_images2 = val_dataset2.filter_labels("detections", F("supercategory") == "animal")
print(len(val_animal_images2))

8265


In [14]:
unique_labels = ['cow', 'zebra', 'horse', 'sheep', 'cat', 'bear', 'bird', 'elephant', 'giraffe', 'dog']
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}  # Create mapping

In [16]:

# Convert COCO annotations 
def extract_images_and_labels(dataset, label_to_int):
    image_paths = []
    labels = []

    for sample in dataset:
        filepath = sample.filepath
        # Ensure 'detections' exists and is not None
        if sample["detections"] is None or "detections" not in sample["detections"]:
            continue

        detections = sample["detections"]["detections"]
        if not detections:  # Skip samples with no detections
            continue

        for detection in detections:
            label = detection["label"]  # Use the 'label' field for class names
            if label in label_to_int:  # Filter relevant categories
                image_paths.append(filepath)
                labels.append(label_to_int[label])

    return image_paths, labels



In [17]:
val_images2, val_labels2 = extract_images_and_labels(val_animal_images2, label_to_int)

In [18]:
import os
import json
from tqdm import tqdm
from shutil import copyfile

# Paths relative to FinalProject
coco_dir = "./fiftyone/coco-2014"  # Your COCO dataset root directory
output_dir = "./fiftyone/yolo-dataset-test"  # Output directory for YOLO dataset

import shutil

# Clean the dataset directory
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)  # Delete the folder and its contents

os.makedirs(output_dir, exist_ok=True)

def convert_coco_to_yolo(annotations, image_dir, output_dir, subset_name, unique_labels):
    """
    Converts COCO annotations to YOLO format and organizes the dataset structure.
    """
    # Create directories for YOLO dataset
    subset_img_dir = os.path.join(output_dir, subset_name, "images")
    subset_lbl_dir = os.path.join(output_dir, subset_name, "labels")
    os.makedirs(subset_img_dir, exist_ok=True)
    os.makedirs(subset_lbl_dir, exist_ok=True)

    # Mapping category IDs to YOLO class indices for unique_labels only
    categories = annotations["categories"]
    cat_id_to_name = {cat["id"]: cat["name"] for cat in categories if cat["name"] in unique_labels}
    # Ensure the order of YOLO indices matches unique_labels
    cat_id_to_yolo = {cat["id"]: unique_labels.index(cat["name"]) for cat in categories if cat["name"] in unique_labels}
        
    # Map image IDs to filenames and dimensions
    img_id_to_file = {img["id"]: img["file_name"] for img in annotations["images"]}
    img_id_to_dims = {img["id"]: (img["width"], img["height"]) for img in annotations["images"]}

    valid_image_ids = set()  # Track images with valid annotations

    # Process annotations
    for ann in tqdm(annotations["annotations"], desc=f"Converting {subset_name}"):
        img_id = ann["image_id"]
        category_id = ann["category_id"]
        bbox = ann["bbox"]  # [x_min, y_min, width, height]

        if category_id not in cat_id_to_yolo:
            continue  # Skip non-relevant categories

        # Get image dimensions
        img_width, img_height = img_id_to_dims[img_id]
        x_min, y_min, box_width, box_height = bbox
        x_center = (x_min + box_width / 2) / img_width
        y_center = (y_min + box_height / 2) / img_height
        norm_width = box_width / img_width
        norm_height = box_height / img_height

        # Create YOLO label line
        label_line = f"{cat_id_to_yolo[category_id]} {x_center} {y_center} {norm_width} {norm_height}\n"

        # Write label to corresponding .txt file
        label_path = os.path.join(subset_lbl_dir, f"{os.path.splitext(img_id_to_file[img_id])[0]}.txt")
        with open(label_path, "a") as f:
            f.write(label_line)

        # Mark this image as valid
        valid_image_ids.add(img_id)

    # Copy only valid images
    for img_id in valid_image_ids:
        src_img_path = os.path.join(image_dir, img_id_to_file[img_id])
        dest_img_path = os.path.join(subset_img_dir, img_id_to_file[img_id])
        if os.path.exists(src_img_path):
            copyfile(src_img_path, dest_img_path)



convert_coco_to_yolo(
    annotations=annotations2,
    image_dir=os.path.join(coco_dir, "val2014"),
    output_dir=output_dir,
    subset_name="test",
    unique_labels=unique_labels
)

# Create data.yaml
data_yaml_path = os.path.join(output_dir, "data.yaml")
with open(data_yaml_path, "w") as f:
    f.write(f"test: {os.path.abspath(os.path.join(output_dir, 'test/images'))}\n")
    f.write(f"nc: {len(unique_labels)}\n")
    f.write(f"names: {unique_labels}\n")

print("YOLO dataset created successfully!")


Converting test: 100%|██████████| 291875/291875 [00:01<00:00, 182346.89it/s]


YOLO dataset created successfully!
