In [19]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

# Download data
First, make sure you have run the `fetch_data.sh` script to download the image data.

# Load Image Data and Annotations
The image part of the annotations file contains information about the images:


* the file name of the image
* the width and height of the image

The annotations part of the annotations file contains:

* the image ID
* the category ID, this is basically the class label
* the bounding box coordinates
* the segmentation (which is always empty because we don't have it)

We will create a dataframe that has both of these.

In [21]:
def get_coco_dataframe(root: str, split: str):
    """Creates a pandas dataframe with data from the COCO annotations file.

    :arg
        root (str) - the root directory of the coco dataset.
        split (str) - the split folder to look at, train/test/val.

    :return
        full_df (pd.DataFrame) - pandas dataframe with the full image annotations.
    """
    annotations_path = os.path.join(root, split, "_annotations.coco.json")

    with open(annotations_path, "r") as f:
        json_data = json.load(f)

    df_images = pd.DataFrame(json_data["images"])
    df_images.set_index("id", inplace=True)

    df_annotations = pd.DataFrame(json_data["annotations"])
    df_annotations.set_index("id", inplace=True)

    full_df = df_images.merge(df_annotations, how="inner", on="id")

    return full_df

In [22]:
train_data = get_coco_dataframe("garbage_coco", "train")
test_data = get_coco_dataframe("garbage_coco", "test")
val_data = get_coco_dataframe("garbage_coco", "valid")

In [8]:
!bash fetch_data.sh --type coco --output garbage_coco

Downloading full coco data from https://universe.roboflow.com/ds/vunUxYLq9j?key=jDCfG8KbT0
Data downloaded and extracted into garbage_coco


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
  0   885    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
100   885  100   885    0     0    201      0  0:00:04  0:00:04 --:--:--   201

  0     0    0     0    0     0      0      0 --:--:--  0:00:06 --:--:--     0
  0  274M    0 81912    0     0  11175      0  7:08:33  0:00:07  7:08:26  112k
  0  274M    0  885k    0     0   107k      0  0:43:38  0:00:08  0:43:30  538k
  0  274M    0 1774k    0     0   191k      0  0:24:24  0:00:09  0:24:15  671k
  0  274M    0 2318k    0     0   225k      0  0:20:42  0:00:10  0:20:32  636k
  1  274M    1 2990k    0     0   265k      0  0:17:37  0:00:11  0:17:26  642k
  1  274M    1 3470k    0     0   283k      0  0:1

In [31]:
!bash fetch_data.sh --type yolo --output garbage_yolo_full

yolo dataset
Downloading full yolo data from https://universe.roboflow.com/ds/UoC75yslyT?key=V3X5ZOBCmH
Data downloaded and extracted into garbage_yolo_full


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
  0   894    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
100   894  100   894    0     0    227      0  0:00:03  0:00:03 --:--:--   227

  0     0    0     0    0     0      0      0 --:--:--  0:00:06 --:--:--     0
  0  274M    0 49146    0     0   7226      0 11:03:05  0:00:06 11:02:59 88711
  0  274M    0 1038k    0     0   133k      0  0:35:00  0:00:07  0:34:53  682k
  0  274M    0 2078k    0     0   237k      0  0:19:44  0:00:08  0:19:36  824k
  1  274M    1 3038k    0     0   311k      0  0:15:02  0:00:09  0:14:53  862k
  1  274M    1 4094k    0     0   380k      0  0:12:18  0:00:10  0:12:08  905k
  1  274M    1 5214k    0     0   441k      0  0:1

In [79]:
import glob

def get_yolo_labels(root_dir: str, split: str):
    labels_path = os.path.join(root_dir, split, "labels\\*.txt")
    yolo_train_labels = glob.glob(labels_path)

    labels_to_class = []
    for yolo_label in yolo_train_labels:
        with open(yolo_label, "r") as file:
            image_name = os.path.split(yolo_label)
            image_name = os.path.splitext(image_name[-1])[0]
            image_name += ".jpg"
            labels_to_class.append((image_name, int(bool(file.readlines()))))

    return pd.DataFrame(labels_to_class, columns=["file_name", "label"])

In [80]:
train_labels = get_yolo_labels("garbage_yolo_full", "train")
test_labels = get_yolo_labels("garbage_yolo_full", "test")
val_labels = get_yolo_labels("garbage_yolo_full", "valid")

In [83]:
full_train_data = train_data.merge(train_labels, how="inner", on="file_name")
full_test_data = test_data.merge(test_labels, how="inner", on="file_name")
full_val_data = val_data.merge(val_labels, how="inner", on="file_name")