In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pprint import pprint

from additional_info import sub_dir, json_reader, json_writer

In [None]:
selected_dataset = "coco"

# to get limit size of sample output 
output_line_limit = 10

# 1. Initializing dataset directory

In [None]:
# directory containing image and annotations of coco dataset
image_dir = os.path.join(sub_dir['dir_data'], selected_dataset, "train2017")
annotation_dir = os.path.join(sub_dir['dir_data'], selected_dataset, "annotations")

print(f"Image directory = {image_dir}")
print(f"Annotation directory = {annotation_dir}")

# 2. Number of files in image and annotation directory [first look]

In [None]:
image_dir_files = os.listdir(image_dir)
print(f"Number of images = ", len(image_dir_files))
print(f"Number of annotation files  = ", len(os.listdir(annotation_dir)))

print("\nSome files from image folder and their size :: ")
for i,file in enumerate(image_dir_files):
    file_size = round((os.path.getsize(os.path.join(image_dir, file)))/1024, ndigits=2)
    print("\t{}. File name = {} :: size = {} kb".format(i+1, file, file_size))
    
    if i > output_line_limit:
        print("\t....................................................\n"*3)
        file_size = round((os.path.getsize(os.path.join(image_dir, image_dir_files[-1])))/1024, ndigits=2)
        print("\t{}. File name = {} :: size = {} kb".format(len(image_dir_files), image_dir_files[-1], file_size))
        break
        
print("\nFiles from annotation folder :: ")
for i,file in enumerate(os.listdir(annotation_dir)):
    file_size = round((os.path.getsize(os.path.join(annotation_dir, file)))/1024, ndigits=2)
    print("\t{}. File name = {} :: size = {} kb".format(i+1, file, file_size))

### As we have downloaded train images from the coco dataset for EDA, we only require instances_train2017.json file. So, we will be working with these two things from now on:

- **image_dir** = contains 118287 images from coco dataset

- **annotation** = contains all the information (which we will read below) from the **instances_train2017.json** [largest file present in annotation folder, **448 MB**]

# 3. Viewing annotation file ["instances_train2017.json"]

In [None]:
# reading contents of "instances_train2017.json" into annotation
annotation = json_reader(os.path.join(annotation_dir,"instances_train2017.json"))

In [None]:
# As annotation is a json file i.e. a dictionary structure, lets take a look at keys present in it.
print("Keys present in annotaion = ", annotation.keys())

# Lets see the number of entries in each key and first value of each key, for better understanding.
for i,key in enumerate(annotation.keys()):
    print(f"\n{i+1}. Key = {key}\n\t Number of entries :: {len(annotation[key])}")
    if type(annotation[key]) == list:
        print("\t First value :: ")
        pprint(annotation[key][0],indent=25)
    else:
        print("\t First value :: ")
        pprint(annotation[key],indent=25)

### Annotation consists of five sections of information that provide information for the entire dataset. [source: https://cocodataset.org/#format-data]
- **info**     – general information about the dataset.

- **licenses** – license information for the images in the dataset.

- **images**   – a list of images in the dataset.

- **annotations** – a list of annotations (including bounding boxes) that are present in all images in the dataset.

- **categories**  – a list of label categories.

    ### Structure of images in annotations["images"]
    - **id**         – A unique identifier for the image. The id field maps to the id field in the annotations array (where bounding box information is stored).

    - **license**    – Maps to the license array.

    - **coco_url**   – The location of the image.

    - **flickr_url** – The location of the image on Flickr.

    - **width**      – The width of the image.

    - **height**     – The height of the image.

    - **file_name**  – The image file name. In this example, file_name and id match, but this is not a requirement for COCO datasets.

    - **date_captured** – the date and time the image was captured.
    
    ### Structure of annotations in annotations["annotations"]
    - **id** – The identifier for the annotation.

    - **image_id** –  Corresponds to the image id in the images array.

    - **category_id** –  The identifier for the label that identifies the object within a bounding box. It maps to the id field of the categories array.

    - **iscrowd** – Specifies if the image contains a crowd of objects.

    - **segmentation** – Segmentation information for objects on an image.

    - **area** – The area of the annotation.

    - **bbox** –  Contains the coordinates, in pixels, of a bounding box around an object on the image. **[top left x, top left y,width,height]**
    ### Structure of categories in annotation["categories"]
    - **supercategory** – The parent category for a label.

    - **id** – The label identifier. The id field maps to the category_id field in an annotation object.

    - **name** – the label name.

# 4. Overview of annotations

In [None]:
# After first look at the annotation file, we got the following information about the coco dataset
print("Date created :: ", annotation['info']['date_created'])
print("Contributor  :: ", annotation['info']['contributor'])
print("URL          :: ", annotation['info']['url'])
print("Image count  :: ", len(annotation['images']), end="\t\t (This is equal to the number of images present in directory\ 
      [check section 2])\n")
print("Annotation count :: ", len(annotation['annotations']))
print("Number of categories (classes) :: ", len(annotation['categories']))



# Retrieving name of parent categories and classes from annotation file
parent_category = {}
classes = {}
for annotation_info in tqdm(annotation['categories']):
    parent_category[annotation_info['supercategory']] = 1
    classes[annotation_info['name']] = 1

parent_category = list(parent_category.keys())
classes = list(classes.keys())


count=1
print("Name of categories :: ")
for index in range(0,len(classes),5):
    print(f'\t{count}. {classes[index]:15}\t{count+1}. 
          {classes[index+1]:15}\t{count+2}. {classes[index+2]:15}\t{count+3}. 
          {classes[index+3]:15}\t{count+4}. {classes[index+4]:15}')
    count+=5

count=1
print("\nNumber of parent categories :: ", len(parent_category))
print("Name of parent categories :: ")
for index in range(0,len(parent_category), 3):
    print(f'\t{count}. {parent_category[index]:5}\t{count+1}. 
          {parent_category[index+1]:5}\t{count+2}. 
          {parent_category[index+2]:5}')
    count+=3


### As we can see, we have 1lakh+ images which have 8.6lakh annotations which is spread over 80 different classes.

### It means that an image have multiple annotations (same/different class).

# 5. Relation in the | images | annotations | categories | in the annotation file.


| images  &nbsp;&nbsp;&nbsp;         | annotations &nbsp;&nbsp;&nbsp;   | categories  |
| -----------                       | -----------                      | ----------- |
| **id** &nbsp;&nbsp;&nbsp;         | **image_id** &nbsp;&nbsp;&nbsp;  | name        |
| license  &nbsp;&nbsp;&nbsp;       |  id &nbsp;&nbsp;&nbsp;           | supercategory       |
| coco_url &nbsp;&nbsp;&nbsp;       | ***category_id*** &nbsp;&nbsp;&nbsp;   | ***id***      |
| date_captured &nbsp;&nbsp;&nbsp;     | iscrowd  &nbsp;&nbsp;&nbsp;      | -----       |
| flickr_url &nbsp;&nbsp;&nbsp;     | segmentation &nbsp;&nbsp;&nbsp;  | -----       |
| width  &nbsp;&nbsp;&nbsp;         | area  &nbsp;&nbsp;&nbsp;         | -----       |
| height &nbsp;&nbsp;&nbsp;         | bbox &nbsp;&nbsp;&nbsp;          | -----       |
| file_name  &nbsp;&nbsp;&nbsp;     | -----  &nbsp;&nbsp;&nbsp;        | -----       |


### Here we can see that:
- **id** in images is mapped to **image_id** in annotations.
- ***id*** in categories is mapped to ***category_id*** in annotations.