In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pprint import pprint

from additional_info import sub_dir, json_reader, json_writer

In [None]:
selected_dataset = "coco"

# to get limit size of sample output 
output_line_limit = 10

# 1. Initializing dataset directory

In [None]:
# directory containing image and annotations of coco dataset
image_dir = os.path.join(sub_dir['dir_data'], selected_dataset, "train2017")
annotation_dir = os.path.join(sub_dir['dir_data'], selected_dataset, "annotations")

print(f"Image directory = {image_dir}")
print(f"Annotation directory = {annotation_dir}")

# 2. Number of files in image and annotation directory [first look]

In [None]:
image_dir_files = os.listdir(image_dir)
print(f"Number of images = ", len(image_dir_files))
print(f"Number of annotation files  = ", len(os.listdir(annotation_dir)))

print("\nSome files from image folder and their size :: ")
for i,file in enumerate(image_dir_files):
    file_size = round((os.path.getsize(os.path.join(image_dir, file)))/1024, ndigits=2)
    print("\t{}. File name = {} :: size = {} kb".format(i+1, file, file_size))
    
    if i > output_line_limit:
        print("\t....................................................\n"*3)
        file_size = round((os.path.getsize(os.path.join(image_dir, image_dir_files[-1])))/1024, ndigits=2)
        print("\t{}. File name = {} :: size = {} kb".format(len(image_dir_files), image_dir_files[-1], file_size))
        break
        
print("\nFiles from annotation folder :: ")
for i,file in enumerate(os.listdir(annotation_dir)):
    file_size = round((os.path.getsize(os.path.join(annotation_dir, file)))/1024, ndigits=2)
    print("\t{}. File name = {} :: size = {} kb".format(i+1, file, file_size))

### As we have downloaded train images from the coco dataset for EDA, we only require instances_train2017.json file. So, we will be working with these two things from now on:

- **image_dir** = contains 118287 images from coco dataset

- **annotation** = contains all the information (which we will read below) from the **instances_train2017.json** [largest file present in annotation folder, **448 MB**]

# 3. Viewing annotation file ["instances_train2017.json"]

In [None]:
# reading contents of "instances_train2017.json" into annotation
annotation = json_reader(os.path.join(annotation_dir,"instances_train2017.json"))

In [None]:
# As annotation is a json file i.e. a dictionary structure, lets take a look at keys present in it.
print("Keys present in annotaion = ", annotation.keys())

# Lets see the number of entries in each key and first value of each key, for better understanding.
for i,key in enumerate(annotation.keys()):
    print(f"\n{i+1}. Key = {key}\n\t Number of entries :: {len(annotation[key])}")
    if type(annotation[key]) == list:
        print("\t First value :: ")
        pprint(annotation[key][0],indent=25)
    else:
        print("\t First value :: ")
        pprint(annotation[key],indent=25)

### Annotation consists of five sections of information that provide information for the entire dataset. [source: https://cocodataset.org/#format-data]
- **info**     – general information about the dataset.

- **licenses** – license information for the images in the dataset.

- **images**   – a list of images in the dataset.

- **annotations** – a list of annotations (including bounding boxes) that are present in all images in the dataset.

- **categories**  – a list of label categories.

    ### Structure of images in annotations["images"]
    - **id**         – A unique identifier for the image. The id field maps to the id field in the annotations array (where bounding box information is stored).

    - **license**    – Maps to the license array.

    - **coco_url**   – The location of the image.

    - **flickr_url** – The location of the image on Flickr.

    - **width**      – The width of the image.

    - **height**     – The height of the image.

    - **file_name**  – The image file name. In this example, file_name and id match, but this is not a requirement for COCO datasets.

    - **date_captured** – the date and time the image was captured.
    
    ### Structure of annotations in annotations["annotations"]
    - **id** – The identifier for the annotation.

    - **image_id** –  Corresponds to the image id in the images array.

    - **category_id** –  The identifier for the label that identifies the object within a bounding box. It maps to the id field of the categories array.

    - **iscrowd** – Specifies if the image contains a crowd of objects.

    - **segmentation** – Segmentation information for objects on an image.

    - **area** – The area of the annotation.

    - **bbox** –  Contains the coordinates, in pixels, of a bounding box around an object on the image. **[top left x, top left y,width,height]**
    ### Structure of categories in annotation["categories"]
    - **supercategory** – The parent category for a label.

    - **id** – The label identifier. The id field maps to the category_id field in an annotation object.

    - **name** – the label name.

# 4. Overview of annotations

In [None]:
# After first look at the annotation file, we got the following information about the coco dataset
print("Date created :: ", annotation['info']['date_created'])
print("Contributor  :: ", annotation['info']['contributor'])
print("URL          :: ", annotation['info']['url'])
print("Image count  :: ", len(annotation['images']), end="\t\t (This is equal to the number of images present in directory\ 
      [check section 2])\n")
print("Annotation count :: ", len(annotation['annotations']))
print("Number of categories (classes) :: ", len(annotation['categories']))



# Retrieving name of parent categories and classes from annotation file
parent_category = {}
classes = {}
for annotation_info in tqdm(annotation['categories']):
    parent_category[annotation_info['supercategory']] = 1
    classes[annotation_info['name']] = 1

parent_category = list(parent_category.keys())
classes = list(classes.keys())


count=1
print("Name of categories :: ")
for index in range(0,len(classes),5):
    print(f'\t{count}. {classes[index]:15}\t{count+1}. 
          {classes[index+1]:15}\t{count+2}. {classes[index+2]:15}\t{count+3}. 
          {classes[index+3]:15}\t{count+4}. {classes[index+4]:15}')
    count+=5

count=1
print("\nNumber of parent categories :: ", len(parent_category))
print("Name of parent categories :: ")
for index in range(0,len(parent_category), 3):
    print(f'\t{count}. {parent_category[index]:5}\t{count+1}. 
          {parent_category[index+1]:5}\t{count+2}. 
          {parent_category[index+2]:5}')
    count+=3


### As we can see, we have 1lakh+ images which have 8.6lakh annotations which is spread over 80 different classes.

### It means that an image have multiple annotations (same/different class).

# 5. Relation in the | images | annotations | categories | in the annotation file.


| images  &nbsp;&nbsp;&nbsp;         | annotations &nbsp;&nbsp;&nbsp;   | categories  |
| -----------                       | -----------                      | ----------- |
| **id** &nbsp;&nbsp;&nbsp;         | **image_id** &nbsp;&nbsp;&nbsp;  | name        |
| license  &nbsp;&nbsp;&nbsp;       |  id &nbsp;&nbsp;&nbsp;           | supercategory       |
| coco_url &nbsp;&nbsp;&nbsp;       | ***category_id*** &nbsp;&nbsp;&nbsp;   | ***id***      |
| date_captured &nbsp;&nbsp;&nbsp;     | iscrowd  &nbsp;&nbsp;&nbsp;      | -----       |
| flickr_url &nbsp;&nbsp;&nbsp;     | segmentation &nbsp;&nbsp;&nbsp;  | -----       |
| width  &nbsp;&nbsp;&nbsp;         | area  &nbsp;&nbsp;&nbsp;         | -----       |
| height &nbsp;&nbsp;&nbsp;         | bbox &nbsp;&nbsp;&nbsp;          | -----       |
| file_name  &nbsp;&nbsp;&nbsp;     | -----  &nbsp;&nbsp;&nbsp;        | -----       |


### Here we can see that:
- **id** in images is mapped to **image_id** in annotations.
- ***id*** in categories is mapped to ***category_id*** in annotations.

# 6. Removing data from | images | annotations | to reduce file size.

### We will drop: 
   - `license`, `coco_url`, `flickr_url` from **images**
   - `iscrowd`, `segmentation` from **annotations**

In [None]:
for image_info in tqdm(annotation['images']):
    image_info.pop('license')
    image_info.pop('flickr_url')
    image_info.pop('coco_url')

for annotation_info in tqdm(annotation['annotations']):
    annotation_info.pop('iscrowd')
    annotation_info.pop('segmentation')

# saving processed annotation file. Will use this file from now onwards.
file_loc = os.path.join(annotation_dir, 'processed_instances_train2017.json')
json_writer(data=annotation, file_loc=file_loc)
file_size = round((os.path.getsize(file_loc))/1024, ndigits=2)
print("New file size = {} kb".format(file_size))

# 7. Stats about image dimension using processed annotation file

In [None]:
# dataset_info is used to store file_name, width, height, aspect_ratio and type of each image.
dataset_info = []

for image_info in tqdm(annotation['images']):
    dataset_info.append([image_info['id'], image_info['width'], image_info['height'], 
                         image_info['width']/image_info['height'] ,image_info['file_name'].split('.')[-1]])
    
image_dim_df = pd.DataFrame(data=dataset_info, columns=['file_name','width','height','aspect_ratio','type'])
del dataset_info

print("About height of images :: ")
print(f" Max = {image_dim_df['height'].max()} \n Min = {image_dim_df['height'].min()}")
print(f" Mean = {image_dim_df['height'].mean()} \n Median = {image_dim_df['height'].median()} \n Mode = {image_dim_df['height'].mode()[0]}")

print("\nAbout width of images :: ")
print(f" Max = {image_dim_df['width'].max()} \n Min = {image_dim_df['width'].min()}")
print(f" Mean = {image_dim_df['width'].mean()} \n Median = {image_dim_df['width'].median()} \n Mode = {image_dim_df['width'].mode()[0]}")

print("\nAbout aspect_ratio of images :: ")
print(f" Max = {image_dim_df['aspect_ratio'].max()} \n Min = {image_dim_df['aspect_ratio'].min()}")
print(f" Mean = {image_dim_df['aspect_ratio'].mean()} \n Median = {image_dim_df['aspect_ratio'].median()} \n Mode = {image_dim_df['aspect_ratio'].mode()[0]}")

image_dim_df

In [None]:
plt.figure()
image_dim_df.hist(column='height', bins=25, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("height of images", fontsize=15)
plt.ylabel("image count", fontsize=15)

plt.figure()
image_dim_df.hist(column='width', bins=25, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("width of images", fontsize=15)
plt.ylabel("image count", fontsize=15)

plt.figure()
image_dim_df.hist(column='aspect_ratio', bins=25, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("aspect_ratio of images", fontsize=15)
plt.ylabel("image count", fontsize=15)

### From above, we can see that majority of images in the dataset are present in the specific range which confirms that there are very less outliers present in this dataset i.e. less extreme values.
                     

| &nbsp;&nbsp;&nbsp;         | minimum &nbsp;&nbsp;&nbsp;   | maximum &nbsp;&nbsp;&nbsp;   |mean &nbsp;&nbsp;&nbsp;   |median &nbsp;&nbsp;&nbsp;   | mode &nbsp;&nbsp;&nbsp;   |
| -----------                       | -----------                      | ----------- | ----------- | ----------- |  ----------- |
| **height** &nbsp;&nbsp;&nbsp; | 51 &nbsp;&nbsp;&nbsp;  | 640 &nbsp;&nbsp;&nbsp; |484 &nbsp;&nbsp;&nbsp; |480.0 &nbsp;&nbsp;&nbsp; |480 &nbsp;&nbsp;&nbsp; |
| **width** &nbsp;&nbsp;&nbsp; | 59 &nbsp;&nbsp;&nbsp;  | 640 &nbsp;&nbsp;&nbsp; |577.71 &nbsp;&nbsp;&nbsp; |640.0 &nbsp;&nbsp;&nbsp; |640 &nbsp;&nbsp;&nbsp; |
| **aspect_ratio** &nbsp;&nbsp;&nbsp; | 0.24375 &nbsp;&nbsp;&nbsp;  | 6.15 &nbsp;&nbsp;&nbsp; |1.25 &nbsp;&nbsp;&nbsp; |1.33 &nbsp;&nbsp;&nbsp; |1.33 &nbsp;&nbsp;&nbsp; |



# 8. Checking bounding box data using annotation file 

### As we are provided with bounding box data which is present in annotation["annotations"], we will check for following things:
   - **height or width** : value for height and width of bbox is present in bbox[x,y,width,height] format. These value should be greater than 0.
   - **area** : value for area is provided and it has to be greater than 0.
   
### If any value like mentioned above is present, we will remove that annotation from the file.
##### NOTE - original count of annotations is present in section 4.

In [None]:
print("Annotation count before check:: ", len(annotation['annotations']))

for annotation_info in tqdm(annotation['annotations']):
    bbox_width = annotation_info['bbox'][2]
    bbox_height = annotation_info['bbox'][3]
    area = annotation_info['area']
    
    if bbox_height==0 or bbox_width==0 or area==0:
        print("This entry will be removed :: ")
        pprint(annotation_info)
        annotation['annotations'].pop(annotation['annotations'].index(annotation_info))


print("Annotation count after check:: ", len(annotation['annotations']))        


# 9. Stats about bounding box dimension using processed annotation file

In [None]:
# dataset_info is used to store id, bbox_width, bbox_height, aspect_ratio, area and class.
dataset_info = []

for annotation_info in tqdm(annotation['annotations']):
    bbox_width = annotation_info['bbox'][2]
    bbox_height = annotation_info['bbox'][3]
    aspect_ratio = bbox_width/bbox_height
    
    dataset_info.append([annotation_info['id'], bbox_width, bbox_height, aspect_ratio ,annotation_info['area'],annotation_info['category_id']])
    
bbox_df = pd.DataFrame(data=dataset_info, columns=['image_id','bbox_width','bbox_height','aspect_ratio','area',"class"])
print("About height of bbox :: ")
print(f" Max = {bbox_df['bbox_height'].max()} \n Min = {bbox_df['bbox_height'].min()}")
print(f" Mean = {bbox_df['bbox_height'].mean()} \n Median = {bbox_df['bbox_height'].median()} \n Mode = {bbox_df['bbox_height'].mode()[0]}")

print("\nAbout width of bbox :: ")
print(f" Max = {bbox_df['bbox_width'].max()} \n Min = {bbox_df['bbox_width'].min()}")
print(f" Mean = {bbox_df['bbox_width'].mean()} \n Median = {bbox_df['bbox_width'].median()} \n Mode = {bbox_df['bbox_width'].mode()[0]}")

print("\nAbout aspect_ratio of bbox :: ")
print(f" Max = {bbox_df['aspect_ratio'].max()} \n Min = {bbox_df['aspect_ratio'].min()}")
print(f" Mean = {bbox_df['aspect_ratio'].mean()} \n Median = {bbox_df['aspect_ratio'].median()} \n Mode = {bbox_df['aspect_ratio'].mode()[0]}")

print("\nAbout area of bbox :: ")
print(f" Max = {bbox_df['area'].max()} \n Min = {bbox_df['area'].min()}")
print(f" Mean = {bbox_df['area'].mean()} \n Median = {bbox_df['area'].median()} \n Mode = {bbox_df['area'].mode()[0]}")
bbox_df

In [None]:
plt.figure()
bbox_df.hist(column='bbox_height', bins=100, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("height of bbox", fontsize=15)
plt.ylabel("annotation count", fontsize=15)

plt.figure()
bbox_df.hist(column='bbox_width', bins=100, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("width of bbox", fontsize=15)
plt.ylabel("annotation count", fontsize=15)

plt.figure()
bbox_df.hist(column='aspect_ratio', bins=100, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("aspect_ratio of bbox", fontsize=15)
plt.ylabel("annotation count", fontsize=15)

plt.figure()
bbox_df.hist(column='area', bins=10, grid=True, figsize=(12,8), color='#69bf69', rwidth=0.6)
plt.xlabel("area of bbox", fontsize=15)
plt.ylabel("annotation count", fontsize=15)



### From above, we can see that majority of bbox in the dataset are present in the specific range which represents that the dataset maybe be having the problem of class imbalance.
                     

| &nbsp;&nbsp;&nbsp;         | minimum &nbsp;&nbsp;&nbsp;   | maximum &nbsp;&nbsp;&nbsp;   |mean &nbsp;&nbsp;&nbsp;   |median &nbsp;&nbsp;&nbsp;   | mode &nbsp;&nbsp;&nbsp;   |
| -----------                       | -----------                      | ----------- | ----------- | ----------- |  ----------- |
| **bbox_height** &nbsp;&nbsp;&nbsp; | 0.34 &nbsp;&nbsp;&nbsp;  | 640.0 &nbsp;&nbsp;&nbsp; |107.42 &nbsp;&nbsp;&nbsp; |62.34 &nbsp;&nbsp;&nbsp; |480 &nbsp;&nbsp;&nbsp; |
| **bbox_width** &nbsp;&nbsp;&nbsp; | 0.23 &nbsp;&nbsp;&nbsp;  | 640.0 &nbsp;&nbsp;&nbsp; |103.9 &nbsp;&nbsp;&nbsp; |54.14 &nbsp;&nbsp;&nbsp; |640.0 &nbsp;&nbsp;&nbsp; |
| **aspect_ratio** &nbsp;&nbsp;&nbsp; | 0.02 &nbsp;&nbsp;&nbsp;  | 143.77 &nbsp;&nbsp;&nbsp; |1.21 &nbsp;&nbsp;&nbsp; |0.86 &nbsp;&nbsp;&nbsp; |1.0 &nbsp;&nbsp;&nbsp; |
| **area** &nbsp;&nbsp;&nbsp; | 0.541 &nbsp;&nbsp;&nbsp;  | 787151.47 &nbsp;&nbsp;&nbsp; |12025.88 &nbsp;&nbsp;&nbsp; |1697.09 &nbsp;&nbsp;&nbsp; |130.0 &nbsp;&nbsp;&nbsp; |

### To get more details about the bbox, it's better to compare them among class..

# 10. Viewing number of images in each class.

### This will help us to visualize that 80 classes of objects have similar amount of annotated images or not.

In [None]:
# dictionary to store number of bounding box in each class
class_wise_bbox = {}

# creating map of class_id --> class_name
# adding each class in class_wise_bbox dictionary as count 0
classes_id_name_map = {}
for annotation_info in (annotation['categories']):
    category_name = annotation_info['name']
    category_id = annotation_info['id']
    
    class_wise_bbox[category_name] = 0
    classes_id_name_map[category_id] = category_name
    
# now we will count the number of annotations in each class and update it in the class_wise_bbox dictionary.
for annotation_info in annotation['annotations']:
    class_wise_bbox[classes_id_name_map[annotation_info['category_id']]] += 1


In [None]:
plt.figure(figsize=(20,10))
plt.bar(range(len(class_wise_bbox)), class_wise_bbox.values(), align='center')
plt.xticks(range(len(class_wise_bbox)), list(class_wise_bbox.keys()),rotation=-90,fontsize=15)
plt.xlabel("Classes", fontsize=25)
plt.ylabel("Number of annotations", fontsize=25)

plt.show()