## Create COCO annotation files ##
This notebooks requires the detectron2 library.

In [1]:
# Imports
import os
import numpy as np
import pandas as pd

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib import patches

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Detectron2 library
import detectron2
from detectron2.structures import BoxMode

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.fileutils import FileOP
from dentexmodel.imageproc import ImageData
from dentexmodel.dentexdata import DentexData, val_test_split

print(f'Project module version: {dm.__version__}')
print(f'Detectron2 version:     {detectron2.__version__}')

Project module version: 0.0.post1.dev101+g9852c2c.d20240205
Detectron2 version:     0.6


In [2]:
# Data directory (change as needed)
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_detection')

# This image directory is where the xrays are in the archive, so should be left as-is
image_dir = os.path.join(data_dir, 'quadrant_enumeration', 'xrays')

# Data frame with images and paths
data_df_file_name = 'dentex_detection_datasplit.parquet'
data_df_file = os.path.join(dentex_dir, data_df_file_name)

In [3]:
# Load the data frame with image paths and bounding boxes
data_df = pd.read_parquet(data_df_file)
display(data_df.head(2))

train_set = set(data_df.loc[data_df['dataset'] == 'train', 'file_name'].values)
print(f'We have {len(train_set)} images in the train set.')

val_set = set(data_df.loc[data_df['dataset'] == 'val', 'file_name'].values)
print(f'We have {len(val_set)} images in the validation set.')

test_set = set(data_df.loc[data_df['dataset'] == 'test', 'file_name'].values)
print(f'We have {len(test_set)} images in the test set.')

Unnamed: 0,image_id,file_name,image_number,file_path,dataset
0,327,train_612.png,612,/app/data/dentex/dentex_detection/quadrant_enu...,train
1,543,train_163.png,163,/app/data/dentex/dentex_detection/quadrant_enu...,train


We have 600 images in the train set.
We have 10 images in the validation set.
We have 24 images in the test set.


### Create the dataset representations ###
https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html#register-a-dataset

In [18]:
# Open the json annotation file
# We cannot use this file directly, because detectron2 requires a little different representation

annotation_file_name = 'train_quadrant_enumeration.json'
json_file = os.path.join(data_dir, 'quadrant_enumeration', annotation_file_name)

dtx = DentexData(data_dir=data_dir)
annotations = dtx.load_annotations(json_file=json_file)
display(annotations.keys())

# Create one dictionary with all annotation categories
category_dict = dtx.create_category_dict(categories=range(1, 3))
display(category_dict)

# Images in the annotation file are related by the image_id

dict_keys(['images', 'annotations', 'categories_1', 'categories_2'])

{'categories_1': {0: 1, 1: 2, 2: 3, 3: 4},
 'categories_2': {0: '1',
  1: '2',
  2: '3',
  3: '4',
  4: '5',
  5: '6',
  6: '7',
  7: '8'}}

In [38]:
dataset = 'train'
file_name_list = sorted(data_df.loc[data_df['dataset'] == dataset, 'file_name'].values)
print(len(file_name_list))

file_name = file_name_list[10]
print(file_name)

600
train_107.png


In [53]:
# Function that collects the annotations for a given file_name
def get_image_annotations(annotations, file_name):
    image_dict = {}
    image_dict_list = [d for d in annotations.get('images') if d.get('file_name') == file_name]
    if len(image_dict_list) > 0:
        image_dict.update(image_dict_list[0])
        an_dict_list = [d for d in annotations.get('annotations') if d.get('image_id') == image_dict.get('id')]
        # We need to add some more detectron2-specific information to each annotation
        an_dict_list = [d.update({'bbox_mode': BoxMode.XYWH_ABS, 'category_id': 0}) for d in an_dict_list]
        image_dict.update({'file_name_annotations': an_dict_list})
    return image_dict

In [58]:
a = {'name': file_name,
     'id': 123}
a_list = [a] * 4
print(a_list)

[{'name': 'train_107.png', 'id': 123}, {'name': 'train_107.png', 'id': 123}, {'name': 'train_107.png', 'id': 123}, {'name': 'train_107.png', 'id': 123}]


In [59]:
b_list = [a.update({'new_id': 234}) for a in a_list]
print(b_list)

[None, None, None, None]


In [54]:
# Pick one image from the training set
image_dict = get_annotations(annotations, file_name=file_name)
display(image_dict)

{'height': 1316,
 'width': 2860,
 'id': 226,
 'file_name': 'train_107.png',
 'file_name_annotations': [{'iscrowd': 0,
   'image_id': 226,
   'bbox': [1321.7777777777778,
    373.7777777777778,
    91.66666666666674,
    319.44444444444446],
   'segmentation': [[1363,
     423,
     1355,
     457,
     1352,
     496,
     1335,
     534,
     1321,
     573,
     1321,
     632,
     1338,
     687,
     1371,
     693,
     1396,
     646,
     1394,
     576,
     1394,
     529,
     1405,
     459,
     1413,
     393,
     1394,
     373,
     1371,
     373]],
   'id': 6457,
   'area': 17735,
   'category_id_1': 0,
   'category_id_2': 0},
  {'iscrowd': 0,
   'image_id': 226,
   'bbox': [1266.2222222222222, 407.1111111111111, 75.0, 261.1111111111112],
   'segmentation': [[1341,
     407,
     1341,
     426,
     1341,
     448,
     1332,
     479,
     1321,
     523,
     1321,
     571,
     1316,
     612,
     1307,
     651,
     1282,
     668,
     1271,
     648,
     1