This notebook creates the test dataset that is used for the second set of detectron2 models. The difference is that this time the images are not all shuffled before being split into test and training datasets (i.e. one patent belongs to either the test or the train dataset, not both).

DATASET NEEDS TO BE IN FOLLOWING FORMAT (JSON):
* input as "images" containing list of dictionnaries (each dict = one image):
    * each dictionnary should contain: file_name, height, width, id
    * e.g.: [{"file_name": "PMC5491943_00004.jpg", "height": 794, "id": 348952, "width": 596}
* output as "annotations" containing list of disctionnaries (each dict = one segmented object):
    * each dictionnary should contain: segmentation (list of polygon coordinates contained in object), area,
    iscrowd (0), image_id (the id of the image in which there is the object), bbox (bounding box coords), category_id, id.
    * e.g. {"segmentation": [[37.59,360.34,288.66,360.34,37.59,360.34]],
      "area": 10218.471181684348,
      "iscrowd": 0,
      "image_id": 346767,
      "bbox": [
        37.59,
        360.34,
        251.07,
        41.36
      ],
      "category_id": 1,
      "id": 3377124
    },

In [79]:
import xmltodict
import pprint
import json
import os
from PIL import Image
from detectron2.structures import BoxMode

In [80]:
COCO = {'images': [],
          'annotations': []}
COCO_NO_SIGIL = {'images': [],
                    'annotations': []}

In [81]:
# ADD IMAGES FROM SAMPLE TO TRAIN ON

In [82]:
class_dict = {0: 'text',1: 'title',2: 'list', 3: 'table', 4: 'figure', 5: 'sigil'}
reverse_class_dict = {v:k for k, v in class_dict.items()}

In [83]:
class_dict.values()

dict_values(['text', 'title', 'list', 'table', 'figure', 'sigil'])

In [84]:
def xyxy_to_xswd(xmin, ymin, xmax, ymax):
    """
    function to transform the XYXY format to XYWH format for bounding boxes
    Args:
    xmin - minimum x coordinate 
    ymin - minimum y coordinate 
    xmax - maximum x coordinate 
    ymax - maximum y coordinate 
    Returns:
    x - the center x coordinate
    y - the center y coordinate
    w - width of bounding box
    h - height of bounding box
    """
    return (xmin + xmax) /2.0, (ymin + ymax) /2.0, xmax - xmin, ymax - ymin

In [86]:
PATH_TO_IMGS = '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train'
img_id = 0
img_dict = {}
imgs = []
for path, dirs, files in os.walk(PATH_TO_IMGS):
    for f in files:
        if f.endswith('png'):
            image = Image.open('{}/{}'.format(path, f))
            width, height = image.size
            img_dict.update({'{}/{}'.format(path, f) : img_id})
            COCO['images'].append({"file_name": '{}/{}'.format(path, f), "height": height, "id": img_id, "width": width})
            img_id += 1
            

In [87]:
len(img_dict.keys())

6447

In [88]:
#HERE WE CONSTRUCT THE COCO FORMAT DATA
#fetch all xml files in output folder
PATH_TO_XMLS = '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train'
xmls = []
for path, dirs, files in os.walk(PATH_TO_XMLS):
    for f in files:
        if f.endswith('xml'):
            xmls.append('{}/{}'.format(path, f))
annotation_id = 0
#parse each xml file
for xml_file in xmls:
    with open(xml_file) as file:
        doc = xmltodict.parse(file.read())
    img_id = img_dict[xml_file.replace('xml', 'png')]
    if isinstance(doc['annotation']['object'], list):
        for i in range(len(doc['annotation']['object'])):
            xmin, ymin, xmax, ymax = (int(doc['annotation']['object'][i]['bndbox']['xmin']), 
            int(doc['annotation']['object'][i]['bndbox']['ymin']), 
            int(doc['annotation']['object'][i]['bndbox']['xmax']), 
            int(doc['annotation']['object'][i]['bndbox']['ymax']))            
            xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
            COCO['annotations'].append({'segmentation': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin],
                                          'area': w * h,
                                          'iscrowd': 0,
                                          'image_id': img_id,
                                          'bbox': [xc, yc, w, h],
                                          'category_id': reverse_class_dict[doc['annotation']['object'][i]['name']],
                                          'id': annotation_id})
            annotation_id += 1
    else:
        xmin, ymin, xmax, ymax = (int(doc['annotation']['object']['bndbox']['xmin']), 
        int(doc['annotation']['object']['bndbox']['ymin']), 
        int(doc['annotation']['object']['bndbox']['xmax']), 
        int(doc['annotation']['object']['bndbox']['ymax']))            
        xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
        COCO['annotations'].append({'segmentation': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin],
                                      'area': w * h,
                                      'iscrowd': 0,
                                      'image_id': img_id,
                                      'bbox': [xc, yc, w, h],
                                      'category_id': reverse_class_dict[doc['annotation']['object']['name']],
                                      'id': annotation_id})
        annotation_id += 1

In [89]:
COCO

{'images': [{'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/0.png',
   'height': 2337,
   'id': 0,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/1.png',
   'height': 2337,
   'id': 1,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/10.png',
   'height': 2337,
   'id': 2,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/11.png',
   'height': 2337,
   'id': 3,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/2.png',
   'height': 2337,
   'id': 4,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/3.png',
   'height': 2337,
   'id': 5,
   'width': 1728}

In [90]:
#making a coco dataset without the additional sigil class
COCO_NO_SIGIL.update({'images': COCO['images']})

In [91]:
#HERE WE CONSTRUCT THE COCO FORMAT DATA WITHOUT THE ADDITIONAL SIGIL CLASS
#preparing dataset and excluding the sigil class
annotation_id = 0
#parse each xml file
for xml_file in xmls:
    with open(xml_file) as file:
        doc = xmltodict.parse(file.read())
    img_id = img_dict[xml_file.replace('xml', 'png')]
    #if there are many annotations for the file
    if isinstance(doc['annotation']['object'], list):
        for i in range(len(doc['annotation']['object'])):
            if reverse_class_dict[doc['annotation']['object'][i]['name']] != 5:
                xmin, ymin, xmax, ymax = (int(doc['annotation']['object'][i]['bndbox']['xmin']), 
                int(doc['annotation']['object'][i]['bndbox']['ymin']), 
                int(doc['annotation']['object'][i]['bndbox']['xmax']), 
                int(doc['annotation']['object'][i]['bndbox']['ymax']))            
                xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
                COCO_NO_SIGIL['annotations'].append({'segmentation': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin],
                                              'area': w * h,
                                              'iscrowd': 0,
                                              'image_id': img_id,
                                              'bbox': [xc, yc, w, h],
                                              'category_id': reverse_class_dict[doc['annotation']['object'][i]['name']],
                                              'id': annotation_id})
                annotation_id += 1
    #if there is only one annotation for the file
    else:
        if reverse_class_dict[doc['annotation']['object']['name']] != 5:
            xmin, ymin, xmax, ymax = (int(doc['annotation']['object']['bndbox']['xmin']), 
            int(doc['annotation']['object']['bndbox']['ymin']), 
            int(doc['annotation']['object']['bndbox']['xmax']), 
            int(doc['annotation']['object']['bndbox']['ymax']))            
            xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
            COCO_NO_SIGIL['annotations'].append({'segmentation': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin],
                                          'area': w * h,
                                          'iscrowd': 0,
                                          'image_id': img_id,
                                          'bbox': [xc, yc, w, h],
                                          'category_id': reverse_class_dict[doc['annotation']['object']['name']],
                                          'id': annotation_id})
            annotation_id += 1

In [92]:
COCO_NO_SIGIL

{'images': [{'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/0.png',
   'height': 2337,
   'id': 0,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/1.png',
   'height': 2337,
   'id': 1,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/10.png',
   'height': 2337,
   'id': 2,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/11.png',
   'height': 2337,
   'id': 3,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/2.png',
   'height': 2337,
   'id': 4,
   'width': 1728},
  {'file_name': '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/GB0725820A/3.png',
   'height': 2337,
   'id': 5,
   'width': 1728}

In [94]:
#HERE WE MAKE THE SAME COCO DATASETS SUCH THAT THEY ARE COMPATIBLE WITH COLAB (I.E. CHANGING THE FILE_NAME'S)
COCO_COLAB = {}
COCO_NO_SIGIL_COLAB = {}
COCO_COLAB.update({'annotations': COCO['annotations']})
COCO_NO_SIGIL_COLAB.update({'annotations': COCO_NO_SIGIL['annotations']})

COCO_COLAB.update({'images': []})
COCO_NO_SIGIL_COLAB.update({'images': []})

for i in range(len(COCO['images'])):
    im = {}
    im.update({'file_name': 'data_for_model/{}'.format('/'.join(COCO['images'][i]['file_name'].split('/')[-2:])),#'/'.join(COCO['images'][i]['file_name'].split('/')[-3:]),
              'height': COCO['images'][i]['height'],
              'width': COCO['images'][i]['width'],
              'id': COCO['images'][i]['id']})
    COCO_COLAB['images'].append(im)

for i in range(len(COCO_NO_SIGIL['images'])):
    im = {}
    im.update({'file_name': 'data_for_model/{}'.format('/'.join(COCO_NO_SIGIL['images'][i]['file_name'].split('/')[-2:])),#'/'.join(COCO_NO_SIGIL['images'][i]['file_name'].split('/')[-3:]),
              'height': COCO_NO_SIGIL['images'][i]['height'],
              'width': COCO_NO_SIGIL['images'][i]['width'],
              'id': COCO_NO_SIGIL['images'][i]['id']})
    COCO_NO_SIGIL_COLAB['images'].append(im)
    

In [95]:
len(COCO['images'])

6447

In [96]:
len(COCO_COLAB['images'])

6447

In [100]:
import json

with open('/Volumes/Non-Backup_Files/GB-patents/json/local/coco/new_train_data.json', 'w') as handle:
    json.dump(COCO, handle)
    
with open('/Volumes/Non-Backup_Files/GB-patents/json/local/coco/new_train_data_reduced.json', 'w') as handle:
    json.dump(COCO_NO_SIGIL, handle)
    
with open('/Volumes/Non-Backup_Files/GB-patents/json/colab/coco/new_train_data.json', 'w') as handle:
    json.dump(COCO_COLAB, handle)
    
with open('/Volumes/Non-Backup_Files/GB-patents/json/colab/coco/new_train_data_reduced.json', 'w') as handle:
    json.dump(COCO_NO_SIGIL_COLAB, handle)
     

In [103]:
#HERE WE CONSTRUCT THE DATASET FORMAT DATA
DATASET = []
PATH_TO_XMLS = '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train'
xmls = []
for path, dirs, files in os.walk(PATH_TO_XMLS):
    for f in files:
        if f.endswith('xml'):
            xmls.append('{}/{}'.format(path, f))
annotation_id = 0
#parse each xml file
for xml_file in xmls:
    with open(xml_file) as file:
        doc = xmltodict.parse(file.read())
    img_id = img_dict[xml_file.replace('xml', 'png')]
    annotations = []
    file_name = xml_file.replace('xml', 'png')
    width = COCO['images'][img_id]['width']
    height = COCO['images'][img_id]['height']
    width = COCO['images'][img_id]['width']            
    if isinstance(doc['annotation']['object'], list):
        for i in range(len(doc['annotation']['object'])):
            xmin, ymin, xmax, ymax = (int(doc['annotation']['object'][i]['bndbox']['xmin']), 
            int(doc['annotation']['object'][i]['bndbox']['ymin']), 
            int(doc['annotation']['object'][i]['bndbox']['xmax']), 
            int(doc['annotation']['object'][i]['bndbox']['ymax']))            
            xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
            annotations.append({'segmentation': [[xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin]],
                                'bbox': [float(xmin), float(ymin), float(xmax), float(ymax)],
                                'category_id': reverse_class_dict[doc['annotation']['object'][i]['name']],
                                'bbox_mode': BoxMode.XYXY_ABS})
            annotation_id += 1
        DATASET.append({'annotations': annotations, 'file_name': file_name, 'image_id': img_id, 'width': width, 'height': height})
    else:
        xmin, ymin, xmax, ymax = (int(doc['annotation']['object']['bndbox']['xmin']), 
        int(doc['annotation']['object']['bndbox']['ymin']), 
        int(doc['annotation']['object']['bndbox']['xmax']), 
        int(doc['annotation']['object']['bndbox']['ymax']))            
        xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
        annotations.append({'segmentation': [[xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin]],
                            'bbox': [float(xmin), float(ymin), float(xmax), float(ymax)],
                            'category_id': reverse_class_dict[doc['annotation']['object']['name']],
                            'bbox_mode': BoxMode.XYXY_ABS})
        annotation_id += 1
        DATASET.append({'annotations': annotations, 'file_name': file_name, 'image_id': img_id, 'width': width, 'height': height})


In [104]:
DATASET

[{'annotations': [{'segmentation': [[192,
      192,
      517,
      192,
      517,
      502,
      192,
      502,
      192,
      192]],
    'bbox': [192.0, 192.0, 517.0, 502.0],
    'category_id': 5,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[529, 121, 1141, 121, 1141, 192, 529, 192, 529, 121]],
    'bbox': [529.0, 121.0, 1141.0, 192.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[507, 580, 1156, 580, 1156, 739, 507, 739, 507, 580]],
    'bbox': [507.0, 580.0, 1156.0, 739.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[514, 824, 1063, 824, 1063, 946, 514, 946, 514, 824]],
    'bbox': [514.0, 824.0, 1063.0, 946.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[527, 217, 1209, 217, 1209, 443, 527, 443, 527, 217]],
    'bbox': [527.0, 217.0, 1209.0, 443.0],
    'category_id': 0,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[38

In [105]:
DATASET_NO_SIGIL = []
PATH_TO_XMLS = '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train'
xmls = []
for path, dirs, files in os.walk(PATH_TO_XMLS):
    for f in files:
        if f.endswith('xml'):
            xmls.append('{}/{}'.format(path, f))
annotation_id = 0
#parse each xml file
for xml_file in xmls:
    with open(xml_file) as file:
        doc = xmltodict.parse(file.read())
    img_id = img_dict[xml_file.replace('xml', 'png')]
    annotations = []
    file_name = xml_file.replace('xml', 'png')
    width = COCO['images'][img_id]['width']
    height = COCO['images'][img_id]['height']
    width = COCO['images'][img_id]['width']            
    if isinstance(doc['annotation']['object'], list):
        for i in range(len(doc['annotation']['object'])):
            if reverse_class_dict[doc['annotation']['object'][i]['name']] != 5:
                xmin, ymin, xmax, ymax = (int(doc['annotation']['object'][i]['bndbox']['xmin']), 
                int(doc['annotation']['object'][i]['bndbox']['ymin']), 
                int(doc['annotation']['object'][i]['bndbox']['xmax']), 
                int(doc['annotation']['object'][i]['bndbox']['ymax']))            
                xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
                annotations.append({'segmentation': [[xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin]],
                                    'bbox': [float(xmin), float(ymin), float(xmax), float(ymax)],
                                    'category_id': reverse_class_dict[doc['annotation']['object'][i]['name']],
                                    'bbox_mode': BoxMode.XYXY_ABS})
                annotation_id += 1
        DATASET_NO_SIGIL.append({'annotations': annotations, 'file_name': file_name, 'image_id': img_id, 'width': width, 'height': height})
    else:
        if reverse_class_dict[doc['annotation']['object']['name']] != 5:
            xmin, ymin, xmax, ymax = (int(doc['annotation']['object']['bndbox']['xmin']), 
            int(doc['annotation']['object']['bndbox']['ymin']), 
            int(doc['annotation']['object']['bndbox']['xmax']), 
            int(doc['annotation']['object']['bndbox']['ymax']))            
            xc, yc, w, h = xyxy_to_xswd(xmin, ymin, xmax, ymax)
            annotations.append({'segmentation': [[xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax, xmin, ymin]],
                                    'bbox': [float(xmin), float(ymin), float(xmax), float(ymax)],
                                'category_id': reverse_class_dict[doc['annotation']['object']['name']],
                                'bbox_mode': BoxMode.XYXY_ABS})
            annotation_id += 1
            DATASET_NO_SIGIL.append({'annotations': annotations, 'file_name': file_name, 'image_id': img_id, 'width': width, 'height': height})


In [108]:
DATASET

[{'annotations': [{'segmentation': [[192,
      192,
      517,
      192,
      517,
      502,
      192,
      502,
      192,
      192]],
    'bbox': [192.0, 192.0, 517.0, 502.0],
    'category_id': 5,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[529, 121, 1141, 121, 1141, 192, 529, 192, 529, 121]],
    'bbox': [529.0, 121.0, 1141.0, 192.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[507, 580, 1156, 580, 1156, 739, 507, 739, 507, 580]],
    'bbox': [507.0, 580.0, 1156.0, 739.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[514, 824, 1063, 824, 1063, 946, 514, 946, 514, 824]],
    'bbox': [514.0, 824.0, 1063.0, 946.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[527, 217, 1209, 217, 1209, 443, 527, 443, 527, 217]],
    'bbox': [527.0, 217.0, 1209.0, 443.0],
    'category_id': 0,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[38

In [110]:
#HERE WE MAKE THE SAME COCO DATASETS SUCH THAT THEY ARE COMPATIBLE WITH COLAB (I.E. CHANGING THE FILE_NAME'S)
DATASET_COLAB = []
DATASET_NO_SIGIL_COLAB = []

for i in range(len(DATASET)):
    im = DATASET[i].copy()
    im['file_name'] = 'data_for_model/{}'.format('/'.join(im['file_name'].split('/')[-2:]))#'/'.join(im['file_name'].split('/')[-3:])
    DATASET_COLAB.append(im)

for i in range(len(DATASET_NO_SIGIL)):
    im = DATASET_NO_SIGIL[i].copy()
    im['file_name'] = 'data_for_model/{}'.format('/'.join(im['file_name'].split('/')[-2:]))#'/'.join(im['file_name'].split('/')[-3:])
    DATASET_NO_SIGIL_COLAB.append(im)
    

In [111]:
DATASET

[{'annotations': [{'segmentation': [[192,
      192,
      517,
      192,
      517,
      502,
      192,
      502,
      192,
      192]],
    'bbox': [192.0, 192.0, 517.0, 502.0],
    'category_id': 5,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[529, 121, 1141, 121, 1141, 192, 529, 192, 529, 121]],
    'bbox': [529.0, 121.0, 1141.0, 192.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[507, 580, 1156, 580, 1156, 739, 507, 739, 507, 580]],
    'bbox': [507.0, 580.0, 1156.0, 739.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[514, 824, 1063, 824, 1063, 946, 514, 946, 514, 824]],
    'bbox': [514.0, 824.0, 1063.0, 946.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[527, 217, 1209, 217, 1209, 443, 527, 443, 527, 217]],
    'bbox': [527.0, 217.0, 1209.0, 443.0],
    'category_id': 0,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[38

In [112]:
DATASET_COLAB

[{'annotations': [{'segmentation': [[192,
      192,
      517,
      192,
      517,
      502,
      192,
      502,
      192,
      192]],
    'bbox': [192.0, 192.0, 517.0, 502.0],
    'category_id': 5,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[529, 121, 1141, 121, 1141, 192, 529, 192, 529, 121]],
    'bbox': [529.0, 121.0, 1141.0, 192.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[507, 580, 1156, 580, 1156, 739, 507, 739, 507, 580]],
    'bbox': [507.0, 580.0, 1156.0, 739.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[514, 824, 1063, 824, 1063, 946, 514, 946, 514, 824]],
    'bbox': [514.0, 824.0, 1063.0, 946.0],
    'category_id': 1,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[527, 217, 1209, 217, 1209, 443, 527, 443, 527, 217]],
    'bbox': [527.0, 217.0, 1209.0, 443.0],
    'category_id': 0,
    'bbox_mode': <BoxMode.XYXY_ABS: 0>},
   {'segmentation': [[38

In [114]:
import json

with open('/Volumes/Non-Backup_Files/GB-patents/json/local/dataset/new_train_data.json', 'w') as handle:
    json.dump(DATASET, handle)
    
with open('/Volumes/Non-Backup_Files/GB-patents/json/local/dataset/new_train_data_reduced.json', 'w') as handle:
    json.dump(DATASET_NO_SIGIL, handle)
    
with open('/Volumes/Non-Backup_Files/GB-patents/json/colab/dataset/new_train_data.json', 'w') as handle:
    json.dump(DATASET_COLAB, handle)
    
with open('/Volumes/Non-Backup_Files/GB-patents/json/colab/dataset/new_train_data_reduced.json', 'w') as handle:
    json.dump(DATASET_NO_SIGIL_COLAB, handle)    