In [1]:
import pandas as pd 
import json
import numpy as np
import datetime
import re
import copy

In [13]:
df = pd.read_csv("../../dataset/preprocessed-datasets/preprocessed_metadata.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,index,patient_id,breast_density,left_or_right_breast,image_view,abnormality_id,abnormality_type,shape,margin,...,checksum,is_flipped,vertical_crop_pixels,horizontal_crop_pixels,width,height,preprocessed_original_image_path,num_masks,bounding_boxes,preprocessed_mass_all_mask_path
0,0,0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,...,8edd908901f92930776353c9ed140d95,False,480,1918,1918,3848,preprocessed/8edd908901f92930776353c9ed140d95.png,1,"[{""x"": 270, ""y"": 1942, ""width"": 390, ""height"":...",preprocessed/8edd908901f92930776353c9ed140d95_...
1,1,1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,...,d15f9840224eafd79870dc700c2e6cb1,False,480,2138,2138,3840,preprocessed/d15f9840224eafd79870dc700c2e6cb1.png,1,"[{""x"": 271, ""y"": 2500, ""width"": 215, ""height"":...",preprocessed/d15f9840224eafd79870dc700c2e6cb1_...
2,2,2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,...,17b97fe9529c1f9777fbf031fdfe7909,True,549,2277,2277,4393,preprocessed/17b97fe9529c1f9777fbf031fdfe7909.png,1,"[{""x"": 1186, ""y"": 2951, ""width"": 382, ""height""...",preprocessed/17b97fe9529c1f9777fbf031fdfe7909_...
3,3,3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,...,39a6e98a61aa8e2c19b3136c48e1cc07,True,549,2151,2151,4393,preprocessed/39a6e98a61aa8e2c19b3136c48e1cc07.png,1,"[{""x"": 1011, ""y"": 2506, ""width"": 381, ""height""...",preprocessed/39a6e98a61aa8e2c19b3136c48e1cc07_...
4,4,4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,...,baf98767e0ca955c78f41c147077435e,False,549,2113,2113,4393,preprocessed/baf98767e0ca955c78f41c147077435e.png,1,"[{""x"": 694, ""y"": 3255, ""width"": 424, ""height"":...",preprocessed/baf98767e0ca955c78f41c147077435e_...


# Genrate COCO format dataset
- Find bounding boxes
- Create dict
- Save as json

In [14]:
coco = {
    "info": {
        "description": "CBIS-DDCM dataset",
        "version": "1.0",
        "year": 2023,
        "date_created": "2023/19/03",
        "samples": 0
    },
    "licenses": [
        {
            "url": "http://creativecommons.org/licenses/by-nc-sa/2.0/",
            "id": 1,
            "name": "Attribution-NonCommercial-ShareAlike License"
        }
    ],
    "categories": [
        {
            "id": 0,
            "name": "ROI",
            "supercategory": "none"
        },
        {
            "id": 1,
            "name": "Mass",
            "supercategory": "ROI"
        },
        {
            "id": 2,
            "name": "Calcification",
            "supercategory": "ROI"
        },
    ],
    "images": [
        # {
        #     "id": 0,
        #     "license": 1,
        #     "file_name": "BloodImage_00204_jpg.rf.04ba9998769a12d374e6c4b284e6a4a2.jpg",
        #     "height": 416,
        #     "width": 416,
        #     "date_captured": "2021-02-24T08:05:45+00:00"
        # }
    ],
    "annotations": [
        # {
        #     "id": 0,
        #     "image_id": 0,
        #     "category_id": 2,
        #     "bbox": [
        #         "x",
        #         "y",
        #         "width",
        #         "height"
        #     ],
        #     "area": 5934,
        #     "segmentation": [],
        #     "iscrowd": 0
        # }
    ]
}

In [6]:
classes = {
    "mass": 1,
    "calcification": 2
}

In [15]:
df['bounding_boxes'] = df['bounding_boxes'].apply(json.loads)

In [16]:
train_df = df.query("dataset == 'train'")
test_df = df.query("dataset == 'test'")

print(len(train_df), len(test_df))

1231 361


### Add images and bounding boxes to dict

In [19]:
def create_coco(df, coco):
    images = []
    annotations = []

    for i, row in df.iterrows():
        image = {
            "id": i,
            "license": 1,
            "file_name": row.preprocessed_original_image_path.split("/")[-1],
            "height": row.height,
            "width": row.width,
            "date_captured": str(datetime.datetime.now())
        }

        has_annotations = False

        for bbox in row.bounding_boxes:
            if bbox["type"] == "calcification":
                continue
            
            has_annotations = True
            x, y, width, height = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
            annotation = {
                "id": len(annotations),
                "image_id": i,
                "category_id": classes[bbox["type"]],
                "bbox": [int(x), int(y), int(width), int(height)],
                "area": int(width * height),
                "segmentation": [],
                "iscrowd": 0
            }
            
            annotations.append(annotation)
            
        if not has_annotations:
            continue 

        images.append(image)

    coco["images"] = images
    coco["annotations"] = annotations
    coco["info"]["samples"] = len(images)

    return coco

In [20]:
train_coco_1 = create_coco(train_df, copy.deepcopy(coco))
test_coco = create_coco(test_df, copy.deepcopy(coco))

### Save COCO dataset to JSON

In [22]:
with open("../annotations/_train_annotations_1.coco.json", "w") as f:
    f.write(json.dumps(train_coco_1))

with open("../annotations/_test_annotations.coco.json", "w") as f:
    f.write(json.dumps(test_coco))