# Import library

In [None]:
import os
import json
import pandas as pd
import numpy as np

from pycocotools.coco import COCO
from copy import deepcopy
from random import sample

# Custom functions

In [None]:
def remove_images(data: list, rmv_ids: list)->list:
    data_new = deepcopy(data)
    
    # Remove images
    data_new['images'] = [x for x in data['images'] if x['id'] not in rmv_ids]
    
    # Remove bbox annotations
    data_new['annotations'] = [x for x in data['annotations'] if x['image_id'] not in rmv_ids]
    
    print(f"# of images in data before: [{len(data['images'])}] >> after: [{len(data_new['images'])}]")
    print(f"# of bboxes in data before: [{len(data['annotations'])}] >> after: [{len(data_new['annotations'])}]")
    
    return data_new


def save_json(data: dict, file_nm: str, dir_path='../../dataset'):
    with open(os.path.join(data_dir, file_nm), 'w') as outfile:
        json.dump(data, outfile)

# Load train and validation data

In [None]:
data_dir = '../../dataset' # data_dir 경로

annot_train = '../../dataset/cv_train_1.json' # Multilabel K-Fold 방식으로 분리된 train set의 annotation
annot_valid = '../../dataset/cv_val_1.json' # Multilabel K-Fold 방식으로 분리된 validation set의 annotation

with open(annot_train) as f:
    data_train = json.load(f)
    
with open(annot_valid) as f:
    data_valid = json.load(f)
   
coco_train = COCO(annot_train)

In [None]:
gt_list = []

for image_id in coco_train.getImgIds():
        
    image_info = coco_train.loadImgs(image_id)[0]
    annotation_id = coco_train.getAnnIds(imgIds=image_info['id'])
    annotation_info_list = coco_train.loadAnns(annotation_id)
        
    file_name = image_info['file_name']
        
    for annotation in annotation_info_list:
        gt_list.append([file_name,
                        annotation['id'],
                        annotation['category_id'],
                       float(annotation['bbox'][0]),
                       float(annotation['bbox'][0]) + float(annotation['bbox'][2]),
                       float(annotation['bbox'][1]),
                       (float(annotation['bbox'][1]) + float(annotation['bbox'][3]))])

In [None]:
gt_list[:10]

In [None]:
train_df = pd.DataFrame(gt_list, columns=['img_id', 'annot_id', 'label', 'x1', 'x2', 'y1', 'y2'])

In [None]:
# Get box size
train_df['area'] = train_df.apply(lambda x: (x['y2']-x['y1'])*(x['x2']-x['x1']), axis=1)

# Get box center point
train_df['cent_x'] = (train_df.x2+train_df.x1)/2
train_df['cent_y'] = (train_df.y2+train_df.y1)/2

# Get width and height
train_df['width'] = train_df.x2-train_df.x1
train_df['height'] = train_df.y2-train_df.y1

# Get width and height ratio
train_df['height/width'] = train_df.height/train_df.width 

# Get bbox size
train_df['diagonal'] = (train_df.width**2 + train_df.height**2)**0.5

# Get label nm
LABEL_NAME = ["General trash", "Paper", "Paper pack", "Metal", 
              "Glass", "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"]
map_label_2_nm = {idx: nm for idx, nm in zip(range(len(LABEL_NAME)), LABEL_NAME)}
train_df['label_nm'] = train_df.label.map(map_label_2_nm)
train_df

# Remove images with excessively many bboxes

In [None]:
# Get box per image upper threshold for outlier check
box_per_img = train_df.img_id.value_counts()
print(box_per_img.describe())

q1_box_per_img, q3_box_per_img = np.percentile(box_per_img, [25, 75])
upper_box_per_img = q3_box_per_img + (q3_box_per_img-q1_box_per_img)*1.5
print(f'Outlier threshold for box per image: {upper_box_per_img}')

In [None]:
# Get image ids for removing
out_box_per_img_list = sorted([int(x[-8:-4]) for x in box_per_img[box_per_img > upper_box_per_img].index])
out_box_per_img_list[:10]

In [None]:
# Generate an outlier removed dataset
data_train_v6 = remove_images(data_train, out_box_per_img_list)

In [None]:
help(save_json)

In [None]:
# Save new dataset version 6
save_json(data_train_v6, 'cv_train_1_v6.json')