# 1. Import library

In [81]:
import os
import json
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from pycocotools.coco import COCO

# 2. Check duplicate

In [179]:
BASE_DIR = './kfold/'

def check_duplicate_images(fold = int):
    for i in range(fold):
        train_json_dir = os.path.join(BASE_DIR, 'fold_' + str(i) + '_train.json')
        val_json_dir = os.path.join(BASE_DIR, 'fold_' + str(i) + '_val.json')

        coco_train =  COCO(train_json_dir)
        coco_val = COCO(val_json_dir)

        train_imgs = []
        val_imgs = []

        for ann_id in coco_train.getAnnIds():
            train_img_id = coco_train.loadAnns(ann_id)[0]['image_id']
            if train_img_id not in train_imgs:
                train_imgs.append(train_img_id)

        for ann_id in coco_val.getAnnIds():
            val_img_id = coco_val.loadAnns(ann_id)[0]['image_id']
            if val_img_id not in val_imgs:
                val_imgs.append(val_img_id)

        train_imgs = set(train_imgs)
        val_imgs = set(val_imgs)

        train_imgs_num = len(train_imgs)
        val_imgs_num = len(val_imgs)

        if not train_imgs & val_imgs:
            print('\n'f'üéàCheck duplicate fold {i} images: Success!''\n')
            print(f'‚ú®train imgs: {train_imgs_num}Í∞ú ({float(100 * train_imgs_num / (train_imgs_num + val_imgs_num)):.2f}%)')
            print(f'‚ú®val imgs: {val_imgs_num}Í∞ú ({float(100 * val_imgs_num / (train_imgs_num + val_imgs_num)):.2f}%)')
            print(f'‚ú®total imgs: {train_imgs_num + val_imgs_num}Í∞ú''\n')

        else:
            print('\n'f'üéàCheck duplicate fold {i} images: Fail''\n')
            print(f'‚ú®train imgs: {train_imgs_num}Í∞ú ({float(100 * train_imgs_num / (train_imgs_num + val_imgs_num)):.2f}%)')
            print(f'‚ú®val imgs: {val_imgs_num}Í∞ú ({float(100 * val_imgs_num / (train_imgs_num + val_imgs_num)):.2f}%)')
            print(f'‚ú®total imgs: {train_imgs_num + val_imgs_num}Í∞ú''\n')

# 3. Run

In [180]:
check_duplicate_images(fold=5)

loading annotations into memory...
Done (t=0.08s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!

üéàCheck duplicate fold 0 images: Success!

‚ú®train imgs: 3908Í∞ú (80.03%)
‚ú®val imgs: 975Í∞ú (19.97%)
‚ú®total imgs: 4883Í∞ú

loading annotations into memory...
Done (t=0.07s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!

üéàCheck duplicate fold 1 images: Success!

‚ú®train imgs: 3907Í∞ú (80.01%)
‚ú®val imgs: 976Í∞ú (19.99%)
‚ú®total imgs: 4883Í∞ú

loading annotations into memory...
Done (t=0.06s)
creating index...
index created!
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!

üéàCheck duplicate fold 2 images: Success!

‚ú®train imgs: 3903Í∞ú (79.93%)
‚ú®val imgs: 980Í∞ú (20.07%)
‚ú®total imgs: 4883Í∞ú

loading annotations into memory...
Done (t=0.06s)
creating index...
index created!
loading annotations into m

# 4. Check class distribuition

In [176]:
BASE_DIR = './kfold/'
ann_dir = './dataset/train.json'

with open(ann_dir) as f:
    _annotations = json.load(f)

# Convert category_id(int) to classes(name)
classes = [d['name'] for d in _annotations['categories']]

def get_distribution(x:list, y:int):
    return [x[i]/y for i in range(len(x))]

def check_class_distribution(fold):
    kfold_distribution = pd.DataFrame()

    for i in range(fold):
        train_json_dir = os.path.join(BASE_DIR, 'fold_' + str(i) + '_train.json')
        val_json_dir = os.path.join(BASE_DIR, 'fold_' + str(i) + '_val.json')

        with open(train_json_dir) as f:
            _kfold_train = json.load(f)

        with open(val_json_dir) as f:
            _kfold_val = json.load(f)

        train_class_count = []
        val_class_count = []

        # train datasetÏùò class total
        for _trian_id in _kfold_train['annotations']:
            train_class_count.append(classes[_trian_id['category_id']])

        train_class_num = Counter(train_class_count)
        train_total = len(train_class_count)

        # val datasetÏùò class total
        for _val_id in _kfold_val['annotations']:
            val_class_count.append(classes[_val_id['category_id']])

        val_class_num = Counter(val_class_count)
        val_total = len(val_class_count)

        train_dist, val_dist = defaultdict(float), defaultdict(float)

        for keys in train_class_num.keys():
            train_dist[keys] = f'{float(100 * train_class_num[keys] / train_total):.2f}%'
            val_dist[keys] = f'{float(100 * val_class_num[keys] / val_total):.2f}%'

        # DataFrameÏóê Ï∂îÍ∞Ä
        kfold_distribution[f'Fold_{i}_Train'] = pd.Series(train_dist)
        kfold_distribution[f'Fold_{i}_Val'] = pd.Series(val_dist)

    return kfold_distribution.T

In [177]:
check_class_distribution(fold = 5)

Unnamed: 0,General trash,Metal,Plastic bag,Glass,Plastic,Paper pack,Styrofoam,Paper,Clothing,Battery
Fold_0_Train,17.48%,3.93%,22.57%,4.38%,12.46%,3.76%,5.44%,27.48%,1.80%,0.70%
Fold_0_Val,15.66%,4.54%,21.52%,3.64%,13.81%,4.38%,5.54%,27.28%,2.98%,0.65%
Fold_1_Train,17.38%,4.37%,22.28%,4.03%,12.95%,3.96%,5.65%,26.57%,2.19%,0.63%
Fold_1_Val,16.28%,2.89%,22.70%,4.99%,11.90%,3.59%,4.77%,30.56%,1.43%,0.88%
Fold_2_Train,17.05%,4.07%,22.74%,4.07%,13.04%,3.98%,5.37%,27.15%,1.95%,0.57%
Fold_2_Val,17.49%,3.93%,20.93%,4.91%,11.43%,3.48%,5.80%,28.60%,2.29%,1.13%
Fold_3_Train,17.06%,3.83%,22.27%,4.40%,12.38%,3.86%,5.23%,28.15%,2.08%,0.74%
Fold_3_Val,17.46%,4.90%,22.77%,3.61%,14.09%,3.94%,6.37%,24.59%,1.79%,0.48%
Fold_4_Train,16.73%,4.03%,22.00%,4.32%,12.76%,3.83%,5.60%,27.85%,2.10%,0.79%
Fold_4_Val,18.86%,4.12%,23.95%,3.92%,12.53%,4.05%,4.87%,25.75%,1.71%,0.25%


# ...

In [178]:
coco_train =  COCO('./kfold/fold_0_train.json')
ann_id = coco_train.getAnnIds()
type(ann_id[0])
train_img_id = coco_train.loadAnns(ann_id[0])
train_img_id[0]

loading annotations into memory...
Done (t=0.07s)
creating index...
index created!


{'image_id': 0,
 'category_id': 0,
 'area': 257301.66,
 'bbox': [197.6, 193.7, 547.8, 469.7],
 'iscrowd': 0,
 'id': 0}

In [87]:
class_count = []

for i in _annotations['annotations']:
    class_count.append(classes[i['category_id']])

class_total = Counter(class_count)
class_total

Counter({'Paper': 6352,
         'Plastic bag': 5178,
         'General trash': 3966,
         'Plastic': 2943,
         'Styrofoam': 1263,
         'Glass': 982,
         'Metal': 936,
         'Paper pack': 897,
         'Clothing': 468,
         'Battery': 159})

In [71]:
len(COCO('./dataset/train.json').getAnnIds())

loading annotations into memory...
Done (t=0.13s)
creating index...
index created!


23144

In [62]:
ann_dir = './dataset/train.json'

with open(ann_dir) as f:
    _annotations = json.load(f)

classes = [d['name'] for d in _annotations['categories']]


['General trash',
 'Paper',
 'Paper pack',
 'Metal',
 'Glass',
 'Plastic',
 'Styrofoam',
 'Plastic bag',
 'Battery',
 'Clothing']