In [10]:
import json
import pandas as pd
import numpy as np
from copy import copy

with open("../dataset/train.json", "r") as f:
    train = json.load(f)
split_train = copy(train)
split_valid = copy(train)
df = pd.DataFrame(train['annotations'])

In [11]:
df.head(10)

Unnamed: 0,image_id,category_id,area,bbox,iscrowd,id
0,0,0,257301.66,"[197.6, 193.7, 547.8, 469.7]",0,0
1,1,3,10402.56,"[0.0, 407.4, 57.6, 180.6]",0,1
2,1,7,26259.36,"[0.0, 455.6, 144.6, 181.6]",0,2
3,1,4,69096.17,"[722.3, 313.4, 274.3, 251.9]",0,3
4,1,5,24164.58,"[353.2, 671.0, 233.7, 103.4]",0,4
5,1,5,188324.4,"[3.7, 448.5, 778.2, 242.0]",0,5
6,1,0,38908.72,"[425.3, 681.9, 216.4, 179.8]",0,6
7,1,7,7391.52,"[92.4, 601.7, 139.2, 53.1]",0,7
8,1,0,6857.76,"[622.4, 686.5, 72.8, 94.2]",0,8
9,2,3,324010.8,"[267.9, 165.2, 631.6, 513.0]",0,9


In [12]:
class_per_image = [(j, [0 for i in range(10)]) for j in range(4883)]
def clean(x):
    '''
    for apply : 
    '''
    class_per_image[x['image_id']][1][x['category_id']] += 1
df.apply(clean, axis=1)
class_per_image = np.array(class_per_image)

np.random.seed(2021)
np.random.shuffle(class_per_image)
class_per_image

  


array([[2826, list([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])],
       [3835, list([0, 1, 0, 0, 0, 0, 6, 3, 0, 0])],
       [1789, list([0, 2, 0, 0, 0, 0, 1, 5, 0, 0])],
       ...,
       [1152, list([1, 8, 0, 0, 0, 0, 0, 0, 0, 0])],
       [3413, list([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])],
       [1140, list([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])]], dtype=object)

In [13]:
train_classes = np.zeros((10))
train_indices = []
valid_classes = np.zeros((10))
valid_indices = []

ratio = 0.2
for image_id, classes in (class_per_image):
    train_if_train = train_classes + classes
    valid_if_train = valid_classes / ratio
    mse_if_train = ((train_if_train - valid_if_train)**2).mean(axis=0)
    
    train_if_valid = train_classes
    valid_if_valid = (valid_classes + classes) / ratio
    mse_if_valid = ((train_if_valid - valid_if_valid)**2).mean(axis=0)

    ### Random Forcing
    if(np.random.random() < 0.15):
        '''
        Random Forcing이 없으면 한 image 안에 객체가 많을 경우 
        Validation set으로 넣어버리면 mse가 갑자기 높아지기 때문에
        그냥 train set으로 분류해버리는 경향이 있음
        '''
        valid_indices.append(int(image_id))
        valid_classes += classes
        continue
    
    if mse_if_train < mse_if_valid:
        train_indices.append(int(image_id))
        train_classes += classes
    else:
        valid_indices.append(int(image_id))
        valid_classes += classes

print("Train images :" , len(train_indices))
print("Validation images :" , len(valid_indices))
print('-'*60)
print("Distribution of classes in Train dataset")
print(train_classes)
print('-'*60)
print("Distribution of classes in Validation dataset")
print(valid_classes)

Train images : 3812
Validation images : 1071
------------------------------------------------------------
Distribution of classes in Train dataset
[3305. 5193.  747.  744.  757. 2405. 1002. 4312.  125.  389.]
------------------------------------------------------------
Distribution of classes in Validation dataset
[ 661. 1159.  150.  192.  225.  538.  261.  866.   34.   79.]


In [14]:
# train, valid에 해당하는 image/annotation을 각각 저장
train_images = []
train_ann = []
valid_images = []
valid_ann = []
for image in split_train['images']:
    if image['id'] in train_indices:
        train_images.append(image)
    else:
        valid_images.append(image)
for ann in split_train['annotations']:
    if ann['image_id'] in train_indices:
        train_ann.append(ann)
    else:
        valid_ann.append(ann)
split_train['images'] = train_images
split_train['annotations'] = train_ann
split_valid['images'] = valid_images
split_valid['annotations'] = valid_ann

len(split_train['images']), len(split_valid['images'])

(3812, 1071)

In [15]:
# Save
with open("../dataset/split_train_v2.json", "w") as f:
    json.dump(split_train, f)
with open("../dataset/split_valid_v2.json", "w") as f:
    json.dump(split_valid, f)