In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from pycocotools.coco import COCO
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
with open('dataset/train.json', 'r') as f:
    train = json.load(f)

In [None]:
train

### Imgae 단위 class distribution을 똑같이 나누는 cv strategy

In [None]:
coco = COCO('./dataset/train.json')

train_df = pd.DataFrame()

image_ids = []
class_name = []
class_id = []
x_min = []
y_min = []
x_max = []
y_max = []
classes = ["General trash", "Paper", "Paper pack", "Metal", "Glass", 
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"]
for image_id in coco.getImgIds():
        
    image_info = coco.loadImgs(image_id)[0]
    ann_ids = coco.getAnnIds(imgIds=image_info['id'])
    anns = coco.loadAnns(ann_ids)
        
    file_name = image_info['file_name']
        
    for ann in anns:
        image_ids.append(file_name)
        class_name.append(classes[ann['category_id']])
        class_id.append(ann['category_id'])
        x_min.append(float(ann['bbox'][0]))
        y_min.append(float(ann['bbox'][1]))
        x_max.append(float(ann['bbox'][0]) + float(ann['bbox'][2]))
        y_max.append(float(ann['bbox'][1]) + float(ann['bbox'][3]))

train_df['image_id'] = image_ids
train_df['class_name'] = class_name
train_df['class_id'] = class_id
train_df['x_min'] = x_min
train_df['y_min'] = y_min
train_df['x_max'] = x_max
train_df['y_max'] = y_max

In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

var = [(dat['image_id'], dat['category_id']) for dat in train['annotations']]

X = np.ones((len(var), ))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in  var])

for train_idx, test_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print("      ", y[train_idx])
    print("TEST :", groups[test_idx])
    print("      ", y[test_idx])

In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

# check distribution
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())
    
    return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in range(np.max(y) +1)]
    
distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
                 
categories = [d['name'] for d in train['categories']]

pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
    train_fold = dict();valid_fold = dict()
    
    train_fold['info'] = train['info']
    train_fold['licenses'] = train['licenses']
    train_fold['images'] = [image for image in train['images'] if image['id'] in set(groups[train_idx])]
    train_fold['categories'] = train['categories']
    train_fold['annotations'] = [ann for idx, ann in enumerate(train['annotations']) if idx in train_idx]
    
    valid_fold['info'] = train['info']
    valid_fold['licenses'] = train['licenses']
    valid_fold['images'] = [image for image in train['images'] if image['id'] in set(groups[valid_idx])]
    valid_fold['categories'] = train['categories']
    valid_fold['annotations'] = [ann for idx, ann in enumerate(train['annotations']) if idx in valid_idx]
    
    with open(f'train_f{idx}.json', 'w') as f:
        json.dump(train_fold, f, indent = 4)

    with open(f'valid_f{idx}.json', 'w') as f:
        json.dump(valid_fold, f, indent = 4)

### Imgae 단위 class distribution과 bbox aspect ratio 똑같이 나누는 cv strategy

In [None]:
coco = COCO('./dataset/train_f0.json')

train_df = pd.DataFrame()

image_ids = []
class_name = []
class_id = []
x_min = []
y_min = []
x_max = []
y_max = []
classes = ["General trash", "Paper", "Paper pack", "Metal", "Glass", 
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"]
for image_id in coco.getImgIds():
        
    image_info = coco.loadImgs(image_id)[0]
    ann_ids = coco.getAnnIds(imgIds=image_info['id'])
    anns = coco.loadAnns(ann_ids)
        
    file_name = image_info['file_name']
        
    for ann in anns:
        image_ids.append(file_name)
        class_name.append(classes[ann['category_id']])
        class_id.append(ann['category_id'])
        x_min.append(float(ann['bbox'][0]))
        y_min.append(float(ann['bbox'][1]))
        x_max.append(float(ann['bbox'][0]) + float(ann['bbox'][2]))
        y_max.append(float(ann['bbox'][1]) + float(ann['bbox'][3]))

train_df['image_id'] = image_ids
train_df['class_name'] = class_name
train_df['class_id'] = class_id
train_df['x_min'] = x_min
train_df['y_min'] = y_min
train_df['x_max'] = x_max
train_df['y_max'] = y_max

In [None]:
train_df['wh_ratio'] = (train_df['x_max'] - train_df['x_min']) / (train_df['y_max'] - train_df['y_min'])
train_df['aspect_ratio'] = (train_df['x_max'] - train_df['x_min']) * (train_df['y_max'] - train_df['y_min']) / (1024*1024)

In [None]:
aspect = []
for i in train_df['aspect_ratio']:
    if i < 0.01:
        aspect.append('small')
    elif i < 0.05:
        aspect.append('medium')
    else:
        aspect.append('large')
train_df['aspect'] = aspect

In [None]:
train_df['want'] = train_df['class_name'] + '-' + train_df['aspect']
wd = {key:val for val, key in enumerate(sorted(train_df['want'].unique()))}

imgint = [int(i[6:10]) for i in train_df['image_id']]
categoryint = [wd[i] for i in train_df['want']]
train_df['image_int'] = imgint
train_df['want_int'] = categoryint

train_df

In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

var = [(train_df.loc[idx, 'image_int'], train_df.loc[idx, 'want_int']) for idx in range(len(train_df))]

X = np.ones((len(var), ))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in  var])

for train_idx, test_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print("      ", y[train_idx])
    print("TEST :", groups[test_idx])
    print("      ", y[test_idx])

In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

# check distribution
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())
    
    return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in range(np.max(y) +1)]
    
distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
                 
categories = [d for d in wd.keys()]

pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

In [None]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
    train_fold = dict();valid_fold = dict()
    
    train_fold['info'] = train['info']
    train_fold['licenses'] = train['licenses']
    train_fold['images'] = [image for image in train['images'] if image['id'] in set(groups[train_idx])]
    train_fold['categories'] = train['categories']
    train_fold['annotations'] = [ann for idx, ann in enumerate(train['annotations']) if idx in train_idx]
    
    valid_fold['info'] = train['info']
    valid_fold['licenses'] = train['licenses']
    valid_fold['images'] = [image for image in train['images'] if image['id'] in set(groups[valid_idx])]
    valid_fold['categories'] = train['categories']
    valid_fold['annotations'] = [ann for idx, ann in enumerate(train['annotations']) if idx in valid_idx]
    
    with open(f'train_newf{idx}.json', 'w') as f:
        json.dump(train_fold, f, indent = 4)

    with open(f'valid_newf{idx}.json', 'w') as f:
        json.dump(valid_fold, f, indent = 4)