In [84]:
import os
import json 
import numpy as np 
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold 
from collections import Counter


In [85]:
# annotation = './my_data/train_01.json'
annotation = '/opt/ml/detection/dataset/train.json'

In [96]:
with open(annotation) as f: 
    data = json.load(f)

train_dict = data.copy()

type_ = annotation.split('/')[-1].split('.')[0].split('_')[-1]
    
    
var = [(ann['image_id'], ann['category_id']) for ann in data['annotations']]

X = np.ones((len(data['annotations']),1)) 
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in var]) 

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=411) 
TRAIN_X = []
VALID_X = []
for idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)): 

    print("TRAIN:", groups[train_idx])
    print(" ", y[train_idx])
    print(" TEST:", groups[val_idx])
    print(" ", y[val_idx])
    
    train_dict = data.copy()
    valid_dict = data.copy()
    
    train_annotations = np.array(data['annotations'])[train_idx].tolist()
    valid_annotations = np.array(data['annotations'])[val_idx].tolist()
    
    train_img_id = list(set(groups[train_idx]))
    train_images = [i for i in data['images'] if i['id'] in train_img_id]

    valid_img_id = list(set(groups[val_idx]))
    valid_images = [i for i in data['images'] if i['id'] in valid_img_id]
    
    train_dict['images'] = train_images
    train_dict['annotations'] = train_annotations
    
    valid_dict['images'] = valid_images
    valid_dict['annotations'] = valid_annotations
        
    # path = f'./stratified_kfold/under_{type_}/'
    path = f'./basic_v2/'

    train_dir = os.path.join(path, f'cv_train_{idx + 1}.json')
    val_dir = os.path.join(path, f'cv_val_{idx + 1}.json')
    
    
    with open(train_dir, 'w') as train_file:
        json.dump(train_dict, train_file)

    with open(val_dir, 'w') as val_file:
        json.dump(valid_dict, val_file)

TRAIN: [   0    1    1 ... 4881 4881 4881]
  [0 3 7 ... 7 1 7]
 TEST: [   6   13   13 ... 4882 4882 4882]
  [1 6 7 ... 0 1 1]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   5    5    5 ... 4876 4876 4878]
  [7 0 0 ... 0 2 0]
TRAIN: [   0    3    3 ... 4882 4882 4882]
  [0 2 6 ... 0 1 1]
 TEST: [   1    1    1 ... 4877 4877 4880]
  [3 7 4 ... 7 7 0]
TRAIN: [   1    1    1 ... 4882 4882 4882]
  [3 7 4 ... 0 1 1]
 TEST: [   0    3    3 ... 4881 4881 4881]
  [0 2 6 ... 7 1 7]
TRAIN: [   0    1    1 ... 4882 4882 4882]
  [0 3 7 ... 0 1 1]
 TEST: [   4    4    4 ... 4868 4872 4872]
  [1 1 1 ... 2 4 6]


In [92]:
# print(len(set(TRAIN_X[0]))+len(set(VALID_X[0])))

In [63]:
# check distribution 

def get_distribution(y): 
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values()) 

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]



distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)): 
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0 
    distrs.append(get_distribution(train_y)) 

    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
    
categories = [d['name'] for d in data['categories']] 
pd.DataFrame(distrs, index=index, columns = [categories[i] for i in range(np.max(y) + 1)])

Unnamed: 0,General trash,Paper,Paper pack,Metal,Glass,Plastic,Styrofoam,Plastic bag,Battery,Clothing
training set,16.91%,27.40%,3.91%,4.02%,4.28%,12.71%,5.51%,22.53%,0.70%,2.04%
train - fold0,16.65%,27.65%,3.98%,4.12%,4.13%,12.67%,5.60%,22.53%,0.68%,2.00%
val - fold0,18.07%,26.33%,3.58%,3.54%,4.91%,12.89%,5.13%,22.53%,0.78%,2.23%
train - fold1,17.03%,27.30%,4.01%,4.07%,4.30%,12.81%,5.06%,22.77%,0.71%,1.93%
val - fold1,16.40%,27.82%,3.50%,3.81%,4.18%,12.28%,7.32%,21.53%,0.66%,2.50%
train - fold2,16.95%,27.21%,3.94%,4.12%,4.38%,12.44%,5.44%,22.69%,0.78%,2.04%
val - fold2,16.73%,28.15%,3.76%,3.60%,3.86%,13.75%,5.81%,21.89%,0.39%,2.07%
train - fold3,16.85%,27.29%,3.79%,4.15%,4.41%,12.58%,5.71%,22.38%,0.62%,2.22%
val - fold3,17.13%,27.86%,4.35%,3.48%,3.78%,13.20%,4.76%,23.10%,1.00%,1.34%
train - fold4,17.06%,27.56%,3.81%,3.61%,4.17%,13.04%,5.76%,22.26%,0.71%,2.03%
