In [16]:
from map_boxes import mean_average_precision_for_boxes
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from pycocotools.coco import COCO
from collections import defaultdict

In [17]:
def stratified_split(coco, index, IsValid):

    df = pd.DataFrame(coco.dataset['annotations'])
    X = df['id']              # 객체 번호 [0~23143]
    y = df['category_id']     # 객체 당 카테고리 번호 [0~9]
    groups = df['image_id']   # 이미지 번호 [0~4882]
    seed = 777
    k = 5
    
    labels_num = y.max() + 1
    # https://stackoverflow.com/a/39132900/14019325
    # 기존 코드의 첫번째 loop와 동일합니다. 각 image 별 label 개수를 확인합니다.
    y_counts_per_group = df.groupby(['image_id', 'category_id']).size().unstack(fill_value=0)
    y_counts_per_fold = np.zeros((k, labels_num))

    # scale을 미리 계산하여 연산을 줄입니다.
    y_norm_counts_per_group = y_counts_per_group / y_counts_per_group.sum()
    # suffle & sort
    shuffled_and_sorted_index = y_norm_counts_per_group.sample(frac=1, random_state=seed).std(axis=1).sort_values(ascending=False).index
    y_norm_counts_per_group = y_norm_counts_per_group.loc[shuffled_and_sorted_index]

    groups_per_fold = defaultdict(set)

    for g, y_counts in zip(y_norm_counts_per_group.index, y_norm_counts_per_group.values):
        best_fold = None
        min_eval = None
        for fold_i in range(k):
            # 기존 코드 eval_y_counts_per_fold 와 동일합니다.
            y_counts_per_fold[fold_i] += y_counts
            fold_eval = y_counts_per_fold.std(axis=0).mean()  # numpy를 활용하여 연산을 단순화 합니다.
            y_counts_per_fold[fold_i] -= y_counts
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = fold_i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    train_groups = all_groups - groups_per_fold[index]
    valid_groups = groups_per_fold[index]

    if IsValid:
        return list(valid_groups), 'valid'
    else:
        return list(train_groups), 'train'

In [29]:
GT_JSON = '../../dataset/train.json'
PRED_CSV = '/opt/ml/detection/object-detection-level2-cv-17/efficientdet/csv/submission_val_256.csv'


    
# load ground truth
with open(GT_JSON, 'r') as outfile:
    test_anno = (json.load(outfile))

# load prediction
pred_df = pd.read_csv(PRED_CSV)

   

In [30]:
pred_df

Unnamed: 0,PredictionString,image_id
0,5 0.30940697 124.18994140625 405.7359619140625...,train/0001.jpg
1,4 0.386418 321.79595947265625 245.454467773437...,train/2052.jpg
2,1 0.51956475 378.3021240234375 400.32687377929...,train/0004.jpg
3,1 0.5795886 294.02996826171875 279.87515258789...,train/2053.jpg
4,1 0.3156256 67.0506591796875 105.4111938476562...,train/4104.jpg
...,...,...
967,7 0.7637757 251.2803955078125 207.961578369140...,train/2031.jpg
968,0 0.39462757 319.748046875 422.1296691894531 6...,train/2036.jpg
969,5 0.36098486 216.87814331054688 502.2306213378...,train/2042.jpg
970,7 0.7214867 -32.298370361328125 350.5903930664...,train/2044.jpg


In [31]:
'''
[
    [file_name 1, confidence_score, x_min, x_max, y_min, y_max], 
    [file_name 2 confidence_score, x_min, x_max, y_min, y_max],
    ,,,
    [file_name , confidence_score, x_min, x_max, y_min, y_max]
]
'''
    
new_pred = []

file_names = pred_df['image_id'].values.tolist()
bboxes = pred_df['PredictionString'].values.tolist()
    
'''
create new_pred
'''
    
for i, bbox in enumerate(bboxes):
    if isinstance(bbox, float):
        print(f'{file_names[i]} empty box')

for file_name, bbox in tqdm(zip(file_names, bboxes)):
    boxes = np.array(str(bbox).split(' '))
    
    if len(boxes) % 6 == 1:
        boxes = boxes[:-1].reshape(-1, 6)
    elif len(boxes) % 6 == 0:
        boxes = boxes.reshape(-1, 6)
    else:
        raise Exception('error', 'invalid box count')
    for box in boxes:
        new_pred.append([file_name, box[0], box[1], float(box[2]), float(box[4]), float(box[3]), float(box[5])])

972it [00:00, 9179.97it/s]


In [37]:
pd.DataFrame(new_pred)

Unnamed: 0,0,1,2,3,4,5,6
0,train/0001.jpg,5,0.30940697,124.189941,787.231262,405.735962,747.169556
1,train/0001.jpg,9,0.24725151,121.166229,809.420898,413.745789,744.529968
2,train/0001.jpg,1,0.23021634,164.878845,988.621399,13.313675,469.461304
3,train/0001.jpg,0,0.2217481,88.644928,838.233154,422.444427,787.575439
4,train/0001.jpg,5,0.21762538,729.156738,998.199341,299.998108,542.733826
...,...,...,...,...,...,...,...
15850,train/4095.jpg,6,0.11486546,304.800446,709.607666,257.694458,765.980103
15851,train/4095.jpg,9,0.11263155,257.923920,782.631348,134.411560,955.919861
15852,train/4095.jpg,5,0.1110139,380.888367,682.514343,310.718597,641.739502
15853,train/4095.jpg,1,0.10930513,340.172180,694.826477,488.592529,921.471191


In [38]:
pd.DataFrame(gt)

Unnamed: 0,0,1,2,3,4,5
0,train/0001.jpg,3,0.0,57.6,407.4,588.0
1,train/0001.jpg,7,0.0,144.6,455.6,637.2
2,train/0001.jpg,4,722.3,996.6,313.4,565.3
3,train/0001.jpg,5,353.2,586.9,671.0,774.4
4,train/0001.jpg,5,3.7,781.9,448.5,690.5
...,...,...,...,...,...,...
4626,train/2044.jpg,5,651.2,888.6,441.4,596.6
4627,train/2044.jpg,7,0.0,693.2,313.1,880.0
4628,train/4095.jpg,5,280.4,712.1,325.1,951.4
4629,train/4095.jpg,0,414.8,538.7,123.4,643.8


In [33]:


    
gt = []

'''
create gt
'''
    
coco = COCO(GT_JSON)
   
mask = stratified_split(coco, 1, 'valid')
# print(mask[0])
for i in mask[0]:
        
    image_info = coco.loadImgs(i)[0]
    annotation_id = coco.getAnnIds(imgIds=image_info['id'])
    annotation_info_list = coco.loadAnns(annotation_id)
        
    file_name = image_info['file_name']
        
    for annotation in annotation_info_list:
        gt.append([file_name, annotation['category_id'],
                   float(annotation['bbox'][0]),                                 #x_min
                   float(annotation['bbox'][0]) + float(annotation['bbox'][2]),  #x_max
                   float(annotation['bbox'][1]),                                 #y_min
                   float(annotation['bbox'][1]) + float(annotation['bbox'][3])]) #y_max         

loading annotations into memory...
Done (t=0.08s)
creating index...
index created!


In [35]:
gt

[['train/0001.jpg', 3, 0.0, 57.6, 407.4, 588.0],
 ['train/0001.jpg', 7, 0.0, 144.6, 455.6, 637.2],
 ['train/0001.jpg', 4, 722.3, 996.5999999999999, 313.4, 565.3],
 ['train/0001.jpg', 5, 353.2, 586.9, 671.0, 774.4],
 ['train/0001.jpg', 5, 3.7, 781.9000000000001, 448.5, 690.5],
 ['train/0001.jpg', 0, 425.3, 641.7, 681.9, 861.7],
 ['train/0001.jpg', 7, 92.4, 231.6, 601.7, 654.8000000000001],
 ['train/0001.jpg', 0, 622.4, 695.1999999999999, 686.5, 780.7],
 ['train/2052.jpg', 5, 692.8, 900.5, 344.4, 498.79999999999995],
 ['train/2052.jpg', 5, 789.6, 942.7, 491.3, 630.1],
 ['train/2052.jpg', 4, 786.0, 1023.0, 618.3, 936.5999999999999],
 ['train/2052.jpg', 4, 565.3, 805.3, 405.3, 953.9000000000001],
 ['train/2052.jpg', 7, 275.8, 328.6, 355.6, 438.20000000000005],
 ['train/2052.jpg', 4, 3.0, 337.7, 303.7, 997.5],
 ['train/2052.jpg', 4, 319.7, 569.1, 303.7, 987.0],
 ['train/0004.jpg', 1, 567.5, 732.7, 462.2, 551.6],
 ['train/0004.jpg', 1, 859.4, 1023.5999999999999, 411.7, 612.6],
 ['train/0004.

In [34]:
'''
calculate mAP
'''

'''
Special mission you sould create mean_average_precision_for_boxes function, not library
'''

mean_ap, average_precisions = mean_average_precision_for_boxes(gt, new_pred, iou_threshold=0.5)

print(mean_ap)

Number of files in annotations: 972
Number of files in predictions: 972
Unique classes: 10
Detections length: 972
Annotations length: 972
0                              | 0.101657 |     794
1                              | 0.241062 |    1270
2                              | 0.218200 |     179
3                              | 0.131034 |     187
4                              | 0.127861 |     197
5                              | 0.107805 |     589
6                              | 0.121134 |     253
7                              | 0.455562 |    1036
8                              | 0.042454 |      32
9                              | 0.068208 |      94
mAP: 0.161498
0.1614978530471745
