# 수도 라벨링을 위한 csv-to-json 변환 코드 v2

In [1]:
import pandas as pd
import numpy as np
import json

from tqdm.notebook import tqdm

In [2]:
csv_path = 'univ-5fold.csv'

df = pd.read_csv(csv_path)
df.drop('Unnamed: 0', axis=1, inplace=True)
for i, row in df.iterrows():
    assert type(row['PredictionString']) != float, i
    df[df.iloc[:,0].isna()]

df_csv = df.copy()
df_csv.head()

Unnamed: 0,PredictionString,image_id
0,7 0.9613852500915527 216.7988739013672 53.5352...,test/0000.jpg
1,6 0.4535674750804901 748.309814453125 657.1721...,test/0001.jpg
2,4 0.8279581665992737 866.7210083007812 245.693...,test/0002.jpg
3,9 0.8936083912849426 146.02633666992188 262.45...,test/0003.jpg
4,1 0.575635552406311 192.84735107421875 248.478...,test/0004.jpg


In [34]:
classes = np.array(["General trash", "Paper", "Paper pack", "Metal", "Glass", 
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"])

df = pd.DataFrame()

for index, row in tqdm(df_csv.iterrows(), total=len(df_csv)):
    preds, img_id = row

    L = np.array(preds.split()).reshape(-1, 6).astype(float).T

    labels = L[0].astype(int)
    scores, xs, ys, Xs, Ys = L[1:]
    ws = Xs - xs
    hs = Ys - ys
    areas = ws * hs

    xs = np.round(xs, 1)
    ys = np.round(ys, 1)
    ws = np.round(ws, 1)
    hs = np.round(hs, 1)

    df_tmp = pd.DataFrame({
        'image_id': np.repeat(int(img_id[5:9]), len(xs)),
        'category_id': labels,
        'score': scores,
        'area': areas,
        'bbox': '',
        'iscrowd': 0,
        'id': 0,
    })
    for i, r in df_tmp.iterrows():
        df_tmp.at[i, 'bbox'] = [xs[i], ys[i], ws[i], hs[i]]

    df = df.append(df_tmp, ignore_index=True)

df['id'] = range(30000, 30000+len(df))

df_backup = df.copy()

  0%|          | 0/4871 [00:00<?, ?it/s]

Unnamed: 0,image_id,category_id,score,area,bbox,iscrowd,id
0,0,7,0.961385,99815.039999,"[216.8, 53.5, 237.1, 420.9]",0,30000
1,0,7,0.951324,63649.19914,"[118.6, 422.8, 211.9, 300.3]",0,30001
2,0,7,0.926737,177775.498759,"[602.9, 518.8, 354.3, 501.8]",0,30002
3,0,7,0.916981,75925.221451,"[390.4, 192.0, 213.2, 356.1]",0,30003
4,0,7,0.874926,54109.49758,"[449.2, 601.6, 198.0, 273.2]",0,30004


Train / Test Set의 이미지 개수가 비슷하므로, 박스 개수도 비슷할 것이라는 가정 하에 비슷한 박스 개수를 가지는 Score Threshold 탐색

In [49]:
# 박스 개수가 적당한 Threshold 탐색
for thr in np.arange(0, 1, 0.01):
    df_thr = df_norm[df_norm.score >= thr].copy()
    if len(df_thr) < 23000:
        break

df_thr.drop('score', axis=1, inplace=True)
df_thr.image_id += 10000
print(f'Score Threshold: {thr:.2f}, # of boxes: {len(df_thr)}')

Score Threshold: 0.30, # of boxes: 22927


# COCO format json 저장

In [51]:
# Test.json images 정보
with open('/opt/ml/dataset/test.json') as f:
    test = json.load(f)
    
test_imgs = test['images']

# 박스가 없는 이미지 제거
test_imgs = np.array(test_imgs)[df_thr.image_id.unique()-10000].tolist()

for k in range(len(test_imgs)):
    test_imgs[k]['id'] += 10000

In [54]:
# 합칠 Train.json 파일 불러옴
with open('/opt/ml/dataset/train-kfold-0.json') as f:
    coco = json.load(f)

# 합침
coco['images'] += timgs
coco['annotations'] += df_thr.to_dict('records')

In [55]:
with open('pseudo-label-3.json', 'w') as f:
    json.dump(coco, f)

# COCO format json 저장 (test only)

In [74]:
# Test.json images 정보
with open('/opt/ml/dataset/test.json') as f:
    test = json.load(f)
    
test_imgs = test['images']

# 박스가 없는 이미지 제거
df_tmp = df_thr.copy()
df_tmp.image_id -= 10000
df_tmp.id -= 30000
test_imgs = np.array(test_imgs)[df_tmp.image_id.unique()].tolist()

test['images'] = test_imgs
test['annotations'] = df_tmp.to_dict('records')

with open('test-only-pseudo.json', 'w') as f:
    json.dump(test, f)