## 데이터 솎아내기

In [5]:
import json 
import os
import pandas as pd
from pycocotools.coco import COCO

In [6]:
coco = COCO('../../dataset/train.json')

train_df = pd.DataFrame()

image_ids = []
class_name = []
class_id = []
x_min = []
y_min = []
x_max = []
y_max = []
classes = ["General trash", "Paper", "Paper pack", "Metal", "Glass", 
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"]
for image_id in coco.getImgIds():
        
    image_info = coco.loadImgs(image_id)[0]
    ann_ids = coco.getAnnIds(imgIds=image_info['id'])
    anns = coco.loadAnns(ann_ids)
        
    file_name = image_info['file_name']
        
    for ann in anns:
        image_ids.append(file_name)
        class_name.append(classes[ann['category_id']])
        class_id.append(ann['category_id'])
        x_min.append(float(ann['bbox'][0]))
        y_min.append(float(ann['bbox'][1]))
        x_max.append(float(ann['bbox'][0]) + float(ann['bbox'][2]))
        y_max.append(float(ann['bbox'][1]) + float(ann['bbox'][3]))

train_df['image_id'] = image_ids
train_df['class_name'] = class_name
train_df['class_id'] = class_id
train_df['x_min'] = x_min
train_df['y_min'] = y_min
train_df['x_max'] = x_max
train_df['y_max'] = y_max



loading annotations into memory...
Done (t=0.17s)
creating index...
index created!


In [7]:
train_df

Unnamed: 0,image_id,class_name,class_id,x_min,y_min,x_max,y_max
0,train/0000.jpg,General trash,0,197.6,193.7,745.4,663.4
1,train/0001.jpg,Metal,3,0.0,407.4,57.6,588.0
2,train/0001.jpg,Plastic bag,7,0.0,455.6,144.6,637.2
3,train/0001.jpg,Glass,4,722.3,313.4,996.6,565.3
4,train/0001.jpg,Plastic,5,353.2,671.0,586.9,774.4
...,...,...,...,...,...,...,...
23139,train/4882.jpg,Plastic,5,0.0,116.2,944.1,930.3
23140,train/4882.jpg,Plastic bag,7,302.1,439.3,567.3,655.4
23141,train/4882.jpg,General trash,0,511.3,451.1,570.0,481.3
23142,train/4882.jpg,Paper,1,255.0,421.4,526.7,616.5


In [8]:
z_score_limit = 3.5 # sigma 범위

index_ = pd.DataFrame()
mean_value = train_df['image_id'].value_counts().mean()
std_value = train_df['image_id'].value_counts().std()

index_['z_score'] = (train_df['image_id'].value_counts()-mean_value)/std_value
removed_index = [i for i in index_.index if index_['z_score'][i] > z_score_limit]


In [9]:
removed_index

['train/4197.jpg',
 'train/0946.jpg',
 'train/4047.jpg',
 'train/0394.jpg',
 'train/2981.jpg',
 'train/2416.jpg',
 'train/1955.jpg',
 'train/1678.jpg',
 'train/4492.jpg',
 'train/1797.jpg',
 'train/4841.jpg',
 'train/4778.jpg',
 'train/3825.jpg',
 'train/2521.jpg',
 'train/1020.jpg',
 'train/1671.jpg',
 'train/1058.jpg',
 'train/4119.jpg',
 'train/0384.jpg',
 'train/0624.jpg',
 'train/4225.jpg',
 'train/3026.jpg',
 'train/1872.jpg',
 'train/3403.jpg',
 'train/1177.jpg',
 'train/0694.jpg',
 'train/3049.jpg',
 'train/0272.jpg',
 'train/3046.jpg',
 'train/2794.jpg',
 'train/3450.jpg',
 'train/0352.jpg',
 'train/3601.jpg',
 'train/0956.jpg',
 'train/0655.jpg',
 'train/3148.jpg',
 'train/1918.jpg',
 'train/0833.jpg',
 'train/2189.jpg',
 'train/4261.jpg',
 'train/3430.jpg',
 'train/2956.jpg',
 'train/2387.jpg',
 'train/2572.jpg',
 'train/1997.jpg',
 'train/0840.jpg',
 'train/2678.jpg',
 'train/0767.jpg',
 'train/0168.jpg',
 'train/2182.jpg',
 'train/1669.jpg',
 'train/3796.jpg',
 'train/4537

In [10]:
len(removed_index)

86

## updated_train.json 생성

In [11]:
# 새 파일 생성
root = '../../dataset/'
file_name = 'train.json'
save_name = 'changed_train.json'
train_df = pd.DataFrame

with open(os.path.join(root,file_name),'r') as f:
    coco_json = json.load(f)


In [12]:
coco_json.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [13]:
coco_json['images'][189:192]  # 원본 데이터 images key

[{'width': 1024,
  'height': 1024,
  'file_name': 'train/0189.jpg',
  'license': 0,
  'flickr_url': None,
  'coco_url': None,
  'date_captured': '2020-12-26 15:09:15',
  'id': 189},
 {'width': 1024,
  'height': 1024,
  'file_name': 'train/0190.jpg',
  'license': 0,
  'flickr_url': None,
  'coco_url': None,
  'date_captured': '2020-12-28 12:00:56',
  'id': 190},
 {'width': 1024,
  'height': 1024,
  'file_name': 'train/0191.jpg',
  'license': 0,
  'flickr_url': None,
  'coco_url': None,
  'date_captured': '2020-12-26 21:23:05',
  'id': 191}]

In [14]:
coco_info = coco_json['info']
coco_licenses = coco_json['licenses']
coco_images = [ i for i in coco_json['images'] if i['file_name'] not in removed_index]
coco_categories = coco_json['categories']
coco_annotations = coco_json['annotations']


In [15]:
coco_images[185:188] # 0190.jpg 데이터 누락

[{'width': 1024,
  'height': 1024,
  'file_name': 'train/0188.jpg',
  'license': 0,
  'flickr_url': None,
  'coco_url': None,
  'date_captured': '2021-01-04 14:40:10',
  'id': 188},
 {'width': 1024,
  'height': 1024,
  'file_name': 'train/0189.jpg',
  'license': 0,
  'flickr_url': None,
  'coco_url': None,
  'date_captured': '2020-12-26 15:09:15',
  'id': 189},
 {'width': 1024,
  'height': 1024,
  'file_name': 'train/0191.jpg',
  'license': 0,
  'flickr_url': None,
  'coco_url': None,
  'date_captured': '2020-12-26 21:23:05',
  'id': 191}]

In [16]:
new_data = {
    'info': coco_info,
    'licenses': coco_licenses,
    'images': coco_images,
    'categories': coco_categories,
    'annotations': coco_annotations
}

In [17]:
new_data.keys()

dict_keys(['info', 'licenses', 'images', 'categories', 'annotations'])

In [18]:
# 새 파일 생성
root = '../../dataset/'
file_name = 'train.json'
save_name = 'updated_train.json'
train_df = pd.DataFrame
output_path = os.path.join(root,save_name)

with open(output_path,'w') as f:
    json.dump(new_data, f, indent=2)