In [101]:
import os
import random
import time
import json
import warnings 
warnings.filterwarnings('ignore')
import shutil

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import cv2

import numpy as np
import pandas as pd
from tqdm import tqdm

# 전처리를 위한 라이브러리
from pycocotools.coco import COCO
import torchvision
import torchvision.transforms as transforms

#!pip install albumentations==0.4.6
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [102]:
# seed 고정
random_seed = 21
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [103]:
%matplotlib inline

dataset_path  = '/opt/ml/input/data'
anns_file_path = dataset_path + '/' + 'train_all.json'

# Read annotations
with open(anns_file_path, 'r') as f:
    dataset = json.loads(f.read())

categories = dataset['categories']
anns = dataset['annotations']
imgs = dataset['images']
nr_cats = len(categories)
nr_annotations = len(anns)
nr_images = len(imgs)

# Load categories and super categories
cat_names = []
super_cat_names = []
super_cat_ids = {}
super_cat_last_name = ''
nr_super_cats = 0
for cat_it in categories:
    cat_names.append(cat_it['name'])
    super_cat_name = cat_it['supercategory']
    # Adding new supercat
    if super_cat_name != super_cat_last_name:
        super_cat_names.append(super_cat_name)
        super_cat_ids[super_cat_name] = nr_super_cats
        super_cat_last_name = super_cat_name
        nr_super_cats += 1

print('Number of super categories:', nr_super_cats)
print('Number of categories:', nr_cats)
print('Number of annotations:', nr_annotations)
print('Number of images:', nr_images)

Number of super categories: 10
Number of categories: 10
Number of annotations: 26240
Number of images: 3272


In [104]:
# Count annotations
cat_histogram = np.zeros(nr_cats,dtype=int)
for ann in anns:
    cat_histogram[ann['category_id']-1] += 1


# Convert to DataFrame
df = pd.DataFrame({'Categories': cat_names, 'Number of annotations': cat_histogram})
df = df.sort_values('Number of annotations', 0, False)

In [105]:
# category labeling 
sorted_temp_df = df.sort_index()

# background = 0 에 해당되는 label 추가 후 기존들을 모두 label + 1 로 설정
sorted_df = pd.DataFrame(["Backgroud"], columns = ["Categories"])
sorted_df = sorted_df.append(sorted_temp_df, ignore_index=True)

In [106]:
category_names = list(sorted_df.Categories)

def get_classname(classID, cats):
    for i in range(len(cats)):
        if cats[i]['id']==classID:
            return cats[i]['name']
    return "None"

class CustomDataLoader(Dataset):
    """COCO format"""
    def __init__(self, data_dir, mode = 'train', transform = None):
        super().__init__()
        self.mode = mode
        self.transform = transform
        self.coco = COCO(data_dir)
        
    def __getitem__(self, index: int):
        # dataset이 index되어 list처럼 동작
        image_id = self.coco.getImgIds()[index]
        # image_id = self.coco.getImgIds(imgIds=index)
        image_infos = self.coco.loadImgs(image_id)[0]
        
        # cv2 를 활용하여 image 불러오기
        images = cv2.imread(os.path.join(dataset_path, image_infos['file_name']))
        images = cv2.cvtColor(images, cv2.COLOR_BGR2RGB).astype(np.float32)
        images /= 255.0
        
        if (self.mode in ('train', 'val')):
            ann_ids = self.coco.getAnnIds(imgIds=image_infos['id'])
            anns = self.coco.loadAnns(ann_ids)

            # Load the categories in a variable
            cat_ids = self.coco.getCatIds()
            cats = self.coco.loadCats(cat_ids)

            # masks : size가 (height x width)인 2D
            # 각각의 pixel 값에는 "category id" 할당
            # Background = 0
            masks = np.zeros((image_infos["height"], image_infos["width"]))
            # General trash = 1, ... , Cigarette = 10
            anns = sorted(anns, key=lambda idx : idx['area'], reverse=True)
            for i in range(len(anns)):
                className = get_classname(anns[i]['category_id'], cats)
                pixel_value = category_names.index(className)
                masks[self.coco.annToMask(anns[i]) == 1] = pixel_value
            masks = masks.astype(np.int8)
                        
            # transform -> albumentations 라이브러리 활용
            if self.transform is not None:
                transformed = self.transform(image=images, mask=masks)
                images = transformed["image"]
                masks = transformed["mask"]
            return images, masks, image_infos
        
        if self.mode == 'test':
            # transform -> albumentations 라이브러리 활용
            if self.transform is not None:
                transformed = self.transform(image=images)
                images = transformed["image"]
            return images, image_infos
    
    def __len__(self) -> int:
        # 전체 dataset의 size를 return
        return len(self.coco.getImgIds())

### 기존의 test.json 파일을 갖고 있어야 합니다.
### 저는 test_past.json으로 이름을 바꿔서 놓고 작업했습니다.

In [None]:
with open('/opt/ml/input/data/test_past.json', 'r') as f:
    test_past = json.load(f)

with open('/opt/ml/data.json', 'r') as f:
    data = json.load(f)

In [144]:
test_past_fnames = []
for i in range(len(test_past['images'])):
    fname = test_past['images'][i]['file_name']
    test_past_fnames.append(fname)

In [145]:
cnt = 0
new_fnames = []
for i in range(len(data['images'])):
    fname = data['images'][i]['file_name']
    if fname in test_past_fnames:
        cnt += 1
        new_fnames.append(fname)
        
print('기존 test.json에 있던 image 중 labeling이 이미 되어 있던 이미지 개수:', cnt)
# print(new_fnames)

기존 test.json에 있던 image 중 labeling이 이미 되어 있던 이미지 개수: 195


In [150]:
# train.json / validation.json / test.json 디렉토리 설정
train_path = dataset_path + '/data_revised.json'

# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

import albumentations as A
from albumentations.pytorch import ToTensorV2
train_transform = A.Compose([
                            ToTensorV2()
                            ])

# train dataset
train_dataset = CustomDataLoader(data_dir=train_path, mode='train', transform=train_transform)

# DataLoader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=1,
                                           shuffle=False,
                                           num_workers=4,
                                           collate_fn=collate_fn)

loading annotations into memory...
Done (t=0.57s)
creating index...
index created!


In [151]:
data_dir = '/opt/ml/input/mmseg'
annotation_dir = os.path.join(data_dir, 'annotations')

# train_pseudo폴더에 저장되도록 설정해놓았습니다.
for path in ['train_pseudo', 'valid']:
    os.makedirs(os.path.join(annotation_dir, path), exist_ok=True)

In [152]:
# 기존 이미지와 겹치지 않도록 이미지 번호 설정
# 임의로 5000번부터 시작하도록 했습니다.
new_indices = 5000

In [153]:
for (imgs, masks, image_infos) in train_loader:
    
    image_infos = image_infos[0]

    if image_infos['file_name'] in new_fnames:
        
        # train_pseudo폴더에 저장되도록 설정해놓았습니다.
        file_dir = f"{annotation_dir}/train_pseudo/{str(int(image_infos['id'])+new_indices).zfill(4)}.png"
        masks = masks[0].numpy()
        cv2.imwrite(file_dir, masks)
        shutil.copyfile(os.path.join("/opt/ml/input/data", image_infos['file_name']), os.path.join("/opt/ml/input/mmseg/images/train_pseudo", f"{str(int(image_infos['id'])+new_indices).zfill(4)}.jpg"))