In [1]:
import json
import os

import cv2
import numpy as np
import pandas as pd
from pycocotools.coco import COCO
from torchvision.utils import save_image
from torch.utils.data import Dataset
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm

import albumentations as A
from albumentations.pytorch import ToTensorV2

## Argument 세팅

In [2]:
src_data_dir  = '/opt/ml/segmentation/input/data'

mode = 'train'
# src_data_json_path = src_data_dir + '/train.json'
src_data_json_path = src_data_dir + '/train_all.json'
dst_data_dir = '/opt/ml/segmentation/input/mmseg_pseudo/'
pseudo_num_duplicates = 1  # 2이면 train set이
                           # train_all 데이터(3272개) + pseudo 데이터(819개) 2번 으로
                           # 총 4910개로 구성된다.
                           # 1이면 총 4091개가 된다.
pseudo_ann_csv_path = './pan_se_resnext101_32x4d_train_all_20e_512.csv'
                           # pseudo_num_duplicates이 0인 경우 쓰이지 않음

# mode = 'val'
# src_data_json_path = src_data_dir + '/val.json'
# dst_data_dir = '/opt/ml/segmentation/input/mmseg_pseudo/'

# mode = 'test'
# src_data_json_path = src_data_dir + '/test.json'
# dst_data_dir = '/opt/ml/segmentation/input/mmseg_pseudo/'


## Dataset 구현

In [3]:
category_names = ['Backgroud', 'General trash', 'Paper', 'Paper pack', 'Metal', 'Glass', 'Plastic', 'Styrofoam', 'Plastic bag', 'Battery', 'Clothing']


def get_classname(classID, cats):
    for i in range(len(cats)):
        if cats[i]['id']==classID:
            return cats[i]['name']
    return "None"


class CustomDataLoader(Dataset):
    """COCO format"""
    def __init__(self, data_dir, data_json_path, mode = 'train', transform = None):
        super().__init__()
        self.data_dir = data_dir
        self.mode = mode
        self.transform = transform
        
        self.coco = COCO(data_json_path)
        
    def __getitem__(self, index: int):
        # dataset이 index되어 list처럼 동작
        image_id = self.coco.getImgIds(imgIds=index)
        image_infos = self.coco.loadImgs(image_id)[0]
        
        # cv2 를 활용하여 image 불러오기
        images = cv2.imread(os.path.join(self.data_dir, image_infos['file_name']))
        # images = cv2.cvtColor(images, cv2.COLOR_BGR2RGB).astype(np.float32)
        images = images.astype(np.float32)
        # images /= 255.0
        
        if (self.mode in ('train', 'val')):
            ann_ids = self.coco.getAnnIds(imgIds=image_infos['id'])
            anns = self.coco.loadAnns(ann_ids)

            # Load the categories in a variable
            cat_ids = self.coco.getCatIds()
            cats = self.coco.loadCats(cat_ids)

            # masks : size가 (height x width)인 2D
            # 각각의 pixel 값에는 "category id" 할당
            # Background = 0
            masks = np.zeros((image_infos["height"], image_infos["width"]))
            # General trash = 1, ... , Cigarette = 10
            anns = sorted(anns, key=lambda idx : idx['area'], reverse=True)
            for i in range(len(anns)):
                className = get_classname(anns[i]['category_id'], cats)
                pixel_value = category_names.index(className)
                masks[self.coco.annToMask(anns[i]) == 1] = pixel_value
            masks = masks.astype(np.int8)
                        
            # transform -> albumentations 라이브러리 활용
            if self.transform is not None:
                transformed = self.transform(image=images, mask=masks)
                images = transformed["image"]
                masks = transformed["mask"]
            return images, masks, image_infos
        
        if self.mode == 'test':
            # transform -> albumentations 라이브러리 활용
            if self.transform is not None:
                transformed = self.transform(image=images)
                images = transformed["image"]
            return images, image_infos
    
    def __len__(self) -> int:
        # 전체 dataset의 size를 return
        return len(self.coco.getImgIds())

### Pseudo 라벨 Dataset

In [4]:
class PseudoTrainDataset(Dataset):
    def __init__(self, data_dir, ann_csv_path, transform = None):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transform
        self.masks, self.image_infos = self.__load_annotations(ann_csv_path)

    @staticmethod
    def __load_annotations(ann_csv_path):
        ann_df = pd.read_csv(ann_csv_path)
        
        masks = []
        image_infos = []
        for idx, (input_image_file_name, pred_str) in enumerate(zip(ann_df['image_id'], ann_df['PredictionString'])):
            flatten_pred_ints = [int(pred) for pred in pred_str.split(' ')]
            try:
                mask = np.asarray(flatten_pred_ints, dtype=np.uint8).reshape(512, 512)
            except ValueError as e:
                print(idx)
                raise e
                
            masks.append(mask)
            
            image_infos.append(
                {
                    'file_name': input_image_file_name,
                    'id': idx
                }
            )
        
        return masks, image_infos
        
    def __getitem__(self, idx: int):
        # cv2 를 활용하여 image 불러오기
        image = cv2.imread(os.path.join(self.data_dir, self.image_infos[idx]['file_name']))
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image = image.astype(np.float32)
        # image /= 255.0
        
        mask = self.masks[idx]
        image_info = self.image_infos[idx]
        
        # transform -> albumentations 라이브러리 활용
        if self.transform is not None:
            transformed = self.transform(image=image, mask=mask)
            image = transformed["image"]
            mask = transformed["mask"]
        
        return image, mask, image_info
        
    def __len__(self) -> int:
        # 전체 dataset의 size를 return
        return len(self.masks)

### Concat Two Datasets 

In [5]:
class ConcatTwoDataset(Dataset):  # only train mode
    def __init__(self, dataset_1, dataset_2):
        super().__init__()
        self.dataset_1 = dataset_1
        self.dataset_2 = dataset_2
        
        self.len_1 = len(dataset_1)
        self.len_2 = len(dataset_2)
        
    def __getitem__(self, idx):
        if idx < self.len_1:
            return self.dataset_1[idx]
        else:
            return self.dataset_2[idx-self.len_1]
        
    def __len__(self):
        return self.len_1 + self.len_2

## Dataset 생성

In [6]:
common_transform = A.Compose([ToTensorV2()])

dataset = CustomDataLoader(data_dir=src_data_dir, data_json_path=src_data_json_path, mode=mode, transform=None)

if mode == 'train' and pseudo_num_duplicates > 0:
    pseudo_dataset = PseudoTrainDataset(src_data_dir, pseudo_ann_csv_path, transform=None)
    for _ in range(pseudo_num_duplicates):
        dataset = ConcatTwoDataset(dataset, pseudo_dataset)

print(f"length of dataset: {len(dataset)}")
print(dataset[0])
if pseudo_num_duplicates > 0:
    print(dataset[4090])


loading annotations into memory...
Done (t=4.80s)
creating index...
index created!
length of dataset: 4091
(array([[[158., 189., 212.],
        [163., 194., 217.],
        [160., 189., 216.],
        ...,
        [108., 132., 160.],
        [108., 132., 160.],
        [107., 131., 159.]],

       [[168., 198., 223.],
        [158., 188., 213.],
        [155., 184., 211.],
        ...,
        [107., 131., 159.],
        [107., 131., 159.],
        [107., 131., 159.]],

       [[161., 190., 217.],
        [165., 194., 221.],
        [162., 191., 218.],
        ...,
        [106., 131., 157.],
        [106., 130., 158.],
        [105., 129., 157.]],

       ...,

       [[228., 222., 227.],
        [226., 222., 227.],
        [228., 222., 227.],
        ...,
        [127., 117., 130.],
        [130., 117., 131.],
        [129., 116., 130.]],

       [[213., 209., 215.],
        [212., 210., 216.],
        [214., 210., 216.],
        ...,
        [131., 118., 132.],
        [130., 117., 1

## image 및 annotation 저장

In [7]:
if mode == 'train':
    images_save_dir = os.path.join(dst_data_dir, 'images/training')
    annotations_save_dir = os.path.join(dst_data_dir, 'annotations/training')
elif mode == 'val':
    images_save_dir = os.path.join(dst_data_dir, 'images/validation')
    annotations_save_dir = os.path.join(dst_data_dir, 'annotations/validation')
else:  # mode == 'test'
    images_save_dir = os.path.join(dst_data_dir, 'test')
    annotations_save_dir = None
    
if not os.path.exists(images_save_dir):
    os.makedirs(images_save_dir)
    print('A directory - ' + images_save_dir + ' is created.')
          
if annotations_save_dir and not os.path.exists(annotations_save_dir):
    os.makedirs(annotations_save_dir)
    print('A directory - ' + annotations_save_dir + ' is created.')
    

if mode in ('train', 'val'):
    for idx in tqdm(range(len(dataset))):
        img, mask, image_infos = dataset[idx]
        image_save_path = os.path.join(images_save_dir, f'{idx:04}.jpg')  
        # image_infos["id"]로 할 경우, train data의 id와 pseudo data(test data)의 id가 겹치게 된다.
        annotation_save_path = os.path.join(annotations_save_dir, f'{idx:04}.png')
        
        cv2.imwrite(image_save_path, img)
        cv2.imwrite(annotation_save_path, mask)

elif mode == 'test':
    for idx in tqdm(range(len(dataset))):
        img, image_infos = dataset[idx]
        image_save_path = os.path.join(images_save_dir, f'{image_infos["id"]:04}.jpg')
        
        cv2.imwrite(image_save_path, img)

  0%|          | 6/4091 [00:00<01:12, 56.60it/s]

A directory - /opt/ml/segmentation/input/mmseg_pseudo/images/training is created.
A directory - /opt/ml/segmentation/input/mmseg_pseudo/annotations/training is created.


100%|██████████| 4091/4091 [01:13<00:00, 55.49it/s]
