In [5]:
import pandas as pd
import numpy as np

In [6]:
train_df = pd.read_csv('processed_train.csv', header=0)

<br><br>
## 경로 및 라벨 추출
---

In [7]:
paths = train_df['FullPath']
labels = train_df['Class']

print(f"label, path example: \nlabel: {labels[0]}\npath: {paths[0]}")

label, path example: 
label: 4
path: /opt/ml/input/data/train/images/000001_female_Asian_45/mask1.jpg


<br><br>
## 경로 변경
---
- 훈련 데이터의 이미지 폴더가 /opt/ml/input/data/train/images가 아닌 경우

In [8]:
from pathlib import Path

In [9]:
def update_full_path(train_df, dir_images):
    train_df = train_df.copy()
    
    train_df['FullPath'] = str(Path(dir_images)) + '/'
    train_df['FullPath'] = train_df['FullPath'].str.cat(train_df['Path'])

    return train_df

dir_images = 'C://user/images'
train_df_updated = update_full_path(train_df, dir_images)

In [10]:
paths = train_df_updated['FullPath']
labels = train_df_updated['Class']

print(f"label, path example: \nlabel: {labels[0]}\npath: {paths[0]}")

label, path example: 
label: 4
path: C:/user/images/000001_female_Asian_45/mask1.jpg


<br><br>
## 칼럼
---
* 기본 데이터
    * 원본 데이터(from train.csv)
        * gender
        * race
        * age
        * path
    * 파일명
        * file - path 내의 파일이름(확장자 포함)
* 추가 데이터
    * Path - 이미지파일 폴더 및 이미지파일 경로(ex. 000002_female_Asian_52/mask.jpg) 
    * FullPath - 이미지 파일 full path
    * Mask 
    * Age 
    * Gender
    * Class (Competition 18개 class) 
    * ClassMask (Mask 독립 class)
    * ClassGender (Gender 독립 class)
    * ClassAge (Age 독립 Class)

In [11]:
train_df.columns

Index(['id', 'gender', 'race', 'age', 'path', 'file', 'FullPath', 'Path',
       'Mask', 'Age', 'Gender', 'Class', 'ClassMask', 'ClassGender',
       'ClassAge'],
      dtype='object')

In [12]:
train_df[['Mask', 'Age', 'Gender', 'Class', 'ClassMask', 'ClassGender', 'ClassAge']].head(5)

Unnamed: 0,Mask,Age,Gender,Class,ClassMask,ClassGender,ClassAge
0,Wear,>= 30 and < 60,Female,4,0,1,1
1,Wear,>= 30 and < 60,Female,4,0,1,1
2,Wear,>= 30 and < 60,Female,4,0,1,1
3,Wear,>= 30 and < 60,Female,4,0,1,1
4,Wear,>= 30 and < 60,Female,4,0,1,1


<br><br>
## Label 변경
---

In [24]:
"""
ClassAge 예시
    변경 전
        < 30 : 0
        >= 30 and < 60 : 1
        >= 60 : 2
    변경 후 
        < 20 : 0
        >= 20 and < 40 : 1
        >= 40 and < 60 : 2
        >= 60 : 3
"""
age_modified = train_df.copy()

label_0 = age_modified['age'] < 20
label_1 = (age_modified['age'] >= 20) & (age_modified['age'] < 40)
label_2 = (age_modified['age'] >= 40) & (age_modified['age'] < 60)
label_3 = age_modified['age'] >= 60

age_modified.loc[label_0, 'ClassAge'] = 0
age_modified.loc[label_1, 'ClassAge'] = 1
age_modified.loc[label_2, 'ClassAge'] = 2
age_modified.loc[label_3, 'ClassAge'] = 3

In [25]:
"""
변경된 ClassAge 반영하여 전체 Class 업데이트(필요한 경우)
"""
def update_class_value(train_df):
    train_df = train_df.copy()

    num_age_labels = train_df['ClassAge'].unique().size
    num_gender_labels = train_df['ClassGender'].unique().size
    num_mask_labels = train_df['ClassMask'].unique().size

    train_df['Class'] = train_df['ClassAge'] + \
                        num_age_labels*train_df['ClassGender'] + \
                        (num_age_labels*num_gender_labels)*train_df['ClassMask']

    return train_df

In [27]:
age_modified = update_class_value(age_modified)
age_modified['Class'].unique().size # 3 x 2 x 4 = 24

24

<br><br>
## DataSet 예시
---
두가지 방법을 고려할 수 있습니다.\
1. train_df 자체를 넘겨주기
2. path 및 label을 넘겨주기 - 일반적인 경우?

In [16]:
from torch.utils.data import Dataset

In [28]:
# Case1
class MaskClassifierDataset(Dataset):

    def __init__(self, data_df, label_column='Class', transform=None):
        self.data_df = data_df
        self.transform = transform
        self.label_column = label_column

    def __len__(self):
        return self.data_df.index.size

    def __getitem__(self, index):
        _item = self.data_df.iloc[index]

        image = Image.open(_item['FullPath'])
        label = _item[self.label_column]

        if self.transform:
            image = self.transform(image)

        return (image, label)

- 이 경우, subsample을 Dataset 수준에서 쉽게 뽑아낼 수 있습니다.

In [32]:
# Mask 착용 데이터만 추출하기
mask_df = train_df[train_df['Mask'].isin(['Wear', 'Incorrect'])]
mask_df = update_class_value(mask_df)

print(f"# of labels")
print(f"before : {train_df['Class'].unique().size}")
print(f"after : {mask_df['Class'].unique().size}")

labels = mask_df.Class.unique()
labels.sort()
print(labels)

dataset = MaskClassifierDataset(mask_df)


# of labels
before : 24
after : 16
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [33]:
# Case2
class MaskClassifierDataset(Dataset):

    def __init__(self, paths, labels, transform=None):
        self.paths = paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return self.paths.size

    def __getitem__(self, index):
        image = Image.open(self.paths[index])
        label = self.labels[index]

        if self.transform:
            image = self.transform(image)

        return (image, label)

In [None]:
paths = train_df['FullPath']
labels = train_df['Class']
dataset = MaskClassifierDataset(paths, labels)
