In [4]:
import numpy as np
import pandas as pd
import os
import glob
from string import digits

import cv2
import albumentations as A
import matplotlib as mlp
import matplotlib.pyplot as plt

In [5]:
data = pd.read_csv("/opt/ml/input/data/train/train.csv") # 제공된 파일

In [6]:
# 분류
# gender - male[0], female[1]
# age - <30[0], 30<= and <60[1], 60<=[2]
# mask - mask[0], incorrect[1], nomask[2]

In [7]:
train_folder_paths = data['path']

In [8]:
total = list()

for path in train_folder_paths.tolist():
    split_text = path.split('_')
    id = split_text[0]
    gender = split_text[1]
    age = int(split_text[3])
    for i in glob.glob("/opt/ml/input/data/train/images/{}/*".format(path)):
        # id, gender, age, mask, path
        table = str.maketrans('', '', digits)
        mask = os.path.basename(i).split('.')[0].translate(table) # 숫자제거 #글자수로 자를시 jpg와 jpeg가 둘다 있어서 문제
        total.append([id, gender, age, mask, i])

In [9]:
header = ['id', 'gender', 'age', 'mask', 'path']

In [10]:
df = pd.DataFrame(data=total, columns=header)

In [11]:
# gender - male[0], female[1]
df['gender_class'] = df['gender'].map({'male': 0, 'female': 1})

In [12]:
# age - <30[0], 30<= and <60[1], 60<=[2]
df['age_class'] = df['age']
df.loc[df.age < 30, "age_class"] = 0
df.loc[df.age >= 30, "age_class"] = 1
df.loc[df.age >= 60, "age_class"] = 2

In [13]:
# mask - mask[0], incorrect[1], nomask[2]
df['mask_class'] = df['mask'].map({'mask' : 0, 'incorrect_mask' : 1, 'normal' : 2})

In [14]:
df['total_class'] = [str(x) + str(y) + str(z) for x, y, z in zip(df['mask_class'], df['gender_class'], df['age_class'])]

In [15]:
mask_ = [0, 1, 2]
age_ = [0, 1, 2]
gender_ = [0, 1]
key = [str(i)+str(j)+str(k) for i in mask_ for j in gender_ for k in age_]
value = list(range(18))
print(value)
total_class = {key[i]:value[i] for i in value}
print(total_class)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
{'000': 0, '001': 1, '002': 2, '010': 3, '011': 4, '012': 5, '100': 6, '101': 7, '102': 8, '110': 9, '111': 10, '112': 11, '200': 12, '201': 13, '202': 14, '210': 15, '211': 16, '212': 17}


In [16]:
df['total_class_18'] = df['total_class'].map(total_class)

In [17]:
df.drop(columns=['total_class'], inplace=True)

In [19]:
%cd /opt/ml/input/data/train

/opt/ml/input/data/train


In [20]:
check = df.groupby('total_class_18').count()['path'] # 카운트 갯수 저장

In [21]:
# 이미지 변환 체크
def visualize(image):
    plt.figure(figsize=(10, 10))
    plt.axis('off')
    plt.imshow(image)

In [None]:
# Aug
max_value = max(check)
save_location = "/opt/ml/input/data/train/aug_images"
aug_pandas_list = list()

if not os.path.exists(save_location):
    os.mkdir(save_location)

for i in range(len(check)): # 18개 클래스
    num = max_value - check[i]
    mok = num // check[i]
    namur = num % check[i]
    need_aug_path = df.loc[df['total_class_18'] == i]['path'].tolist()
    need_aug_paths = need_aug_path * mok + need_aug_path[:namur] # 갯수가 같아질때까지 무지성 반복
    if mok == 0 and namur == 0:
        continue
#     print(len(need_aug_paths), max_value)
    for idx, org_path in enumerate(need_aug_paths):
        img = cv2.imread(org_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        transform = A.Compose(
            [A.CoarseDropout(always_apply=False, p=1.0, max_holes=5, max_height=40, max_width=40, min_holes=3, min_height=20, min_width=20),
             A.RGBShift(always_apply=False, p=1.0, r_shift_limit=(-30, 30), g_shift_limit=(-30, 30), b_shift_limit=(-30, 30)),
             A.GaussNoise(always_apply=False, p=1.0, var_limit=(10.0, 50.0)),
             A.GridDistortion(always_apply=False, p=1.0, num_steps=2, distort_limit=(-0.30000001192092896, 0.30000001192092896), interpolation=0, border_mode=0, value=(0, 0, 0), mask_value=None),
             A.HorizontalFlip(p=0.5),
             A.Rotate(always_apply=False, p=1.0, limit=(-5, 5), interpolation=0, border_mode=0, value=(0, 0, 0), mask_value=None)
             # Grid
            ])

        augmented_image = transform(image=img)['image']
#         visualize(augmented_image)
        img = cv2.cvtColor(augmented_image, cv2.COLOR_RGB2BGR)
        
        path_split = org_path.split('/')
        id_gender_age = path_split[-2]
        id_gender_age_split = id_gender_age.split('_')
        id_ = id_gender_age_split[0]
        gender_ = id_gender_age_split[1]
        age_ = int(id_gender_age_split[3])
        mask_ext = path_split[-1].split('.')
        mask_ = mask_ext[0]
        ext_ = mask_ext[1]
        
        dir_path = "{}/{}".format(save_location, id_gender_age)
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        aug_path = "{}/{}_{}.{}".format(dir_path, mask_, idx, ext_)
        cv2.imwrite(aug_path, img)
        
        gender_cls = 0
        age_cls = 0
        mask_cls = 0
        if gender_ == 'female': gender_cls = 1
        if age_ >= 30: age_cls = 1
        if age_ >= 60: age_cls = 2
        if mask_ == "incorrect_mask": mask_cls = 1
        if mask_ == "normal": mask_cls = 2
        
        aug_pandas_list.append([id_, gender_, age_, mask_, aug_path, gender_cls, age_cls, mask_cls, i])

In [None]:
aug_pandas = pd.DataFrame(data=aug_pandas_list, columns=df.columns)

In [None]:
aug_pandas.loc[aug_pandas['mask_class'] == 0, 'mask'] = 'mask'

In [None]:
result = pd.concat([df,aug_pandas], ignore_index=True)

In [None]:
result.to_csv("./total_class_aug.csv", mode='w')