In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
import torchvision.models as models

import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image

from tqdm import tqdm
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import gc
import argparse
import random
from box import Box
import cv2
import cvlib as cv
import albumentations
import timm

import warnings
warnings.filterwarnings('ignore')

In [None]:
# ages 생성
def get_ages(x):
    if x < 30: return 0
    elif x < 60: return 1
    else: return 2

# genders 생성
def get_genders(x):
    if x == 'male': return 0
    else: return 1

# masks 생성
def get_masks(x):
    if x == 'normal': return 2
    elif x == 'incorrect_mask': return 1
    else: return 0

# # age_cats 생성
# def get_age_cats(x):
#     if x < 20: return 0
#     elif x < 30: return 1
#     elif x < 40: return 2
#     elif x < 50: return 3
#     elif x < 60: return 4
#     else: return 5

def get_age_cats(x):
    if x < 25: return 0
    elif x < 30: return 1
    elif x < 45: return 2
    elif x < 52: return 3
    elif x < 57: return 4
    elif x < 60: return 5
    else: return 6

# labels 생성
def get_labels(masks, genders, ages):
    return masks * 6 + genders * 3 + ages

# label_cats 생성
def get_label_cats(masks, genders, ages):
    return masks * 12 + genders * 6 + ages

# 마스크 이상치 변경
def swap_mask(swap_li : list, df : pd.DataFrame) -> pd.DataFrame:
    swap_df = df.copy()
    for swap_id in swap_li:
        _swap_df = swap_df[swap_df['id'] == swap_id]
        
        normal_swap_df = _swap_df[_swap_df['mask'] == 'normal']
        incorrect_mask_swap_df = _swap_df[_swap_df['mask'] == 'incorrect_mask']
        
        normal_path = normal_swap_df['path'].values[0]
        incorrect_mask_path = incorrect_mask_swap_df['path'].values[0]
        
        swap_df.loc[normal_swap_df.index, 'path'] = incorrect_mask_path
        swap_df.loc[incorrect_mask_swap_df.index, 'path'] = normal_path
    
    return swap_df

# train_df + mask 결측치 처리
def make_train_df(df : pd.DataFrame, swap_mask_li : list, cfg) -> pd.DataFrame:
    train_df = []
    
    for line in df.iloc:
        for file in list(os.listdir(os.path.join(cfg.train_image_dir, line['path']))):
            if file[0] == '.':
                continue
            
            mask = file.split('.')[0]
            gender = line['gender']
            age = line['age']
            
            masks = get_masks(mask)
            genders = get_genders(gender)
            ages = get_ages(age)
            age_cats = get_age_cats(age)
            
            data = {
                'id' : line['id'],
                'mask' : mask,
                'gender' : gender,
                'age' : age,
                'masks' : masks,
                'genders' : genders,
                'ages' : ages,
                'age_cats' : age_cats,
                'labels': get_labels(masks = masks, genders = genders, ages = ages),
                'label_cats': get_label_cats(masks = masks, genders = genders, ages = age_cats),
                'path': os.path.join(cfg.train_image_dir, line['path'], file),
            }
            
            train_df.append(data)
            
    train_df = pd.DataFrame(train_df)
    
    train_df['idx'] = train_df.index
    
    train_df = swap_mask(swap_li = swap_mask_li, df = train_df)
    
    return train_df

# 성별 이상치 처리
def swap_gender(swap_li : list, df : pd.DataFrame) -> pd.DataFrame:
    swap_df = df.copy()
    for swap in swap_li:
        swap_id, swap_gender = swap
        swap_df.loc[swap_df[swap_df['id'] == swap_id].index, 'gender'] = swap_gender
    return swap_df

# 사람 나누기 데이터 + 성별 결측치 처리
def preprocessing_df(df : pd.DataFrame, swap_gender_li : list) -> pd.DataFrame:
    
    preprocessing_df = df.copy()
    preprocessing_df = swap_gender(swap_li = swap_gender_li, df = preprocessing_df)
    
    preprocessing_df['ages'] = preprocessing_df['age'].apply(lambda x : get_ages(x))
    preprocessing_df['genders'] = preprocessing_df['gender'].apply(lambda x : get_genders(x))
    
    preprocessing_df['cv_taget_col'] = 'ages' + '_' + preprocessing_df['ages'].astype(str) + '_' + 'genders' + '_' + preprocessing_df['genders'].astype(str)
    
    return preprocessing_df

# val_idx 생성
def get_val_idx(df : pd.DataFrame, target_col : str):
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 22)
    for trn_idx, val_idx in skf.split(df, df[target_col]):
        yield val_idx
        
        
swap_gender_li = [["001498-1", "female"], ["004432", "female"],["005223", "female"], 
                  ['006359', 'male'], ['006360', 'male'], ['006361', 'male'], ['006362', 'male'], ['006363', 'male'], ['006364', 'male'],]
swap_mask_li = ['000020', '004418', '005227']

In [None]:
config = {
    'seed' : 22,
    
    'image_size' : [512, 384],
    'image_normal_mean' : [0.5, 0.5, 0.5],
    'image_normal_std' : [0.2, 0.2, 0.2],
    
#     'image_size' : [380, 380],
#     'image_normal_mean' : [0.485, 0.456, 0.406],
#     'image_normal_std' : [0.229, 0.224, 0.225],
    
    'num_workers' : 3,
    'epochs' : 20,
    'batch_size' : 128,
    'lr' : 0.00009,
    'oof' : 1,
    'num_classes' : 18,
    
#     cel
#     labelsmoothing
#     focal
#     f1
#     'loss' : 'cel',
    
#     'loss' : 'focal',
#     'weight' : torch.tensor([0.158863, 0.139296, 1.000000, 0.494792, 0.101064, 0.494792]).to(device),
    
    'loss' : 'labelsmoothing',
    'smoothing' : 0.1,
    
    'train_data_name' : 'train.csv',
    'train_data_dir' : '/opt/ml/input/data/train',
    'train_image_dir' : '/opt/ml/input/data/train/images',
    
    'submission_data_name' : 'info.csv',
    'submission_data_dir' : '/opt/ml/input/data/eval',
    'submission_image_dir' : '/opt/ml/input/data/eval/images',
    
    'model_dir' : '/opt/ml/model',
    # 저장할 모델병
    'model_name' : 'regnety_002_v22',
    
    # timm 에 존재하는 모델 이름
    'timm_model_name' : 'regnety_002',
    
    # 학습 타겟
    'tagets_col' : 'labels',
    'split_col' : 'label_cats',
    'cv_taget_col' : 'cv_taget_col',
    
    # 저장할 파일명
    'file_name' : 'regnety_002_v22.csv',
}

config = Box(config)

In [None]:
df = pd.read_csv(os.path.join(config.train_data_dir, config.train_data_name))
submission = pd.read_csv(os.path.join(config.submission_data_dir, config.submission_data_name))

In [None]:
from tqdm import tqdm
import shutil

def createAbnormalImage(df: pd.DataFrame, path: str):
    lists_images_for_avg = []
    
    for line in df.iloc:
        image_path = line['path']
        image = np.array(Image.open(image_path)).astype(np.int16)
        lists_images_for_avg.append(image)
    
    genereted_image = np.mean(lists_images_for_avg, axis=0).astype(np.uint8)    
    Image.fromarray(genereted_image, 'RGB').save(path)
    
    return genereted_image

pre_df = preprocessing_df(df = df, swap_gender_li = swap_gender_li)
train_df = make_train_df(df = pre_df, swap_mask_li = swap_mask_li, cfg = config)

all_idx_li = pre_df.index.tolist()
val_idx_li = get_val_idx(df = pre_df, target_col = config.cv_taget_col)

for fold_num in range(1, config.oof + 1):
    # trn, val 데이터 셋
    val_idx = next(val_idx_li)
    trn_idx = list(set(all_idx_li) - set(val_idx.tolist()))
    
    val_id_df = pre_df.iloc[val_idx, :]
    trn_id_df = pre_df.iloc[trn_idx, :]
    
    val_df = train_df.set_index('id').loc[val_id_df['id'].tolist(), :].reset_index()
    trn_df = train_df.set_index('id').loc[trn_id_df['id'].tolist(), :].reset_index()
    
    gan_label_li = trn_df['labels'].value_counts()[trn_df['labels'].value_counts() <= 1000].index.tolist()
    gan_label_li = sorted(gan_label_li)
    
    gan_df = pd.DataFrame(columns = trn_df.columns)
    id_li = []
    path_li = []
    label_li = []
    
    # 이미지 저장 폴더 생성
    image_dir = config.train_data_dir + f'/oof{fold_num}'
    
    # 폴더가 존재하면 삭제
    if os.path.exists(image_dir):
        shutil.rmtree(image_dir)
    
    if not os.path.isdir(image_dir):
        os.mkdir(image_dir)
        
    for gan_label in tqdm(gan_label_li):
        gan_trn_df = trn_df[trn_df['labels'] == gan_label]
        sample_cnt = 1000 - gan_trn_df['labels'].count()
        
        gan_id_li = gan_trn_df['id'].unique().tolist()
        
        # seed 고정
        random.seed(22)
        gan_li = random.sample(list(combinations(gan_id_li, 3)), sample_cnt)
        
        # label에 따른 이미지 폴더 생성
        label_image_dir = image_dir + f'/gan{gan_label}'
        if not os.path.isdir(label_image_dir):
            os.mkdir(label_image_dir)
        
        for gan in gan_li:
            id1, id2, id3 = gan
            image_name = id1 + '_' + id2 + '_' + id3
            path = os.path.join(label_image_dir, image_name + '.jpg')
            
            get_gan_trn_df = gan_trn_df.set_index('id').loc[gan, :]
            if len(get_gan_trn_df) > 3:
                mask_li = get_gan_trn_df['mask'].unique().tolist()
                get_mask = random.sample(mask_li, 1)
                get_gan_trn_df = get_gan_trn_df.reset_index()
                get_gan_trn_df = get_gan_trn_df.set_index('mask', drop = False).loc[get_mask, :]
            
            gan_image = createAbnormalImage(df = get_gan_trn_df, path = path)
            
            id_li.append(id1)
            path_li.append(path)
            label_li.append(gan_label)

    gan_df['id'] = id_li
    gan_df['path'] = path_li
    gan_df['labels'] = label_li
    
    gan_df.to_csv(os.path.join(config.train_data_dir, f'oof{fold_num}_df'))