# 1. 기본 세팅
    - import module
    - set basic path
    - config 설정
    - seed 고정

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as mlt
import matplotlib.pyplot as plt
from PIL import Image
from box import Box
from typing import Tuple, List
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import timm

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import random
from torchvision import transforms

import time

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
class DirFilePath:
    base_path = os.path.join('/opt', 'ml')
    data_dir = os.path.join(base_path, 'input', 'data')
    train_dir = os.path.join(data_dir, 'train')
    train_meta = os.path.join(train_dir, 'train.csv')
    train_image_dir = os.path.join(train_dir, 'images')
    eval_dir = os.path.join(data_dir, 'eval')
    eval_meta = os.path.join(eval_dir, 'info.csv')
    eval_image_dir = os.path.join(eval_dir, 'images')
    
    save_dir = os.path.join(base_path, 'model')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
timm_model = 'regnetx_002'
config = {
    'seed': 42,
    
    'num_fold': 5,
    'num_split_fold': 5,
    'split__shuffle': True,
    'split_base_col': 'split_base',
    
    'image_normal_mean': [0.55800916, 0.51224077, 0.47767341],
    'image_normal_std': [0.21817792, 0.23804603, 0.25183411],
    
    
    'timm_model_name': timm_model,
    'num_classes': 18,
    'batch_size': 128,
    'epochs': 3,
    'num_accum': 1,
    'num_workers': 3,
    'learning_rate': 0.0001,
    'device': device,
    }

config = Box(config)

In [4]:
def seed_everything(seed):
    torch.manual_seed(seed) # transforms seed 고정
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(config.seed)

# 2. 데이터 전처리
    - 이상 데이터 처리
    - gender, age 인코딩
        - male: 0, female: 1 ==> replace=True
        - <30: 0, 30 <= and < 60: 1, >=60: 2 ==> replace=False
    - train_meta_df
        - columns: id(str), gender(int), age(int), path(str), ages(int), split_base(int)
    - train_images_df
        - columns: gender(int), age(int), ages(int), path(str), file_path(str), mask(int), label(int)
        - labeling.csv로 저장

In [5]:
class Preprocessing:
    @classmethod
    def recreateMetaDataFrame(cls, meta_df: pd.DataFrame) -> pd.DataFrame:
        """
        meta_df: id(str), gender(str), race(str), age(int), path(str)
        -> meta_df: id(str), gender(int), age(int), path(str), ages(int), split_base(int)
        """
        # cls._modifyAnomalyFilename()
        meta_df = cls._modifyAnomalyGender(meta_df)
        meta_df = cls._encodeAgeGender(meta_df)
        return meta_df
    
    
    @classmethod
    def createImageDataFrmae(cls, meta_df: pd.DataFrame) -> pd.DataFrame:
        """
        meta_df: id(str), gender(int), age(int), path(str), ages(int), split_base(int)
        -> info_df: gender(int), age(int), ages(int), path(str), file_path(str), mask(int), label(int)
        """
        info_df = cls._metaToImageDataFrame(meta_df)
        return info_df
    
    def _metaToImageDataFrame(meta_df: pd.DataFrame) -> pd.DataFrame:
        info_df = pd.DataFrame(None, columns=['gender', 'age', 'ages', 'path', 'file_path', 'mask', 'label'])
    
        for idx, row in tqdm(meta_df.iterrows(), total=meta_df.shape[0]):
            image_dir_path = os.path.join(DirFilePath.train_image_dir, row['path'])
            for image_file_name in os.listdir(image_dir_path):
                if image_file_name[0] == '.':
                    continue

                if image_file_name.split('.')[0] == 'normal':
                    mask = 2
                elif image_file_name.split('.')[1] == 'incorrect_mask':
                    mask = 1
                else:
                    mask = 0

                data = {
                    'gender': row['gender'],
                    'age': row['age'],
                    'ages': row['ages'],
                    'path': row['path'],
                    'file_path': os.path.join(image_dir_path, image_file_name),
                    'mask': mask,
                    'label': mask * 6 + row['gender'] * 3 + row['ages']
                }

                info_df = info_df.append(data, ignore_index=True)

        info_df.to_csv(os.path.join(DirFilePath.train_dir, 'labeling.csv'), index=False)
        print('create image file data frame doen!')
        
        return info_df
        
    
    def _encodeAges(age: int) -> int:
        if age < 30:
            return 0
        elif age < 60:
            return 1
        else:
            return 2
        
    
    def _encodeGender(gender: str) -> int:
        if gender == 'male':
            return 0
        return 1
    
    
    @classmethod
    def _encodeAgeGender(cls, prev_df: pd.DataFrame) -> pd.DataFrame:
        ret_df = prev_df.copy()
        ret_df['ages'] = prev_df['age'].apply(lambda x : cls._encodeAges(x))
        ret_df['gender'] = prev_df['gender'].apply(lambda x : cls._encodeGender(x))
        ret_df['split_base'] = 3 * ret_df['gender'] + ret_df['ages']
        ret_df = ret_df.drop(labels='race', axis=1)
        
        print('encode age and gender done!')
        
        return ret_df
    
    def _genderSwapList() -> Tuple[list, list]:    
        # 게시판 3번 Posted by 강영석_T3006
        gender_anomaly_file_path_list = [f'00{id}_female_Asian_18' for id in range(6359, 6365)]
        gender_list = ['male' for _ in range(6359, 6365)]


        # 게시판 3번 Posted by 강영석_T3006
        # Commented by 구창회_T3011
        gender_anomaly_file_path_list.extend(['001498-1_male_Asian_23', '004432_male_Asian_43'])
        gender_list.extned(['femal']*2)
        
        return gender_anomaly_list, gender_list
    
    
    @classmethod  
    def _modifyAnomalyGender(cls, meta_df: pd.DataFrame) -> pd.DataFrame:
        gender_anomaly_list, gender_list = cls._genderAnomalyList()
        for path_name, correct_gender in zip(gender_anomaly_list, gender_list):
            meta_df.loc[meta_df.query(f"path == \'{path_name}\'").index, 'gender'] = correct_gender
        
        print('modify anomaly gender done!')
        return meta_df
        
        
    def _genderAnomalyList() -> Tuple[list, list]:    
        # 게시판 3번 Posted by 강영석_T3006
        anomaly_female_id = [i for i in range(6359, 6365)]
        gender_anomaly_list = [f'00{id}_female_Asian_18' for id in anomaly_female_id]
        gender_list = ['male'] * len(anomaly_female_id)


        # 게시판 3번 Posted by 강영석_T3006
        # Commented by 구창회_T3011
        anomaly_male_id = ['001498-1_male_Asian_23', '004432_male_Asian_43']
        gender_anomaly_list.extend(anomaly_male_id)
        gender_list.extend(['female']*len(anomaly_male_id))
        
        return gender_anomaly_list, gender_list
    
    
    @classmethod
    def _modifyAnomalyFilename(cls, titles=['incorrect_mask.jpg', 'normal.jpg']) -> None:
        """
        swap titels[0] <-> titles[1]
        """
        filename_anomaly_list = cls._filenameAnomalyList()

        for image_dir in filename_anomaly_list:
            image_dir_path = os.path.join(DirFilePath.train_image_dir, image_dir)

            prev_image_file_name = os.path.join(image_dir_path, titles[0])
            if cls._isFileChanged(prev_image_file_name):
                print(f'{prev_image_file_name} is already changed!')
                continue
            else:
                print(f'{prev_image_file_name} is changing')

            temp_after_image_file_name = os.path.join(image_dir_path, 'temp.jpg')
            os.rename(prev_image_file_name, temp_after_image_file_name)

            prev_image_file_name = os.path.join(image_dir_path, titles[1])
            after_image_file_name = os.path.join(image_dir_path, titles[0])
            os.rename(prev_image_file_name, after_image_file_name)

            after_image_file_name = os.path.join(image_dir_path, titles[1])
            os.rename(temp_after_image_file_name, after_image_file_name)

        print('modify anomaly filename done!')
        
    def _isFileChanged(file_path: str) -> bool: # check if file changed based on last modified time 
        # os.path.getmtime(file_path) not working...
        # file system 문제...?
        if os.path.getctime(file_path) > 1646000000.0: 
            return True 

        return False
    
    def _filenameAnomalyList() -> list:
        # 게시판 8번 Posted by 강영석_T3006
        filename_anomaly_list = ['000020_female_Asian_50', '004418_male_Asian_20']

        # 게시판 3번 Posted by 강영석_T3006
        # Commented by 구창회_T3011
        filename_anomaly_list.append('005227_male_Asian_22')
        
        return filename_anomaly_list

In [6]:
train_meta_df = pd.read_csv(DirFilePath.train_meta)
train_meta_df = Preprocessing.recreateMetaDataFrame(train_meta_df)

modify anomaly gender done!
encode age and gender done!


In [7]:
# train_images_df = Preprocessing.createImageDataFrmae(train_meta_df)
train_images_df = pd.read_csv(os.path.join(DirFilePath.train_dir, 'labeling.csv'))

# 3. Data Split, Dataset

## 3.1. Data Split

In [8]:
def splitData(df: pd.DataFrame, n_splits: int, shuffle: bool, random_state: int, split_base: str):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    
    for train_idx, val_idx in skf.split(df, df[split_base]):
        yield train_idx, val_idx

## 3.2. Dataset

In [9]:
trfm = {
    'basic_trfm': transforms.Compose([
        transforms.Grayscale(num_output_channels=3), # r=g=b
        transforms.ToTensor(),
    ]),
    
    'train_aug_trfm': transforms.Compose([
        transforms.GaussianBlur(
            kernel_size=5, # 숫자가 커지면 더 흐려진다.
            sigma=(15, 19)
        ),    
    ]),
}

trfm = Box(trfm)

In [10]:
class CustomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, transforms: transforms.Compose, train: bool) -> None:
        self.images_df = df
        self.trfm = transforms
        self.train = train

    def __getitem__(self, idx) -> torch.Tensor:
        row = self.images_df.iloc[idx]
        img = Image.open(row['file_path'])
        img = self.trfm(img)
        
        if self.train:
            return img, row['label']
        else:
            return img
        
    def __len__(self) -> int:
        return self.images_df.shape[0]


# 4. 학습
    - model, loss
    - train, eval function
    - model parameter 저장

## 4.1. Model

In [11]:
class CreatePrertrainedModel(nn.Module):
    def __init__(self, model_name: str, pretrained: bool, num_classes: int) -> None:
        super(CreatePrertrainedModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=num_classes)

    def forward(self, x) -> list:
        y = self.model(x)
        return y

In [12]:
def convBatch(in_channels: int, out_channels: int, kernel_size=3, padding=1, stride=1) -> nn.Sequential:
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, stride=stride, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.LeakyReLU()
    )

class ResidualBlock(nn.Module):
    # https://coding-yoon.tistory.com/141
    def __init__(self, in_channels: int):
        super(ResidualBlock, self).__init__()
        
        reduced_channels = in_channels // 2
        
        self.features = nn.Sequential(
            convBatch(in_channels, reduced_channels, kernel_size=1, padding=0),
            convBatch(reduced_channels, in_channels),
        )
        
    
    def forward(self, x):
        residual = x
        
        out = self.features(x)
        out += residual
        out = nn.ReLU()(out)
        
        return out
        


class CustomModel(nn.Module):
    def __init__(self, num_classes: int, block: nn.Module) -> None:
        super(CustomModel, self).__init__()
        
        self.features = nn.Sequential(
            convBatch(3, 32),
            convBatch(32, 64, stride=2),
            self._iter_residual_block(block, 64, 1),
            convBatch(64, 128, stride=2),
            self._iter_residual_block(block, 128, 2),
            convBatch(128, 256, stride=2),
            self._iter_residual_block(block, 256, 8),
            convBatch(256, 512, stride=2),
            self._iter_residual_block(block, 512, 8),
            convBatch(512, 1024, stride=2),
            self._iter_residual_block(block, 1024, 4),
        )
        self.fc = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Linear(1024, num_classes, bias=True)
        )
        
    def forward(self, x):
        out = self.features(x)
        out = self.fc(out)
        
        return out
    
    def _iter_residual_block(self, block: nn.Module, in_channels: int, iter_num: int):
        layers = []
        for _ in range(iter_num):
            layers.append(block(in_channels))
        
        return nn.Sequential(*layers)

In [13]:
# model = CustomModel(18, ResidualBlock)
model = CreatePrertrainedModel(config.timm_model_name, pretrained=True, num_classes=18)
model.to(device)


CreatePrertrainedModel(
  (model): RegNet(
    (stem): ConvBnAct(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNormAct2d(
        32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (act): ReLU(inplace=True)
      )
    )
    (s1): RegStage(
      (b1): Bottleneck(
        (conv1): ConvBnAct(
          (conv): Conv2d(32, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNormAct2d(
            24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (act): ReLU(inplace=True)
          )
        )
        (conv2): ConvBnAct(
          (conv): Conv2d(24, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=3, bias=False)
          (bn): BatchNormAct2d(
            24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (act): ReLU(inplace=True)
          )
        )
        (conv3): ConvBnAct(
          (conv): Conv2d(24, 24, ker

## 4.2. Train, Evaluation 함수

In [14]:
def trainValidFunc(model, optimizer, criterion, train_loader, val_loader, epochs, device, num_accum):        
    model.train()

    # batch_size
    size = len(self.train_loader)

    # 훈련 시간 측정
    epoch_start = torch.cuda.Event(enable_timing=True)
    epoch_end = torch.cuda.Event(enable_timing=True)
    total_train_time = 0

    for epoch in range(epochs):

        # 시작 시간 기록
        epoch_start.record()

        # 한 epoch 당 loss, ground truth, prediction 기록
        epoch_loss = 0
        pred_list = []
        gt_list = []

        optimizer.zero_grad()
        for idx, train_batch in enumerate(train_loader):
            data, data_gt = train_batch
            data = data.to(device)
            data_gt = data_gt.to(device)

            output = model(data)
            loss = criterion(output, data_gt)
            loss.backward()
            epoch_loss += loss.item()                
            
            predict = output.argmax(dim=1)
            gt_list.extend(data_gt.cpu().numpy())
            pred_list.extend(predict.detach().cpu().numpy())

            # gradient accumulation
            if (idx+1) % num_accum == 0:
                optimizer.step()
                optimizer.zero_grad()

        epoch_end.record()
        torch.cuda.synchronize()

        avg_loss = epoch_loss / (size*num_accum)
        avg_acc = getAcc(gt_list, pred_list)
        avg_f1 = getF1(gt_list, pred_list)


        print(f'Epoch[{epoch+1}/{epochs}]\ttrain_acc: {avg_acc:.4f}\ttrain_loss: {avg_loss:.4f}\ttrain_f1: {avg_f1:.4f}')
        print(f'\t\t훈련시간: {epoch_start.elapsed_time(epoch_end)/1000:.0f}')
                
                

    # validation set 
    model.eval()

    for epoch in range(epochs):
        epoch_start.record()
        
        with torch.no_grad():
            epoch_loss = 0


            for idx, train_batch in enumerate(train_loader):
                data, data_gt = train_batch
                data = data.to(device)
                data_gt = data_gt.to(device)

                output = model(data)
                loss = criterion(output, data_gt)
                epoch_loss += loss.item()     
                
                predict = output.argmax(dim=1)
                gt_list.extend(data_gt.cpu().numpy())
                pred_list.extend(predict.detach().cpu().numpy())

                # gradient accumulation
                if (idx+1) % num_accum == 0:
                    optimizer.step()
                    optimizer.zero_grad()
        
        epoch_end.record()

    
def getAcc(self, gt_list: list, pred_list: list):
    return accuracy_score(y_true=gt_list, y_pred=pred_list)


def getF1(self, gt_list: list, pred_list: list):
    return f1_score(y_true=gt_list, y_pred=pred_list, average='micro')

In [15]:
# def eval_func(model, data_loader):
#     model.eval()
#     with torch.no_grad():
        
        
#     pass
    

## 4.3. Optimizer, loss

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate, amsgrad =True)
criterion = criterion = nn.CrossEntropyLoss()

## 4.4. 학습 실행

In [17]:
def trainValDataLoader(train_idx: np.array, val_idx: np.array) -> Tuple[DataLoader, DataLoader]:
    train_set_meta_df = train_meta_df.loc[train_idx].reset_index(drop=True)
    val_set_meta_df = train_meta_df.loc[val_idx].reset_index(drop=True)

    train_set_images_df = train_images_df.query(f"path in {train_set_meta_df['path'].values.tolist()}").reset_index(drop=True)
    val_set_images_df =train_images_df.query(f"path in {val_set_meta_df['path'].values.tolist()}").reset_index(drop=True)
    
    train_set_dataset = CustomDataset(train_set_images_df, trfm.basic_trfm, train=True)
    train_set_data_loader = DataLoader(
                                train_set_dataset,
                                batch_size= config.batch_size,
                                num_workers= config.num_workers,
                                shuffle = True,
                                )
    
    val_set_dataset = CustomDataset(val_set_images_df, trfm.basic_trfm, train=True)
    val_set_data_loader = DataLoader(
                            val_set_dataset,
                            batch_size= config.batch_size,
                            num_workers= config.num_workers,
                            shuffle = True,
                            )
    
    return train_set_data_loader, val_set_data_loader

In [18]:
split_generator = splitData(
    df=train_meta_df, 
    n_splits=config.num_split_fold, 
    shuffle=config.split__shuffle, 
    random_state=config.seed, 
    split_base=config.split_base_col,
    )

In [19]:
train_idx, val_idx = next(split_generator)

In [20]:
train_set_data_loader, val_set_data_loader = trainValDataLoader(train_idx, val_idx)

In [21]:
train_val = trainValidFunc(
    model=model, 
    optimizer=optimizer, 
    criterion=criterion, 
    train_loader=train_set_data_loader, 
    val_loader=val_set_data_loader, 
    epochs=config.epochs, 
    device=config.device, 
    num_accum=config.num_accum
)

In [23]:
print('epoch\thi\thihi')
print('\thi')

epoch	hi	hihi
	hi
