## import

In [None]:
import os
import pandas as pd
import numpy as np
import random
import glob
import shutil
from PIL import Image
from tqdm import tqdm

# 모델 관련 모듈
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import Resize, ToTensor, Normalize
import torchvision.models as models

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
# 경로 설정

data_dir = '../input/data/train/'
test_dir = '../input/data/eval/'
submission_dir = './submission/'
model_dir = './model/'
image_data_dir = data_dir + 'images/'

## Seed 고정

In [None]:
'''
정인식님 코드 참고

'''

random_seed = 42

#pytorch의 random seed 고정

torch.manual_seed(random_seed)

# CuDNN 부분고정

torch.backends.cudnn.deterministic = True # 고정하면 학습이 느려진다고 합니다.

torch.backends.cudnn.benchmark = False

# Numpy 부분

np.random.seed(random_seed)

# transforms에서 random 라이브러리를 사용하기 때문에 random 라이브러리를 불러서 고정

random.seed(random_seed)

# GPU 에서 사용하는 난수 생성 시드 고정

torch.cuda.manual_seed(random_seed)

## 데이터 전처리

In [None]:
train_df = pd.read_csv(data_dir + 'train_edit.csv')
submission = pd.read_csv(test_dir + 'info.csv')

In [None]:
# 이상치 이미지 시각화
def show_img(img_id_li, df, data_dir):
    for img_id in img_id_li:
        get_df = df[df['id'] == img_id]
        
        img_age = get_df['age'].tolist()[0]
        img_gender = get_df['gender'].tolist()[0]
        
        img_path = get_df['path'].tolist()[0]
        img_path = os.path.join(data_dir, img_path)
        img_name_li = sorted(list(os.listdir(img_path)))
        
        fig, ax = plt.subplots(1, 7, figsize = (30, 15))
        ax = ax.flatten()
        
        idx = 0
        for _img_name in img_name_li:
            if _img_name[0] == '.': continue
            
            if _img_name.split('.')[0] == 'normal': imag_name = 'normal'
            elif _img_name.split('.')[0] == 'incorrect_mask': imag_name = 'incorrect_mask'
            else: imag_name = 'mask'
            
            get_img_path = os.path.join(img_path, _img_name)
            
            img = Image.open(get_img_path)
            img = np.array(img)
            ax[idx].imshow(img)
            ax[idx].set_title(f'{img_id} / {img_age} / {img_gender} / {imag_name}')
            ax[idx].set_xticks([])
            ax[idx].set_yticks([])
            idx += 1
            
        plt.show()

# image path로 이미지 시각화
def path_li_show_img(path_li):
    fig, ax = plt.subplots(1, 7, figsize = (30, 15))
    ax = ax.flatten()
    idx = 0
    for path in path_li:
        image_name = path.split('/')[-1]
        img = Image.open(path)
        img = np.array(img)
        ax[idx].imshow(img)
        ax[idx].set_title(f'{image_name}')
        ax[idx].set_xticks([])
        ax[idx].set_yticks([])
        idx += 1
    plt.show()

# ages 생성
def get_ages(x):
    if x < 30: return 0
    elif x < 60: return 1
    else: return 2

# genders 생성
def get_genders(x):
    if x == 'male': return 0
    else: return 1

# masks 생성
def get_masks(x):
    if x == 'normal': return 2
    elif x == 'incorrect_mask': return 1
    else: return 0

# labels 생성
def get_labels(masks, genders, ages):
    return masks * 6 + genders * 3 + ages

# train_df 생성
def get_train_df(df):
    train_df = []
    train_data_dir = '../input/data/train/'
    for line in df.iloc:
        for file in list(os.listdir(os.path.join(train_image_dir, line['path']))):
            if file[0] == '.':
                continue
                
            mask = file.split('.')[0]
            gender = line['gender']
            age = line['age']
            
            masks = get_masks(mask)
            genders = get_genders(gender)
            ages = get_ages(age)
            
            data = {
                'id' : line['id'],
                'mask' : mask,
                'gender' : gender,
                'age' : age,
                'masks' : masks,
                'genders' : genders,
                'ages' : ages,
                'labels': get_labels(masks = masks, genders = genders, ages = ages),
                'path': os.path.join(train_image_dir, line['path'], file),
            }
            train_df.append(data)
            
    train_df = pd.DataFrame(train_df)
    return train_df

# 성별 이상치 처리
def swap_gender(swap_li, df):
    swap_df = df.copy()
    for swap in swap_li:
        swap_id, swap_gender = swap
        swap_df.loc[swap_df[swap_df['id'] == swap_id].index, 'gender'] = swap_gender
    return swap_df
        
# 전처리된 df 생성 - 성별 결측치 처리, cv_target_col 생성
def preprocessing_df(df, swap_gender_li):
    '''
    swap_gender_li = [['006359', 'male'], ['006360', 'male'], ['006361', 'male'], ['006362', 'male'], ['006363', 'male'], ['006364', 'male']]
    '''
    
    preprocessing_df = df.copy()
    preprocessing_df = swap_gender(swap_li = swap_gender_li, df = preprocessing_df)
    
    preprocessing_df['ages'] = preprocessing_df['age'].apply(lambda x : get_ages(x))
    preprocessing_df['genders'] = preprocessing_df['gender'].apply(lambda x : get_genders(x))
    
    preprocessing_df['cv_taget_col'] = 'ages' + '_' + preprocessing_df['ages'].astype(str) + '_' + 'genders' + '_' + preprocessing_df['genders'].astype(str)
    
    return preprocessing_df

# val_idx 생성
def get_val_idx(df, target_col):
    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 22)
    for trn_idx, val_idx in skf.split(df, df[target_col]):
        yield val_idx

# 마스크 이상치 변경
def swap_mask(swap_li, df):
    swap_df = df.copy()
    for swap_id in swap_li:
        _swap_df = swap_df[swap_df['id'] == swap_id]
        
        normal_swap_df = _swap_df[_swap_df['mask'] == 'normal']
        incorrect_mask_swap_df = _swap_df[_swap_df['mask'] == 'incorrect_mask']
        
        normal_path = normal_swap_df['path'].values[0]
        incorrect_mask_path = incorrect_mask_swap_df['path'].values[0]
        
        swap_df.loc[normal_swap_df.index, 'path'] = incorrect_mask_path
        swap_df.loc[incorrect_mask_swap_df.index, 'path'] = normal_path
    
    return swap_df

In [None]:
'''
age group화 진행

'''
def age_group(df):
    df.loc[df['age'] < 30, 'age_group'] = 0 
    df.loc[(df['age'] < 60) & (df['age'] >= 30), 'age_group'] = 1
    df.loc[df['age'] >= 60, 'age_group'] = 2
    
    df = pd.DataFrame(df)
    
    return df

train_df = age_group(train_df)

In [None]:
'''
gender + age feature 생성
'''

train_df['gender_age'] = train_df['gender'] + ' ' + train_df['age_group'].astype(str)

In [None]:
'''
데이터셋 분리
'''
from sklearn.model_selection import train_test_split

def split_data(df):
    train_idx, val_idx = train_test_split(df['gender_age'], test_size = 0.2, random_state = 42, shuffle = True, stratify = df['gender_age'])
                                      
    train_set, val_set = df.iloc[train_idx.index, :], df.iloc[val_idx.index, :]
    
    return train_idx, val_idx, train_set, val_set

train_idx, val_idx, train_set, val_set = split_data(train_df)

In [None]:
'''
신규범님 코드 참고

학습 데이터 구축
'''
def age_group(x):
    if x < 30: return 0
    elif x < 60: return 1
    else: return 2

def df_target_preprocess(df):
    df_list = []

    for idx, line in tqdm(enumerate(df.iloc)):
        for file in list(os.listdir(os.path.join(image_data_dir, line['path']))):
            if file[0] == '.':
                continue
            if file.split('.')[0] == 'normal':
                mask = 2
            elif file.split('.')[0] == 'incorrect_mask':
                mask = 1
            else:
                mask = 0
            gender = 0 if line['gender'] == 'male' else 1
            data = {
                'id' : line['id'],
                'gender' : line['gender'],
                'age_group' : age_group(line['age']),
                'mask' : mask,
                'path': os.path.join(image_data_dir, line['path'], file),
                'gender_age' : line['gender_age'],
                'label': mask * 6 + gender * 3 + age_group(line['age'])
            }
            df_list.append(data)

    df = pd.DataFrame(df_list)
    
    return df

train_set = df_target_preprocess(train_set)
val_set = df_target_preprocess(val_set)

In [None]:
train_set

In [None]:
val_set

## 데이터셋 구축

In [None]:
'''
Sample_submission 코드 참고

데이터 셋 구축
'''

class CustomDataset(Dataset):
    def __init__(self, df, transform, train = True):
        
        image_dir = '../input/data/eval/images'
        
        self.train = train
        self.df = df
        if self.train:
            self.img_paths = self.df['path'].tolist()
            self.labels = self.df['label'].tolist()
        else:
            self.img_paths = [os.path.join(image_dir, img_id) for img_id in self.df.ImageID]
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.img_paths[index])
        if self.transform:
            image = self.transform(image)
        
        if self.train: return image, torch.tensor(self.labels[index])
        else: return image

    def __len__(self):
        return len(self.img_paths)

## 학습 설정

In [None]:
'''
학습 함수 설정
'''


def train(model, data_loader, optimizer, scheduler, criterion):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    f1 = 0
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()

        benign_outputs = model(images)
        loss = criterion(benign_outputs, targets)
        loss.backward()

        optimizer.step()
        
        
        train_loss += loss.item()
        _, predicted = benign_outputs.max(1)

        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        f1 += f1_score(targets.cpu().tolist(), predicted.cpu().tolist(), average='macro')
    
    train_loss /= len(data_loader)
    acc = correct / total
    f1 /= len(data_loader)
    
    scheduler.step(train_loss)
    
    return train_loss, acc, f1


def val(model, data_loader, criterion):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    f1 = 0
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        with torch.no_grad():
            images, targets = images.to(device), targets.to(device)
            benign_outputs = model(images)
            loss = criterion(benign_outputs, targets)
            val_loss += loss.item()
            _, predicted = benign_outputs.max(1)

            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            f1 += f1_score(targets.cpu().tolist(), predicted.cpu().tolist(), average='macro')
            
    val_loss /= len(data_loader)
    acc = correct / total
    f1 /= len(data_loader)
    
    return val_loss, acc, f1

def pred(model, data_loader):
    model.eval()
    all_predictions = []
    for images in data_loader:
        with torch.no_grad():
            images = images.to(device)
            pred = model(images)
            pred = pred.argmax(dim=-1)
            all_predictions.extend(pred.cpu().numpy())
    
    return all_predictions

In [None]:
'''
학습 설정
'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 0.01
epochs = 30
batch_size = 32

In [None]:
'''
데이터 로더 생성
'''

transform = transforms.Compose([
    ToTensor(),
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.2, 0.2, 0.2)),
    Resize((380, 380), Image.BILINEAR),
#     transforms.CenterCrop(380),
])

train_customset = CustomDataset(df = train_set, transform = transform, train = True)
val_customset = CustomDataset(df = val_set, transform = transform, train = True)
test_customset = CustomDataset(df = submission, transform = transform, train = False)

train_loader = DataLoader(
    train_customset,
    batch_size = batch_size,
    shuffle=True,
    num_workers = 2,
)

val_loader = DataLoader(
    val_customset,
    batch_size = batch_size,
    shuffle=True,
    num_workers = 2,
)

test_loader = DataLoader(
    test_customset,
    batch_size = batch_size,
    shuffle=False,
    num_workers = 2,
)

In [None]:
'''
모델 설정
'''
model = models.efficientnet_b4(pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor = 0.1, eps = 1e-09, patience = 5)

In [None]:
len(train_loader)

In [None]:
in_features = model.classifier[1].in_features
model.classifier[1] = torch.nn.Linear(in_features=in_features, out_features=18, bias=True).to(device)

## 학습

In [None]:
torch.cuda.empty_cache()

min_val_loss = float("inf")

for epoch in tqdm(range(1, epochs + 1)):
    train_loss, train_acc, train_f1 = train(model = model, data_loader = train_loader, optimizer = optimizer
                                            , scheduler = scheduler, criterion = criterion)
    val_loss, val_acc, val_f1 = val(model = model, data_loader = val_loader, criterion = criterion)
    
    print(f'epoch : {epoch}, train_loss : {train_loss}, train_acc : {train_acc}, train_f1 : {train_f1} \
    , val_loss : {val_loss}, val_acc : {val_acc}, val_f1 : {val_f1}')
    
    # 모델 저장
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), model_dir + f'epoch_{epoch}_efficientnet_b4.pt')

## 예측

In [None]:
# 모델이 테스트 데이터셋을 예측하고 결과를 저장합니다.
all_predictions = pred(model = model, data_loader = test_loader)
submission['ans'] = all_predictions

# 제출할 파일을 저장합니다.
submission.to_csv(os.path.join(submission_dir, 'efficientnetb4_pretrained.csv'), index=False)
print('test inference is done!')

In [None]:
submission.head()