## import

In [1]:
import os
import pandas as pd
import numpy as np
import random
import glob
import shutil
from PIL import Image
from tqdm import tqdm

# 모델 관련 모듈
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import Resize, ToTensor, Normalize
import torchvision.models as models

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# 경로 설정

data_dir = '../input/data/train/'
test_dir = '../input/data/eval/'
submission_dir = './submission/'
model_dir = './model/'
image_data_dir = data_dir + 'images/'

## Seed 고정

In [3]:
'''
정인식님 코드 참고

'''

random_seed = 42

#pytorch의 random seed 고정

torch.manual_seed(random_seed)

# CuDNN 부분고정

torch.backends.cudnn.deterministic = True # 고정하면 학습이 느려진다고 합니다.

torch.backends.cudnn.benchmark = False

# Numpy 부분

np.random.seed(random_seed)

# transforms에서 random 라이브러리를 사용하기 때문에 random 라이브러리를 불러서 고정

random.seed(random_seed)

# GPU 에서 사용하는 난수 생성 시드 고정

torch.cuda.manual_seed(random_seed)

## 데이터 전처리

In [4]:
train_df = pd.read_csv(data_dir + 'train_edit.csv')
submission = pd.read_csv(test_dir + 'info.csv')

In [5]:
'''
age group화 진행

'''
def age_group(df):
    df.loc[df['age'] < 30, 'age_group'] = 0 
    df.loc[(df['age'] < 60) & (df['age'] >= 30), 'age_group'] = 1
    df.loc[df['age'] >= 60, 'age_group'] = 2
    
    df = pd.DataFrame(df)
    
    return df

train_df = age_group(train_df)

In [6]:
'''
gender + age feature 생성
'''

train_df['gender_age'] = train_df['gender'] + ' ' + train_df['age_group'].astype(str)

In [7]:
'''
데이터셋 분리
'''
from sklearn.model_selection import train_test_split

def split_data(df):
    train_idx, val_idx = train_test_split(df['gender_age'], test_size = 0.2, random_state = 42, shuffle = True, stratify = df['gender_age'])
                                      
    train_set, val_set = df.iloc[train_idx.index, :], df.iloc[val_idx.index, :]
    
    return train_idx, val_idx, train_set, val_set

train_idx, val_idx, train_set, val_set = split_data(train_df)

In [8]:
'''
신규범님 코드 참고

학습 데이터 구축
'''
def age_group(x):
    if x < 30: return 0
    elif x < 60: return 1
    else: return 2

def df_target_preprocess(df):
    df_list = []

    for idx, line in tqdm(enumerate(df.iloc)):
        for file in list(os.listdir(os.path.join(image_data_dir, line['path']))):
            if file[0] == '.':
                continue
            if file.split('.')[0] == 'normal':
                mask = 2
            elif file.split('.')[0] == 'incorrect_mask':
                mask = 1
            else:
                mask = 0
            gender = 0 if line['gender'] == 'male' else 1
            data = {
                'id' : line['id'],
                'gender' : line['gender'],
                'age_group' : age_group(line['age']),
                'mask' : mask,
                'path': os.path.join(image_data_dir, line['path'], file),
                'gender_age' : line['gender_age'],
                'label': mask * 6 + gender * 3 + age_group(line['age'])
            }
            df_list.append(data)

    df = pd.DataFrame(df_list)
    
    return df

train_set = df_target_preprocess(train_set)
val_set = df_target_preprocess(val_set)

2160it [00:00, 2675.22it/s]
540it [00:00, 2748.74it/s]


In [9]:
train_set

Unnamed: 0,id,gender,age_group,mask,path,gender_age,label
0,003801,female,1,0,../input/data/train/images/003801_female_Asian...,female 1.0,4
1,003801,female,1,0,../input/data/train/images/003801_female_Asian...,female 1.0,4
2,003801,female,1,1,../input/data/train/images/003801_female_Asian...,female 1.0,10
3,003801,female,1,0,../input/data/train/images/003801_female_Asian...,female 1.0,4
4,003801,female,1,0,../input/data/train/images/003801_female_Asian...,female 1.0,4
...,...,...,...,...,...,...,...
15115,000613,male,1,1,../input/data/train/images/000613_male_Asian_5...,male 1.0,7
15116,000613,male,1,0,../input/data/train/images/000613_male_Asian_5...,male 1.0,1
15117,000613,male,1,0,../input/data/train/images/000613_male_Asian_5...,male 1.0,1
15118,000613,male,1,0,../input/data/train/images/000613_male_Asian_5...,male 1.0,1


In [10]:
val_set

Unnamed: 0,id,gender,age_group,mask,path,gender_age,label
0,003628,male,1,0,../input/data/train/images/003628_male_Asian_5...,male 1.0,1
1,003628,male,1,0,../input/data/train/images/003628_male_Asian_5...,male 1.0,1
2,003628,male,1,1,../input/data/train/images/003628_male_Asian_5...,male 1.0,7
3,003628,male,1,0,../input/data/train/images/003628_male_Asian_5...,male 1.0,1
4,003628,male,1,0,../input/data/train/images/003628_male_Asian_5...,male 1.0,1
...,...,...,...,...,...,...,...
3775,005448,female,1,1,../input/data/train/images/005448_female_Asian...,female 1.0,10
3776,005448,female,1,0,../input/data/train/images/005448_female_Asian...,female 1.0,4
3777,005448,female,1,0,../input/data/train/images/005448_female_Asian...,female 1.0,4
3778,005448,female,1,0,../input/data/train/images/005448_female_Asian...,female 1.0,4


## 데이터셋 구축

In [9]:
'''
Sample_submission 코드 참고

데이터 셋 구축
'''

class CustomDataset(Dataset):
    def __init__(self, df, transform, train = True):
        
        image_dir = '../input/data/eval/images'
        
        self.train = train
        self.df = df
        if self.train:
            self.img_paths = self.df['path'].tolist()
            self.labels = self.df['label'].tolist()
        else:
            self.img_paths = [os.path.join(image_dir, img_id) for img_id in self.df.ImageID]
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.img_paths[index])
        if self.transform:
            image = self.transform(image)
        
        if self.train: return image, torch.tensor(self.labels[index])
        else: return image

    def __len__(self):
        return len(self.img_paths)

## 학습 설정

In [10]:
'''
학습 함수 설정
'''


def train(model, data_loader, optimizer, scheduler, criterion):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    f1 = 0
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()

        benign_outputs = model(images)
        loss = criterion(benign_outputs, targets)
        loss.backward()

        optimizer.step()
        scheduler.step(loss)
        
        train_loss += loss.item()
        _, predicted = benign_outputs.max(1)

        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        f1 += f1_score(targets.cpu().tolist(), predicted.cpu().tolist(), average='macro')
    
    train_loss /= len(data_loader)
    acc = correct / total
    f1 /= len(data_loader)
    
    return train_loss, acc, f1


def val(model, data_loader, criterion):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    f1 = 0
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        with torch.no_grad():
            images, targets = images.to(device), targets.to(device)
            benign_outputs = model(images)
            loss = criterion(benign_outputs, targets)
            val_loss += loss.item()
            _, predicted = benign_outputs.max(1)

            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            f1 += f1_score(targets.cpu().tolist(), predicted.cpu().tolist(), average='macro')
            
    val_loss /= len(data_loader)
    acc = correct / total
    f1 /= len(data_loader)
    
    return val_loss, acc, f1

def pred(model, data_loader):
    model.eval()
    all_predictions = []
    for images in data_loader:
        with torch.no_grad():
            images = images.to(device)
            pred = model(images)
            pred = pred.argmax(dim=-1)
            all_predictions.extend(pred.cpu().numpy())
    
    return all_predictions

In [11]:
'''
학습 설정
'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 0.01
epochs = 30
batch_size = 32

In [12]:
'''
데이터 로더 생성
'''

transform = transforms.Compose([
    ToTensor(),
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.2, 0.2, 0.2)),
    Resize((380, 380), Image.BILINEAR),
#     transforms.CenterCrop(380),
])

train_customset = CustomDataset(df = train_set, transform = transform, train = True)
val_customset = CustomDataset(df = val_set, transform = transform, train = True)
test_customset = CustomDataset(df = submission, transform = transform, train = False)

train_loader = DataLoader(
    train_customset,
    batch_size = batch_size,
    shuffle=True,
    num_workers = 2,
)

val_loader = DataLoader(
    val_customset,
    batch_size = batch_size,
    shuffle=True,
    num_workers = 2,
)

test_loader = DataLoader(
    test_customset,
    batch_size = batch_size,
    shuffle=False,
    num_workers = 2,
)



In [13]:
'''
모델 설정
'''
model = models.efficientnet_b4(pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor = 0.1, eps = 1e-09, patience = 5)

In [14]:
len(train_loader)

473

In [15]:
in_features = model.classifier[1].in_features
model.classifier[1] = torch.nn.Linear(in_features=in_features, out_features=18, bias=True).to(device)

## 학습

In [None]:
torch.cuda.empty_cache()

min_val_loss = float("inf")

for epoch in tqdm(range(1, epochs + 1)):
    train_loss, train_acc, train_f1 = train(model = model, data_loader = train_loader, optimizer = optimizer
                                            , scheduler = scheduler, criterion = criterion)
    val_loss, val_acc, val_f1 = val(model = model, data_loader = val_loader, criterion = criterion)
    
    print(f'epoch : {epoch}, train_loss : {train_loss}, train_acc : {train_acc}, train_f1 : {train_f1} \
    , val_loss : {val_loss}, val_acc : {val_acc}, val_f1 : {val_f1}')
    
    # 모델 저장
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        torch.save(model.state_dict(), model_dir + f'epoch_{epoch}_efficientnet_b4.pt')

  0%|          | 0/30 [00:00<?, ?it/s]

epoch : 1, train_loss : 1.0513337872623136, train_acc : 0.6853835978835979, train_f1 : 0.4541372261120737     , val_loss : 0.8975218218915603, val_acc : 0.7462962962962963, val_f1 : 0.5268116158667432


  7%|▋         | 2/30 [08:09<1:54:09, 244.64s/it]

epoch : 2, train_loss : 1.0142689495963728, train_acc : 0.6957671957671958, train_f1 : 0.4591436106725567     , val_loss : 0.9059035865699544, val_acc : 0.746031746031746, val_f1 : 0.520813404115734


 10%|█         | 3/30 [12:13<1:50:03, 244.57s/it]

epoch : 3, train_loss : 1.0085768932771986, train_acc : 0.6990079365079365, train_f1 : 0.4609640950596385     , val_loss : 0.8963853839565726, val_acc : 0.744973544973545, val_f1 : 0.5225697638249835


 13%|█▎        | 4/30 [16:18<1:45:57, 244.53s/it]

epoch : 4, train_loss : 1.006754233267544, train_acc : 0.6988756613756614, train_f1 : 0.46870973341695377     , val_loss : 0.8995784686393097, val_acc : 0.7462962962962963, val_f1 : 0.5083169741199228


## 예측

In [None]:
# 모델이 테스트 데이터셋을 예측하고 결과를 저장합니다.
all_predictions = pred(model = model, data_loader = test_loader)
submission['ans'] = all_predictions

# 제출할 파일을 저장합니다.
submission.to_csv(os.path.join(submission_dir, 'efficientnetb4_pretrained.csv'), index=False)
print('test inference is done!')

In [15]:
submission.head()

Unnamed: 0,ImageID,ans
0,cbc5c6e168e63498590db46022617123f1fe1268.jpg,13
1,0e72482bf56b3581c081f7da2a6180b8792c7089.jpg,2
2,b549040c49190cedc41327748aeb197c1670f14d.jpg,13
3,4f9cb2a045c6d5b9e50ad3459ea7b791eb6e18bc.jpg,13
4,248428d9a4a5b6229a7081c32851b90cb8d38d0c.jpg,12
