## import

In [1]:
import os
import pandas as pd
import numpy as np
import random
import glob
import shutil
from PIL import Image
from tqdm import tqdm

# 모델 관련 모듈
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import Resize, ToTensor, Normalize
import torchvision.models as models

In [2]:
# 경로 설정

data_dir = '../input/data/train/'
test_dir = '../input/data/eval/'
submission_dir = './submission/'
image_data_dir = data_dir + 'images/'

## Seed 고정

In [3]:
'''
정인식님 코드 참고

'''

random_seed = 42

#pytorch의 random seed 고정

torch.manual_seed(random_seed)

# CuDNN 부분고정

torch.backends.cudnn.deterministic = True # 고정하면 학습이 느려진다고 합니다.

torch.backends.cudnn.benchmark = False

# Numpy 부분

np.random.seed(random_seed)

# transforms에서 random 라이브러리를 사용하기 때문에 random 라이브러리를 불러서 고정

random.seed(random_seed)

# GPU 에서 사용하는 난수 생성 시드 고정

torch.cuda.manual_seed(random_seed)

## 데이터 전처리

In [4]:
train_df = pd.read_csv(data_dir + 'train_edit.csv')
submission = pd.read_csv(test_dir + 'info.csv')

In [5]:
train_df

Unnamed: 0.1,Unnamed: 0,id,gender,race,age,path
0,0,000001,female,Asian,45,000001_female_Asian_45
1,1,000002,female,Asian,52,000002_female_Asian_52
2,2,000004,male,Asian,54,000004_male_Asian_54
3,3,000005,female,Asian,58,000005_female_Asian_58
4,4,000006,female,Asian,59,000006_female_Asian_59
...,...,...,...,...,...,...
2695,2695,006954,male,Asian,19,006954_male_Asian_19
2696,2696,006955,male,Asian,19,006955_male_Asian_19
2697,2697,006956,male,Asian,19,006956_male_Asian_19
2698,2698,006957,male,Asian,20,006957_male_Asian_20


In [11]:
'''
age group화 진행

'''
def age_group(df):
    df.loc[df['age'] < 30, 'age_group'] = 0 
    df.loc[(df['age'] < 60) & (df['age'] >= 30), 'age_group'] = 1
    df.loc[df['age'] >= 60, 'age_group'] = 2
    
    print(df)
    df = pd.DataFrame(df)
    
    return df

train_df = age_group(train_df)
train_df

      Unnamed: 0      id  gender   race  age                    path  \
0              0  000001  female  Asian   45  000001_female_Asian_45   
1              1  000002  female  Asian   52  000002_female_Asian_52   
2              2  000004    male  Asian   54    000004_male_Asian_54   
3              3  000005  female  Asian   58  000005_female_Asian_58   
4              4  000006  female  Asian   59  000006_female_Asian_59   
...          ...     ...     ...    ...  ...                     ...   
2695        2695  006954    male  Asian   19    006954_male_Asian_19   
2696        2696  006955    male  Asian   19    006955_male_Asian_19   
2697        2697  006956    male  Asian   19    006956_male_Asian_19   
2698        2698  006957    male  Asian   20    006957_male_Asian_20   
2699        2699  006959    male  Asian   19    006959_male_Asian_19   

      age_group  
0           1.0  
1           1.0  
2           1.0  
3           1.0  
4           1.0  
...         ...  
2695     

Unnamed: 0.1,Unnamed: 0,id,gender,race,age,path,age_group
0,0,000001,female,Asian,45,000001_female_Asian_45,1.0
1,1,000002,female,Asian,52,000002_female_Asian_52,1.0
2,2,000004,male,Asian,54,000004_male_Asian_54,1.0
3,3,000005,female,Asian,58,000005_female_Asian_58,1.0
4,4,000006,female,Asian,59,000006_female_Asian_59,1.0
...,...,...,...,...,...,...,...
2695,2695,006954,male,Asian,19,006954_male_Asian_19,0.0
2696,2696,006955,male,Asian,19,006955_male_Asian_19,0.0
2697,2697,006956,male,Asian,19,006956_male_Asian_19,0.0
2698,2698,006957,male,Asian,20,006957_male_Asian_20,0.0


In [5]:
'''
신규범님 코드 참고

학습 데이터 구축
'''
def age_group(x):
    if x < 30: return 0
    elif x < 60: return 1
    else: return 2

def df_target_preprocess(df):
    df_list = []

    for idx, line in tqdm(enumerate(df.iloc)):
        for file in list(os.listdir(os.path.join(image_data_dir, line['path']))):
            if file[0] == '.':
                continue
            if file.split('.')[0] == 'normal':
                mask = 2
            elif file.split('.')[0] == 'incorrect_mask':
                mask = 1
            else:
                mask = 0
            gender = 0 if line['gender'] == 'male' else 1
            data = {
                'id' : line['id'],
                'gender' : line['gender'],
                'age_group' : age_group(line['age']),
                'mask' : mask,
                'path': os.path.join(image_data_dir, line['path'], file),
                'label': mask * 6 + gender * 3 + age_group(line['age'])
            }
            df_list.append(data)

    df = pd.DataFrame(df_list)
    
    return df

df = df_target_preprocess(train_df)

2700it [00:01, 2692.73it/s]


In [5]:
'''
gender + age feature 생성
'''

df['gender_age'] = df['gender'] + ' ' + df['age_group'].astype(str)
df

Unnamed: 0.1,Unnamed: 0,id,gender,race,age,path,gender_age
0,0,000001,female,Asian,45,000001_female_Asian_45,female 45
1,1,000002,female,Asian,52,000002_female_Asian_52,female 52
2,2,000004,male,Asian,54,000004_male_Asian_54,male 54
3,3,000005,female,Asian,58,000005_female_Asian_58,female 58
4,4,000006,female,Asian,59,000006_female_Asian_59,female 59
...,...,...,...,...,...,...,...
2695,2695,006954,male,Asian,19,006954_male_Asian_19,male 19
2696,2696,006955,male,Asian,19,006955_male_Asian_19,male 19
2697,2697,006956,male,Asian,19,006956_male_Asian_19,male 19
2698,2698,006957,male,Asian,20,006957_male_Asian_20,male 20


In [10]:
df['gender_age'].value_counts()

male 19      1449
female 19    1400
female 20    1309
female 18    1057
female 58     763
             ... 
female 31       7
male 47         7
male 44         7
male 36         7
female 37       7
Name: gender_age, Length: 84, dtype: int64

In [9]:
'''
데이터셋 분리
'''
from sklearn.model_selection import train_test_split

def split_data(df):
    train_idx, val_idx = train_test_split(df['gender_age'], test_size = 0.2, random_state = 42, stratify = df['gender_age'])
                                      
    train_set, val_set = df.iloc[train_idx.index, :], df.iloc[val_idx.index, :]
    
    return train_idx, val_idx, train_set, val_set

train_idx, val_idx, train_set, val_set = split_data(train_df)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [22]:
df.groupby(['gender', 'age']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,age_group,mask,path,label
gender,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,18,1057,1057,1057,1057,1057
female,19,1400,1400,1400,1400,1400
female,20,1309,1309,1309,1309,1309
female,21,98,98,98,98,98
female,22,147,147,147,147,147
...,...,...,...,...,...,...
male,56,329,329,329,329,329
male,57,259,259,259,259,259
male,58,350,350,350,350,350
male,59,322,322,322,322,322


In [4]:
'''
데이터셋 분리
'''
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(df['label'], train_size = 0.8, random_state = 22, stratify = df['label'])
                                      
train_set, val_set = df.iloc[train_idx.index, :], df.iloc[val_idx.index, :]

## 데이터셋 구축

In [6]:
'''
Sample_submission 코드 참고

데이터 셋 구축
'''

class CustomDataset(Dataset):
    def __init__(self, df, transform, train = True):
        
        image_dir = '../input/data/eval/images'
        
        self.train = train
        self.df = df
        if self.train:
            self.img_paths = self.df['path'].tolist()
            self.labels = self.df['label'].tolist()
        else:
            self.img_paths = [os.path.join(image_dir, img_id) for img_id in self.df.ImageID]
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.img_paths[index])
        if self.transform:
            image = self.transform(image)
        
        if self.train: return image, torch.tensor(self.labels[index])
        else: return image

    def __len__(self):
        return len(self.img_paths)

## 학습 설정

In [7]:
'''
학습 함수 설정
'''

def train(model, data_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (images, targets) in enumerate(data_loader):
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()

        benign_outputs = model(images)
        loss = criterion(benign_outputs, targets)
        loss.backward()

        optimizer.step()
        train_loss += loss.item()
        _, predicted = benign_outputs.max(1)

        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    train_loss /= len(data_loader)
    acc = correct / total
    
    return train_loss, acc


def val(model, data_loader, criterion):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        with torch.no_grad():
            images, targets = images.to(device), targets.to(device)
            benign_outputs = model(images)
            loss = criterion(benign_outputs, targets)
            val_loss += loss.item()
            _, predicted = benign_outputs.max(1)

            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    val_loss /= len(data_loader)
    acc = correct / total
    
    return val_loss, acc

def pred(model, data_loader):
    all_predictions = []
    for images in data_loader:
        with torch.no_grad():
            images = images.to(device)
            pred = model(images)
            pred = pred.argmax(dim=-1)
            all_predictions.extend(pred.cpu().numpy())
    
    return all_predictions

In [8]:
'''
학습 설정
'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 0.01
epochs = 10
batch_size = 128

In [9]:
'''
데이터 로더 생성
'''

transform = transforms.Compose([
    Resize((224, 224), Image.BILINEAR),
    ToTensor(),
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.2, 0.2, 0.2)),
])

train_customset = CustomDataset(df = train_set, transform = transform, train = True)
val_customset = CustomDataset(df = val_set, transform = transform, train = True)
test_customset = CustomDataset(df = submission, transform = transform, train = False)

train_loader = DataLoader(
    train_customset,
    batch_size = batch_size,
    shuffle=True,
)

val_loader = DataLoader(
    val_customset,
    batch_size = batch_size,
    shuffle=True,
)

test_loader = DataLoader(
    test_customset,
    batch_size = batch_size,
    shuffle=False,
)



In [10]:
'''
모델 설정
'''
model = models.efficientnet_b0(pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [11]:
len(train_loader)

119

In [12]:
model

EfficientNet(
  (features): Sequential(
    (0): ConvNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): ConvNormActivation(
 

In [12]:
in_features = model.classifier[1].in_features
model.classifier[1] = torch.nn.Linear(in_features=in_features, out_features=18, bias=True).to(device)

## 학습

In [13]:
for epoch in tqdm(range(1, epochs + 1)):
    train_loss, train_acc = train(model = model, data_loader = train_loader, optimizer = optimizer, criterion = criterion)
    val_loss, val_acc = val(model = model, data_loader = val_loader, criterion = criterion)
    
    print(f'epoch : {epoch}, train_loss : {train_loss}, train_acc : {train_acc}, val_loss : {val_loss}, val_acc : {val_acc}')

 10%|█         | 1/10 [02:14<20:13, 134.80s/it]

epoch : 1, train_loss : 0.6830161796147082, train_acc : 0.7816798941798941, val_loss : 0.46568207641442616, val_acc : 0.8394179894179894


 20%|██        | 2/10 [04:29<17:58, 134.77s/it]

epoch : 2, train_loss : 0.3516964311359309, train_acc : 0.8838624338624339, val_loss : 0.6752737800280253, val_acc : 0.7835978835978836


 30%|███       | 3/10 [06:44<15:42, 134.70s/it]

epoch : 3, train_loss : 0.33289135892351135, train_acc : 0.8861111111111111, val_loss : 0.34801584233840305, val_acc : 0.8828042328042328


 40%|████      | 4/10 [08:58<13:27, 134.62s/it]

epoch : 4, train_loss : 0.22974598025824844, train_acc : 0.9214285714285714, val_loss : 0.34292655686537427, val_acc : 0.8907407407407407


 50%|█████     | 5/10 [11:13<11:13, 134.63s/it]

epoch : 5, train_loss : 0.1849399362729878, train_acc : 0.93505291005291, val_loss : 0.44504253516594566, val_acc : 0.8777777777777778


 60%|██████    | 6/10 [13:27<08:58, 134.63s/it]

epoch : 6, train_loss : 0.14453406290722495, train_acc : 0.9498677248677249, val_loss : 0.43900218307971955, val_acc : 0.9052910052910053


 70%|███████   | 7/10 [15:42<06:43, 134.57s/it]

epoch : 7, train_loss : 0.1499619255734592, train_acc : 0.9473544973544974, val_loss : 0.24520428826411564, val_acc : 0.9193121693121693


 80%|████████  | 8/10 [17:56<04:28, 134.45s/it]

epoch : 8, train_loss : 0.11314576087879534, train_acc : 0.9617063492063492, val_loss : 0.16894902462760608, val_acc : 0.9439153439153439


 90%|█████████ | 9/10 [20:10<02:14, 134.43s/it]

epoch : 9, train_loss : 0.10397998238874584, train_acc : 0.9634259259259259, val_loss : 0.3497778947154681, val_acc : 0.8830687830687831


100%|██████████| 10/10 [22:25<00:00, 134.53s/it]

epoch : 10, train_loss : 0.10298629493272606, train_acc : 0.964021164021164, val_loss : 0.5097503652175267, val_acc : 0.8317460317460318





## 예측

In [14]:
# 모델이 테스트 데이터셋을 예측하고 결과를 저장합니다.
all_predictions = pred(model = model, data_loader = test_loader)
submission['ans'] = all_predictions

# 제출할 파일을 저장합니다.
submission.to_csv(os.path.join(submission_dir, 'efficientnetb0_pretrained.csv'), index=False)
print('test inference is done!')

test inference is done!


In [15]:
submission.head()

Unnamed: 0,ImageID,ans
0,cbc5c6e168e63498590db46022617123f1fe1268.jpg,13
1,0e72482bf56b3581c081f7da2a6180b8792c7089.jpg,2
2,b549040c49190cedc41327748aeb197c1670f14d.jpg,13
3,4f9cb2a045c6d5b9e50ad3459ea7b791eb6e18bc.jpg,13
4,248428d9a4a5b6229a7081c32851b90cb8d38d0c.jpg,12
