In [1]:
import os
import pandas as pd
from PIL import Image

from tqdm import tqdm

data_dir = '../input/data/train/'
test_dir = '../input/data/eval/'
submission_dir = './submission/'
image_data_dir = data_dir + 'images/'

## 데이터 전처리

In [2]:
train_df = pd.read_csv(data_dir + 'train.csv')
submission = pd.read_csv(test_dir + 'info.csv')

In [3]:
'''
신규범님 코드 참고

학습 데이터 구축
'''
def age_group(x):
    if x < 30: return 0
    elif x < 60: return 1
    else: return 2


df = []

for idx, line in tqdm(enumerate(train_df.iloc)):
    for file in list(os.listdir(os.path.join(image_data_dir, line['path']))):
        if file[0] == '.':
            continue
        if file.split('.')[0] == 'normal':
            mask = 2
        elif file.split('.')[0] == 'incorrect_mask':
            mask = 1
        else:
            mask = 0
        gender = 0 if line['gender'] == 'male' else 1
        data = {
            'id' : line['id'],
            'gender' : line['gender'],
            'age_group' : age_group(line['age']),
            'mask' : mask,
            'path': os.path.join(image_data_dir, line['path'], file),
            'label': mask * 6 + gender * 3 + age_group(line['age'])
        }
        df.append(data)

df = pd.DataFrame(df)

2700it [00:00, 2800.18it/s]


In [4]:
'''
데이터셋 분리
'''
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(df['label'], train_size = 0.8, random_state = 22, stratify = df['label'])
                                      
train_set, val_set = df.iloc[train_idx.index, :], df.iloc[val_idx.index, :]

## 모델 구축

In [5]:
'''
Efficientnetb5 모델 구축

'''

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import Resize, ToTensor, Normalize
import torchvision.models as models

## 데이터셋 구축

In [6]:
'''
Sample_submission 코드 참고

데이터 셋 구축
'''

class CustomDataset(Dataset):
    def __init__(self, df, transform, train = True):
        
        image_dir = '../input/data/eval/images'
        
        self.train = train
        self.df = df
        if self.train:
            self.img_paths = self.df['path'].tolist()
            self.labels = self.df['label'].tolist()
        else:
            self.img_paths = [os.path.join(image_dir, img_id) for img_id in self.df.ImageID]
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.img_paths[index])
        if self.transform:
            image = self.transform(image)
        
        if self.train: return image, torch.tensor(self.labels[index])
        else: return image

    def __len__(self):
        return len(self.img_paths)

## 학습 설정

In [7]:
'''
학습 함수 설정
'''

def train(model, data_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (images, targets) in enumerate(data_loader):
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()

        benign_outputs = model(images)
        loss = criterion(benign_outputs, targets)
        loss.backward()

        optimizer.step()
        train_loss += loss.item()
        _, predicted = benign_outputs.max(1)

        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    train_loss /= len(data_loader)
    acc = correct / total
    
    return train_loss, acc


def val(model, data_loader, criterion):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (images, targets) in enumerate(data_loader):
        with torch.no_grad():
            images, targets = images.to(device), targets.to(device)
            benign_outputs = model(images)
            loss = criterion(benign_outputs, targets)
            val_loss += loss.item()
            _, predicted = benign_outputs.max(1)

            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    val_loss /= len(data_loader)
    acc = correct / total
    
    return val_loss, acc

def pred(model, data_loader):
    all_predictions = []
    for images in data_loader:
        with torch.no_grad():
            images = images.to(device)
            pred = model(images)
            pred = pred.argmax(dim=-1)
            all_predictions.extend(pred.cpu().numpy())
    
    return all_predictions

In [8]:
'''
학습 설정
'''

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lr = 0.01
epochs = 10
batch_size = 4

In [9]:
'''
데이터 로더 생성
'''

transform = transforms.Compose([
    ToTensor(),
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.2, 0.2, 0.2)),
    Resize((456, 456), Image.BILINEAR),
])

train_customset = CustomDataset(df = train_set, transform = transform, train = True)
val_customset = CustomDataset(df = val_set, transform = transform, train = True)
test_customset = CustomDataset(df = submission, transform = transform, train = False)

train_loader = DataLoader(
    train_customset,
    batch_size = batch_size,
    shuffle=True,
)

val_loader = DataLoader(
    val_customset,
    batch_size = batch_size,
    shuffle=True,
)

test_loader = DataLoader(
    test_customset,
    batch_size = batch_size,
    shuffle=False,
)



In [10]:
'''
모델 설정
'''
model = models.efficientnet_b5(pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

Downloading: "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth" to /opt/ml/.cache/torch/hub/checkpoints/efficientnet_b5_lukemelas-b6417697.pth


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [11]:
len(train_loader)

3780

In [12]:
model.classifier[1] = torch.nn.Linear(in_features=2048, out_features=18, bias=True).to(device)

## 학습

In [13]:
torch.cuda.empty_cache()
for epoch in tqdm(range(1, epochs + 1)):
    train_loss, train_acc = train(model = model, data_loader = train_loader, optimizer = optimizer, criterion = criterion)
    val_loss, val_acc = val(model = model, data_loader = val_loader, criterion = criterion)
    
    print(f'epoch : {epoch}, train_loss : {train_loss}, train_acc : {train_acc}, val_loss : {val_loss}, val_acc : {val_acc}')

 10%|█         | 1/10 [12:33<1:52:58, 753.18s/it]

epoch : 1, train_loss : 2.3656142974972094, train_acc : 0.22156084656084657, val_loss : 2.477101001537666, val_acc : 0.16666666666666666


 20%|██        | 2/10 [25:12<1:40:39, 754.89s/it]

epoch : 2, train_loss : 2.0652309260671102, train_acc : 0.3617063492063492, val_loss : 1.7559299804546216, val_acc : 0.4621693121693122


 30%|███       | 3/10 [37:48<1:28:06, 755.21s/it]

epoch : 3, train_loss : 1.5350351201715293, train_acc : 0.5203703703703704, val_loss : 1.0778161486620625, val_acc : 0.6328042328042328


 40%|████      | 4/10 [50:27<1:15:39, 756.55s/it]

epoch : 4, train_loss : 1.0442277755332057, train_acc : 0.6673941798941799, val_loss : 0.8244789003222077, val_acc : 0.7317460317460317


 50%|█████     | 5/10 [1:03:07<1:03:07, 757.41s/it]

epoch : 5, train_loss : 0.8055225119586029, train_acc : 0.7387566137566137, val_loss : 0.6953270948871418, val_acc : 0.7637566137566137


 60%|██████    | 6/10 [1:15:44<50:29, 757.41s/it]  

epoch : 6, train_loss : 0.6797959216619058, train_acc : 0.7776455026455027, val_loss : 0.639935351912149, val_acc : 0.7732804232804232


 70%|███████   | 7/10 [1:28:12<37:43, 754.51s/it]

epoch : 7, train_loss : 0.595991336123121, train_acc : 0.799404761904762, val_loss : 0.5190564053596327, val_acc : 0.8134920634920635


 80%|████████  | 8/10 [1:40:39<25:04, 752.42s/it]

epoch : 8, train_loss : 0.5203819571523656, train_acc : 0.8277777777777777, val_loss : 0.5652828999868934, val_acc : 0.7928571428571428


 90%|█████████ | 9/10 [1:53:06<12:30, 750.83s/it]

epoch : 9, train_loss : 0.4613996681866911, train_acc : 0.8431878306878307, val_loss : 0.37484103189707396, val_acc : 0.8669312169312169


100%|██████████| 10/10 [2:05:34<00:00, 753.45s/it]

epoch : 10, train_loss : 0.42701202459317983, train_acc : 0.8551587301587301, val_loss : 0.4446101817396031, val_acc : 0.8383597883597883





## 예측

In [14]:
# 모델이 테스트 데이터셋을 예측하고 결과를 저장합니다.
all_predictions = pred(model = model, data_loader = test_loader)
submission['ans'] = all_predictions

# 제출할 파일을 저장합니다.
submission.to_csv(os.path.join(submission_dir, 'efficientnetb5_pretrained.csv'), index=False)
print('test inference is done!')

test inference is done!


In [15]:
submission.head()

Unnamed: 0,ImageID,ans
0,cbc5c6e168e63498590db46022617123f1fe1268.jpg,13
1,0e72482bf56b3581c081f7da2a6180b8792c7089.jpg,1
2,b549040c49190cedc41327748aeb197c1670f14d.jpg,17
3,4f9cb2a045c6d5b9e50ad3459ea7b791eb6e18bc.jpg,13
4,248428d9a4a5b6229a7081c32851b90cb8d38d0c.jpg,16
