In [1]:
#!conda install -n base -c conda-forge jupyterlab_widgets -y

In [2]:
# conda install -c conda-forge ipywidgets

In [7]:
import random

import matplotlib
import matplotlib.pyplot as plt

import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset

import torchvision
from torchvision import datasets, models, transforms

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from PIL import Image

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device 객체

# Set random seed
SEED = 2021
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)  # type: ignore
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True  # type: ignore

In [8]:
device

device(type='cuda', index=0)

In [9]:
# 데이터셋을 불러올 때 사용할 변형(transformation) 객체 정의
transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # 정규화(normalization)
])

In [10]:
def train_val_dataset(data):
    
    train_idx = []
    val_idx = []
    
    for i in range(len(data.classes)):
        all_idx = []
        
        for j in tqdm(range(len(data))):
            if i == data[j][1]:
                all_idx.append(j)
        
        split_idx = int(len(all_idx) * 0.8)
        
        train_idx = np.concatenate((train_idx, all_idx[:split_idx]))
        val_idx = np.concatenate((val_idx, all_idx[split_idx:]))
    
    datas = {}
    datas['train'] = Subset(data, list(map(int, train_idx)))
    datas['val'] = Subset(data, list(map(int, val_idx)))
    
    return datas

In [11]:
'''
def train_val_dataset(data, val_split=0.2):
    train_idx, val_idx = train_test_split(list(range(len(data))), test_size=val_split, shuffle=False)
    datas = {}
    datas['train'] = Subset(data, train_idx)
    datas['val'] = Subset(data, val_idx)
    return datas
'''

"\ndef train_val_dataset(data, val_split=0.2):\n    train_idx, val_idx = train_test_split(list(range(len(data))), test_size=val_split, shuffle=False)\n    datas = {}\n    datas['train'] = Subset(data, train_idx)\n    datas['val'] = Subset(data, val_idx)\n    return datas\n"

In [12]:
from torch.utils.data import Dataset, DataLoader

class TestDataset(Dataset):
    def __init__(self, img_paths, transform):
        self.img_paths = img_paths
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.img_paths[index])

        if self.transform:
            image = self.transform(image)
        return image

    def __len__(self):
        return len(self.img_paths)

In [13]:
test_dir = '/opt/ml/input/cropped_v2.1/eval'
submission = pd.read_csv(os.path.join(test_dir, 'info.csv'))

In [14]:
def model_run(target, sub_df, test_dir):
    data_dir = f'/opt/ml/input/cropped_v2.1/train/images_classified_{target}/'
    
    dataset = datasets.ImageFolder(os.path.join(data_dir), transforms)
    
    print(f'*****{target}*****')
    print(dataset)
    print()
    
    class_names = dataset.classes
    print(class_names)
    print()
    
    print('****train, valid split****')
    dataset_split = train_val_dataset(dataset)
    
    dataloader = torch.utils.data.DataLoader(dataset_split['train'], batch_size=128, shuffle=True, num_workers=8)
    valid_dataloader = torch.utils.data.DataLoader(dataset_split['val'], batch_size=128, shuffle=False, num_workers=8)
    
    model = models.resnet34(pretrained=True)

    num_features = model.fc.in_features
    # 전이 학습(transfer learning): 모델의 출력 뉴런 수를 18개로 교체하여 마지막 레이어 다시 학습
    model.fc = nn.Linear(num_features, len(class_names)) 
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    
    num_epochs = 10
    model.train()
    start_time = time.time()

    # 전체 반복(epoch) 수 만큼 반복하며
    print('****start epoch****')
    for epoch in range(num_epochs):
        running_loss = 0.
        running_corrects = 0

        # 배치 단위로 학습 데이터 불러오기
        for inputs, labels in tqdm(dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            # 모델에 입력(forward)하고 결과 계산
            optimizer.zero_grad() # 전체 grad 값을 초기화.
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            # 역전파를 통해 기울기(gradient) 계산 및 학습 진행
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(dataset_split['train'])
        epoch_acc = running_corrects / len(dataset_split['train']) * 100.
        
        # validation
        model.eval()
        
        all_labels = []
        all_preds = []
        
        with torch.no_grad():
            running_loss = 0.
            running_corrects = 0
            
            for inputs, labels in valid_dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
        
                all_preds.extend(preds.cpu().numpy())
        
                loss = criterion(outputs, labels)

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
        
                all_labels.extend(labels.data.cpu().numpy())
        
            valid_epoch_loss = running_loss / len(dataset_split['val'])
            valid_epoch_acc = running_corrects / len(dataset_split['val']) * 100.
            F1_score = f1_score(all_labels, all_preds, average='macro')
        
        # 학습 과정 중에 결과 출력
        print('#{} Loss: {:.4f} Acc: {:.4f}% Time: {:.4f}s'.format(epoch, epoch_loss, epoch_acc, time.time() - start_time))
        print(f'Valid Loss: {valid_epoch_loss:.4f} Valid Acc: {valid_epoch_acc:.4f} F1 Score: {F1_score:.4f}')
    '''
    valid_dataloader = torch.utils.data.DataLoader(dataset_split['val'], batch_size=128, shuffle=False, num_workers=8)
    
    model.eval()
    start_time = time.time()

    all_labels = []
    all_preds = []

    with torch.no_grad():
        running_loss = 0.
        running_corrects = 0

        for inputs, labels in valid_dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
        
            all_preds.extend(preds.cpu().numpy())
        
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
        
        
            all_labels.extend(labels.data.cpu().numpy())
        
            
            # 한 배치의 첫 번째 이미지에 대하여 결과 시각화
            print(f'[예측 결과: {class_names[preds[0]]}] (실제 정답: {class_names[labels.data[0]]})')
            imshow(inputs.cpu().data[0], title='예측 결과: ' + class_names[preds[0]])
            
            
        epoch_loss = running_loss / len(dataset_split['val'])
        epoch_acc = running_corrects / len(dataset_split['val']) * 100.
        print('[Test Phase] Loss: {:.4f} Acc: {:.4f}% Time: {:.4f}s'.format(epoch_loss, epoch_acc, time.time() - start_time))
    
    F1_score = f1_score(all_labels, all_preds, average='macro')
    
    print(f'*****F1 Score: {F1_score}*****')
    '''
    image_dir = os.path.join(test_dir, 'images')
    
    image_paths = [os.path.join(image_dir, img_id) for img_id in sub_df.ImageID]
    
    test_set = TestDataset(image_paths, transforms)
    
    test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False, num_workers=8)
    
    all_predictions = []

    with torch.no_grad():
        for inputs in tqdm(test_dataloader):
            inputs = inputs.to(device)
        
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
        
            all_predictions.extend(preds.cpu().numpy())
    
    all_predictions2 = []

    for p in all_predictions:
        all_predictions2.append(class_names[p])
        
    sub_df[target] = all_predictions2
    
    return sub_df

In [None]:
submission = model_run('mask', submission, test_dir).copy()

  0%|          | 21/18900 [00:00<01:30, 208.13it/s]

*****mask*****
Dataset ImageFolder
    Number of datapoints: 18900
    Root location: /opt/ml/input/cropped_v2.1/train/images_classified_mask/
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=PIL.Image.BILINEAR)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )

['0', '12', '6']

****train, valid split****


100%|██████████| 18900/18900 [00:56<00:00, 334.06it/s]
100%|██████████| 18900/18900 [00:56<00:00, 335.13it/s]
100%|██████████| 18900/18900 [00:56<00:00, 333.04it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

****start epoch****


100%|██████████| 119/119 [00:29<00:00,  4.09it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

#0 Loss: 1.6857 Acc: 85.3638% Time: 32.6672s
Valid Loss: 35.6006 Valid Acc: 14.2857 F1 Score: 0.0833


100%|██████████| 119/119 [00:27<00:00,  4.29it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

#1 Loss: nan Acc: 63.8095% Time: 63.8060s
Valid Loss: nan Valid Acc: 71.4286 F1 Score: 0.2778


100%|██████████| 119/119 [00:27<00:00,  4.30it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

#2 Loss: nan Acc: 71.4286% Time: 95.0254s
Valid Loss: nan Valid Acc: 71.4286 F1 Score: 0.2778


100%|██████████| 119/119 [00:27<00:00,  4.27it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

#3 Loss: nan Acc: 71.4286% Time: 126.4854s
Valid Loss: nan Valid Acc: 71.4286 F1 Score: 0.2778


100%|██████████| 119/119 [00:27<00:00,  4.27it/s]
  0%|          | 0/119 [00:00<?, ?it/s]

#4 Loss: nan Acc: 71.4286% Time: 157.9359s
Valid Loss: nan Valid Acc: 71.4286 F1 Score: 0.2778


 12%|█▏        | 14/119 [00:04<00:24,  4.26it/s]

In [None]:
submission = model_run('gender', submission, test_dir).copy()

In [None]:
submission = model_run('age', submission, test_dir).copy()

In [14]:
submission = submission.astype({'age':'int','gender':'int','mask':'int'})

In [15]:
try : submission['ans'] = submission['mask'] + submission['gender'] + submission['age'] ; submission2 = submission.drop(['mask','gender','age'],axis=1)
except : pass
submission2

Unnamed: 0,ImageID,ans
0,cbc5c6e168e63498590db46022617123f1fe1268.jpg,13
1,0e72482bf56b3581c081f7da2a6180b8792c7089.jpg,2
2,b549040c49190cedc41327748aeb197c1670f14d.jpg,13
3,4f9cb2a045c6d5b9e50ad3459ea7b791eb6e18bc.jpg,13
4,248428d9a4a5b6229a7081c32851b90cb8d38d0c.jpg,12
...,...,...
12595,d71d4570505d6af8f777690e63edfa8d85ea4476.jpg,2
12596,6cf1300e8e218716728d5820c0bab553306c2cfd.jpg,4
12597,8140edbba31c3a824e817e6d5fb95343199e2387.jpg,9
12598,030d439efe6fb5a7bafda45a393fc19f2bf57f54.jpg,1


In [16]:
submission2.to_csv(os.path.join(test_dir, 'submission.csv'), index=False)