# EfficientNet을 활용한 CESCO Q&A classification

## 1. Data Loader 만들기
* 우리가 만든 데이터셋을 로드하기 위해 DataLoader를 만듦

In [2]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
from collections import Counter

In [3]:
DATASET_DIR = 'dataset'
LARVA_PUPA = 'larva_pupa'
IMAGO = 'imago'
ETC = 'etc'
BATCH_SIZE = 10
TRAIN_RATIO = 0.8
SEED = 42

def load_dataset(is_train=True):
    # For Debugging
    torch.manual_seed(42)

    from torchvision import transforms, datasets
    data_transform = {
        'train' : transforms.Compose([transforms.Resize((224,224)),
                                      transforms.RandomRotation(30),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
        'val' : transforms.Compose([transforms.Resize((224,224)),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    }

    # load dataset
    image_datasets = datasets.ImageFolder(os.path.join(DATASET_DIR),
                                          transform=data_transform['train'])

    # train set과 validation set을 랜덤하게 할당
    train_cnt = int(len(image_datasets) * TRAIN_RATIO)
    val_cnt = len(image_datasets) - train_cnt

    train, val = torch.utils.data.random_split(image_datasets, [train_cnt, val_cnt])

    # 클래스별 데이터 개수 확인
    train_classes = [label for _, label in train]
    print("train:",Counter(train_classes))
    train_classes = [label for _, label in val]
    print("valid:",Counter(train_classes))

    # data loader 만들기
    train_loader = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    valid_loader = DataLoader(val, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    return train_loader, valid_loader

train_loader, valid_loader = load_dataset()

train: Counter({1: 4582, 0: 1121, 2: 1109})
valid: Counter({1: 1114, 0: 299, 2: 291})


* etc : 0, imago : 1, larva_pupa: 2
* 라벨링한 데이터를 분석해 본 결과 imago가 가장 많음

## 2. trainer 만들기

In [4]:
from efficientnet import EfficientNet
from torchsummary import summary

In [5]:
# 사용할 모델은 EfficientNet 으로 정함
# https://arxiv.org/abs/1905.11946
model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=3)
summary(model, input_size=(3,224,224), device='cpu')

Loaded pretrained weights for efficientnet-b5
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         ZeroPad2d-1          [-1, 3, 225, 225]               0
Conv2dStaticSamePadding-2         [-1, 48, 112, 112]           1,296
       BatchNorm2d-3         [-1, 48, 112, 112]              96
MemoryEfficientSwish-4         [-1, 48, 112, 112]               0
         ZeroPad2d-5         [-1, 48, 114, 114]               0
Conv2dStaticSamePadding-6         [-1, 48, 112, 112]             432
       BatchNorm2d-7         [-1, 48, 112, 112]              96
MemoryEfficientSwish-8         [-1, 48, 112, 112]               0
          Identity-9             [-1, 48, 1, 1]               0
Conv2dStaticSamePadding-10             [-1, 12, 1, 1]             588
MemoryEfficientSwish-11             [-1, 12, 1, 1]               0
         Identity-12             [-1, 12, 1, 1]               0
Conv2dStaticSamePadding-13        

In [6]:
import torch.optim as optim
import torch.nn as nn
import time
import copy

from efficientnet import EfficientNet
from torch.optim import lr_scheduler

In [34]:
NUM_OF_EPOCH = 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=3)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001) # 기울기 업데이트 -> how to?
criterion = nn.CrossEntropyLoss() # y_hat과 y의 차이를 구함
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

since = time.time()

best_model_weights = copy.deepcopy(model.state_dict())
best_acc = 0.0

Loaded pretrained weights for efficientnet-b5


In [35]:
for epoch in range(NUM_OF_EPOCH):
    print('Epoch {}/{}'.format(epoch + 1, NUM_OF_EPOCH))
    print('-' * 10)
    model.train()

    running_loss = 0.0
    running_corrects = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad() # 이전에 구했던 기울기를 0으로 초기화 함

        outputs = model(inputs) # 네트워크를 통과시킴 => computational graph가 만들어짐
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        # back-propagation
        loss.backward()
        optimizer.step()

        running_loss += float(loss.item() * inputs.size(0))
        running_corrects += float(torch.sum(preds == labels.data))

    scheduler.step()

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = running_corrects / len(train_loader.dataset)
    # writer.add_graph('epoch loss', epoch_loss, epoch)
    # writer.add_graph('epoch acc', epoch_acc, epoch)

    print('{} Loss: {:.4f} Acc: {:.4f}'.format(
        "train", epoch_loss, epoch_acc))

    if epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model_weights = copy.deepcopy(model.state_dict())

Epoch 1/10
----------
train Loss: 0.8885 Acc: 0.6647
Epoch 2/10
----------
train Loss: 0.8722 Acc: 0.6723
Epoch 3/10
----------
train Loss: 0.8683 Acc: 0.6723
Epoch 4/10
----------
train Loss: 0.8684 Acc: 0.6726
Epoch 5/10
----------
train Loss: 0.8694 Acc: 0.6726
Epoch 6/10
----------
train Loss: 0.8667 Acc: 0.6726
Epoch 7/10
----------
train Loss: 0.8662 Acc: 0.6726
Epoch 8/10
----------
train Loss: 0.8618 Acc: 0.6726
Epoch 9/10
----------
train Loss: 0.8608 Acc: 0.6726
Epoch 10/10
----------
train Loss: 0.8616 Acc: 0.6726


* 그대로 학습을 수행한 결과 대부분의 결과를 imago로 추측하는 것으로 overfitting되는 것을 확인할 수 있었음
the number of total training dataset : 6812
the number of imago data in training dataset : 4582
the ratio of imago to total : 0.6726365237815619

* class imbalance한 상황에서 제대로 학습 시키기 위해 data관점 method를 적용
* 참고 :
Survey on deep learning with class imbalance - https://journalofbigdata.springeropen.com/articles/10.1186/s40537-019-0192-5
How to handle imbalanced classes - https://discuss.pytorch.org/t/how-to-handle-imbalanced-classes/11264/13
PyTorch [Basics] — Sampling Samplers - https://towardsdatascience.com/pytorch-basics-sampling-samplers-2a0f29f0bf2a

# Dataloader Using Weighted RandomSampler

* Weighted Random Sampler로 weight를 줌으로써 imago로 overfitting 되는 것을 방지

In [1]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from torch.utils.data import WeightedRandomSampler

In [2]:
DATASET_DIR = 'dataset'
LARVA_PUPA = 'larva_pupa'
IMAGO = 'imago'
ETC = 'etc'
BATCH_SIZE = 16
TRAIN_RATIO = 0.8
SEED = 42


def get_class_distribution(dataset_obj, idx2class):
    count_dict = {k:0 for k,v in dataset_obj.class_to_idx.items()}

    for element in dataset_obj:
        y_lbl = element[1]
        y_lbl = idx2class[y_lbl]
        count_dict[y_lbl] += 1

    return count_dict

def load_dataset(is_train=True):
    # For Debugging
    torch.manual_seed(42)

    from torchvision import transforms, datasets
    data_transform = {
        'train' : transforms.Compose([transforms.Resize((224,224)),
                                      transforms.RandomRotation(30),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
        'val' : transforms.Compose([transforms.Resize((224,224)),
                                    transforms.ToTensor(),
                                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    }

    # load dataset
    image_datasets = datasets.ImageFolder(os.path.join(DATASET_DIR),
                                          transform=data_transform['train'])

    # train set과 validation set을 랜덤하게 할당
    train_cnt = int(len(image_datasets) * TRAIN_RATIO)
    val_cnt = len(image_datasets) - train_cnt

    train, val = torch.utils.data.random_split(image_datasets, [train_cnt, val_cnt])

    # 클래스별 데이터 개수 확인
    #train_classes = [label for _, label in train]
    #print("train:",Counter(train_classes))
    #train_classes = [label for _, label in val]
    #print("valid:",Counter(train_classes))

    # weighted sampler 만들기
    idx2class = {v: k for k, v in image_datasets.class_to_idx.items()}
    class_count = [i for i in get_class_distribution(image_datasets, idx2class).values()]
    class_weights = 1./torch.tensor(class_count, dtype=torch.float)

    train_class_weights = torch.tensor([class_weights[image_datasets[int(index)][1]] for index in train.indices])
    valid_class_weights = torch.tensor([class_weights[image_datasets[int(index)][1]] for index in val.indices])

    train_weighted_sampler = WeightedRandomSampler(
        weights=train_class_weights,
        num_samples=train_cnt,
        replacement=True
    )
    valid_weighted_sampler = WeightedRandomSampler(
        weights=valid_class_weights,
        num_samples=val_cnt,
        replacement=True
    )

    # data loader 만들기
    train_loader = DataLoader(train,
                              batch_size=BATCH_SIZE,
                              shuffle=False, num_workers=4,
                              sampler=train_weighted_sampler)
    valid_loader = DataLoader(val,
                              batch_size=BATCH_SIZE,
                              shuffle=False, num_workers=4,
                              sampler=valid_weighted_sampler)

    return train_loader, valid_loader

train_loader, valid_loader = load_dataset()

## Training

In [3]:
import torch.optim as optim
import torch.nn as nn
import time
import copy

from efficientnet import EfficientNet
from torch.optim import lr_scheduler

In [4]:
NUM_OF_EPOCH = 30

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=3)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001) # 기울기 업데이트 -> how to?
criterion = nn.CrossEntropyLoss() # y_hat과 y의 차이를 구함
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

since = time.time()

best_model_weights = copy.deepcopy(model.state_dict())
best_acc = 0.0

Loaded pretrained weights for efficientnet-b5


In [5]:
for epoch in range(NUM_OF_EPOCH):
    print('Epoch {}/{}'.format(epoch + 1, NUM_OF_EPOCH))
    print('-' * 10)
    model.train()

    running_loss = 0.0
    running_corrects = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad() # 이전에 구했던 기울기를 0으로 초기화 함

        outputs = model(inputs) # 네트워크를 통과시킴 => computational graph가 만들어짐
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        # back-propagation
        loss.backward()
        optimizer.step()

        running_loss += float(loss.item() * inputs.size(0))
        running_corrects += float(torch.sum(preds == labels.data))

    scheduler.step()

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = running_corrects / len(train_loader.dataset)
    # writer.add_graph('epoch loss', epoch_loss, epoch)
    # writer.add_graph('epoch acc', epoch_acc, epoch)

    print('{} Loss: {:.4f} Acc: {:.4f}'.format(
        "train", epoch_loss, epoch_acc))

    if epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model_weights = copy.deepcopy(model.state_dict())

Epoch 1/30
----------
train Loss: 1.1061 Acc: 0.3801
Epoch 2/30
----------
train Loss: 1.0899 Acc: 0.3920
Epoch 3/30
----------
train Loss: 1.0748 Acc: 0.4103
Epoch 4/30
----------
train Loss: 1.0671 Acc: 0.4127
Epoch 5/30
----------
train Loss: 1.0440 Acc: 0.4469
Epoch 6/30
----------
train Loss: 1.0296 Acc: 0.4693
Epoch 7/30
----------
train Loss: 1.0071 Acc: 0.4952
Epoch 8/30
----------
train Loss: 0.9367 Acc: 0.5446
Epoch 9/30
----------
train Loss: 0.8930 Acc: 0.5662
Epoch 10/30
----------
train Loss: 0.8448 Acc: 0.5973
Epoch 11/30
----------
train Loss: 0.8080 Acc: 0.6240
Epoch 12/30
----------
train Loss: 0.7759 Acc: 0.6403
Epoch 13/30
----------
train Loss: 0.7292 Acc: 0.6649
Epoch 14/30
----------
train Loss: 0.7044 Acc: 0.6762
Epoch 15/30
----------
train Loss: 0.6717 Acc: 0.6998
Epoch 16/30
----------
train Loss: 0.6533 Acc: 0.7064
Epoch 17/30
----------
train Loss: 0.6481 Acc: 0.7173
Epoch 18/30
----------
train Loss: 0.6380 Acc: 0.7156
Epoch 19/30
----------
train Loss: 0.

In [6]:
# calculate training time
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))

Training complete in 50m 31s
Best val Acc: 0.740311


In [7]:
# save model weight
torch.save(best_model_weights, './weights/best_weights_b5_class_3.pth')

## Validataion

In [12]:
import torch
from efficientnet import EfficientNet

In [13]:
weight_path = './weights/best_weights_b5_class_3.pth'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 모델 구성후 학습된 파라미터 로드
model = EfficientNet.from_pretrained('efficientnet-b5', num_classes=3)
model.load_state_dict(torch.load(weight_path))
model.to(device)

Loaded pretrained weights for efficientnet-b5


EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
  )
  (_bn0): BatchNorm2d(48, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        48, 48, kernel_size=(3, 3), stride=[1, 1], groups=48, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(48, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        48, 12, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        12, 48, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
        48, 24, kernel_siz

In [15]:
model.eval()

running_corrects = 0

with torch.no_grad():
    for inputs, labels in valid_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        running_corrects += float(torch.sum(preds == labels.data))

    valid_acc = running_corrects / len(valid_loader.dataset)

    print('{} Acc: {:.4f}'.format(
        "valid", valid_acc))

valid Acc: 0.4624


## 간단한 분석
* 데이터셋을 사람이 직접 하지 않고, 답변 문자열을 분석하여 키워드 중심으로 분류했기 때문에 정확하게 라벨링 되지 않고 많은 noise가 섞여있다
* 따라서, 데이터셋을 확인해보았을 때, validation accuracy가 46% 인것이 나쁘지 않은 결과인 것 같다.
* 만약, 제대로 라벨링 된 데이터셋을 사용했다면 더 좋은 성능을 낼 수 있을 것이라고 생각된다.
* (직접 라벨링 하면 성능은 높일 수 있겠지만, 라벨링을 손으로 하는 것은 수업의 목적에 부합하다고 생각하지 때문에 생략했다..)