In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

file_count = 0  # 출력한 파일 개수를 추적
max_files = 10  # 출력할 파일의 최대 개수

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        file_count += 1
        if file_count >= max_files:
            break
    if file_count >= max_files:
        break

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv
/kaggle/input/plant-pathology-2020-fgvc7/train.csv
/kaggle/input/plant-pathology-2020-fgvc7/test.csv
/kaggle/input/plant-pathology-2020-fgvc7/images/Test_1743.jpg
/kaggle/input/plant-pathology-2020-fgvc7/images/Test_262.jpg
/kaggle/input/plant-pathology-2020-fgvc7/images/Train_1524.jpg
/kaggle/input/plant-pathology-2020-fgvc7/images/Train_1336.jpg
/kaggle/input/plant-pathology-2020-fgvc7/images/Train_1461.jpg
/kaggle/input/plant-pathology-2020-fgvc7/images/Train_1797.jpg
/kaggle/input/plant-pathology-2020-fgvc7/images/Test_1499.jpg


--- 
### Plant Pathology 2020 - Advanced Model
- Kaggle : Plant Pathology 2020 - FGVC7
- 병든 나뭇잎을 식별하는 다중분류 문제
- target = (0, 1) :
    - healthy = 1 : 잎사귀의 건강상태
    - multiple_disease = 1 : 다수의 질병
    - rust = 1 : 녹병
    - scab = 1 : 곰팡이병
- 성능개선 방법 : 1) 훈련단계에서 epochs 증가 및 스케줄러 추가 / 2) 예측단계에서 TTA(데이터 증강)과 레이블 스무딩 
--- 

In [2]:
import numpy as np 
import pandas as pd 
import os 
# 데이터 로딩 
data_path = '/kaggle/input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

print(train.shape, test.shape, submission.shape) 

(1821, 5) (1821, 1) (1821, 5)


In [3]:
# 1.시드고정 
import torch 
import random 

seed = 50 
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

# 3.GPU setting 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# 4.train/valid data split : 4개 target변수들의 비율에 맞춰 층화추출 
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, test_size = 0.1, 
                                stratify = train[['healthy', 'multiple_diseases', 'rust', 'scab']], 
                                random_state = 50)

In [5]:
# 5. data class define 
from torch.utils.data import Dataset 

class ImageDataset(Dataset): 
    #초기화 메서드 생성 (is_test = True/False ~ 테스트용/훈련&검증용 데이터셋을 만들려 할때 지정)
    def __init__(self, df, img_dir = './', transform = None, is_test = False): 
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transform 
        self.is_test = is_test 
    #데이터셋 크기변환 
    def __len__(self): 
        return len(self.df)
    #인덱스(idx)에 해당하는 데이터 반환 
    def __getitem__(self, idx): 
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id + '.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        #이미지 변환 
        if self.transform is not None: 
            #image = self.transform(image) <- torchvision 변환기 사용할 때는 이렇게, 아래는 albumentations를 사용하기 위함 
            image = self.transform(image=image)['image']
        #이미지가 테스트용이면 이미지만, 훈련/검증용이면 이미지와 타겟값 반환 
        if self.is_test: 
            return image 
        else : 
            #이미지가 훈련/검증용인 경우, 타겟값(healthy, multiple_disease, rust, scab)중 가장 큰값의 인덱스 할당 
            # eg.가장 큰값이 healthy -> 0, multiple_disease -> 1, rust -> 2, scab -> 3 
            label = np.argmax(self.df.iloc[idx, 1:5])
            return image, label

In [6]:
# 6.이미지 변환기 정의 및 훈련/검증/테스트 데이터 변환  
# 이미지 크기, 밝기/명암, 상하좌우 대칭, 이동, 스케일링, 회전, 엠보싱/샤프닝/블러, 어파인, 정규화&텐서 변환 
import albumentations as A 
from albumentations.pytorch import ToTensorV2

transform_train = A.Compose([
    A.Resize(450, 650),   #원본 이미지가 가로가 긴 형태임 
    A.RandomBrightnessContrast(brightness_limit = 0.2, contrast_limit = 0.2, p = 0.3),  #밝기/명암이 -0.2 ~ +0.2의 값 & 적용확률 
    A.VerticalFlip(p=0.2), A.HorizontalFlip(p=0.5), 
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, p=0.3),       #이동조절 -0.1 ~ +0.1, 스케일조절 -0.2 ~ +0.2, 회전 -30 ~ +30도, 30% 확률로 적용
    A.OneOf([A.Emboss(p=1), A.Sharpen(p=1), A.Blur(p=1)], p=0.3),                       #엠보싱/샤프닝/블러 중 하나를 선택해서 30% 확률로 적용 
    A.PiecewiseAffine(p=0.3),                                                           #어파인(이미지 모양을 전체적으로 변경) 조절, 30% 
    A.Normalize(),                                                                      #정규화(torchvision을 쓸때는 transform.Normalize()를 사용함)
    ToTensorV2()                                                                        #이미지를 텐서 형태로 변환(torchvision을 쓸때는 transform.ToTensor()를 사용함)
])

#검증/테스트 데이터용 변환기 : 필수(크기조정, 정규화, 텐서화)만 적용 
transform_test = A.Compose([
    A.Resize(450, 650), 
    A.Normalize(), 
    ToTensorV2()
])

  check_for_updates()
  original_init(self, **validated_kwargs)


In [7]:
# 7. 데이터셋 생성 
# 멀티 프로세스 활용을 위해, 데이터로더 시드값 고정하고 제너레이터 생성 
img_dir = '/kaggle/input/plant-pathology-2020-fgvc7/images/'
dataset_train = ImageDataset(train, img_dir = img_dir, transform = transform_train)
dataset_valid = ImageDataset(valid, img_dir = img_dir, transform = transform_test)

#멀티프로세싱 환경에서 데이터 로더의 각 워커(worker)가 고유한 시드값을 갖도록 설정하여 난수 생성의 일관성을 유지
#torch.initial_seed()는 현재 워커의 초기 시드를 가져옵니다. & 이 값을 2**32로 나눈 나머지를 worker_seed로 사용합니다.
def seed_worker(worker_id): 
    worker_seed = torch.initial_seed() % 2**32
    random.seed(worker_seed)
    np.random.seed(worker_seed)
#PyTorch에서 난수 생성기를 제어하기 위한 객체를 만들고,생성기 g의 시드를 고정(이를 통해 멀티프로세싱 환경에서도 데이터 로더의 난수 생성이 일관성을 유지) 
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7c523ed09ad0>

In [8]:
# 8. 데이터 로더 생성 
from torch.utils.data import DataLoader
batch_size = 4 

loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, worker_init_fn=seed_worker, generator=g, num_workers=2)
loader_valid = DataLoader(dataset_valid, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=2)

In [9]:
# 9. 모델 임포트 
!pip install efficientnet-pytorch==0.7.1
from efficientnet_pytorch import EfficientNet 

# num_classes는 최종 출력값의 갯수를 의미함 (여기서는 healty, multiple_disease, rust, scab의 4개)
model = EfficientNet.from_pretrained('efficientnet-b7', num_classes=4)
model = model.to(device)

Collecting efficientnet-pytorch==0.7.1
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16425 sha256=89221cb477e72b3beb287f57ced82c4292f03ba11219daf1e3cafe79a487313e
  Stored in directory: /root/.cache/pip/wheels/03/3f/e9/911b1bc46869644912bda90a56bcf7b960f20b5187feea3baf
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b7-dcc49843.pth
100%|██████████| 254M/254M [00:02<00:00, 115MB/s]  


Loaded pretrained weights for efficientnet-b7


In [10]:
# 10. 손실함수와 옵티마이저 설정
import torch.nn as nn 

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001 )

In [11]:
# 11. epochs를 39로 증가 
# 12. 스케줄러 정의 : 훈련과정에서 학습률을 조정 (최적값에 가까울수록 학습률을 작게)
# get_cosine_schedule_with_warmup() : 지정값 만큼 학습률 증가했다가, 코사인 형태로 점차 감소시킴 
from transformers import get_cosine_schedule_with_warmup
epochs = 39 

# num_warmup_steps : 지정된 학습률(lr=0.00006)에 몇번만에 도달하게 할 것인가? 
# num_training_steps : 모든 훈련을 마치는데 필요한 반복 횟수 
scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = len(loader_train)*3,  # len(loader_train) = 1 epoch 반복수, 즉 3회만에 지정된 학슬률에 도달 
                                            num_training_steps = len(loader_train)*epochs)  # 39 epochs 훈련함 

In [12]:
# 11. 모델 훈련&검증 : epoch 단위로 훈련과 성능검증을 반복함  
import cv2
from sklearn.metrics import roc_auc_score 
from tqdm.notebook import tqdm

for epoch in range(epochs): 
    # 모델 설정 및 손실값 초기화 
    model.train() 
    epoch_train_loss = 0 

    for images, labels in tqdm(loader_train): 
        images = images.to(device)
        labels = labels.to(device)
        #옵티마이저 초기화 
        optimizer.zero_grad() 
        #순전파 출력값과 손실값 계산 
        outputs = model(images)
        loss = criterion(outputs, labels)
        epoch_train_loss += loss.item()
        #역전파 수행 및 가중치 갱신 
        loss.backward() 
        optimizer.step()
        ## 스케줄러 학습률 갱신 
        scheduler.step() 
    
    print(f'epoch[{epoch+1}/{epochs}] - loss : {epoch_train_loss/len(loader_train):.4f}') 

    # 검증 : 모델을 평가상태로 설정, 손실값 초기화, 예측/실제값 저장용 리스트 생성  
    model.eval()
    epoch_valid_loss = 0 
    preds_list = []
    true_onehot_list = []

    with torch.no_grad(): #기울기 계산 비활성화
        #미니배치 단위로 검증 
        for images, labels in loader_valid: 
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_valid_loss += loss.item() 
            preds = torch.softmax(outputs.cpu(), dim=1).numpy()
            #실제값을 원-핫 인코딩 형태로 
            #true_onehot = torch.eye(4)[labels].cpu().numpy() 
            true_onehot = torch.eye(4, device=labels.device)[labels].cpu().numpy()

            # 실제&예측값 저장 
            true_onehot_list.extend(true_onehot)
            preds_list.extend(preds) 
            
    print(f'epoch[{epoch+1}/{epochs}] - valid loss : {epoch_valid_loss/len(loader_valid):.4f} / valid ROC AUC : {roc_auc_score(true_onehot_list, preds_list):.4f}')

  0%|          | 0/410 [00:00<?, ?it/s]

epoch[1/39] - loss : 1.2767
epoch[1/39] - valid loss : 0.6686 / valid ROC AUC : 0.9097


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[2/39] - loss : 0.5623
epoch[2/39] - valid loss : 0.2439 / valid ROC AUC : 0.9576


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[3/39] - loss : 0.3983
epoch[3/39] - valid loss : 0.2014 / valid ROC AUC : 0.9640


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[4/39] - loss : 0.2658
epoch[4/39] - valid loss : 0.3647 / valid ROC AUC : 0.9667


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[5/39] - loss : 0.2146
epoch[5/39] - valid loss : 0.1889 / valid ROC AUC : 0.9627


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[6/39] - loss : 0.1683
epoch[6/39] - valid loss : 0.1658 / valid ROC AUC : 0.9787


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[7/39] - loss : 0.1129
epoch[7/39] - valid loss : 0.1542 / valid ROC AUC : 0.9699


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[8/39] - loss : 0.0825
epoch[8/39] - valid loss : 0.1745 / valid ROC AUC : 0.9798


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[9/39] - loss : 0.0714
epoch[9/39] - valid loss : 0.2507 / valid ROC AUC : 0.9812


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[10/39] - loss : 0.0874
epoch[10/39] - valid loss : 0.2158 / valid ROC AUC : 0.9707


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[11/39] - loss : 0.0533
epoch[11/39] - valid loss : 0.2491 / valid ROC AUC : 0.9687


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[12/39] - loss : 0.0595
epoch[12/39] - valid loss : 0.2090 / valid ROC AUC : 0.9669


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[13/39] - loss : 0.0456
epoch[13/39] - valid loss : 0.2103 / valid ROC AUC : 0.9830


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[14/39] - loss : 0.0386
epoch[14/39] - valid loss : 0.1919 / valid ROC AUC : 0.9823


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[15/39] - loss : 0.0283
epoch[15/39] - valid loss : 0.2697 / valid ROC AUC : 0.9770


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[16/39] - loss : 0.0368
epoch[16/39] - valid loss : 0.2432 / valid ROC AUC : 0.9831


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[17/39] - loss : 0.0347
epoch[17/39] - valid loss : 0.2473 / valid ROC AUC : 0.9732


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[18/39] - loss : 0.0155
epoch[18/39] - valid loss : 0.2432 / valid ROC AUC : 0.9774


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[19/39] - loss : 0.0123
epoch[19/39] - valid loss : 0.2575 / valid ROC AUC : 0.9838


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[20/39] - loss : 0.0171
epoch[20/39] - valid loss : 0.2239 / valid ROC AUC : 0.9839


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[21/39] - loss : 0.0301
epoch[21/39] - valid loss : 0.2214 / valid ROC AUC : 0.9766


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[22/39] - loss : 0.0151
epoch[22/39] - valid loss : 0.1972 / valid ROC AUC : 0.9802


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[23/39] - loss : 0.0164
epoch[23/39] - valid loss : 0.2314 / valid ROC AUC : 0.9885


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[24/39] - loss : 0.0146
epoch[24/39] - valid loss : 0.1649 / valid ROC AUC : 0.9888


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[25/39] - loss : 0.0197
epoch[25/39] - valid loss : 0.1692 / valid ROC AUC : 0.9832


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[26/39] - loss : 0.0126
epoch[26/39] - valid loss : 0.1817 / valid ROC AUC : 0.9811


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[27/39] - loss : 0.0096
epoch[27/39] - valid loss : 0.2079 / valid ROC AUC : 0.9820


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[28/39] - loss : 0.0071
epoch[28/39] - valid loss : 0.2092 / valid ROC AUC : 0.9848


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[29/39] - loss : 0.0162
epoch[29/39] - valid loss : 0.1828 / valid ROC AUC : 0.9876


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[30/39] - loss : 0.0095
epoch[30/39] - valid loss : 0.1994 / valid ROC AUC : 0.9859


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[31/39] - loss : 0.0081
epoch[31/39] - valid loss : 0.1987 / valid ROC AUC : 0.9876


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[32/39] - loss : 0.0040
epoch[32/39] - valid loss : 0.1882 / valid ROC AUC : 0.9863


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[33/39] - loss : 0.0083
epoch[33/39] - valid loss : 0.1883 / valid ROC AUC : 0.9868


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[34/39] - loss : 0.0055
epoch[34/39] - valid loss : 0.1847 / valid ROC AUC : 0.9839


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[35/39] - loss : 0.0047
epoch[35/39] - valid loss : 0.1810 / valid ROC AUC : 0.9850


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[36/39] - loss : 0.0055
epoch[36/39] - valid loss : 0.1849 / valid ROC AUC : 0.9839


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[37/39] - loss : 0.0039
epoch[37/39] - valid loss : 0.1874 / valid ROC AUC : 0.9838


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[38/39] - loss : 0.0058
epoch[38/39] - valid loss : 0.1873 / valid ROC AUC : 0.9835


  0%|          | 0/410 [00:00<?, ?it/s]

epoch[39/39] - loss : 0.0040
epoch[39/39] - valid loss : 0.1895 / valid ROC AUC : 0.9842


In [13]:
# 12. 예측 
# TTA : 테스트데이터에 여러 변환을 적용하여 예측하고, 값을 평균한다 (데이터 증가 + 앙상블)

# 테스트용 데이터셋, 로더 생성 
dataset_test = ImageDataset(test, img_dir = img_dir, transform=transform_test, is_test=True)
loader_test = DataLoader(dataset_test, batch_size = batch_size, shuffle=False, worker_init_fn = seed_worker, generator=g, num_workers=2)

## TTA용 데이터셋, 로더 생성
dataset_TTA = ImageDataset(test, img_dir = img_dir, transform=transform_test, is_test=True)
loader_TTA = DataLoader(dataset_TTA, batch_size = batch_size, shuffle=False, worker_init_fn = seed_worker, generator=g, num_workers=2)

In [15]:
# 타겟확률 예측 - 원본데이터 사용 
model.eval()
preds = np.zeros((len(test), 4))

with torch.no_grad(): 
    for i, images in enumerate(loader_test): 
        images = images.to(device)
        outputs = model(images)
        preds_parts = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds[i*batch_size:(i+1)*batch_size] += preds_parts

# 타겟값을 저장 
submission_test = submission.copy()
preds_test = submission_test[['healthy', 'multiple_diseases', 'rust', 'scab']] 

# 제출용 파일 
submission_test.to_csv('submission_test.csv', index=False)

In [16]:
# 타겟확률 예측 - TTA 5회 사용 
num_TTA = 5 
preds_tta = np.zeros((len(test), 4))

for i in range(num_TTA): 
    with torch.no_grad(): 
        for i, images in enumerate(loader_test): 
            images = images.to(device)
            outputs = model(images)
            preds_parts = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
            preds_tta[i*batch_size:(i+1)*batch_size] += preds_parts

# TTA를 적용한 예측값의 평균 
preds_tta /= num_TTA

# 타겟값을 저장 
submission_tta = submission.copy()
preds_tta = submission_tta[['healthy', 'multiple_diseases', 'rust', 'scab']] 

# 제출용 파일 
submission_tta.to_csv('submission_tta.csv', index=False)

--- 
#### Label Smoothing 
- Overfitting 방지 기법 중 하나
- 타겟값일 확률을 1에 매우 가깝게 예측하는 경우, 즉 모델의 과잉확신으로 일반화 성능이 낮아지는 것을 방지 
---

In [18]:
# 레이블 스무딩 함수 
def apply_label_smoothing(df, target, alpha, threshold): 
    df_target = df[target].copy()
    k = len(target)
    for idx, row in df_target.iterrows(): 
        if(row > threshold).any(): 
           row = (1-alpha)*row + alpha/k 
           df_target.iloc[idx] = row 
    return df_target

# alpha : 레이블 스무딩의 강도 
# threshold : 레이블 스무딩을 적용할 임계값 (임계값 이상인 경우만 적용)
alpha = 0.001 
threshold = 0.999

# 데이터프레임을 복사하고 타겟값의 열 이름 지정 
submission_test_ls = submission_test.copy()
submission_tta_ls = submission_tta.copy()
target = ['healthy', 'multiple_diseases', 'rust', 'scab']

# 레이블스무딩 적용 
submission_test_ls[target] = apply_label_smoothing(submission_test_ls, target, alpha, threshold)
submission_tta_ls[target] = apply_label_smoothing(submission_tta_ls, target, alpha, threshold)

# 제출용 파일 
submission_test_ls.to_csv('submission_test_ls.csv', index=False)
submission_tta_ls.to_csv('submission_tta_ls.csv', index=False)