# Library import

In [3]:
# 필요한 라이브러리를 임포트합니다.
import os
from typing import Tuple, Callable, Union, List
import random

import cv2
import timm
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import models, transforms
from torchvision.transforms import AutoAugment, AutoAugmentPolicy
import albumentations as A
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from albumentations.pytorch import ToTensorV2
from PIL import Image


In [4]:
# 랜덤 시드 설정 함수
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 시드 값 설정
seed = 42
set_seed(seed)

# Dataset Class

In [5]:
class CustomDataset(Dataset):
    def __init__(
        self, 
        root_dir: str, 
        info_df: pd.DataFrame, 
        transform: Callable = None,  # transform을 선택적으로 설정
        is_inference: bool = False
    ):
        self.root_dir = root_dir
        self.transform = transform
        self.is_inference = is_inference
        self.image_paths = info_df['image_path'].tolist()
        
        if not self.is_inference:
            self.targets = info_df['target'].tolist()

    def __len__(self) -> int:
        return len(self.image_paths)

    def __getitem__(self, index: int) -> Union[Tuple[torch.Tensor, int], Image.Image]:
        img_path = os.path.join(self.root_dir, self.image_paths[index])
        image = cv2.imread(img_path, cv2.IMREAD_COLOR)
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_path}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        
        if self.is_inference:
            return image  # TTA를 위해 원본 이미지 반환
        else:
            image = self.transform(image)
            target = self.targets[index]
            return image, target

# Transform Class

In [6]:
class TorchvisionTransform:
    def __init__(self, is_train: bool = True):
        common_transforms = [
            transforms.Resize((448, 448)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
        
        if is_train:
            self.transform = transforms.Compose(
                [
                    AutoAugment(policy=AutoAugmentPolicy.IMAGENET),  # 훈련 시 AutoAugment 적용
                ] + common_transforms
            )
        else:
            self.transform = transforms.Compose(common_transforms)

    def __call__(self, image: Image.Image) -> torch.Tensor:
        return self.transform(image)

In [7]:
class AlbumentationsTransform:
    def __init__(self, is_train: bool = True):
        # 공통 변환 설정: 이미지 리사이즈, 정규화, 텐서 변환
        common_transforms = [
            A.Resize(448, 448),  # 이미지를 448x448 크기로 리사이즈
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # 정규화
            ToTensorV2()  # albumentations에서 제공하는 PyTorch 텐서 변환
        ]
        
        if is_train:
            # 훈련용 변환: 랜덤 수평 뒤집기, 랜덤 회전, 랜덤 밝기 및 대비 조정 추가
            self.transform = A.Compose(
                [
                    A.HorizontalFlip(p=0.5),  # 50% 확률로 이미지를 수평 뒤집기
                    A.Rotate(limit=15),  # 최대 15도 회전
                    A.RandomBrightnessContrast(p=0.2),  # 밝기 및 대비 무작위 조정
                ] + common_transforms
            )
        else:
            # 검증/테스트용 변환: 공통 변환만 적용
            self.transform = A.Compose(common_transforms)

    def __call__(self, image) -> torch.Tensor:
        # 이미지가 NumPy 배열인지 확인
        if not isinstance(image, np.ndarray):
            raise TypeError("Image should be a NumPy array (OpenCV format).")
        
        # 이미지에 변환 적용 및 결과 반환
        transformed = self.transform(image=image)  # 이미지에 설정된 변환을 적용
        
        return transformed['image']  # 변환된 이미지의 텐서를 반환

In [8]:
class TransformSelector:
    """
    이미지 변환 라이브러리를 선택하기 위한 클래스.
    """
    def __init__(self, transform_type: str):

        # 지원하는 변환 라이브러리인지 확인
        if transform_type in ["torchvision", "albumentations"]:
            self.transform_type = transform_type
        
        else:
            raise ValueError("Unknown transformation library specified.")

    def get_transform(self, is_train: bool):
        
        # 선택된 라이브러리에 따라 적절한 변환 객체를 생성
        if self.transform_type == 'torchvision':
            transform = TorchvisionTransform(is_train=is_train)
        
        elif self.transform_type == 'albumentations':
            transform = AlbumentationsTransform(is_train=is_train)
        
        return transform

# Model Class

In [9]:
class SimpleCNN(nn.Module):
    """
    간단한 CNN 아키텍처를 정의하는 클래스.
    """
    def __init__(self, num_classes: int):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        # 순전파 함수 정의
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = torch.flatten(x, 1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [10]:
class TorchvisionModel(nn.Module):
    """
    Torchvision에서 제공하는 사전 훈련된 모델을 사용하는 클래스.
    """
    def __init__(
        self, 
        model_name: str, 
        num_classes: int, 
        pretrained: bool
    ):
        super(TorchvisionModel, self).__init__()
        self.model = models.__dict__[model_name](pretrained=pretrained)
        
        # 모델의 최종 분류기 부분을 사용자 정의 클래스 수에 맞게 조정
        if 'fc' in dir(self.model):
            num_ftrs = self.model.fc.in_features
            self.model.fc = nn.Linear(num_ftrs, num_classes)
        
        elif 'classifier' in dir(self.model):
            num_ftrs = self.model.classifier[-1].in_features
            self.model.classifier[-1] = nn.Linear(num_ftrs, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        return self.model(x)

In [11]:
class TimmModel(nn.Module):
    """
    Timm 라이브러리를 사용하여 다양한 사전 훈련된 모델을 제공하는 클래스.
    """
    def __init__(
        self, 
        model_name: str, 
        num_classes: int, 
        pretrained: bool
    ):
        super(TimmModel, self).__init__()
        self.model = timm.create_model(
            model_name, 
            pretrained=pretrained, 
            num_classes=num_classes
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        return self.model(x)

In [12]:
class ModelSelector:
    """
    사용할 모델 유형을 선택하는 클래스.
    """
    def __init__(
        self, 
        model_type: str, 
        num_classes: int, 
        **kwargs
    ):
        
        # 모델 유형에 따라 적절한 모델 객체를 생성
        if model_type == 'simple':
            self.model = SimpleCNN(num_classes=num_classes)
        
        elif model_type == 'torchvision':
            self.model = TorchvisionModel(num_classes=num_classes, **kwargs)
        
        elif model_type == 'timm':
            self.model = TimmModel(num_classes=num_classes, **kwargs)
        
        else:
            raise ValueError("Unknown model type specified.")

    def get_model(self) -> nn.Module:

        # 생성된 모델 객체 반환
        return self.model

# Loss Class

In [13]:
# Asymmetric Loss 정의 (다중 클래스 분류에 맞게 수정)
class AsymmetricLoss(nn.Module):
    def __init__(self, gamma_pos=1, gamma_neg=4, clip=0.05, eps=1e-8, reduction='mean'):
        """
        Args:
            gamma_pos (float): 양성 클래스에 대한 초점 맞추기 파라미터.
            gamma_neg (float): 음성 클래스에 대한 초점 맞추기 파라미터.
            clip (float): 음성 예측에 대한 임계값.
            eps (float): 로그 함수의 안정성을 위한 작은 값.
            reduction (str): 손실의 합산 방식 ('mean' | 'sum' | 'none').
        """
        super(AsymmetricLoss, self).__init__()
        self.gamma_pos = gamma_pos
        self.gamma_neg = gamma_neg
        self.clip = clip
        self.eps = eps
        self.reduction = reduction

    def forward(self, inputs, targets):
        """
        Args:
            inputs: 모델의 출력 logits (batch_size, num_classes).
            targets: 실제 정답 라벨 (batch_size).
        """
        # 소프트맥스 확률 계산
        probs = F.softmax(inputs, dim=1)  # (batch_size, num_classes)
        
        # 정답 클래스에 대한 확률 추출
        targets_one_hot = F.one_hot(targets, num_classes=inputs.size(1))  # (batch_size, num_classes)
        targets_one_hot = targets_one_hot.type_as(probs)
        p_t = (probs * targets_one_hot).sum(dim=1)  # (batch_size)
        
        # 양성 클래스 손실 계산
        loss_pos = -((1 - p_t) ** self.gamma_pos) * torch.log(p_t + self.eps)
        
        # 음성 클래스 손실 계산
        p_n = (probs * (1 - targets_one_hot)).sum(dim=1)  # (batch_size)
        loss_neg = -((p_n) ** self.gamma_neg) * torch.log(1 - p_n + self.eps)
        
        # 비대칭 손실 합산
        loss = loss_pos + loss_neg

        # 손실 합산 방식
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

In [14]:
class Loss(nn.Module):
    """
    모델의 손실함수를 계산하는 클래스. 현재 AsymmetricLoss를 사용하도록 설정.
    """
    def __init__(self, loss_type='asymmetric'):
        super(Loss, self).__init__()
        if loss_type == 'cross_entropy':
            self.loss_fn = nn.CrossEntropyLoss()
        elif loss_type == 'asymmetric':
            self.loss_fn = AsymmetricLoss(gamma_pos=1, gamma_neg=4, clip=0.05, reduction='mean')
        else:
            raise ValueError("Unsupported loss type.")

    def forward(
        self, 
        outputs: torch.Tensor, 
        targets: torch.Tensor
    ) -> torch.Tensor:
        
        return self.loss_fn(outputs, targets)

# Trainer Class

In [15]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        """
        Args:
            patience (int): 개선이 없을 때 몇 에포크를 기다릴지
            min_delta (float): 성능이 개선되었다고 판단하는 최소 변화량
        """
        self.patience = patience  # 개선되지 않아도 기다리는 최대 에포크 수
        self.min_delta = min_delta  # 성능 개선이 없다고 판단하는 최소 변화량
        self.counter = 0  # 개선되지 않은 에포크 수 카운트
        self.best_loss = None  # 검증 손실의 최저값
        self.early_stop = False  # 중지 플래그
    
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss  # 첫 번째 에포크의 손실 저장
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss  # 손실이 개선되면 갱신
            self.counter = 0  # 카운터 초기화
        else:
            self.counter += 1  # 손실이 개선되지 않으면 카운터 증가
            if self.counter >= self.patience:
                self.early_stop = True  # patience를 초과하면 학습 중지

In [16]:
from torch.cuda.amp import autocast, GradScaler  # Mixed precision

class Trainer:
    def __init__(
        self, 
        model: nn.Module, 
        device: torch.device, 
        train_loader: DataLoader, 
        val_loader: DataLoader, 
        optimizer: optim.Optimizer,
        scheduler: optim.lr_scheduler._LRScheduler,
        loss_fn: torch.nn.modules.loss._Loss, 
        epochs: int,
        result_path: str,
        fold_idx: int,  # 폴드 인덱스 추가
        patience: int = 3,  # Early Stopping patience 조정
        min_delta: float = 0.001  # Early Stopping min_delta 조정
    ):
        # 클래스 초기화: 모델, 디바이스, 데이터 로더 등 설정
        self.model = model  # 훈련할 모델
        self.device = device  # 연산을 수행할 디바이스 (CPU or GPU)
        self.train_loader = train_loader  # 훈련 데이터 로더
        self.val_loader = val_loader  # 검증 데이터 로더
        self.optimizer = optimizer  # 최적화 알고리즘
        self.scheduler = scheduler  # 학습률 스케줄러
        self.loss_fn = loss_fn  # 손실 함수
        self.epochs = epochs  # 총 훈련 에폭 수
        self.result_path = result_path  # 모델 저장 경로
        self.fold_idx = fold_idx  # 현재 폴드 인덱스 저장
        self.best_models = []  # 가장 좋은 상위 3개 모델의 정보를 저장할 리스트
        self.lowest_loss = float('inf')  # 가장 낮은 Loss를 저장할 변수
        self.early_stopping = EarlyStopping(patience=patience, min_delta=min_delta)  # EarlyStopping 초기화

    def save_model(self, epoch, loss):
        # 모델 저장 경로 설정
        os.makedirs(self.result_path, exist_ok=True)

        # 현재 에폭 모델 저장
        current_model_path = os.path.join(
            self.result_path, 
            f'fold_{self.fold_idx}_epoch_{epoch}_loss_{loss:.4f}.pt'
        )
        torch.save(self.model.state_dict(), current_model_path)

        # 최상위 3개 모델 관리
        self.best_models.append((loss, epoch, current_model_path))
        self.best_models.sort()
        if len(self.best_models) > 3:
            _, _, path_to_remove = self.best_models.pop(-1)  # 가장 높은 손실 모델 삭제
            if os.path.exists(path_to_remove):
                os.remove(path_to_remove)

        # 가장 낮은 손실의 모델 저장
        if loss < self.lowest_loss:
            self.lowest_loss = loss
            best_model_path = os.path.join(self.result_path, f'fold_{self.fold_idx}_best_model.pt')
            torch.save(self.model.state_dict(), best_model_path)
            print(f"Fold {self.fold_idx}: Save {epoch} epoch result. Loss = {loss:.4f}")

    def train_epoch(self) -> tuple:
        # 한 에폭 동안의 훈련을 진행
        self.model.train()
        
        total_loss = 0.0
        correct = 0
        total = 0
        progress_bar = tqdm(self.train_loader, desc="Training", leave=False)
        scaler = GradScaler()  # AMP를 위한 GradScaler 객체 생성

        for images, targets in progress_bar:
            images, targets = images.to(self.device), targets.to(self.device)
            self.optimizer.zero_grad()

            # autocast 컨텍스트 내에서 모델을 실행하여 정밀도를 관리
            with autocast():
                outputs = self.model(images)
                loss = self.loss_fn(outputs, targets)

            # 스케일링된 손실을 사용하여 역전파 실행
            scaler.scale(loss).backward()

            # 스케일러를 사용해 가중치를 업데이트
            scaler.step(self.optimizer)
            scaler.update()

            total_loss += loss.item()

            # 정확도 계산
            _, predicted = outputs.max(1)
            correct += predicted.eq(targets).sum().item()
            total += targets.size(0)

            progress_bar.set_postfix(loss=loss.item())

        # 전체 정확도 계산
        train_accuracy = 100.0 * correct / total
        return total_loss / len(self.train_loader), train_accuracy

    def validate(self) -> tuple:
        # 모델의 검증을 진행
        self.model.eval()
        
        total_loss = 0.0
        correct = 0
        total = 0
        progress_bar = tqdm(self.val_loader, desc="Validating", leave=False)
        
        with torch.no_grad():
            for images, targets in progress_bar:
                images, targets = images.to(self.device), targets.to(self.device)
                outputs = self.model(images)
                loss = self.loss_fn(outputs, targets)
                total_loss += loss.item()

                # 정확도 계산
                _, predicted = outputs.max(1)
                correct += predicted.eq(targets).sum().item()
                total += targets.size(0)

                progress_bar.set_postfix(loss=loss.item())

        # 전체 정확도 계산
        val_accuracy = 100.0 * correct / total
        return total_loss / len(self.val_loader), val_accuracy

    def train(self) -> None:
        # 전체 훈련 과정을 관리
        for epoch in range(self.epochs):
            print(f"Epoch {epoch+1}/{self.epochs}")

            train_loss, train_accuracy = self.train_epoch()
            val_loss, val_accuracy = self.validate()

            print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")
            print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%\n")

            self.save_model(epoch, val_loss)

            # Early Stopping 조건 확인
            self.early_stopping(val_loss)
            if self.early_stopping.early_stop:
                print("Early stopping triggered. Stopping training...")
                break

            self.scheduler.step()

# Model Training

In [17]:
# 학습에 사용할 장비를 선택.
# torch라이브러리에서 gpu를 인식할 경우, cuda로 설정.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
# 학습 데이터의 경로와 정보를 가진 파일의 경로를 설정.
traindata_dir = "/data/ephemeral/home/dog_remove/data/train"
traindata_info_file = "/data/ephemeral/home/dog_remove/data/train_cleaned2.csv"
save_result_path = "/data/ephemeral/home/youngtae/model2/Asymmmetric_result"

In [19]:
# 학습 데이터의 class, image path, target에 대한 정보가 들어있는 csv파일을 읽기.
train_info = pd.read_csv(traindata_info_file)

# 총 class의 수를 측정.
num_classes = len(train_info['target'].unique())

In [20]:
# Stratified K-Fold 설정
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

In [20]:
for fold_idx, (train_index, val_index) in enumerate(skf.split(train_info, train_info['target']), 1):
    print(f'Fold {fold_idx}')

    # 각 폴드에 따라 데이터 분리
    train_df = train_info.iloc[train_index].reset_index(drop=True)
    val_df = train_info.iloc[val_index].reset_index(drop=True)

    # Transform 설정
    transform_selector = TransformSelector(
        transform_type="torchvision"  # "torchvision" 또는 "albumentations"
    )
    train_transform = transform_selector.get_transform(is_train=True)
    val_transform = transform_selector.get_transform(is_train=False)

    # Dataset 생성
    train_dataset = CustomDataset(
        root_dir=traindata_dir,
        info_df=train_df,
        transform=train_transform
    )
    val_dataset = CustomDataset(
        root_dir=traindata_dir,
        info_df=val_df,
        transform=val_transform
    )

    # DataLoader에서 worker_init_fn을 설정하여 각 워커의 시드를 고정
    def worker_init_fn(worker_id):
        np.random.seed(seed + worker_id)
        random.seed(seed + worker_id)

    # DataLoader 생성
    train_loader = DataLoader(
        train_dataset, 
        batch_size=64, 
        shuffle=True,
        num_workers=4,
        pin_memory=True,
        worker_init_fn=worker_init_fn
    )
    val_loader = DataLoader(
        val_dataset, 
        batch_size=64, 
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        worker_init_fn=worker_init_fn
    )

    # 학습에 사용할 Model을 선언합니다.
    model_selector = ModelSelector(
        model_type='timm', 
        num_classes=num_classes,
        model_name='eva02_large_patch14_448.mim_m38m_ft_in22k_in1k', 
        pretrained=True
    )
    model = model_selector.get_model()

    # 선언된 모델을 학습에 사용할 장비로 셋팅합니다.
    model.to(device)

    # 모델의 모든 파라미터를 동결 (학습되지 않도록 설정)
    for name, param in model.named_parameters():
        param.requires_grad = False

    # 마지막 레이어와 그 전 레이어만 학습되도록 설정
    # 여기서는 'blocks.22', 'blocks.23', 'head'를 예시로 들었습니다.
    # 실제 모델 구조에 따라 레이어 이름을 확인하고 수정해야 합니다.
    for name, param in model.named_parameters():
        if 'blocks.23' in name or 'blocks.22' in name or 'head' in name:
            param.requires_grad = True

    # 옵티마이저 설정: 레이어별로 다른 학습률 적용
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if ('blocks.23' not in n and 'blocks.22' not in n and 'head' not in n) and p.requires_grad],
            'lr': 1e-5,  # 그 전 레이어는 낮은 학습률
            'weight_decay': 1e-4
        },
        {
            'params': [p for n, p in model.named_parameters() if ('blocks.23' in n or 'blocks.22' in n) and p.requires_grad],
            'lr': 1e-4,  # penultimate 레이어는 중간 학습률
            'weight_decay': 1e-4
        },
        {
            'params': [p for n, p in model.named_parameters() if 'head' in n and p.requires_grad],
            'lr': 1e-3,  # head 레이어는 높은 학습률
            'weight_decay': 1e-4
        }
    ]

    optimizer = optim.AdamW(
        optimizer_grouped_parameters
    )

    # 스케줄러 초기화: Cosine Annealing with Warm Restarts 적용
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, 
        T_0=10,  # 첫 번째 주기 (에포크 수)
        T_mult=1,  # 주기 증가 배수 (여기서는 고정)
        eta_min=1e-6  # 최저 학습률
    )

    # 학습에 사용할 Loss를 선언 (AsymmetricLoss 사용)
    loss_fn = Loss(loss_type='asymmetric')

    # 결과 저장 경로 수정
    fold_result_path = os.path.join(save_result_path, f'fold_{fold_idx}')
    os.makedirs(fold_result_path, exist_ok=True)

    # Trainer 선언
    trainer = Trainer(
        model=model, 
        device=device, 
        train_loader=train_loader,
        val_loader=val_loader, 
        optimizer=optimizer,
        scheduler=scheduler,
        loss_fn=loss_fn, 
        epochs=100,
        result_path=fold_result_path,
        fold_idx=fold_idx  # 폴드 인덱스 전달
    )

    # 모델 학습.
    trainer.train()

Fold 1
Epoch 1/100


                                                                       

Epoch 1, Train Loss: 2.0688, Train Accuracy: 77.40%
Epoch 1, Validation Loss: 0.6969, Validation Accuracy: 87.33%

Fold 1: Save 0 epoch result. Loss = 0.6969
Epoch 2/100


                                                                        

Epoch 2, Train Loss: 0.4130, Train Accuracy: 91.00%
Epoch 2, Validation Loss: 0.5514, Validation Accuracy: 89.10%

Fold 1: Save 1 epoch result. Loss = 0.5514
Epoch 3/100


                                                                        

Epoch 3, Train Loss: 0.2371, Train Accuracy: 94.07%
Epoch 3, Validation Loss: 0.5181, Validation Accuracy: 90.03%

Fold 1: Save 2 epoch result. Loss = 0.5181
Epoch 4/100


                                                                        

Epoch 4, Train Loss: 0.1562, Train Accuracy: 95.59%
Epoch 4, Validation Loss: 0.4446, Validation Accuracy: 90.46%

Fold 1: Save 3 epoch result. Loss = 0.4446
Epoch 5/100


                                                                         

Epoch 5, Train Loss: 0.1111, Train Accuracy: 96.72%
Epoch 5, Validation Loss: 0.4553, Validation Accuracy: 91.96%

Epoch 6/100


                                                                         

Epoch 6, Train Loss: 0.0851, Train Accuracy: 97.86%
Epoch 6, Validation Loss: 0.4296, Validation Accuracy: 91.70%

Fold 1: Save 5 epoch result. Loss = 0.4296
Epoch 7/100


                                                                          

Epoch 7, Train Loss: 0.0522, Train Accuracy: 98.37%
Epoch 7, Validation Loss: 0.4262, Validation Accuracy: 92.26%

Fold 1: Save 6 epoch result. Loss = 0.4262
Epoch 8/100


                                                                          

Epoch 8, Train Loss: 0.0463, Train Accuracy: 98.77%
Epoch 8, Validation Loss: 0.4268, Validation Accuracy: 92.00%

Epoch 9/100


                                                                          

Epoch 9, Train Loss: 0.0337, Train Accuracy: 98.97%
Epoch 9, Validation Loss: 0.4269, Validation Accuracy: 92.20%

Epoch 10/100


                                                                          

Epoch 10, Train Loss: 0.0254, Train Accuracy: 99.14%
Epoch 10, Validation Loss: 0.4295, Validation Accuracy: 92.20%

Early stopping triggered. Stopping training...
Fold 2
Epoch 1/100


                                                                       

Epoch 1, Train Loss: 2.0843, Train Accuracy: 77.10%
Epoch 1, Validation Loss: 0.7311, Validation Accuracy: 86.23%

Fold 2: Save 0 epoch result. Loss = 0.7311
Epoch 2/100


                                                                        

Epoch 2, Train Loss: 0.4185, Train Accuracy: 91.04%
Epoch 2, Validation Loss: 0.5319, Validation Accuracy: 88.73%

Fold 2: Save 1 epoch result. Loss = 0.5319
Epoch 3/100


                                                                        

Epoch 3, Train Loss: 0.2475, Train Accuracy: 93.57%
Epoch 3, Validation Loss: 0.4858, Validation Accuracy: 90.33%

Fold 2: Save 2 epoch result. Loss = 0.4858
Epoch 4/100


                                                                        

Epoch 4, Train Loss: 0.1632, Train Accuracy: 95.41%
Epoch 4, Validation Loss: 0.4614, Validation Accuracy: 90.96%

Fold 2: Save 3 epoch result. Loss = 0.4614
Epoch 5/100


                                                                         

Epoch 5, Train Loss: 0.1167, Train Accuracy: 96.58%
Epoch 5, Validation Loss: 0.4365, Validation Accuracy: 91.36%

Fold 2: Save 4 epoch result. Loss = 0.4365
Epoch 6/100


                                                                         

Epoch 6, Train Loss: 0.0835, Train Accuracy: 97.67%
Epoch 6, Validation Loss: 0.4088, Validation Accuracy: 92.30%

Fold 2: Save 5 epoch result. Loss = 0.4088
Epoch 7/100


                                                                         

Epoch 7, Train Loss: 0.0596, Train Accuracy: 98.42%
Epoch 7, Validation Loss: 0.4014, Validation Accuracy: 92.60%

Fold 2: Save 6 epoch result. Loss = 0.4014
Epoch 8/100


                                                                          

Epoch 8, Train Loss: 0.0386, Train Accuracy: 98.78%
Epoch 8, Validation Loss: 0.3944, Validation Accuracy: 92.83%

Fold 2: Save 7 epoch result. Loss = 0.3944
Epoch 9/100


                                                                          

Epoch 9, Train Loss: 0.0395, Train Accuracy: 98.81%
Epoch 9, Validation Loss: 0.3966, Validation Accuracy: 92.93%

Epoch 10/100


                                                                          

Epoch 10, Train Loss: 0.0339, Train Accuracy: 99.01%
Epoch 10, Validation Loss: 0.3961, Validation Accuracy: 93.00%

Epoch 11/100


                                                                         

Epoch 11, Train Loss: nan, Train Accuracy: 96.65%
Epoch 11, Validation Loss: 0.5556, Validation Accuracy: 90.46%

Early stopping triggered. Stopping training...
Fold 3
Epoch 1/100


                                                                       

Epoch 1, Train Loss: nan, Train Accuracy: 77.12%
Epoch 1, Validation Loss: 0.8115, Validation Accuracy: 86.00%

Fold 3: Save 0 epoch result. Loss = 0.8115
Epoch 2/100


                                                                        

Epoch 2, Train Loss: 0.4159, Train Accuracy: 90.96%
Epoch 2, Validation Loss: 0.6416, Validation Accuracy: 88.43%

Fold 3: Save 1 epoch result. Loss = 0.6416
Epoch 3/100


                                                                        

Epoch 3, Train Loss: 0.2271, Train Accuracy: 94.21%
Epoch 3, Validation Loss: 0.5462, Validation Accuracy: 89.86%

Fold 3: Save 2 epoch result. Loss = 0.5462
Epoch 4/100


                                                                        

Epoch 4, Train Loss: 0.1584, Train Accuracy: 95.56%
Epoch 4, Validation Loss: 0.5811, Validation Accuracy: 89.56%

Epoch 5/100


                                                                         

Epoch 5, Train Loss: 0.1177, Train Accuracy: 96.68%
Epoch 5, Validation Loss: 0.5174, Validation Accuracy: 90.56%

Fold 3: Save 4 epoch result. Loss = 0.5174
Epoch 6/100


                                                                         

Epoch 6, Train Loss: 0.0714, Train Accuracy: 97.84%
Epoch 6, Validation Loss: 0.5099, Validation Accuracy: 90.70%

Fold 3: Save 5 epoch result. Loss = 0.5099
Epoch 7/100


                                                                         

Epoch 7, Train Loss: 0.0607, Train Accuracy: 98.35%
Epoch 7, Validation Loss: 0.4934, Validation Accuracy: 91.10%

Fold 3: Save 6 epoch result. Loss = 0.4934
Epoch 8/100


                                                                          

Epoch 8, Train Loss: 0.0449, Train Accuracy: 98.69%
Epoch 8, Validation Loss: 0.4955, Validation Accuracy: 91.46%

Epoch 9/100


                                                                          

Epoch 9, Train Loss: 0.0291, Train Accuracy: 99.12%
Epoch 9, Validation Loss: 0.4898, Validation Accuracy: 91.23%

Fold 3: Save 8 epoch result. Loss = 0.4898
Epoch 10/100


                                                                          

Epoch 10, Train Loss: 0.0285, Train Accuracy: 99.03%
Epoch 10, Validation Loss: 0.4909, Validation Accuracy: 91.33%

Epoch 11/100


                                                                         

Epoch 11, Train Loss: nan, Train Accuracy: 96.24%
Epoch 11, Validation Loss: 0.7025, Validation Accuracy: 88.96%

Epoch 12/100


                                                                         

Epoch 12, Train Loss: 0.2235, Train Accuracy: 95.12%
Epoch 12, Validation Loss: 0.6145, Validation Accuracy: 90.00%

Early stopping triggered. Stopping training...
Fold 4
Epoch 1/100


                                                                        

Epoch 1, Train Loss: 2.0596, Train Accuracy: 77.66%
Epoch 1, Validation Loss: 0.7965, Validation Accuracy: 86.42%

Fold 4: Save 0 epoch result. Loss = 0.7965
Epoch 2/100


                                                                        

Epoch 2, Train Loss: 0.3957, Train Accuracy: 91.02%
Epoch 2, Validation Loss: 0.5528, Validation Accuracy: 88.59%

Fold 4: Save 1 epoch result. Loss = 0.5528
Epoch 3/100


                                                                        

Epoch 3, Train Loss: 0.2529, Train Accuracy: 93.92%
Epoch 3, Validation Loss: 0.5714, Validation Accuracy: 89.43%

Epoch 4/100


                                                                        

Epoch 4, Train Loss: 0.1593, Train Accuracy: 95.81%
Epoch 4, Validation Loss: 0.5291, Validation Accuracy: 90.46%

Fold 4: Save 3 epoch result. Loss = 0.5291
Epoch 5/100


                                                                         

Epoch 5, Train Loss: 0.1187, Train Accuracy: 96.82%
Epoch 5, Validation Loss: 0.4854, Validation Accuracy: 91.26%

Fold 4: Save 4 epoch result. Loss = 0.4854
Epoch 6/100


                                                                         

Epoch 6, Train Loss: 0.0703, Train Accuracy: 97.86%
Epoch 6, Validation Loss: 0.5200, Validation Accuracy: 91.09%

Epoch 7/100


                                                                         

Epoch 7, Train Loss: 0.0688, Train Accuracy: 98.12%
Epoch 7, Validation Loss: 0.4827, Validation Accuracy: 91.43%

Fold 4: Save 6 epoch result. Loss = 0.4827
Epoch 8/100


                                                                          

Epoch 8, Train Loss: 0.0442, Train Accuracy: 98.61%
Epoch 8, Validation Loss: nan, Validation Accuracy: 91.33%

Epoch 9/100


                                                                          

Epoch 9, Train Loss: 0.0405, Train Accuracy: 98.90%
Epoch 9, Validation Loss: 0.4781, Validation Accuracy: 91.79%

Fold 4: Save 8 epoch result. Loss = 0.4781
Epoch 10/100


                                                                          

Epoch 10, Train Loss: 0.0279, Train Accuracy: 99.15%
Epoch 10, Validation Loss: nan, Validation Accuracy: 91.63%

Epoch 11/100


                                                                        

Epoch 11, Train Loss: 0.1579, Train Accuracy: 96.43%
Epoch 11, Validation Loss: 0.6213, Validation Accuracy: 90.06%

Epoch 12/100


                                                                        

Epoch 12, Train Loss: 0.1894, Train Accuracy: 95.39%
Epoch 12, Validation Loss: nan, Validation Accuracy: 89.56%

Early stopping triggered. Stopping training...
Fold 5
Epoch 1/100


                                                                       

Epoch 1, Train Loss: 2.0788, Train Accuracy: 78.04%
Epoch 1, Validation Loss: 0.7386, Validation Accuracy: 87.42%

Fold 5: Save 0 epoch result. Loss = 0.7386
Epoch 2/100


                                                                        

Epoch 2, Train Loss: 0.4159, Train Accuracy: 90.85%
Epoch 2, Validation Loss: 0.5871, Validation Accuracy: 88.86%

Fold 5: Save 1 epoch result. Loss = 0.5871
Epoch 3/100


                                                                        

Epoch 3, Train Loss: 0.2344, Train Accuracy: 94.26%
Epoch 3, Validation Loss: 0.5167, Validation Accuracy: 91.06%

Fold 5: Save 2 epoch result. Loss = 0.5167
Epoch 4/100


                                                                         

Epoch 4, Train Loss: 0.1603, Train Accuracy: 95.66%
Epoch 4, Validation Loss: 0.5259, Validation Accuracy: 90.93%

Epoch 5/100


                                                                         

Epoch 5, Train Loss: 0.1085, Train Accuracy: 97.17%
Epoch 5, Validation Loss: 0.4754, Validation Accuracy: 90.99%

Fold 5: Save 4 epoch result. Loss = 0.4754
Epoch 6/100


                                                                         

Epoch 6, Train Loss: 0.0796, Train Accuracy: 97.57%
Epoch 6, Validation Loss: 0.4894, Validation Accuracy: 91.46%

Epoch 7/100


                                                                         

Epoch 7, Train Loss: 0.0613, Train Accuracy: 98.21%
Epoch 7, Validation Loss: 0.4595, Validation Accuracy: 91.96%

Fold 5: Save 6 epoch result. Loss = 0.4595
Epoch 8/100


                                                                          

Epoch 8, Train Loss: 0.0463, Train Accuracy: 98.74%
Epoch 8, Validation Loss: 0.4616, Validation Accuracy: 92.09%

Epoch 9/100


                                                                          

Epoch 9, Train Loss: 0.0266, Train Accuracy: 99.03%
Epoch 9, Validation Loss: 0.4681, Validation Accuracy: 92.33%

Epoch 10/100


                                                                          

Epoch 10, Train Loss: 0.0373, Train Accuracy: 99.07%
Epoch 10, Validation Loss: 0.4687, Validation Accuracy: 92.49%

Early stopping triggered. Stopping training...


# Inference

In [35]:
def ensemble_inference_tta(
    models: List[nn.Module], 
    device: torch.device, 
    test_loader: DataLoader,
    base_transform: Callable,
    tta_transform: Callable,
    tta_steps: int = 3  # 메모리 부담을 줄이기 위해 TTA 단계 줄이기
):
    """
    Test Time Augmentation (TTA)을 적용한 추론 함수.
    
    Args:
        models (List[nn.Module]): 앙상블할 학습된 모델 리스트.
        device (torch.device): 연산을 수행할 디바이스.
        test_loader (DataLoader): 테스트 데이터 로더.
        base_transform (Callable): 기본 변환 함수.
        tta_transform (Callable): TTA를 위한 변환 함수.
        tta_steps (int): TTA 증강 횟수.
    
    Returns:
        List[int]: 예측된 클래스 인덱스 리스트.
    """
    if not models:
        raise ValueError("The models list is empty. Please load at least one model before inference.")
    
    for model_idx, model in enumerate(models, 1):
        model.to(device)
        model.eval()
        print(f"Model {model_idx} loaded and set to eval mode.")
    
    predictions = []
    with torch.no_grad():
        for batch_idx, images in enumerate(tqdm(test_loader, desc="TTA Inference")):
            # images는 PIL 이미지 리스트
            B = len(images)
            
            # 변수 초기화
            base_images = None
            sum_probs = None
            logits = None
            probs = None
            avg_probs = None
            tta_transformed = None
            
            try:
                # 기본 변환 적용
                base_images = torch.stack([base_transform(image) for image in images]).to(device)
                # 확률을 누적할 텐서 초기화
                sum_probs = torch.zeros(B, num_classes).to(device)
            except Exception as e:
                print(f"Error in base_transform at batch {batch_idx}: {e}")
                continue  # 다음 배치로 넘어갑니다.
            
            for model_idx, model in enumerate(models, 1):
                try:
                    # 기본 이미지 예측
                    logits = model(base_images)
                    probs = F.softmax(logits, dim=1)
                    sum_probs += probs
                except Exception as e:
                    print(f"Error in model {model_idx} during base image inference: {e}")
                    continue  # 다음 모델로 넘어갑니다.
                
                for step in range(tta_steps):
                    try:
                        # TTA 변환 적용
                        tta_transformed = torch.stack([tta_transform(image) for image in images]).to(device)
                        logits = model(tta_transformed)
                        probs = F.softmax(logits, dim=1)
                        sum_probs += probs / tta_steps  # TTA 확률을 평균으로 반영
                        # 메모리 해제
                        del tta_transformed, logits, probs
                    except Exception as e:
                        print(f"Error in model {model_idx} during TTA step {step+1}: {e}")
                        continue  # 다음 TTA 단계로 넘어갑니다.
            
            try:
                # 모델 수로 나누어 평균 확률 계산
                avg_probs = sum_probs / len(models)
                
                # 예측 클래스 결정 (Top-1)
                preds = avg_probs.argmax(dim=1)
                predictions.extend(preds.cpu().numpy())
            except Exception as e:
                print(f"Error during probability averaging or prediction at batch {batch_idx}: {e}")
                continue  # 다음 배치로 넘어갑니다.
            
            # 메모리 해제
            variables_to_del = ['base_images', 'sum_probs', 'logits', 'probs', 'avg_probs']
            for var in variables_to_del:
                try:
                    del globals()[var]
                except KeyError:
                    try:
                        del locals()[var]
                    except KeyError:
                        pass  # 변수가 존재하지 않으면 무시합니다.
            torch.cuda.empty_cache()  # 캐시 비우기
    
    return predictions


In [36]:
# 추론 데이터의 경로와 정보를 가진 파일의 경로를 설정.
testdata_dir = "/data/ephemeral/home/dog_remove/data/test"
testdata_info_file = "/data/ephemeral/home/dog_remove/data/test.csv"
save_result_path = "/data/ephemeral/home/youngtae/model2/Asymmmetric_result"

In [37]:
# 추론 데이터의 class, image path, target에 대한 정보가 들어있는 csv파일을 읽기.
test_info = pd.read_csv(testdata_info_file)

# 총 class 수.
num_classes = len(train_info['target'].unique())  # 일관성 유지

In [38]:
# 추론에 사용할 Dataset을 선언.
test_dataset = CustomDataset(
    root_dir=testdata_dir,
    info_df=test_info,
    transform=None,  # 변환은 직접 적용
    is_inference=True
)
# DataLoader에서 worker_init_fn을 설정하여 각 워커의 시드를 고정
def worker_init_fn(worker_id):
    np.random.seed(seed + worker_id)
    random.seed(seed + worker_id)

# 추론에 사용할 DataLoader를 선언합니다.
# 배치 크기와 TTA 단계를 줄여 메모리 사용량 감소
batch_size = 16  # 배치 크기 감소
tta_steps = 3    # TTA 단계 감소

test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False,
    drop_last=False,
    num_workers=4,
    pin_memory=True,
    worker_init_fn=worker_init_fn,
    collate_fn=lambda x: x  # PIL 이미지 리스트 반환
)



In [39]:
# 앙상블할 모델들을 저장할 리스트
models = []

# 각 폴드의 모델을 불러오기
for fold_idx in range(1, n_splits + 1):
    # 모델 초기화
    model_selector = ModelSelector(
        model_type='timm', 
        num_classes=num_classes,
        model_name='eva02_large_patch14_448.mim_m38m_ft_in22k_in1k', 
        pretrained=False
    )
    model = model_selector.get_model()

    # 모델의 모든 파라미터를 동결 (학습되지 않도록 설정)
    for name, param in model.named_parameters():
        param.requires_grad = False

    # 마지막 레이어와 그 전 레이어만 학습되도록 설정
    for name, param in model.named_parameters():
        if 'blocks.23' in name or 'blocks.22' in name or 'head' in name:
            param.requires_grad = True

    # 모델 가중치 로드
    fold_result_path = os.path.join(save_result_path, f'fold_{fold_idx}')
    model_path = os.path.join(fold_result_path, f'fold_{fold_idx}_best_model.pt')
    model.load_state_dict(
        torch.load(
            model_path,
            map_location=device
        )
    )

    # 모델을 리스트에 추가
    models.append(model)

In [40]:
# TTA를 포함한 앙상블 추론 함수 호출
# 여기서 AutoAugment는 tta_transform에 포함되어 TTA 과정에서 적용됩니다.
# base_transform과 tta_transform은 학습 시 사용한 것과 동일한 방식으로 정의해야 합니다.
transform_selector = TransformSelector(transform_type="torchvision")
base_transform = transform_selector.get_transform(is_train=False)
tta_transform = transform_selector.get_transform(is_train=True)  # TTA는 훈련 시 변환과 동일하게 설정


In [41]:
predictions = ensemble_inference_tta(
    models=models, 
    device=device, 
    test_loader=test_loader,
    base_transform=base_transform,
    tta_transform=tta_transform,  # TTA 변환에 AutoAugment가 포함됨
    tta_steps=tta_steps  # TTA 증강 횟수
)


Model 1 loaded and set to eval mode.
Model 2 loaded and set to eval mode.
Model 3 loaded and set to eval mode.
Model 4 loaded and set to eval mode.
Model 5 loaded and set to eval mode.


TTA Inference: 100%|██████████| 626/626 [3:42:50<00:00, 21.36s/it]  


In [42]:
# 예측 결과를 CSV 형식에 맞게 저장
test_info['target'] = predictions
test_info = test_info.reset_index().rename(columns={"index": "ID"})

In [43]:
# 결과 저장
test_info.to_csv("Asymmetric.csv", index=False)