# 1. Setup & Imports

## 1.1 Setup

In [1]:
import os
import wandb
from dotenv import load_dotenv

load_dotenv()
wandb_api_key = os.getenv("WANDB_API_KEY")

wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mparaise[0m ([33mparaise-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## 1.2 Imports

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='pydantic')

import gc
import math
from scipy.optimize import minimize
import shutil
import glob
import random
from types import SimpleNamespace
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import cv2
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.swa_utils import update_bn
from torch.utils.data import Dataset, DataLoader

import lightning.pytorch as pl
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import TQDMProgressBar

from torchmetrics import ConfusionMatrix, AUROC
import timm
import ttach as tta
import albumentations as A
from albumentations.pytorch import ToTensorV2
from transformers import get_cosine_schedule_with_warmup

import seaborn as sns
import matplotlib.pyplot as plt

cv2.setNumThreads(0)
print(os.cpu_count())
# torch.set_float32_matmul_precision('medium') # L4

8


## 1.3 Configuration

In [3]:
class CFG:
    model_arch = 'convnext_small.fb_in22k_ft_in1k'
    is_bn = False
    seed = 855
    lr = 0.0001
    weight_decay = 0.05
    alpha = 0.5
    T = 2.0
    drop_path_rate = 0.2
    top_k = 3
    n_folds = 5
    epochs = 25
    warmup_multiplier = 2
    batch_size = 32
    accum_iter = 1
    num_workers = 4
    persistent_workers=True
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    project_name = 'PlantPathology2020'
    exp_name = 's10_convnext_small_T_edit'

In [4]:
def set_seed(seed, deterministic=False):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu
    torch.cuda.manual_seed(seed) # gpu
    torch.cuda.manual_seed_all(seed) # 멀티 gpu
    if deterministic:
        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.use_deterministic_algorithms(True)
    else:
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True
        torch.use_deterministic_algorithms(False)

set_seed(CFG.seed)
device = CFG.device
print(device)

cuda


In [5]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(CFG.seed)

<torch._C.Generator at 0x7fcb478aca50>

# 2. Data Pipeline

## 2.1 Data Loading

In [6]:
class DataModule:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.img_dir = self.data_dir + 'images/'
        
    def prepare_data(self):
        self.train_df = pd.read_csv(self.data_dir + 'datasets/train_reborn_02.csv')
        self.train_df = self.train_df.reset_index(drop=True)
        self.test_df = pd.read_csv(self.data_dir + 'test.csv')
        self.submission = pd.read_csv(self.data_dir + 'sample_submission.csv')
        
        oof_df_01 = pd.read_csv(self.data_dir + 'datasets/oof_preds_Student5_EfficientNetB6_reborn.csv')
        oof_df_02 = pd.read_csv(self.data_dir + 'oof_preds_Student8_ResNest101e.csv')        
        oof_df_01['image_id'] = self.train_df['image_id']
        oof_df_01 = oof_df_01[['image_id', 'healthy', 'multiple_diseases', 'rust', 'scab']]
        
        self.oof_df = oof_df_01.copy()
        hard_cols = ['healthy', 'multiple_diseases', 'rust', 'scab']
        self.oof_df[hard_cols] = oof_df_01[hard_cols] * 0.5 + oof_df_02[hard_cols] * 0.5
        self.oof_df.columns = ['image_id', 'healthy_pred', 'multiple_diseases_pred', 'rust_pred', 'scab_pred']
        self.train_df = self.train_df.merge(self.oof_df, on='image_id', how='left')
        return self.train_df, self.test_df, self.submission

    
train_df, test_df, submission = DataModule('../data/').prepare_data()

hard_cols = ['healthy', 'multiple_diseases', 'rust', 'scab']
soft_cols = ['healthy_pred', 'multiple_diseases_pred', 'rust_pred', 'scab_pred']

train_df.head()

Unnamed: 0,image_id,fold,group_id,label_idx,healthy,multiple_diseases,rust,scab,healthy_pred,multiple_diseases_pred,rust_pred,scab_pred
0,Train_0,2,Train_0,3,0.0,0.0,0.0,1.0,0.006739,0.007093,0.002089,0.984131
1,Train_1,3,Train_1,1,0.0,1.0,0.0,0.0,0.00507,0.906738,0.072174,0.016166
2,Train_2,1,Train_2,0,1.0,0.0,0.0,0.0,0.970703,0.006515,0.011703,0.011107
3,Train_3,2,Train_3,2,0.0,0.0,1.0,0.0,0.000161,0.000684,0.999023,5.2e-05
4,Train_4,0,Train_4,0,1.0,0.0,0.0,0.0,0.990967,0.002657,0.003892,0.002475


## 2.2 Load Images

In [7]:
all_images = {}
all_img_ids = np.concatenate([train_df['image_id'].tolist(), test_df['image_id'].tolist()])

print("Loading all images into RAM once...")
for img_id in tqdm(all_img_ids, desc='Loading Images...' ,leave=False):
    img = cv2.imread('../data/images/' + img_id + '.jpg')
    img = cv2.resize(img, (650, 450))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img.setflags(write=False)
    all_images[img_id] = img

print("All Images on Ram")

Loading all images into RAM once...


Loading Images...:   0%|          | 0/3642 [00:00<?, ?it/s]

All Images on Ram


## 2.3 Custom Dataset

In [8]:
class ImageDataset(Dataset):
    def __init__(self, df, hard_cols=hard_cols, soft_cols=soft_cols, transform=None, is_test=False):
        super().__init__()
        self.df = df
        self.transform = transform
        self.is_test = is_test
        self.hard_cols = hard_cols
        self.soft_cols = soft_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        image = all_images[img_id].copy()

        if self.transform is not None:
            image = self.transform(image=image)['image']

        if self.is_test:
            return image
        else:
            soft_labels = self.df.iloc[idx][self.soft_cols].values.astype(np.float32)
            hard_labels = self.df.iloc[idx][self.hard_cols].values.astype(np.float32)
            return image, torch.tensor(soft_labels), torch.tensor(hard_labels)

## 2.4 DataModule

In [9]:
class PlantDataModule(pl.LightningDataModule):
    """
    데이터 로딩, 전처리 및 학습/검증 세트 분할을 관리하는 클래스입니다.
    
    K-Fold 인덱스에 따라 데이터를 학습용과 검증용으로 분리하며, 
    stage 인자에 따라 불필요한 데이터 로딩을 방지하여 메모리 효율성을 최적화합니다. 
    이미지 증강(Augmentation) 로직을 내부적으로 포함하여 데이터와 모델 사이의 인터페이스를 명확히 정의합니다.
    """
    def __init__(self, train_df, test_df, cfg, fold_idx, inference_mode=False):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.cfg = cfg
        self.fold_idx = fold_idx
        self.inference_mode = inference_mode

        self.transform_train = A.Compose([
            A.OneOf([
                A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0, p=1.0),
                A.RandomBrightnessContrast(brightness_limit=0, contrast_limit=0.2, p=1.0)
            ], p=1.0),

            A.OneOf([
                A.MotionBlur(blur_limit=3, p=1.0),
                A.MedianBlur(blur_limit=3, p=1.0),
                A.GaussianBlur(blur_limit=3, p=1.0),
            ], p=0.5),

            A.VerticalFlip(p=0.5),
            A.HorizontalFlip(p=0.5),
            
            A.Affine(
                scale=(0.8, 1.2),
                translate_percent=0.2,
                rotate=20,
                interpolation=cv2.INTER_CUBIC, # 보간
                border_mode=cv2.BORDER_REFLECT_101, # 테두리 반사 채우기
                p=1.0
            ),

            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            ToTensorV2()
        ])

        self.transform_test = A.Compose([
            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            ToTensorV2()
        ])
            
    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.train = self.train_df[self.train_df['fold']!=self.fold_idx].reset_index(drop=True).copy()
            self.valid = self.train_df[self.train_df['fold']==self.fold_idx].copy()
            self.dataset_train = ImageDataset(self.train, transform=self.transform_train)
            self.dataset_valid = ImageDataset(self.valid, transform=self.transform_test)
            print(f'[Fit] Train: {len(self.train)}, Valid: {len(self.valid)}')

        elif stage == 'test':
            self.valid = self.train_df[self.train_df['fold']==self.fold_idx].copy()
            self.dataset_valid = ImageDataset(self.valid, transform=self.transform_test)
            self.dataset_test = ImageDataset(self.test_df, transform=self.transform_test, is_test=True)
            print(f'[Test] Valid(OOF): {len(self.valid)}, Test: {len(self.test_df)}')

        elif stage == 'predict':
            self.dataset_test = ImageDataset(self.test_df, transform=self.transform_test, is_test=True)
    
    def train_dataloader(self):
        loader_train = DataLoader(self.dataset_train, batch_size=self.cfg.batch_size, shuffle=True,
                                worker_init_fn=seed_worker, generator=g, num_workers=self.cfg.num_workers, 
                                persistent_workers=True, pin_memory=True)
        return loader_train
    
    def val_dataloader(self):
        user_persistent = not self.inference_mode
        loader_valid = DataLoader(self.dataset_valid, batch_size=self.cfg.batch_size*4, shuffle=False,
                                worker_init_fn=seed_worker, generator=g, num_workers=self.cfg.num_workers, 
                                persistent_workers=user_persistent, pin_memory=True)
        return loader_valid
    
    def predict_dataloader(self):
        loader_test = DataLoader(self.dataset_test, batch_size=self.cfg.batch_size*4, shuffle=False,
                                worker_init_fn=seed_worker, generator=g, num_workers=self.cfg.num_workers, 
                                persistent_workers=False, pin_memory=True)
        return loader_test
    
    def test_dataloader(self):
        return self.predict_dataloader()

# 3. Model Architecture

In [10]:
class PlantDiseaseModule(pl.LightningModule):
    """
    모델의 순전파, 손실 함수 계산, 최적화 알고리즘 및 메트릭 측정을 캡슐화합니다.
    특히 훈련 단계에서는 Soft Label Mixing(Knowledge Distillation 원리 적용)을 통해 
    라벨 노이즈에 대한 강건성을 확보하며, 추론 단계에서는 TTA(Test Time Augmentation)를 
    통합하여 예측의 불확실성을 줄이고 일반화 성능을 향상시킵니다.
    """
    def __init__(self, config, steps_per_epoch=None):
        super().__init__()
        if isinstance(config, type):
            config = {k: v for k, v in config.__dict__.items() if not k.startswith('__')}
        self.save_hyperparameters(config)
        self.steps_per_epoch = steps_per_epoch
        self.model = timm.create_model(
            self.hparams.model_arch,
            pretrained=True,
            drop_path_rate=self.hparams.drop_path_rate,
            num_classes=4
            )
        self.criterion = nn.CrossEntropyLoss()
        self.criterion_ls = nn.CrossEntropyLoss(label_smoothing=0.1)

        self.steps_per_epoch = steps_per_epoch
        
        # TTA
        self.tta_transforms = tta.Compose([
                tta.HorizontalFlip(),
                tta.VerticalFlip(),
            ])
        
        # metrics
        self.valid_auc = AUROC(task='multiclass', num_classes=4)
        self.valid_cm = ConfusionMatrix(task='multiclass', num_classes=4)
        self.best_score = 0.0

        self.top_k_scores = []  # (score, epoch) 튜플을 저장할 리스트
        self.top_k = self.hparams.top_k

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        total_steps = self.steps_per_epoch * self.hparams.epochs
        warmup_steps = self.steps_per_epoch * self.hparams.warmup_multiplier
            
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )
        
        scheduler_config = {
            'scheduler' : scheduler,
            'interval' : 'step',
            'frequency' : 1
        }
        
        return [optimizer], [scheduler_config]
        
    def forward(self, x):
        return self.model(x)
    
    # def training_step(self, batch, batch_idx):
    #     image, soft_labels, hard_labels = batch
        
    #     T = self.hparams.T
    #     if T > 1.0:
    #         epsilon = 1e-6
    #         logits_from_oof = torch.log(soft_labels + epsilon)
    #         soft_labels = torch.softmax(logits_from_oof / T, dim=1)

    #     label = self.hparams.alpha * hard_labels + (1 - self.hparams.alpha) * soft_labels
    #     outputs = self.model(image)
    #     logits = outputs / T if T > 1 else outputs
    #     loss = self.criterion(logits, label)
        
    #     if T > 1.0:
    #         loss = loss * (T ** 2)

    #     self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    #     return loss
    
    def training_step(self, batch, batch_idx):
        image, soft_labels, hard_labels = batch
        outputs = self.model(image)

        loss_hard = self.criterion_ls(outputs, hard_labels)
        loss_soft = self.criterion(outputs, soft_labels)
        loss = self.hparams.alpha * loss_hard + (1 - self.hparams.alpha) * loss_soft

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('hard_loss', loss_hard, on_step=False, on_epoch=True, logger=True)
        self.log('soft_loss', loss_soft, on_step=False, on_epoch=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        image, _, hard_labels = batch
        outputs = self.model(image)
        loss = self.criterion(outputs, hard_labels)
        
        probs = torch.softmax(outputs, dim=1)
        preds = torch.argmax(outputs, dim=1)
        targets = torch.argmax(hard_labels, dim=1)

        self.valid_cm(preds, targets)
        self.valid_auc(probs, targets)
        
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_roc_auc', self.valid_auc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def on_validation_epoch_end(self):
        if self.trainer.sanity_checking:
            return
                  
        score = self.trainer.callback_metrics.get('val_roc_auc')
        train_loss = self.trainer.callback_metrics.get('train_loss')
        val_loss = self.trainer.callback_metrics.get('val_loss')

        current_epoch = self.current_epoch
        t_loss_str = f"{train_loss:.4f}" if train_loss is not None else "N/A"
        v_loss_str = f"{val_loss:.4f}" if val_loss is not None else "N/A"
        roc_str = f"{score:.4f}" if score is not None else "N/A"
        self.print(f"\n(Epoch {current_epoch}) Train Loss: {t_loss_str} | Val Loss: {v_loss_str} | ROC AUC: {roc_str}")
        
        if score is not None:
            current_score = score.item()            
            self.top_k_scores.append((current_score, current_epoch))
            self.top_k_scores.sort(key=lambda x: x[0], reverse=True)
            self.top_k_scores = self.top_k_scores[:self.top_k]
            is_in_top_k = (current_score, current_epoch) in self.top_k_scores
            
            if is_in_top_k and isinstance(self.logger, WandbLogger):
                rank = self.top_k_scores.index((current_score, current_epoch)) + 1
                self.print(f'New Top-K Score! (Rank {rank})')
                top_k_str = ", ".join([f"(Ep {e}: {s:.4f})" for s, e in self.top_k_scores])
                self.print(f"Current Top-{self.top_k}: {top_k_str}")
                
                plt.figure(figsize=(10, 8))
                cm = self.valid_cm.compute().cpu().numpy()
                columns = ['Healthy', 'Multiple', 'Rust', 'Scab']

                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                            xticklabels=columns, yticklabels=columns,
                            annot_kws={"size": 12}) # 글자 크기 키움
                            
                plt.ylabel('True Label', fontsize=12)
                plt.xlabel('Predicted Label', fontsize=12)
                plt.title(f'Confusion Matrix (Epoch {current_epoch})', fontsize=14)

                log_key = f"Confusion_Matrix_Ep{current_epoch}"                
                self.logger.experiment.log({
                    log_key: wandb.Image(plt),
                    "global_step": self.global_step
                })
                plt.close()
                self.print(f"Confusion Matrix saved to WandB key: {log_key}")
        self.valid_cm.reset()

    def on_predict_start(self):
        self.tta_model = tta.ClassificationTTAWrapper(
            self.model, 
            self.tta_transforms, 
            merge_mode='mean'
        )

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        if isinstance(batch, (list, tuple)):
            x = batch[0]
        else:
            x = batch
        
        outputs = self.tta_model(x)
        return outputs

In [11]:
torch.cuda.empty_cache()
gc.collect()

160

# 4. Training Components

## 4.1 Metrics & Utils

In [12]:
class MetricHandler:
    def __init__(self):
        self.reset()

    def reset(self):
        self.preds_list = []
        self.actual_list = []

    def update(self, preds, actual):
        self.preds_list.extend(preds)
        self.actual_list.extend(actual)

    def compute_roc_auc(self):
        return roc_auc_score(self.actual_list, self.preds_list)
    
    
class BackupHandler:
    def __init__(self, local_dir, backup_dir=None, active=True):
        self.local_dir = local_dir
        self.backup_dir = backup_dir
        self.active = active and (backup_dir is not None)

        if self.active and self.backup_dir is not None:
            os.makedirs(self.backup_dir, exist_ok=True)
            print(f'Backup Active : {self.local_dir} -> {self.backup_dir}')

    def backup(self, filename):
        if not self.active or self.backup_dir is None:
            return

        src_path = os.path.join(self.local_dir, filename)
        dst_path = os.path.join(self.backup_dir, filename)
        
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)

    def save_file(self, data, filename, logit=False):
        local_path = os.path.join(self.local_dir, filename)

        if logit:
            np.save(local_path, data)
            print(f'Logit saved at {local_path}')

        else:
            data.to_csv(local_path, index=False)
            print(f'CSV saved at {local_path}')

        self.backup(filename)

## 4.2 Experiment Ochestrator

In [13]:
class ExperimentRunner:
    """
    K-Fold 교차 검증 및 전체 실험 프로세스를 지휘하는 오케스트레이터 클래스입니다.
    
    환경 설정(Kaggle, Colab, Local)에 따른 경로 자동화부터 WandB 로깅, 체크포인트 저장, 
    K-Fold 학습 루프 제어 및 최종 추론(OOF 및 Test)까지의 전체 워크플로우를 담당합니다.
    실험이 종료될 때마다 명시적인 메모리 정리(GC, CUDA Cache)를 수행하여 
    리소스 사용을 최적화하고 연속적인 실험 안정성을 보장합니다.
    """
    def __init__(self, config, train_df, test_df):
        super().__init__()
        self.config = config
        self.train_df = train_df
        self.test_df = test_df
        self.paths = self._setup_env()
        self.backup_handler = BackupHandler(local_dir=self.paths.local_path , backup_dir=self.paths.drive_path, active=False)
        
    def _setup_env(self):
        is_kaggle = os.path.exists('/kaggle/') 
        is_colab = os.path.exists('/content/drive/Mydrive') and not is_kaggle

        if is_kaggle:
            print("Environment: Kaggle")
            drive_path = None
            local_path = '/kaggle/working/'
        elif is_colab:
            print("Environment: Google Colab")
            drive_path = f'/content/drive/MyDrive/Kaggle_Save/{CFG.exp_name}/'
            local_path = '/content/models/'
        else:
            print("Environment: Local")
            drive_path = None
            local_path = f'../data/models/{CFG.exp_name}/'
        
        print(f"Save Path: {local_path}")
        return SimpleNamespace(local_path=local_path, drive_path=drive_path)    
    
    def run(self):
        for fold in range(self.config.n_folds):
            print('='*30, f'FOLD {fold+1}', '='*30)
            
            wandb_logger = WandbLogger(
                project=self.config.project_name,
                group=self.config.exp_name,
                name=f"Fold_{fold+1}",
                job_type="train",
                save_code=True,
                config={k: v for k, v in self.config.__dict__.items() if not k.startswith('__')}
            )
            
            train_len = len(self.train_df[self.train_df['fold'] != fold])
            steps_per_epoch = math.ceil(train_len / self.config.batch_size / self.config.accum_iter)

            datamodule = PlantDataModule(train_df=self.train_df, test_df=self.test_df, cfg=self.config, fold_idx=fold)
            model = PlantDiseaseModule(self.config, steps_per_epoch=steps_per_epoch)
            
            ckpt_callback = pl.callbacks.ModelCheckpoint(
                monitor='val_roc_auc',
                mode='max',
                save_top_k=self.config.top_k,
                save_weights_only=True,
                save_last=False,
                dirpath=self.paths.local_path,
                filename=f'Fold{fold+1}-Ep{{epoch:02d}}-{{val_roc_auc:.4f}}',
                auto_insert_metric_name=False,
            )
            progress_bar = TQDMProgressBar(refresh_rate=1)
            
            trainer = pl.Trainer(
                max_epochs=self.config.epochs,
                accelerator='auto',
                precision='16-mixed',
                accumulate_grad_batches=self.config.accum_iter,
                callbacks=[ckpt_callback, progress_bar],
                logger=wandb_logger,
                log_every_n_steps=10
            )

            trainer.fit(model, datamodule=datamodule)
            
            print(f'\n Top-{ckpt_callback.save_top_k} Models in this Fold:')
            for path, score in ckpt_callback.best_k_models.items():
                model_name = os.path.basename(path)
                print(f'> {model_name}')
                
            wandb.finish()
            
            # 메모리 정리
            del datamodule, trainer, model
            torch.cuda.empty_cache()
            gc.collect()

    def _load_averaged_model(self, fold):
        save_path = os.path.join(self.paths.local_path, f'best_score_model_{fold+1}.pth')
        model = PlantDiseaseModule(self.config)

        if os.path.exists(save_path):
            print(f'Found existing averaged model for Fold {fold+1}. Loading directly...')
            state_dict = torch.load(save_path, map_location=device)
            model.load_state_dict(state_dict)
        else:
            print(f'Merging Top-K Models for Fold {fold+1} ...')
            score_pattern = os.path.join(self.paths.local_path, f'Fold{fold+1}-Ep*.ckpt')
            score_files = glob.glob(score_pattern)
            print(f'Found {len(score_files)} score models : {[os.path.basename(f) for f in score_files]}')
            
            first_state = torch.load(score_files[0], map_location='cpu')['state_dict']
            avg_state_dict = {}
            for k, v in first_state.items():
                if v.is_floating_point():
                    avg_state_dict[k] = v.float() # Float32로 변환하여 초기화
                else:
                    avg_state_dict[k] = v 
            
            if len(score_files) > 1:
                for path in score_files[1:]:
                    state_dict = torch.load(path, map_location='cpu')['state_dict']
                    for key in avg_state_dict:
                        avg_state_dict[key] += state_dict[key].float()
                for key in avg_state_dict:
                    if avg_state_dict[key].is_floating_point():
                        avg_state_dict[key] = avg_state_dict[key] / len(score_files)
            
            model.load_state_dict(avg_state_dict)
            
            for remove_path in score_files:
                if os.path.exists(remove_path):
                    os.remove(remove_path)
            
            save_path = os.path.join(self.paths.local_path, f'best_score_model_{fold+1}.pth')
            torch.save(model.state_dict(), save_path)
            print('Save Avg Model : ', save_path)
        
        if self.config.is_bn:
            print('Update BN stats ... ')
            model = model.to(self.config.device)
            model.train()

            train_subset = self.train_df[self.train_df['fold'] != fold].reset_index(drop=True)
            transform_test = A.Compose([
                A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
                ToTensorV2()
            ])
            dataset_bn = ImageDataset(train_subset, transform=transform_test)
            loader_bn = DataLoader(
                dataset_bn, 
                batch_size=self.config.batch_size, 
                shuffle=True,
                num_workers=self.config.num_workers,
                pin_memory=True
            )
            
            update_bn(loader_bn, model, device=self.config.device)
            model.eval()
            torch.save(model.state_dict(), save_path)
        else:
            print('Skipping BN update for LayerNorm')
            model = model.to(self.config.device)
        return model

    def find_optimal_temperature(self, logits, labels):
        """
        OOF Logits와 정답을 이용해 NLL을 최소화하는 T 값 탐색
        """
        # 정답 라벨 처리 (One-hot -> Index)
        if labels.ndim > 1:
            labels = np.argmax(labels, axis=1)
        
        logits_tensor = torch.tensor(logits, dtype=torch.float32)
        labels_tensor = torch.tensor(labels, dtype=torch.long)
        
        # NLL Loss
        t_candidates = np.arange(0.5, 2.6, 0.1)
        best_t = min(t_candidates, key=lambda t: nn.CrossEntropyLoss()(logits_tensor / t, labels_tensor).item())
        print(f"    > Best T: {best_t:.1f}")
        return best_t

    # weight average
    def run_inference(self):
        oof_preds = np.zeros((len(self.train_df), 4))
        oof_preds_logit = np.zeros((len(self.train_df), 4))
        final_preds = np.zeros((len(self.test_df), 4))

        for fold in range(self.config.n_folds):
            print(f'=== Inference Fold {fold+1} ===')
            # 가중치 평균 모델
            avg_model = self._load_averaged_model(fold)

            # 추론용 데이터 모듈 설정
            infer_module = PlantDataModule(self.train_df, self.test_df, self.config, fold_idx=fold, inference_mode=True)
            infer_module.setup(stage='test')
            
            # trainer 생성
            progress_bar = TQDMProgressBar(refresh_rate=1)
            infer_trainer = pl.Trainer(
                accelerator='auto',
                precision='16-mixed',
                logger=False,
                enable_checkpointing=False,
                callbacks=[progress_bar]
            )

            # 검증셋 인덱스 및 정답 라벨
            valid_indices = infer_module.valid.index.values
            valid_labels = self.train_df.iloc[valid_indices][['healthy', 'multiple_diseases', 'rust', 'scab']].values

            # OOF 추론
            oof_list = infer_trainer.predict(avg_model, dataloaders=infer_module.val_dataloader())
            current_oof_logits = torch.cat(oof_list).cpu().numpy()
            print(f"Max: {current_oof_logits.max()}, Min: {current_oof_logits.min()}")

            # 최적 Calibration T 찾기
            optimal_t = self.find_optimal_temperature(current_oof_logits, valid_labels)

            # OOF 보정 및 확률 변환
            calibrated_oof_probs = torch.softmax(torch.tensor(current_oof_logits) / optimal_t, dim=1).numpy()
            oof_preds[valid_indices] = calibrated_oof_probs
            oof_preds_logit[valid_indices] = current_oof_logits

            # Test 추론 및 보정
            sub_list = infer_trainer.predict(avg_model, dataloaders=infer_module.predict_dataloader())
            current_test_logits = torch.cat(sub_list).cpu().numpy()
            calibrated_test_probs = torch.softmax(torch.tensor(current_test_logits) / optimal_t, dim=1).numpy()
            final_preds += calibrated_test_probs

            # 메모리 정리
            del avg_model, infer_trainer, infer_module
            torch.cuda.empty_cache()
            gc.collect()

        final_preds /= self.config.n_folds

        # 최종 메트릭 계산
        metric_handler = MetricHandler()
        metric_handler.update(oof_preds, self.train_df[hard_cols].values)
        oof_roc = metric_handler.compute_roc_auc()
        print(f'\n>>> Final OOF ROC AUC : {oof_roc:.5f}')

        return oof_preds, oof_preds_logit, final_preds

In [14]:
torch.cuda.empty_cache()
gc.collect()

0

# 5. Training Execution

In [15]:
runner = ExperimentRunner(config=CFG, train_df=train_df, test_df=test_df)

Environment: Local
Save Path: ../data/models/s9_convnext_small_delete_T/


In [16]:
%%time
runner.run()



Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[Fit] Train: 1453, Valid: 368


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 0) Train Loss: 0.5679 | Val Loss: 0.4424 | ROC AUC: 0.9352
New Top-K Score! (Rank 1)
Current Top-3: (Ep 0: 0.9352)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep0


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 1) Train Loss: 0.1010 | Val Loss: 0.1283 | ROC AUC: 0.9920
New Top-K Score! (Rank 1)
Current Top-3: (Ep 1: 0.9920), (Ep 0: 0.9352)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep1


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 2) Train Loss: 0.0838 | Val Loss: 0.1028 | ROC AUC: 0.9917
New Top-K Score! (Rank 2)
Current Top-3: (Ep 1: 0.9920), (Ep 2: 0.9917), (Ep 0: 0.9352)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep2


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 3) Train Loss: 0.2952 | Val Loss: 0.0972 | ROC AUC: 0.9901
New Top-K Score! (Rank 3)
Current Top-3: (Ep 1: 0.9920), (Ep 2: 0.9917), (Ep 3: 0.9901)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep3


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 4) Train Loss: 0.2085 | Val Loss: 0.0976 | ROC AUC: 0.9933
New Top-K Score! (Rank 1)
Current Top-3: (Ep 4: 0.9933), (Ep 1: 0.9920), (Ep 2: 0.9917)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep4


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 5) Train Loss: 0.2048 | Val Loss: 0.1366 | ROC AUC: 0.9953
New Top-K Score! (Rank 1)
Current Top-3: (Ep 5: 0.9953), (Ep 4: 0.9933), (Ep 1: 0.9920)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep5


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 6) Train Loss: 0.1705 | Val Loss: 0.0833 | ROC AUC: 0.9982
New Top-K Score! (Rank 1)
Current Top-3: (Ep 6: 0.9982), (Ep 5: 0.9953), (Ep 4: 0.9933)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep6


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 7) Train Loss: 0.2746 | Val Loss: 0.0925 | ROC AUC: 0.9972
New Top-K Score! (Rank 2)
Current Top-3: (Ep 6: 0.9982), (Ep 7: 0.9972), (Ep 5: 0.9953)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep7


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 8) Train Loss: 0.5015 | Val Loss: 0.1054 | ROC AUC: 0.9929


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 9) Train Loss: 0.1225 | Val Loss: 0.0905 | ROC AUC: 0.9789


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 10) Train Loss: 0.1203 | Val Loss: 0.0932 | ROC AUC: 0.9964
New Top-K Score! (Rank 3)
Current Top-3: (Ep 6: 0.9982), (Ep 7: 0.9972), (Ep 10: 0.9964)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep10


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 11) Train Loss: 0.1559 | Val Loss: 0.0712 | ROC AUC: 0.9968
New Top-K Score! (Rank 3)
Current Top-3: (Ep 6: 0.9982), (Ep 7: 0.9972), (Ep 11: 0.9968)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep11


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 12) Train Loss: 0.1002 | Val Loss: 0.0704 | ROC AUC: 0.9967


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 13) Train Loss: 0.1064 | Val Loss: 0.0632 | ROC AUC: 0.9974
New Top-K Score! (Rank 2)
Current Top-3: (Ep 6: 0.9982), (Ep 13: 0.9974), (Ep 7: 0.9972)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep13


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 14) Train Loss: 0.0863 | Val Loss: 0.0719 | ROC AUC: 0.9966


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 15) Train Loss: 0.0522 | Val Loss: 0.0778 | ROC AUC: 0.9974
New Top-K Score! (Rank 2)
Current Top-3: (Ep 6: 0.9982), (Ep 15: 0.9974), (Ep 13: 0.9974)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep15


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 16) Train Loss: 0.1331 | Val Loss: 0.0679 | ROC AUC: 0.9970


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 17) Train Loss: 0.2121 | Val Loss: 0.0728 | ROC AUC: 0.9979
New Top-K Score! (Rank 2)
Current Top-3: (Ep 6: 0.9982), (Ep 17: 0.9979), (Ep 15: 0.9974)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep17


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 18) Train Loss: 0.1289 | Val Loss: 0.0634 | ROC AUC: 0.9986
New Top-K Score! (Rank 1)
Current Top-3: (Ep 18: 0.9986), (Ep 6: 0.9982), (Ep 17: 0.9979)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep18


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 19) Train Loss: 0.2315 | Val Loss: 0.0656 | ROC AUC: 0.9987
New Top-K Score! (Rank 1)
Current Top-3: (Ep 19: 0.9987), (Ep 18: 0.9986), (Ep 6: 0.9982)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep19


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 20) Train Loss: 0.1525 | Val Loss: 0.0637 | ROC AUC: 0.9984
New Top-K Score! (Rank 3)
Current Top-3: (Ep 19: 0.9987), (Ep 18: 0.9986), (Ep 20: 0.9984)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep20


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 21) Train Loss: 0.0943 | Val Loss: 0.0634 | ROC AUC: 0.9983


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 22) Train Loss: 0.0949 | Val Loss: 0.0647 | ROC AUC: 0.9983


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 23) Train Loss: 0.1646 | Val Loss: 0.0641 | ROC AUC: 0.9982


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.



(Epoch 24) Train Loss: 0.1841 | Val Loss: 0.0641 | ROC AUC: 0.9982

 Top-3 Models in this Fold:
> Fold1-Ep18-0.9986.ckpt
> Fold1-Ep19-0.9987.ckpt
> Fold1-Ep20-0.9984.ckpt


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇█
global_step,▁▁▂▂▂▃▃▃▅▅▆▆▇▇██
hard_loss_epoch,█▃▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
hard_loss_step,█▇▇▅▃▂▂▂▂▂▁▃▂▃▁▁▃▁▁▂▁▁▁▁▁▂▁▂▁▁▁▁▂▁▁▁▁▂▂▂
loss_loss_epoch,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss_loss_step,█▇▇▄▂▃▂▂▁▂▃▁▁▂▂▁▂▂▂▁▁▁▂▂▂▁▂▁▂▂▂▂▂▂▂▂▁▁▁▂
train_loss_epoch,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▇▃▆▂▅▃▃▄▂▄▃▂▂▂▅▃▄▂▁▂▂▁▁▂▃▄▃▁▂▂▂▁▂▃▂▂▁▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇█████
val_loss,█▂▂▂▂▂▁▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,24
global_step,966
hard_loss_epoch,0.06318
hard_loss_step,0.05804
loss_loss_epoch,0.2102
loss_loss_step,0.31017
train_loss_epoch,0.13669
train_loss_step,0.1841
trainer/global_step,1149
val_loss,0.06405




Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /teamspace/studios/this_studio/data/models/s9_convnext_small_delete_T exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[Fit] Train: 1461, Valid: 360


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 0) Train Loss: 0.8892 | Val Loss: 0.6440 | ROC AUC: 0.8912
New Top-K Score! (Rank 1)
Current Top-3: (Ep 0: 0.8912)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep0


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 1) Train Loss: 0.3296 | Val Loss: 0.2390 | ROC AUC: 0.9595
New Top-K Score! (Rank 1)
Current Top-3: (Ep 1: 0.9595), (Ep 0: 0.8912)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep1


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 2) Train Loss: 0.1487 | Val Loss: 0.2041 | ROC AUC: 0.9659
New Top-K Score! (Rank 1)
Current Top-3: (Ep 2: 0.9659), (Ep 1: 0.9595), (Ep 0: 0.8912)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep2


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 3) Train Loss: 0.2547 | Val Loss: 0.2378 | ROC AUC: 0.9525
New Top-K Score! (Rank 3)
Current Top-3: (Ep 2: 0.9659), (Ep 1: 0.9595), (Ep 3: 0.9525)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep3


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 4) Train Loss: 0.2227 | Val Loss: 0.2357 | ROC AUC: 0.9613
New Top-K Score! (Rank 2)
Current Top-3: (Ep 2: 0.9659), (Ep 4: 0.9613), (Ep 1: 0.9595)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep4


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 5) Train Loss: 0.0878 | Val Loss: 0.1953 | ROC AUC: 0.9747
New Top-K Score! (Rank 1)
Current Top-3: (Ep 5: 0.9747), (Ep 2: 0.9659), (Ep 4: 0.9613)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep5


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 6) Train Loss: 0.1604 | Val Loss: 0.2313 | ROC AUC: 0.9687
New Top-K Score! (Rank 2)
Current Top-3: (Ep 5: 0.9747), (Ep 6: 0.9687), (Ep 2: 0.9659)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep6


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 7) Train Loss: 0.1582 | Val Loss: 0.1756 | ROC AUC: 0.9779
New Top-K Score! (Rank 1)
Current Top-3: (Ep 7: 0.9779), (Ep 5: 0.9747), (Ep 6: 0.9687)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep7


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 8) Train Loss: 0.0726 | Val Loss: 0.2017 | ROC AUC: 0.9743
New Top-K Score! (Rank 3)
Current Top-3: (Ep 7: 0.9779), (Ep 5: 0.9747), (Ep 8: 0.9743)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep8


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 9) Train Loss: 0.2820 | Val Loss: 0.1570 | ROC AUC: 0.9792
New Top-K Score! (Rank 1)
Current Top-3: (Ep 9: 0.9792), (Ep 7: 0.9779), (Ep 5: 0.9747)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep9


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 10) Train Loss: 0.1137 | Val Loss: 0.1584 | ROC AUC: 0.9817
New Top-K Score! (Rank 1)
Current Top-3: (Ep 10: 0.9817), (Ep 9: 0.9792), (Ep 7: 0.9779)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep10


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 11) Train Loss: 0.1884 | Val Loss: 0.1618 | ROC AUC: 0.9796
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9817), (Ep 11: 0.9796), (Ep 9: 0.9792)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep11


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 12) Train Loss: 0.0867 | Val Loss: 0.1736 | ROC AUC: 0.9778


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 13) Train Loss: 0.1833 | Val Loss: 0.1756 | ROC AUC: 0.9775


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 14) Train Loss: 0.1270 | Val Loss: 0.1677 | ROC AUC: 0.9805
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9817), (Ep 14: 0.9805), (Ep 11: 0.9796)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep14


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 15) Train Loss: 0.1691 | Val Loss: 0.1572 | ROC AUC: 0.9802
New Top-K Score! (Rank 3)
Current Top-3: (Ep 10: 0.9817), (Ep 14: 0.9805), (Ep 15: 0.9802)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep15


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 16) Train Loss: 0.1502 | Val Loss: 0.1588 | ROC AUC: 0.9788


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 17) Train Loss: 0.0513 | Val Loss: 0.1528 | ROC AUC: 0.9802
New Top-K Score! (Rank 3)
Current Top-3: (Ep 10: 0.9817), (Ep 14: 0.9805), (Ep 17: 0.9802)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep17


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 18) Train Loss: 0.0692 | Val Loss: 0.1601 | ROC AUC: 0.9797


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 19) Train Loss: 0.0673 | Val Loss: 0.1443 | ROC AUC: 0.9814
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9817), (Ep 19: 0.9814), (Ep 14: 0.9805)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep19


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 20) Train Loss: 0.0590 | Val Loss: 0.1500 | ROC AUC: 0.9798


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 21) Train Loss: 0.1030 | Val Loss: 0.1516 | ROC AUC: 0.9795


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 22) Train Loss: 0.0803 | Val Loss: 0.1529 | ROC AUC: 0.9792


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 23) Train Loss: 0.1175 | Val Loss: 0.1549 | ROC AUC: 0.9788


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.



(Epoch 24) Train Loss: 0.1130 | Val Loss: 0.1547 | ROC AUC: 0.9789

 Top-3 Models in this Fold:
> Fold2-Ep10-0.9817.ckpt
> Fold2-Ep14-0.9805.ckpt
> Fold2-Ep19-0.9814.ckpt


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
global_step,▁▁▂▂▂▃▃▄▄▄▅▅▆▇▇█
hard_loss_epoch,█▄▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
hard_loss_step,██▂▂▂▃▁▃▃▁▂▂▃▁▁▁▁▁▁▂▁▂▂▁▁▁▁▁▂▁▂▂▁▁▂▁▁▁▁▁
loss_loss_epoch,█▃▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss_loss_step,█▇▅▃▂▁▂▂▃▂▁▂▁▂▁▁▂▁▁▂▁▁▁▁▁▁▂▂▁▁▁▂▂▂▁▁▂▁▁▂
train_loss_epoch,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▂▂▂▁▁▂▂▂▁▁▁▁▁▁▁▁▂▂▁▁▁▂▂▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█
val_loss,█▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,24
global_step,920
hard_loss_epoch,0.05267
hard_loss_step,0.07063
loss_loss_epoch,0.19187
loss_loss_step,0.15537
train_loss_epoch,0.12227
train_loss_step,0.113
trainer/global_step,1149
val_loss,0.15472




Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /teamspace/studios/this_studio/data/models/s9_convnext_small_delete_T exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[Fit] Train: 1456, Valid: 365


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 0) Train Loss: 0.6287 | Val Loss: 0.5374 | ROC AUC: 0.9121
New Top-K Score! (Rank 1)
Current Top-3: (Ep 0: 0.9121)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep0


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 1) Train Loss: 0.0857 | Val Loss: 0.1695 | ROC AUC: 0.9811
New Top-K Score! (Rank 1)
Current Top-3: (Ep 1: 0.9811), (Ep 0: 0.9121)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep1


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 2) Train Loss: 0.0878 | Val Loss: 0.1227 | ROC AUC: 0.9887
New Top-K Score! (Rank 1)
Current Top-3: (Ep 2: 0.9887), (Ep 1: 0.9811), (Ep 0: 0.9121)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep2


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 3) Train Loss: 0.3262 | Val Loss: 0.1065 | ROC AUC: 0.9896
New Top-K Score! (Rank 1)
Current Top-3: (Ep 3: 0.9896), (Ep 2: 0.9887), (Ep 1: 0.9811)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep3


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 4) Train Loss: 0.1361 | Val Loss: 0.0952 | ROC AUC: 0.9929
New Top-K Score! (Rank 1)
Current Top-3: (Ep 4: 0.9929), (Ep 3: 0.9896), (Ep 2: 0.9887)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep4


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 5) Train Loss: 0.0528 | Val Loss: 0.0893 | ROC AUC: 0.9933
New Top-K Score! (Rank 1)
Current Top-3: (Ep 5: 0.9933), (Ep 4: 0.9929), (Ep 3: 0.9896)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep5


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 6) Train Loss: 0.2152 | Val Loss: 0.1035 | ROC AUC: 0.9919
New Top-K Score! (Rank 3)
Current Top-3: (Ep 5: 0.9933), (Ep 4: 0.9929), (Ep 6: 0.9919)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep6


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 7) Train Loss: 0.0983 | Val Loss: 0.0966 | ROC AUC: 0.9968
New Top-K Score! (Rank 1)
Current Top-3: (Ep 7: 0.9968), (Ep 5: 0.9933), (Ep 4: 0.9929)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep7


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 8) Train Loss: 0.2274 | Val Loss: 0.0866 | ROC AUC: 0.9936
New Top-K Score! (Rank 2)
Current Top-3: (Ep 7: 0.9968), (Ep 8: 0.9936), (Ep 5: 0.9933)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep8


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 9) Train Loss: 0.1470 | Val Loss: 0.0872 | ROC AUC: 0.9932


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 10) Train Loss: 0.1211 | Val Loss: 0.0948 | ROC AUC: 0.9932


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 11) Train Loss: 0.1469 | Val Loss: 0.0825 | ROC AUC: 0.9970
New Top-K Score! (Rank 1)
Current Top-3: (Ep 11: 0.9970), (Ep 7: 0.9968), (Ep 8: 0.9936)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep11


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 12) Train Loss: 0.1125 | Val Loss: 0.0782 | ROC AUC: 0.9962
New Top-K Score! (Rank 3)
Current Top-3: (Ep 11: 0.9970), (Ep 7: 0.9968), (Ep 12: 0.9962)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep12


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 13) Train Loss: 0.2400 | Val Loss: 0.0757 | ROC AUC: 0.9971
New Top-K Score! (Rank 1)
Current Top-3: (Ep 13: 0.9971), (Ep 11: 0.9970), (Ep 7: 0.9968)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep13


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 14) Train Loss: 0.1009 | Val Loss: 0.0701 | ROC AUC: 0.9974
New Top-K Score! (Rank 1)
Current Top-3: (Ep 14: 0.9974), (Ep 13: 0.9971), (Ep 11: 0.9970)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep14


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 15) Train Loss: 0.2639 | Val Loss: 0.0772 | ROC AUC: 0.9973
New Top-K Score! (Rank 2)
Current Top-3: (Ep 14: 0.9974), (Ep 15: 0.9973), (Ep 13: 0.9971)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep15


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 16) Train Loss: 0.0746 | Val Loss: 0.0784 | ROC AUC: 0.9970


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 17) Train Loss: 0.1876 | Val Loss: 0.0728 | ROC AUC: 0.9978
New Top-K Score! (Rank 1)
Current Top-3: (Ep 17: 0.9978), (Ep 14: 0.9974), (Ep 15: 0.9973)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep17


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 18) Train Loss: 0.1220 | Val Loss: 0.0793 | ROC AUC: 0.9976
New Top-K Score! (Rank 2)
Current Top-3: (Ep 17: 0.9978), (Ep 18: 0.9976), (Ep 14: 0.9974)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep18


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 19) Train Loss: 0.0933 | Val Loss: 0.0730 | ROC AUC: 0.9976
New Top-K Score! (Rank 2)
Current Top-3: (Ep 17: 0.9978), (Ep 19: 0.9976), (Ep 18: 0.9976)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep19


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 20) Train Loss: 0.0529 | Val Loss: 0.0764 | ROC AUC: 0.9976


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 21) Train Loss: 0.1335 | Val Loss: 0.0776 | ROC AUC: 0.9976


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 22) Train Loss: 0.0888 | Val Loss: 0.0740 | ROC AUC: 0.9977
New Top-K Score! (Rank 2)
Current Top-3: (Ep 17: 0.9978), (Ep 22: 0.9977), (Ep 19: 0.9976)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep22


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 23) Train Loss: 0.0860 | Val Loss: 0.0740 | ROC AUC: 0.9976


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.



(Epoch 24) Train Loss: 0.1220 | Val Loss: 0.0740 | ROC AUC: 0.9976

 Top-3 Models in this Fold:
> Fold3-Ep17-0.9978.ckpt
> Fold3-Ep19-0.9976.ckpt
> Fold3-Ep22-0.9977.ckpt


0,1
epoch,▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
global_step,▁▁▂▂▂▃▃▃▄▅▅▅▅▆▆▇▇█
hard_loss_epoch,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
hard_loss_step,██▄▄▃▁▂▂▁▁▂▁▂▂▁▁▁▂▁▁▂▁▁▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁
loss_loss_epoch,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss_loss_step,█▇▅▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▁▂▁▁▂▂▁▁▁
train_loss_epoch,█▃▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▅▅▃▂▃▃▃▃▁▁▃▁▁▂▃▂▂▂▁▂▁▂▁▁▂▂▂▁▁▂▁▂▂▁▂▂▂▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇█
val_loss,█▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,24
global_step,1058
hard_loss_epoch,0.06013
hard_loss_step,0.05989
loss_loss_epoch,0.20132
loss_loss_step,0.18403
train_loss_epoch,0.13072
train_loss_step,0.12196
trainer/global_step,1149
val_loss,0.07395




Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /teamspace/studios/this_studio/data/models/s9_convnext_small_delete_T exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[Fit] Train: 1458, Valid: 363


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 0) Train Loss: 0.6250 | Val Loss: 0.5211 | ROC AUC: 0.9150
New Top-K Score! (Rank 1)
Current Top-3: (Ep 0: 0.9150)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep0


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 1) Train Loss: 0.4843 | Val Loss: 0.2140 | ROC AUC: 0.9809
New Top-K Score! (Rank 1)
Current Top-3: (Ep 1: 0.9809), (Ep 0: 0.9150)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep1


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 2) Train Loss: 0.2214 | Val Loss: 0.1844 | ROC AUC: 0.9769
New Top-K Score! (Rank 2)
Current Top-3: (Ep 1: 0.9809), (Ep 2: 0.9769), (Ep 0: 0.9150)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep2


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 3) Train Loss: 0.3648 | Val Loss: 0.1750 | ROC AUC: 0.9851
New Top-K Score! (Rank 1)
Current Top-3: (Ep 3: 0.9851), (Ep 1: 0.9809), (Ep 2: 0.9769)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep3


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 4) Train Loss: 0.0814 | Val Loss: 0.1500 | ROC AUC: 0.9863
New Top-K Score! (Rank 1)
Current Top-3: (Ep 4: 0.9863), (Ep 3: 0.9851), (Ep 1: 0.9809)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep4


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 5) Train Loss: 0.1437 | Val Loss: 0.1465 | ROC AUC: 0.9896
New Top-K Score! (Rank 1)
Current Top-3: (Ep 5: 0.9896), (Ep 4: 0.9863), (Ep 3: 0.9851)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep5


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 6) Train Loss: 0.1118 | Val Loss: 0.1420 | ROC AUC: 0.9872
New Top-K Score! (Rank 2)
Current Top-3: (Ep 5: 0.9896), (Ep 6: 0.9872), (Ep 4: 0.9863)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep6


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 7) Train Loss: 0.1093 | Val Loss: 0.1452 | ROC AUC: 0.9904
New Top-K Score! (Rank 1)
Current Top-3: (Ep 7: 0.9904), (Ep 5: 0.9896), (Ep 6: 0.9872)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep7


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 8) Train Loss: 0.0614 | Val Loss: 0.1483 | ROC AUC: 0.9881
New Top-K Score! (Rank 3)
Current Top-3: (Ep 7: 0.9904), (Ep 5: 0.9896), (Ep 8: 0.9881)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep8


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 9) Train Loss: 0.1401 | Val Loss: 0.1392 | ROC AUC: 0.9899
New Top-K Score! (Rank 2)
Current Top-3: (Ep 7: 0.9904), (Ep 9: 0.9899), (Ep 5: 0.9896)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep9


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 10) Train Loss: 0.1096 | Val Loss: 0.1290 | ROC AUC: 0.9921
New Top-K Score! (Rank 1)
Current Top-3: (Ep 10: 0.9921), (Ep 7: 0.9904), (Ep 9: 0.9899)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep10


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 11) Train Loss: 0.1602 | Val Loss: 0.1474 | ROC AUC: 0.9906
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9921), (Ep 11: 0.9906), (Ep 7: 0.9904)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep11


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 12) Train Loss: 0.0781 | Val Loss: 0.1293 | ROC AUC: 0.9909
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9921), (Ep 12: 0.9909), (Ep 11: 0.9906)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep12


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 13) Train Loss: 0.1232 | Val Loss: 0.1315 | ROC AUC: 0.9910
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9921), (Ep 13: 0.9910), (Ep 12: 0.9909)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep13


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 14) Train Loss: 0.0822 | Val Loss: 0.1348 | ROC AUC: 0.9907


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 15) Train Loss: 0.1280 | Val Loss: 0.1379 | ROC AUC: 0.9907


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 16) Train Loss: 0.1230 | Val Loss: 0.1337 | ROC AUC: 0.9916
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9921), (Ep 16: 0.9916), (Ep 13: 0.9910)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep16


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 17) Train Loss: 0.0548 | Val Loss: 0.1327 | ROC AUC: 0.9917
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9921), (Ep 17: 0.9917), (Ep 16: 0.9916)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep17


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 18) Train Loss: 0.1090 | Val Loss: 0.1350 | ROC AUC: 0.9915


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 19) Train Loss: 0.1249 | Val Loss: 0.1351 | ROC AUC: 0.9918
New Top-K Score! (Rank 2)
Current Top-3: (Ep 10: 0.9921), (Ep 19: 0.9918), (Ep 17: 0.9917)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep19


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 20) Train Loss: 0.0545 | Val Loss: 0.1335 | ROC AUC: 0.9923
New Top-K Score! (Rank 1)
Current Top-3: (Ep 20: 0.9923), (Ep 10: 0.9921), (Ep 19: 0.9918)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep20


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 21) Train Loss: 0.1405 | Val Loss: 0.1345 | ROC AUC: 0.9921
New Top-K Score! (Rank 3)
Current Top-3: (Ep 20: 0.9923), (Ep 10: 0.9921), (Ep 21: 0.9921)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep21


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 22) Train Loss: 0.1266 | Val Loss: 0.1310 | ROC AUC: 0.9923
New Top-K Score! (Rank 1)
Current Top-3: (Ep 22: 0.9923), (Ep 20: 0.9923), (Ep 10: 0.9921)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep22


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 23) Train Loss: 0.3741 | Val Loss: 0.1310 | ROC AUC: 0.9924
New Top-K Score! (Rank 1)
Current Top-3: (Ep 23: 0.9924), (Ep 22: 0.9923), (Ep 20: 0.9923)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep23


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 24) Train Loss: 0.1086 | Val Loss: 0.1309 | ROC AUC: 0.9923
New Top-K Score! (Rank 2)
Current Top-3: (Ep 23: 0.9924), (Ep 24: 0.9923), (Ep 22: 0.9923)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep24


`Trainer.fit` stopped: `max_epochs=25` reached.



 Top-3 Models in this Fold:
> Fold4-Ep22-0.9923.ckpt
> Fold4-Ep23-0.9924.ckpt
> Fold4-Ep24-0.9923.ckpt


0,1
epoch,▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇████
global_step,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▆▆▇▇▇▇██
hard_loss_epoch,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
hard_loss_step,█▇▆▄▆▆▆▂▂▁▁▂▃▄▂▂▁▂▄▁▂▁▂▁▃▂▁▂▁▁▁▁▂▁▁▁▁▁▁▂
loss_loss_epoch,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss_loss_step,█▄▃▁▂▁▁▁▁▃▂▃▂▂▂▂▂▂▂▂▂▂▁▂▂▁▂▂▂▂▁▂▂▂▂▂▂▁▂▁
train_loss_epoch,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▅▃▁▂▂▂▁▃▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
val_loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,24
global_step,1150
hard_loss_epoch,0.05388
hard_loss_step,0.01655
loss_loss_epoch,0.19175
loss_loss_step,0.20059
train_loss_epoch,0.12282
train_loss_step,0.10857
trainer/global_step,1149
val_loss,0.13087




Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:881: Checkpoint directory /teamspace/studios/this_studio/data/models/s9_convnext_small_delete_T exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[Fit] Train: 1456, Valid: 365


/home/zeus/miniconda3/envs/cloudspace/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:242: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 0) Train Loss: 1.0700 | Val Loss: 0.6078 | ROC AUC: 0.8846
New Top-K Score! (Rank 1)
Current Top-3: (Ep 0: 0.8846)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep0


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 1) Train Loss: 0.3096 | Val Loss: 0.1565 | ROC AUC: 0.9612
New Top-K Score! (Rank 1)
Current Top-3: (Ep 1: 0.9612), (Ep 0: 0.8846)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep1


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 2) Train Loss: 0.1939 | Val Loss: 0.1301 | ROC AUC: 0.9750
New Top-K Score! (Rank 1)
Current Top-3: (Ep 2: 0.9750), (Ep 1: 0.9612), (Ep 0: 0.8846)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep2


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 3) Train Loss: 0.3858 | Val Loss: 0.1290 | ROC AUC: 0.9807
New Top-K Score! (Rank 1)
Current Top-3: (Ep 3: 0.9807), (Ep 2: 0.9750), (Ep 1: 0.9612)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep3


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 4) Train Loss: 0.1404 | Val Loss: 0.1067 | ROC AUC: 0.9834
New Top-K Score! (Rank 1)
Current Top-3: (Ep 4: 0.9834), (Ep 3: 0.9807), (Ep 2: 0.9750)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep4


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 5) Train Loss: 0.1716 | Val Loss: 0.1273 | ROC AUC: 0.9691


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 6) Train Loss: 0.0492 | Val Loss: 0.1036 | ROC AUC: 0.9744


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 7) Train Loss: 0.1106 | Val Loss: 0.0949 | ROC AUC: 0.9763
New Top-K Score! (Rank 3)
Current Top-3: (Ep 4: 0.9834), (Ep 3: 0.9807), (Ep 7: 0.9763)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep7


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 8) Train Loss: 0.1092 | Val Loss: 0.1018 | ROC AUC: 0.9721


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 9) Train Loss: 0.1387 | Val Loss: 0.0866 | ROC AUC: 0.9808
New Top-K Score! (Rank 2)
Current Top-3: (Ep 4: 0.9834), (Ep 9: 0.9808), (Ep 3: 0.9807)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep9


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 10) Train Loss: 0.2212 | Val Loss: 0.0963 | ROC AUC: 0.9783


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 11) Train Loss: 0.1549 | Val Loss: 0.0946 | ROC AUC: 0.9758


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 12) Train Loss: 0.0994 | Val Loss: 0.0941 | ROC AUC: 0.9768


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 13) Train Loss: 0.2141 | Val Loss: 0.0908 | ROC AUC: 0.9805


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 14) Train Loss: 0.0536 | Val Loss: 0.0899 | ROC AUC: 0.9825
New Top-K Score! (Rank 2)
Current Top-3: (Ep 4: 0.9834), (Ep 14: 0.9825), (Ep 9: 0.9808)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep14


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 15) Train Loss: 0.1769 | Val Loss: 0.0805 | ROC AUC: 0.9842
New Top-K Score! (Rank 1)
Current Top-3: (Ep 15: 0.9842), (Ep 4: 0.9834), (Ep 14: 0.9825)
Confusion Matrix saved to WandB key: Confusion_Matrix_Ep15


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 16) Train Loss: 0.1668 | Val Loss: 0.0843 | ROC AUC: 0.9806


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 17) Train Loss: 0.0735 | Val Loss: 0.0850 | ROC AUC: 0.9812


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 18) Train Loss: 0.1092 | Val Loss: 0.0825 | ROC AUC: 0.9813


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 19) Train Loss: 0.1069 | Val Loss: 0.0804 | ROC AUC: 0.9814


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 20) Train Loss: 0.1041 | Val Loss: 0.0789 | ROC AUC: 0.9812


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 21) Train Loss: 0.0779 | Val Loss: 0.0772 | ROC AUC: 0.9811


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 22) Train Loss: 0.1132 | Val Loss: 0.0797 | ROC AUC: 0.9808


Validation: |          | 0/? [00:00<?, ?it/s]


(Epoch 23) Train Loss: 0.1656 | Val Loss: 0.0793 | ROC AUC: 0.9808


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.



(Epoch 24) Train Loss: 0.2362 | Val Loss: 0.0793 | ROC AUC: 0.9807

 Top-3 Models in this Fold:
> Fold5-Ep04-0.9834.ckpt
> Fold5-Ep14-0.9825.ckpt
> Fold5-Ep15-0.9842.ckpt


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
global_step,▁▁▂▂▃▄▅██
hard_loss_epoch,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
hard_loss_step,██▅▄▄▂▂▂▁▂▂▁▁▃▂▂▂▁▁▁▂▁▁▁▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁
loss_loss_epoch,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss_loss_step,██▇▃▃▂▂▂▃▂▁▂▂▃▂▁▁▂▁▃▂▂▂▂▂▁▁▂▁▂▁▂▂▂▂▁▂▁▂▂
train_loss_epoch,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▃▂▁▂▂▂▁▁▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▇▇▇▇█████
val_loss,█▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,24
global_step,736
hard_loss_epoch,0.05955
hard_loss_step,0.17253
loss_loss_epoch,0.20431
loss_loss_step,0.29995
train_loss_epoch,0.13193
train_loss_step,0.23624
trainer/global_step,1149
val_loss,0.07933


CPU times: user 1h 25min 52s, sys: 15min 36s, total: 1h 41min 29s
Wall time: 1h 41min 10s


# 6. Inference & Save

In [17]:
%%time
oof_preds, oof_preds_logit, final_preds = runner.run_inference()

=== Inference Fold 1 ===
Merging Top-K Models for Fold 1 ...
Found 3 score models : ['Fold1-Ep19-0.9987.ckpt', 'Fold1-Ep18-0.9986.ckpt', 'Fold1-Ep20-0.9984.ckpt']


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Save Avg Model :  ../data/models/s9_convnext_small_delete_T/best_score_model_1.pth
Skipping BN update for LayerNorm
[Test] Valid(OOF): 368, Test: 1821


Predicting: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Max: 6.1875, Min: -3.990234375
    > Best T: 0.7


Predicting: |          | 0/? [00:00<?, ?it/s]

=== Inference Fold 2 ===
Merging Top-K Models for Fold 2 ...
Found 3 score models : ['Fold2-Ep14-0.9805.ckpt', 'Fold2-Ep10-0.9817.ckpt', 'Fold2-Ep19-0.9814.ckpt']


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Save Avg Model :  ../data/models/s9_convnext_small_delete_T/best_score_model_2.pth
Skipping BN update for LayerNorm
[Test] Valid(OOF): 360, Test: 1821


Predicting: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Max: 6.46484375, Min: -4.0234375
    > Best T: 1.0


Predicting: |          | 0/? [00:00<?, ?it/s]

=== Inference Fold 3 ===
Merging Top-K Models for Fold 3 ...
Found 3 score models : ['Fold3-Ep22-0.9977.ckpt', 'Fold3-Ep19-0.9976.ckpt', 'Fold3-Ep17-0.9978.ckpt']


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Save Avg Model :  ../data/models/s9_convnext_small_delete_T/best_score_model_3.pth
Skipping BN update for LayerNorm
[Test] Valid(OOF): 365, Test: 1821


Predicting: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Max: 6.359375, Min: -4.25
    > Best T: 0.8


Predicting: |          | 0/? [00:00<?, ?it/s]

=== Inference Fold 4 ===
Merging Top-K Models for Fold 4 ...
Found 3 score models : ['Fold4-Ep24-0.9923.ckpt', 'Fold4-Ep23-0.9924.ckpt', 'Fold4-Ep22-0.9923.ckpt']


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Save Avg Model :  ../data/models/s9_convnext_small_delete_T/best_score_model_4.pth
Skipping BN update for LayerNorm
[Test] Valid(OOF): 363, Test: 1821


Predicting: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Max: 6.26953125, Min: -4.0625
    > Best T: 0.9


Predicting: |          | 0/? [00:00<?, ?it/s]

=== Inference Fold 5 ===
Merging Top-K Models for Fold 5 ...
Found 3 score models : ['Fold5-Ep14-0.9825.ckpt', 'Fold5-Ep15-0.9842.ckpt', 'Fold5-Ep04-0.9834.ckpt']


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Save Avg Model :  ../data/models/s9_convnext_small_delete_T/best_score_model_5.pth
Skipping BN update for LayerNorm
[Test] Valid(OOF): 365, Test: 1821


Predicting: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Max: 6.58984375, Min: -4.60546875
    > Best T: 0.8


Predicting: |          | 0/? [00:00<?, ?it/s]


>>> Final OOF ROC AUC : 0.99328
CPU times: user 5min 43s, sys: 1min 57s, total: 7min 40s
Wall time: 7min 39s


In [19]:
result_oof = train_df[['image_id']].copy()
result_oof[hard_cols] = oof_preds
runner.backup_handler.save_file(result_oof, f'oof_preds_{CFG.exp_name}.csv')

runner.backup_handler.save_file(oof_preds_logit, f'oof_ogit_{CFG.exp_name}.npy', logit=True)

result_sub = submission[['image_id']].copy()
result_sub[hard_cols] = final_preds
runner.backup_handler.save_file(result_sub, f'submission_{CFG.exp_name}.csv')

CSV saved at ../data/models/s9_convnext_small_delete_T/oof_preds_s9_convnext_small_delete_T.csv
Logit saved at ../data/models/s9_convnext_small_delete_T/oof_ogit_s9_convnext_small_delete_T.npy
CSV saved at ../data/models/s9_convnext_small_delete_T/submission_s9_convnext_small_delete_T.csv


In [20]:
display(result_oof.head())
display(result_sub.head())

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0.000551,0.001286,4.1e-05,0.998047
1,Train_1,0.007912,0.881836,0.096497,0.013985
2,Train_2,0.994141,0.002453,0.000934,0.002367
3,Train_3,1.4e-05,0.000261,0.999512,1.7e-05
4,Train_4,1.0,7.6e-05,0.000113,4.6e-05


Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,7.3e-05,0.003409,0.996387,5.9e-05
1,Test_1,9.5e-05,0.012188,0.987695,8.1e-05
2,Test_2,0.000705,0.000884,5.8e-05,0.998242
3,Test_3,0.99873,7.9e-05,0.000814,0.00038
4,Test_4,4.3e-05,0.002671,0.997168,5.7e-05


In [21]:
sub_file = os.path.join(runner.backup_handler.local_dir, f'submission_{CFG.exp_name}.csv')
print(sub_file)

../data/models/s9_convnext_small_delete_T/submission_s9_convnext_small_delete_T.csv


## 6.1 Submission

In [22]:
os.environ['KAGGLE_CONFIG_DIR'] = "/teamspace/studios/this_studio/"
!kaggle competitions submit -c plant-pathology-2020-fgvc7 -f {sub_file} -m "{CFG.exp_name}"

100%|█████████████████████████████████████████| 153k/153k [00:00<00:00, 843kB/s]
Successfully submitted to Plant Pathology 2020 - FGVC7