In [1]:
#!/usr/bin/env python3
"""
Improved FER Training Pipeline with:
 - Enhanced regularization (higher dropout, weight decay)
 - Advanced data augmentation (rotation, color jitter, mixup)
 - Focal loss for class imbalance
 - Gradual unfreezing strategy
 - Better learning rate schedule
 - Test-Time Augmentation support
"""

import os
import random
from pathlib import Path
from collections import Counter
from typing import Tuple, List

import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

from torchvision import transforms, models
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm


# ============================================================
#                  1. Focal Loss for Imbalanced Data
# ============================================================

class FocalLoss(nn.Module):
    """
    Focal Loss for handling class imbalance.
    Focuses training on hard examples.
    """
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss)
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


# ============================================================
#                  2. Mixup Data Augmentation
# ============================================================

def mixup_data(x, y, alpha=0.2, device='cuda'):
    """
    Mixup augmentation: interpolates between two samples.
    Helps improve generalization.
    """
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam


# ============================================================
#                  3. Data Collection
# ============================================================

def gather_image_paths_and_labels(root_dir: str) -> Tuple[List[str], List[int], List[str]]:
    """
    Collect all images from emotion class subdirectories.
    Handles both structures:
      - root/emotion/*.jpg
      - root/train/emotion/*.jpg and root/test/emotion/*.jpg
    
    Returns:
        image_paths: List of file paths
        labels: List of integer labels
        class_names: List of class names (sorted)
    """
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(f"Dataset root not found: {root_dir}")

    # Collect all images with their parent directory (emotion class)
    image_data = []
    exts = {".jpg", ".jpeg", ".png", ".bmp"}
    
    for img_path in root.rglob("*"):
        if img_path.suffix.lower() in exts:
            # Get emotion class (parent directory name)
            emotion_class = img_path.parent.name
            # Skip if parent is 'train' or 'test' - go up one more level
            if emotion_class in ['train', 'test']:
                emotion_class = img_path.parent.parent.name
            
            image_data.append((str(img_path), emotion_class))
    
    if not image_data:
        raise ValueError(f"No images found under {root_dir}")
    
    # Create class mapping
    unique_classes = sorted(set(emotion for _, emotion in image_data))
    class_to_idx = {name: idx for idx, name in enumerate(unique_classes)}
    
    # Convert to lists
    image_paths = [path for path, _ in image_data]
    labels = [class_to_idx[emotion] for _, emotion in image_data]
    
    return image_paths, labels, unique_classes


# ============================================================
#                  4. Pre-cached Dataset
# ============================================================

class PreCachedImageDataset(Dataset):
    """
    Ultra-fast dataset that stores pre-processed tensors in memory.
    For training: applies augmentation on cached tensors.
    For validation: returns tensors directly (no augmentation).
    """
    def __init__(self, image_paths: List[str], labels: List[int], 
                 transform=None, cache_images: bool = True, img_size: int = 224,
                 is_train: bool = True):
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.transform = transform
        self.is_train = is_train
        self.cached_tensors = []
        
        if cache_images:
            print(f"Pre-loading and processing {len(image_paths)} images...")
            
            # Pre-process transform (applied once during caching)
            imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
            imagenet_std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
            
            resize_transform = transforms.Compose([
                transforms.Resize(img_size),
                transforms.ToTensor(),
            ])
            
            for path in tqdm(image_paths, desc="Caching"):
                try:
                    img = Image.open(path).convert("RGB")
                    img_tensor = resize_transform(img)
                    
                    # For validation, pre-normalize to avoid doing it on every access
                    if not is_train:
                        img_tensor = (img_tensor - imagenet_mean) / imagenet_std
                    
                    self.cached_tensors.append(img_tensor)
                except Exception as e:
                    print(f"Warning: Failed to load {path}: {e}")
                    blank = torch.zeros(3, img_size, img_size)
                    if not is_train:
                        blank = (blank - imagenet_mean) / imagenet_std
                    self.cached_tensors.append(blank)
        else:
            self.cached_tensors = None
            self.image_paths = image_paths

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.cached_tensors is not None:
            img = self.cached_tensors[idx]
            
            # For training, apply augmentation
            if self.is_train and self.transform:
                img = self.transform(img)
            
        else:
            img = Image.open(self.image_paths[idx]).convert("RGB")
            if self.transform:
                img = self.transform(img)
        
        return img, self.labels[idx]


# ============================================================
#                  5. Enhanced Transforms
# ============================================================

def get_augmentation_transforms() -> Tuple[transforms.Compose, transforms.Compose]:
    """
    Returns enhanced augmentation transforms for better generalization.
    """
    imagenet_mean = [0.485, 0.456, 0.406]
    imagenet_std = [0.229, 0.224, 0.225]
    
    # Train: aggressive augmentation
    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(15),  # Add rotation
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Color variation
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),  # Translation & scale
        transforms.Normalize(imagenet_mean, imagenet_std),
        transforms.RandomErasing(p=0.2, scale=(0.02, 0.15)),  # Random erasing
    ])
    
    # Val: just normalize
    val_transform = transforms.Compose([
        transforms.Normalize(imagenet_mean, imagenet_std),
    ])
    
    return train_transform, val_transform


# ============================================================
#                  6. Improved Model Builder
# ============================================================

def build_model(model_name: str = 'resnet34', num_classes: int = 7, 
                dropout_rate: float = 0.6, freeze_backbone: bool = False) -> nn.Module:
    """
    Build ResNet-based classifier with enhanced regularization.
    """
    if model_name == 'resnet18':
        weights = models.ResNet18_Weights.IMAGENET1K_V1
        model = models.resnet18(weights=weights)
    elif model_name == 'resnet34':
        weights = models.ResNet34_Weights.IMAGENET1K_V1
        model = models.resnet34(weights=weights)
    elif model_name == 'resnet50':
        weights = models.ResNet50_Weights.IMAGENET1K_V1
        model = models.resnet50(weights=weights)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    # Replace classifier head with higher dropout
    in_features = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Linear(in_features, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(inplace=True),
        nn.Dropout(dropout_rate),  # Increased dropout
        nn.Linear(512, 256),
        nn.BatchNorm1d(256),
        nn.ReLU(inplace=True),
        nn.Dropout(dropout_rate),  # Increased dropout
        nn.Linear(256, num_classes)
    )

    # Freeze backbone if requested
    if freeze_backbone:
        for name, param in model.named_parameters():
            if "fc" not in name:
                param.requires_grad = False

    return model


# ============================================================
#                  7. Training Functions with Mixup
# ============================================================

def train_one_epoch(model: nn.Module, loader: DataLoader, criterion: nn.Module,
                   optimizer: optim.Optimizer, device: torch.device, epoch: int,
                   use_mixup: bool = True, mixup_alpha: float = 0.2) -> Tuple[float, float]:
    """Train for one epoch with optional mixup augmentation"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(loader, desc=f"Epoch {epoch} [Train]", leave=False)

    for images, labels in pbar:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # Apply mixup augmentation
        if use_mixup and random.random() < 0.5:  # Apply mixup 50% of the time
            mixed_images, labels_a, labels_b, lam = mixup_data(images, labels, mixup_alpha, device)
            
            optimizer.zero_grad(set_to_none=True)
            outputs = model(mixed_images)
            loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
            loss.backward()
            optimizer.step()
            
            # For accuracy, use original labels
            running_loss += loss.item() * images.size(0)
            preds = outputs.argmax(dim=1)
            correct += (lam * (preds == labels_a).sum().item() + (1 - lam) * (preds == labels_b).sum().item())
            total += images.size(0)
        else:
            optimizer.zero_grad(set_to_none=True)
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += images.size(0)

        # Update progress bar
        if total % (images.size(0) * 5) == 0:
            avg_loss = running_loss / total
            avg_acc = 100.0 * correct / total
            pbar.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{avg_acc:.2f}%"})

    avg_loss = running_loss / total
    avg_acc = 100.0 * correct / total
    return avg_loss, avg_acc


def evaluate(model: nn.Module, loader: DataLoader, criterion: nn.Module,
            device: torch.device, epoch: int, use_tta: bool = False) -> Tuple[float, float]:
    """Evaluate on validation set with optional Test-Time Augmentation"""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(loader, desc=f"Epoch {epoch} [Val]", leave=False)

    with torch.no_grad():
        for images, labels in pbar:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            if use_tta:
                # Test-Time Augmentation: average predictions over augmented versions
                outputs_list = []
                outputs_list.append(model(images))
                
                # Horizontal flip
                outputs_list.append(model(torch.flip(images, dims=[3])))
                
                outputs = torch.stack(outputs_list).mean(dim=0)
            else:
                outputs = model(images)
            
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += images.size(0)

            # Update progress bar
            avg_loss = running_loss / total
            avg_acc = 100.0 * correct / total
            pbar.set_postfix({"loss": f"{avg_loss:.4f}", "acc": f"{avg_acc:.2f}%"})

    return avg_loss, avg_acc


# ============================================================
#                  8. Gradual Unfreezing Training
# ============================================================

def train_with_gradual_unfreezing(model, train_loader, val_loader, criterion, 
                                  device, CONFIG, class_names):
    """
    Three-phase training:
    1. Train only classifier head
    2. Unfreeze last layer of backbone
    3. Fine-tune entire model
    """
    best_val_acc = 0.0
    best_epoch = 0
    patience_counter = 0
    current_epoch = 0
    
    print("\n" + "="*60)
    print("PHASE 1: Training classifier head only (5 epochs)")
    print("="*60)
    
    # Phase 1: Freeze backbone, train classifier
    for param in model.parameters():
        param.requires_grad = False
    for param in model.fc.parameters():
        param.requires_grad = True
    
    optimizer = optim.AdamW(
        model.fc.parameters(),
        lr=CONFIG['lr'],
        weight_decay=CONFIG['weight_decay']
    )
    scheduler = CosineAnnealingLR(optimizer, T_max=5, eta_min=1e-6)
    
    for epoch in range(1, 6):
        current_epoch += 1
        train_loss, train_acc = train_one_epoch(
            model, train_loader, criterion, optimizer, device, current_epoch,
            use_mixup=CONFIG['use_mixup'], mixup_alpha=CONFIG['mixup_alpha']
        )
        val_loss, val_acc = evaluate(model, val_loader, criterion, device, current_epoch)
        
        print(f"\n[Epoch {current_epoch}]")
        print(f"  Train -> Loss: {train_loss:.4f}  Acc: {train_acc:.2f}%")
        print(f"  Val   -> Loss: {val_loss:.4f}  Acc: {val_acc:.2f}%")
        
        scheduler.step()
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = current_epoch
            save_checkpoint(model, optimizer, current_epoch, val_acc, val_loss, 
                          class_names, CONFIG)
    
    print("\n" + "="*60)
    print("PHASE 2: Unfreezing last backbone layer (10 epochs)")
    print("="*60)
    
    # Phase 2: Unfreeze layer4
    for param in model.layer4.parameters():
        param.requires_grad = True
    
    optimizer = optim.AdamW([
        {'params': model.layer4.parameters(), 'lr': CONFIG['lr'] / 10},
        {'params': model.fc.parameters(), 'lr': CONFIG['lr'] / 2}
    ], weight_decay=CONFIG['weight_decay'])
    scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)
    
    for epoch in range(1, 11):
        current_epoch += 1
        train_loss, train_acc = train_one_epoch(
            model, train_loader, criterion, optimizer, device, current_epoch,
            use_mixup=CONFIG['use_mixup'], mixup_alpha=CONFIG['mixup_alpha']
        )
        val_loss, val_acc = evaluate(model, val_loader, criterion, device, current_epoch)
        
        print(f"\n[Epoch {current_epoch}]")
        print(f"  Train -> Loss: {train_loss:.4f}  Acc: {train_acc:.2f}%")
        print(f"  Val   -> Loss: {val_loss:.4f}  Acc: {val_acc:.2f}%")
        
        scheduler.step()
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = current_epoch
            patience_counter = 0
            save_checkpoint(model, optimizer, current_epoch, val_acc, val_loss,
                          class_names, CONFIG)
            print(f"  ✓ New best model saved! (Val Acc: {val_acc:.2f}%)")
        else:
            patience_counter += 1
    
    print("\n" + "="*60)
    print("PHASE 3: Fine-tuning entire model")
    print("="*60)
    
    # Phase 3: Unfreeze everything
    for param in model.parameters():
        param.requires_grad = True
    
    optimizer = optim.AdamW(
        model.parameters(),
        lr=CONFIG['lr'] / 20,
        weight_decay=CONFIG['weight_decay']
    )
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    
    remaining_epochs = CONFIG['epochs'] - current_epoch
    for epoch in range(1, remaining_epochs + 1):
        current_epoch += 1
        train_loss, train_acc = train_one_epoch(
            model, train_loader, criterion, optimizer, device, current_epoch,
            use_mixup=CONFIG['use_mixup'], mixup_alpha=CONFIG['mixup_alpha']
        )
        val_loss, val_acc = evaluate(
            model, val_loader, criterion, device, current_epoch,
            use_tta=CONFIG['use_tta']
        )
        
        print(f"\n[Epoch {current_epoch}/{CONFIG['epochs']}]")
        print(f"  Train -> Loss: {train_loss:.4f}  Acc: {train_acc:.2f}%")
        print(f"  Val   -> Loss: {val_loss:.4f}  Acc: {val_acc:.2f}%")
        
        scheduler.step(val_loss)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_epoch = current_epoch
            patience_counter = 0
            save_checkpoint(model, optimizer, current_epoch, val_acc, val_loss,
                          class_names, CONFIG)
            print(f"  ✓ New best model saved! (Val Acc: {val_acc:.2f}%)")
        else:
            patience_counter += 1
            print(f"  No improvement ({patience_counter}/{CONFIG['patience']})")
            
            if patience_counter >= CONFIG['patience']:
                print(f"\nEarly stopping triggered")
                break
    
    return best_val_acc, best_epoch


def save_checkpoint(model, optimizer, epoch, val_acc, val_loss, class_names, CONFIG):
    """Save model checkpoint"""
    best_path = os.path.join(CONFIG['save_dir'], "best_model.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_acc': val_acc,
        'val_loss': val_loss,
        'class_names': class_names,
        'config': CONFIG
    }, best_path)


# ============================================================
#                  9. Main Training Pipeline
# ============================================================

def main():
    # ============= Configuration =============
    CONFIG = {
        'data_root': 'C:/adam/AMIT_Diploma/grad_project/archive (1)',
        'model_name': 'resnet34',  # Changed to resnet34 for more capacity
        'img_size': 224,
        'batch_size': 256,
        'epochs': 50,
        'lr': 3e-4,  # Reduced initial LR
        'weight_decay': 5e-4,  # Increased weight decay
        'dropout': 0.6,  # Increased dropout
        'freeze_backbone': False,
        'val_size': 0.15,
        'random_seed': 42,
        'num_workers': 0,
        'patience': 10,  # Increased patience
        'use_class_weights': True,
        'use_focal_loss': True,  # Use focal loss
        'focal_gamma': 2.0,
        'use_mixup': True,  # Use mixup augmentation
        'mixup_alpha': 0.2,
        'use_tta': True,  # Use test-time augmentation for validation
        'use_gradual_unfreezing': True,  # Use gradual unfreezing strategy
        'save_dir': './checkpoints',
        'cache_images': True,
    }

    # ============= Setup =============
    random.seed(CONFIG['random_seed'])
    np.random.seed(CONFIG['random_seed'])
    torch.manual_seed(CONFIG['random_seed'])
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(CONFIG['random_seed'])
        torch.backends.cudnn.benchmark = True

    os.makedirs(CONFIG['save_dir'], exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # ============= Load Data =============
    print("\n" + "="*60)
    print("Loading dataset...")
    print("="*60)
    
    image_paths, labels, class_names = gather_image_paths_and_labels(CONFIG['data_root'])
    num_classes = len(class_names)
    
    print(f"\nDataset Statistics:")
    print(f"  Total images: {len(image_paths)}")
    print(f"  Classes ({num_classes}): {class_names}")
    
    # Class distribution
    label_counts = Counter(labels)
    print(f"\n  Class distribution:")
    for cls_idx, cls_name in enumerate(class_names):
        print(f"    {cls_name}: {label_counts[cls_idx]}")

    # ============= Stratified Split =============
    print(f"\nCreating stratified split (val_size={CONFIG['val_size']})...")
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=CONFIG['val_size'],
                                random_state=CONFIG['random_seed'])
    indices = np.arange(len(labels))
    train_idx, val_idx = next(sss.split(indices, labels))

    train_paths = [image_paths[i] for i in train_idx]
    train_labels_list = [labels[i] for i in train_idx]
    val_paths = [image_paths[i] for i in val_idx]
    val_labels_list = [labels[i] for i in val_idx]

    print(f"  Train samples: {len(train_paths)}")
    print(f"  Val samples: {len(val_paths)}")

    # ============= Create Datasets =============
    train_tf, val_tf = get_augmentation_transforms()
    
    train_dataset = PreCachedImageDataset(
        train_paths, train_labels_list, 
        transform=train_tf, 
        cache_images=CONFIG['cache_images'],
        img_size=CONFIG['img_size'],
        is_train=True
    )
    val_dataset = PreCachedImageDataset(
        val_paths, val_labels_list,
        transform=None,
        cache_images=CONFIG['cache_images'],
        img_size=CONFIG['img_size'],
        is_train=False
    )

    train_loader = DataLoader(
        train_dataset, 
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        num_workers=0,
        pin_memory=True if torch.cuda.is_available() else False,
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers=0,
        pin_memory=True if torch.cuda.is_available() else False,
    )

    # ============= Build Model =============
    print(f"\n{'='*60}")
    print("Building model...")
    print("="*60)
    
    model = build_model(
        model_name=CONFIG['model_name'],
        num_classes=num_classes,
        dropout_rate=CONFIG['dropout'],
        freeze_backbone=CONFIG['freeze_backbone']
    ).to(device)
    
    print(f"  Model: {CONFIG['model_name']}")
    print(f"  Freeze backbone: {CONFIG['freeze_backbone']}")
    print(f"  Dropout: {CONFIG['dropout']}")
    print(f"  Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # ============= Loss & Optimizer =============
    if CONFIG['use_class_weights']:
        train_counts = np.bincount(train_labels_list, minlength=num_classes)
        class_weights = 1.0 / (train_counts + 1e-6)
        class_weights = class_weights * (len(train_labels_list) / class_weights.sum())
        class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
        print(f"\n  Using class weights: {class_weights.cpu().numpy()}")
    else:
        class_weights = None
    
    if CONFIG['use_focal_loss']:
        criterion = FocalLoss(alpha=class_weights, gamma=CONFIG['focal_gamma'])
        print(f"  Using Focal Loss (gamma={CONFIG['focal_gamma']})")
    else:
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        print(f"  Using Cross Entropy Loss")

    # ============= Training =============
    print(f"\n{'='*60}")
    print("Starting training...")
    print("="*60)
    
    if CONFIG['use_gradual_unfreezing']:
        best_val_acc, best_epoch = train_with_gradual_unfreezing(
            model, train_loader, val_loader, criterion, device, CONFIG, class_names
        )
    else:
        # Standard training loop would go here
        pass

    # ============= Training Complete =============
    print(f"\n{'='*60}")
    print("Training Complete!")
    print("="*60)
    print(f"  Best Val Acc: {best_val_acc:.2f}% (Epoch {best_epoch})")
    print(f"  Model saved to: {CONFIG['save_dir']}/best_model.pth")


if __name__ == "__main__":
    main()

Using device: cpu

Loading dataset...

Dataset Statistics:
  Total images: 35887
  Classes (7): ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']

  Class distribution:
    angry: 4953
    disgusted: 547
    fearful: 5121
    happy: 8989
    neutral: 6198
    sad: 6077
    surprised: 4002

Creating stratified split (val_size=0.15)...
  Train samples: 30503
  Val samples: 5384
Pre-loading and processing 30503 images...


Caching: 100%|██████████| 30503/30503 [04:12<00:00, 120.59it/s]


Pre-loading and processing 5384 images...


Caching: 100%|██████████| 5384/5384 [00:52<00:00, 103.11it/s]



Building model...
  Model: resnet34
  Freeze backbone: False
  Dropout: 0.6
  Trainable params: 21,681,991

  Using class weights: [ 2114.7783 19146.703   2045.306   1165.3425  1690.0564  1723.7594
  2617.0537]
  Using Focal Loss (gamma=2.0)

Starting training...

PHASE 1: Training classifier head only (5 epochs)


                                                                                              


[Epoch 1]
  Train -> Loss: 4002.4163  Acc: 19.48%
  Val   -> Loss: 3621.9444  Acc: 32.91%


                                                                                              


[Epoch 2]
  Train -> Loss: 3762.4457  Acc: 25.27%
  Val   -> Loss: 3524.4703  Acc: 34.60%


                                                                                              


[Epoch 3]
  Train -> Loss: 3683.6070  Acc: 27.02%
  Val   -> Loss: 3476.5845  Acc: 35.46%


                                                                                              


[Epoch 4]
  Train -> Loss: 3625.6778  Acc: 28.73%
  Val   -> Loss: 3473.3585  Acc: 35.22%


                                                                                              


[Epoch 5]
  Train -> Loss: 3627.0427  Acc: 28.13%
  Val   -> Loss: 3466.8112  Acc: 35.46%

PHASE 2: Unfreezing last backbone layer (10 epochs)


                                                                                              


[Epoch 6]
  Train -> Loss: 3502.7510  Acc: 31.80%
  Val   -> Loss: 3143.8893  Acc: 39.86%
  ✓ New best model saved! (Val Acc: 39.86%)


                                                                                              


[Epoch 7]
  Train -> Loss: 3295.5076  Acc: 36.81%
  Val   -> Loss: 2988.2536  Acc: 43.09%
  ✓ New best model saved! (Val Acc: 43.09%)


                                                                                              


[Epoch 8]
  Train -> Loss: 3104.2231  Acc: 41.42%
  Val   -> Loss: 2835.7547  Acc: 44.76%
  ✓ New best model saved! (Val Acc: 44.76%)


                                                                                              


[Epoch 9]
  Train -> Loss: 3033.1628  Acc: 42.60%
  Val   -> Loss: 2737.3676  Acc: 47.49%
  ✓ New best model saved! (Val Acc: 47.49%)


                                                                                               


[Epoch 10]
  Train -> Loss: 2973.5622  Acc: 44.08%
  Val   -> Loss: 2679.6961  Acc: 48.12%
  ✓ New best model saved! (Val Acc: 48.12%)


                                                                                               


[Epoch 11]
  Train -> Loss: 2932.4476  Acc: 44.48%
  Val   -> Loss: 2634.4183  Acc: 49.55%
  ✓ New best model saved! (Val Acc: 49.55%)


                                                                                               


[Epoch 12]
  Train -> Loss: 2895.4007  Acc: 45.35%
  Val   -> Loss: 2589.8231  Acc: 50.15%
  ✓ New best model saved! (Val Acc: 50.15%)


                                                                                               


[Epoch 13]
  Train -> Loss: 2887.1766  Acc: 45.96%
  Val   -> Loss: 2563.7633  Acc: 50.69%
  ✓ New best model saved! (Val Acc: 50.69%)


                                                                                               


[Epoch 14]
  Train -> Loss: 2788.6740  Acc: 47.48%
  Val   -> Loss: 2558.8763  Acc: 51.30%
  ✓ New best model saved! (Val Acc: 51.30%)


                                                                                               


[Epoch 15]
  Train -> Loss: 2815.8385  Acc: 46.70%
  Val   -> Loss: 2552.6190  Acc: 51.71%
  ✓ New best model saved! (Val Acc: 51.71%)

PHASE 3: Fine-tuning entire model


                                                                                               


[Epoch 16/50]
  Train -> Loss: 2794.2076  Acc: 47.94%
  Val   -> Loss: 2407.2987  Acc: 54.81%
  ✓ New best model saved! (Val Acc: 54.81%)


                                                                                               


[Epoch 17/50]
  Train -> Loss: 2683.9378  Acc: 49.92%
  Val   -> Loss: 2319.0945  Acc: 55.98%
  ✓ New best model saved! (Val Acc: 55.98%)


                                                                                               


[Epoch 18/50]
  Train -> Loss: 2565.8418  Acc: 52.10%
  Val   -> Loss: 2270.4415  Acc: 57.00%
  ✓ New best model saved! (Val Acc: 57.00%)


                                                                                               


[Epoch 19/50]
  Train -> Loss: 2543.6347  Acc: 52.86%
  Val   -> Loss: 2215.6637  Acc: 59.14%
  ✓ New best model saved! (Val Acc: 59.14%)


                                                                                               


[Epoch 20/50]
  Train -> Loss: 2448.9852  Acc: 54.55%
  Val   -> Loss: 2186.3733  Acc: 60.55%
  ✓ New best model saved! (Val Acc: 60.55%)


                                                                                               


[Epoch 21/50]
  Train -> Loss: 2447.2880  Acc: 54.96%
  Val   -> Loss: 2148.1071  Acc: 60.42%
  No improvement (1/10)


                                                                                               


[Epoch 22/50]
  Train -> Loss: 2390.7283  Acc: 56.22%
  Val   -> Loss: 2111.4360  Acc: 61.81%
  ✓ New best model saved! (Val Acc: 61.81%)


                                                                                               


[Epoch 23/50]
  Train -> Loss: 2268.4016  Acc: 58.33%
  Val   -> Loss: 2094.2182  Acc: 62.80%
  ✓ New best model saved! (Val Acc: 62.80%)


                                                                                               


[Epoch 24/50]
  Train -> Loss: 2276.3571  Acc: 57.94%
  Val   -> Loss: 2052.1774  Acc: 62.04%
  No improvement (1/10)


                                                                                               


[Epoch 25/50]
  Train -> Loss: 2265.5541  Acc: 58.53%
  Val   -> Loss: 2024.9540  Acc: 63.22%
  ✓ New best model saved! (Val Acc: 63.22%)


                                                                                               


[Epoch 26/50]
  Train -> Loss: 2288.4706  Acc: 58.41%
  Val   -> Loss: 2024.9293  Acc: 63.19%
  No improvement (1/10)


                                                                                               


[Epoch 27/50]
  Train -> Loss: 2242.7141  Acc: 59.18%
  Val   -> Loss: 1973.0883  Acc: 63.87%
  ✓ New best model saved! (Val Acc: 63.87%)


                                                                                               

KeyboardInterrupt: 