## What changed?

This notebook takes the output result of `250813_nda_all` and attempts to optimize a single model instead of an ensemble.

In [3]:
# Check if CUDA

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("❌ CUDA not detected by PyTorch")

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
GPU count: 1
GPU name: NVIDIA GeForce RTX 2060
GPU memory: 6.0 GB


### Initial color/grayscale exploration

(ghostnet_100 wins, Val 84.2%, Test 72.9%)

In [5]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import random
import json
import os
import glob
import pickle
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 30 different architectural families optimized for trail camera deer images
TOP_ARCHITECTURES = [
    # Original 15 architectures
    'efficientnet_b1',           # 1. EfficientNet (Google)
    'resnet34',                  # 2. ResNet (Microsoft Research)
    'densenet169',               # 3. DenseNet (Cornell/Tsinghua)
    'hrnet_w32',                 # 4. HRNet (Microsoft Research)
    'mobilenetv3_large_100',     # 5. MobileNet (Google)
    'vit_small_patch16_224',     # 6. Vision Transformer (Google)
    'regnetx_004',               # 7. RegNet (Facebook)
    'convnext_tiny',             # 8. ConvNeXt (Facebook)
    'swin_tiny_patch4_window7_224', # 9. Swin Transformer (Microsoft)
    'maxvit_tiny_tf_224',        # 10. MaxViT (Google)
    'repvgg_b1',                 # 11. RepVGG (Tsinghua)
    'ghostnet_100',              # 12. GhostNet (Huawei)
    'mobilevit_s',               # 13. MobileViT (Apple)
    'resnext50_32x4d',           # 14. ResNeXt (Facebook)
    'seresnet50',                # 15. SENet (WMW)
    
    # Additional 15 architectures optimized for trail camera imagery
    'efficientnet_b0',           # 16. Smaller EfficientNet for speed
    'efficientnet_b2',           # 17. Larger EfficientNet for accuracy
    'efficientnetv2_s',          # 18. Newer EfficientNet with training improvements
    'resnet18',                  # 19. Lightweight ResNet for efficiency
    'resnet50',                  # 20. Standard ResNet workhorse
    'densenet121',               # 21. Efficient DenseNet variant
    'mobilenetv3_small_100',     # 22. Very efficient for edge deployment
    'regnetx_002',               # 23. Smaller RegNet for speed
    'regnetx_008',               # 24. Larger RegNet for accuracy
    'convnext_small',            # 25. Larger ConvNeXt for better features
    'swin_small_patch4_window7_224', # 26. Larger Swin for hierarchical features
    'vit_tiny_patch16_224',      # 27. Tiny ViT for efficiency
    'mobilevit_xs',              # 28. Extra small MobileViT
    'resnext101_32x8d',          # 29. Large ResNeXt for maximum accuracy
    'dla60'                      # 30. Deep Layer Aggregation (good for fine-grained tasks)
]

IMAGE_SIZE = (224, 224)
AUGMENTATION_TARGET = 1000
BATCH_SIZE = 12  # RTX 2060 friendly

plt.style.use('default')

def detect_and_convert_image(image):
    """Detect if image is grayscale and convert to 3-channel RGB"""
    if len(image.shape) == 2:  # Grayscale
        return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    elif len(image.shape) == 3:
        if image.shape[2] == 1:  # Single channel
            return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif image.shape[2] == 3:  # Already RGB
            return image
        elif image.shape[2] == 4:  # RGBA
            return cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    return image

def load_combined_data():
    """Load data from both color and grayscale folders"""
    color_path = "G:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*.png"
    gray_path = "G:\\Dropbox\\AI Projects\\buck\\images\\squared\\grayscale\\*.png"
    
    images = []
    ages = []
    sources = []  # Track if image came from color or grayscale
    
    print("Loading color images...")
    color_files = glob.glob(color_path)
    for img_path in color_files:
        try:
            img = cv2.imread(img_path)
            if img is None:
                continue
            
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = detect_and_convert_image(img)
            img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
            
            filename = os.path.basename(img_path)
            filename_no_ext = os.path.splitext(filename)[0]
            parts = filename_no_ext.split('_')
            
            if len(parts) < 5:
                continue
            
            age_part = parts[3]
            if 'xpx' in age_part.lower() or 'p' not in age_part:
                continue
            
            try:
                age_value = float(age_part.replace('p', '.'))
                images.append(img_resized)
                ages.append(age_value)
                sources.append('color')
            except ValueError:
                continue
                
        except Exception as e:
            continue
    
    print(f"Loaded {len([s for s in sources if s == 'color'])} color images")
    
    print("Loading grayscale images...")
    gray_files = glob.glob(gray_path)
    for img_path in gray_files:
        try:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            if img is None:
                continue
            
            img = detect_and_convert_image(img)
            img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
            
            filename = os.path.basename(img_path)
            filename_no_ext = os.path.splitext(filename)[0]
            parts = filename_no_ext.split('_')
            
            if len(parts) < 5:
                continue
            
            age_part = parts[3]
            if 'xpx' in age_part.lower() or 'p' not in age_part:
                continue
            
            try:
                age_value = float(age_part.replace('p', '.'))
                images.append(img_resized)
                ages.append(age_value)
                sources.append('grayscale')
            except ValueError:
                continue
                
        except Exception as e:
            continue
    
    print(f"Loaded {len([s for s in sources if s == 'grayscale'])} grayscale images")
    print(f"Total images: {len(images)}")
    
    # Group ages
    ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
    
    # Filter classes with enough samples
    age_counts = Counter(ages_grouped)
    valid_ages = {age for age, count in age_counts.items() if count >= 3}
    
    filtered_images = []
    filtered_ages = []
    filtered_sources = []
    
    for img, age, source in zip(images, ages_grouped, sources):
        if age in valid_ages:
            filtered_images.append(img)
            filtered_ages.append(age)
            filtered_sources.append(source)
    
    print(f"Final dataset: {len(filtered_images)} images")
    print(f"Age distribution: {dict(Counter(filtered_ages))}")
    print(f"Source distribution: {dict(Counter(filtered_sources))}")
    
    return np.array(filtered_images), filtered_ages, filtered_sources

def enhanced_augment_image(image):
    """Enhanced augmentation for deer images with strategic color conversion"""
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Rotation
    if random.random() < 0.7:
        angle = random.uniform(-15, 15)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    # Horizontal flip
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    # Strategic color conversion (RGB -> Grayscale -> RGB)
    # Based on ensemble results showing grayscale superiority
    if len(image.shape) == 3 and image.shape[2] == 3 and random.random() < 0.4:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        image = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
    
    # Brightness/contrast
    if random.random() < 0.8:
        alpha = random.uniform(0.7, 1.3)
        beta = random.randint(-25, 25)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Gamma correction
    if random.random() < 0.4:
        gamma = random.uniform(0.8, 1.2)
        inv_gamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        image = cv2.LUT(image, table)
    
    # Noise
    if random.random() < 0.3:
        noise = np.random.normal(0, 7, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

def create_balanced_dataset(X, y):
    """Create balanced dataset through augmentation"""
    print(f"\nOriginal class distribution:")
    class_counts = Counter(y)
    for class_idx, count in sorted(class_counts.items()):
        print(f"  Class {class_idx}: {count} images")
    
    max_count = max(class_counts.values())
    target_count = max(AUGMENTATION_TARGET, max_count)
    
    print(f"\nTarget samples per class: {target_count}")
    
    X_balanced = []
    y_balanced = []
    
    for class_idx in range(len(set(y))):
        class_mask = np.array(y) == class_idx
        class_images = X[class_mask]
        current_count = len(class_images)
        
        if current_count == 0:
            continue
        
        # Add originals
        X_balanced.extend(class_images)
        y_balanced.extend([class_idx] * current_count)
        
        # Add augmented to reach target
        needed = target_count - current_count
        augmented_for_class = 0
        for i in range(needed):
            orig_idx = random.randint(0, current_count - 1)
            aug_img = enhanced_augment_image(class_images[orig_idx].copy())
            X_balanced.append(aug_img)
            y_balanced.append(class_idx)
            augmented_for_class += 1
        
        print(f"  Class {class_idx}: {current_count} original + {augmented_for_class} augmented = {current_count + augmented_for_class} total")
    
    # Verify final balance
    final_counts = Counter(y_balanced)
    print(f"\nFinal balanced class distribution:")
    for class_idx, count in sorted(final_counts.items()):
        print(f"  Class {class_idx}: {count} images")
    
    print(f"Total training images after balancing: {len(X_balanced)}")
    
    return np.array(X_balanced), np.array(y_balanced)

class DeerDataset(Dataset):
    def __init__(self, X, y, training=True):
        self.X = torch.FloatTensor(X if isinstance(X, np.ndarray) else np.array(X))
        self.y = torch.LongTensor(y if isinstance(y, np.ndarray) else np.array(y))
        self.training = training
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        # Normalize to [0, 1]
        if image.max() > 1.0:
            image = image / 255.0
        
        # Ensure CHW format
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        # Test time augmentation for validation
        if not self.training and random.random() < 0.5:
            image = torch.flip(image, [2])
        
        # ImageNet normalization
        image = (image - self.mean) / self.std
        return image, label

class SingleModelTrainer:
    def __init__(self, num_classes, save_dir=None):
        self.num_classes = num_classes
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        if save_dir is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.save_dir = f"single_model_{timestamp}"
        else:
            self.save_dir = save_dir
        
        os.makedirs(self.save_dir, exist_ok=True)
        
        print(f"Using device: {self.device}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
            torch.backends.cudnn.benchmark = True
    
    def create_model(self, architecture):
        """Create model with optimized head for diverse architectures"""
        model = timm.create_model(architecture, pretrained=True, num_classes=self.num_classes)
        
        # Architecture-specific layer freezing
        if any(arch in architecture for arch in ['resnet', 'resnext', 'seresnet']):
            frozen_layers = ['conv1', 'bn1', 'layer1', 'layer2']
            for name, param in model.named_parameters():
                if any(layer in name for layer in frozen_layers):
                    param.requires_grad = False
        elif 'efficientnet' in architecture:
            for name, param in model.named_parameters():
                if any(block in name for block in ['blocks.0', 'blocks.1', 'blocks.2']):
                    param.requires_grad = False
        elif 'densenet' in architecture:
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['features.conv0', 'features.norm0', 'features.denseblock1']):
                    param.requires_grad = False
        elif 'hrnet' in architecture:
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['conv1', 'bn1', 'stage1']):
                    param.requires_grad = False
        elif any(arch in architecture for arch in ['mobilenet', 'ghostnet']):
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['features.0', 'features.1', 'features.2']):
                    param.requires_grad = False
        elif any(arch in architecture for arch in ['vit', 'swin', 'mobilevit']):
            # Freeze patch embedding and early transformer blocks
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['patch_embed', 'blocks.0', 'blocks.1', 'layers.0']):
                    param.requires_grad = False
        elif 'regnet' in architecture:
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['stem', 's1']):
                    param.requires_grad = False
        elif 'convnext' in architecture:
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['stem', 'stages.0']):
                    param.requires_grad = False
        elif 'maxvit' in architecture:
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['stem', 'stages.0']):
                    param.requires_grad = False
        elif 'repvgg' in architecture:
            for name, param in model.named_parameters():
                if any(layer in name for layer in ['stage0', 'stage1']):
                    param.requires_grad = False
        
        # Replace classifier head based on architecture
        if hasattr(model, 'fc'):
            in_features = model.fc.in_features
            model.fc = nn.Sequential(
                nn.Dropout(0.3),
                nn.Linear(in_features, self.num_classes)
            )
        elif hasattr(model, 'classifier'):
            if hasattr(model.classifier, 'in_features'):
                in_features = model.classifier.in_features
            else:
                in_features = model.classifier[-1].in_features
            model.classifier = nn.Sequential(
                nn.Dropout(0.3),
                nn.Linear(in_features, self.num_classes)
            )
        elif hasattr(model, 'head'):
            if hasattr(model.head, 'in_features'):
                in_features = model.head.in_features
            else:
                in_features = model.head[-1].in_features if hasattr(model.head, '__getitem__') else 512
            model.head = nn.Sequential(
                nn.Dropout(0.3),
                nn.Linear(in_features, self.num_classes)
            )
        
        return model.to(self.device)
    
    def train_single_architecture(self, train_loader, val_loader, architecture):
        """Train a single architecture"""
        print(f"Training {architecture}...")
        
        model = self.create_model(architecture)
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        
        # Separate learning rates
        backbone_params = []
        classifier_params = []
        
        for name, param in model.named_parameters():
            if param.requires_grad:
                if any(head in name for head in ['fc', 'classifier', 'head']):
                    classifier_params.append(param)
                else:
                    backbone_params.append(param)
        
        optimizer = optim.AdamW([
            {'params': backbone_params, 'lr': 0.0002},
            {'params': classifier_params, 'lr': 0.0008}
        ], weight_decay=0.02)
        
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80, eta_min=1e-6)
        
        best_val_acc = 0.0
        patience = 25  # Increased patience
        patience_counter = 0
        best_state = None
        
        for epoch in range(80):  # Increased max epochs
            # Training
            model.train()
            train_correct = 0
            train_total = 0
            train_loss = 0.0
            
            for images, labels in train_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
                train_loss += loss.item()
            
            # Validation
            model.eval()
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(self.device), labels.to(self.device)
                    outputs = model(images)
                    
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            train_acc = 100 * train_correct / train_total
            val_acc = 100 * val_correct / val_total
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            scheduler.step()
            
            if epoch % 20 == 0 and epoch > 0:
                print(f"    Epoch {epoch}: Train {train_acc:.1f}%, Val {val_acc:.1f}%")
            
            if patience_counter >= patience:
                print(f"    Early stopping at epoch {epoch} (patience reached)")
                break
            
            # Memory management for RTX 2060
            if epoch % 5 == 0:
                torch.cuda.empty_cache()
        
        # Load best weights
        if best_state is not None:
            model.load_state_dict(best_state)
        
        print(f"  {architecture} best validation: {best_val_acc:.1f}%")
        return model, best_val_acc
    
    def find_best_architecture(self, train_loader, val_loader, test_loader, label_mapping):
        """Test all architectures and return the best"""
        results = {}
        failed_archs = []
        
        print(f"Testing {len(TOP_ARCHITECTURES)} diverse architectural families...")
        for i, arch in enumerate(TOP_ARCHITECTURES, 1):
            try:
                print(f"[{i:2d}/{len(TOP_ARCHITECTURES)}] Testing {arch}...")
                model, val_acc = self.train_single_architecture(train_loader, val_loader, arch)
                
                # Evaluate on test set for comparison (not selection)
                test_acc = evaluate_model(model, test_loader, self.device)
                
                results[arch] = (model, val_acc, test_acc)
                print(f"  ✓ {arch}: Val {val_acc:.1f}%, Test {test_acc:.1f}%")
                
                # Save each model
                save_path = os.path.join(self.save_dir, f"deer_age_model_{arch}_{test_acc:.1f}pct.pth")
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'architecture': arch,
                    'num_classes': self.num_classes,
                    'label_mapping': label_mapping,
                    'test_accuracy': test_acc,
                    'val_accuracy': val_acc,
                    'input_size': IMAGE_SIZE
                }, save_path)
                print(f"  Model saved: {save_path}")
                
                print()  # Add blank line between architectures
                torch.cuda.empty_cache()
            except Exception as e:
                print(f"  ✗ {arch} failed: {str(e)[:50]}...")
                print()  # Add blank line for failed architectures too
                failed_archs.append(arch)
                torch.cuda.empty_cache()
                continue
        
        if not results:
            raise ValueError("All architectures failed to train!")
        
        # Find best based on VALIDATION (not test) to avoid contamination
        best_arch = max(results.keys(), key=lambda x: results[x][1])
        best_model, best_val_acc, best_test_acc = results[best_arch]
        
        print(f"\n{'='*60}")
        print("ARCHITECTURE COMPARISON RESULTS")
        print('='*60)
        print(f"{'Rank':<4} {'Architecture':<30} {'Validation':<12} {'Test':<8}")
        print('-'*60)
        
        # Sort by validation performance for ranking
        sorted_results = sorted(results.items(), key=lambda x: x[1][1], reverse=True)
        for i, (arch, (_, val_acc, test_acc)) in enumerate(sorted_results, 1):
            marker = "🏆" if arch == best_arch else "  "
            print(f"{i:2d}. {marker} {arch:<28} {val_acc:5.1f}%      {test_acc:5.1f}%")
        
        if failed_archs:
            print(f"\nFailed architectures ({len(failed_archs)}): {', '.join(failed_archs)}")
        
        print(f"\n🏆 WINNER: {best_arch} (Val: {best_val_acc:.1f}%, Test: {best_test_acc:.1f}%)")
        print("Note: Selection based on validation performance to avoid test contamination")
        
        return best_model, best_arch, best_val_acc
    
    def final_optimization(self, model, train_loader, val_loader, architecture):
        """Final optimization of the best model"""
        print(f"\nFinal optimization of {architecture}...")
        
        # Unfreeze more layers for fine-tuning
        for param in model.parameters():
            param.requires_grad = True
        
        # Lower learning rate for fine-tuning
        optimizer = optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-7)
        criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
        
        best_val_acc = 0.0
        patience = 15
        patience_counter = 0
        best_state = None
        
        for epoch in range(50):
            # Training
            model.train()
            train_correct = 0
            train_total = 0
            
            for images, labels in train_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            # Validation
            model.eval()
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(self.device), labels.to(self.device)
                    
                    # Test time augmentation
                    outputs1 = model(images)
                    flipped = torch.flip(images, [3])
                    outputs2 = model(flipped)
                    outputs = (outputs1 + outputs2) / 2
                    
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            train_acc = 100 * train_correct / train_total
            val_acc = 100 * val_correct / val_total
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            scheduler.step()
            
            if epoch % 10 == 0:
                print(f"  Epoch {epoch}: Train {train_acc:.1f}%, Val {val_acc:.1f}%")
            
            if patience_counter >= patience:
                break
        
        if best_state is not None:
            model.load_state_dict(best_state)
        
        print(f"  Final optimization complete: {best_val_acc:.1f}%")
        return model, best_val_acc

def evaluate_model(model, test_loader, device):
    """Evaluate model on test set"""
    model.eval()
    test_correct = 0
    test_total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Test time augmentation
            outputs1 = model(images)
            flipped = torch.flip(images, [3])
            outputs2 = model(flipped)
            outputs = (outputs1 + outputs2) / 2
            
            _, predicted = torch.max(outputs, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
    
    test_acc = 100 * test_correct / test_total
    return test_acc

def main():
    print("Single Optimized Deer Age Prediction Model")
    print("=" * 50)
    
    start_time = time.time()
    
    # Load combined data
    images, ages, sources = load_combined_data()
    
    # Create label mapping
    unique_ages = sorted(list(set(ages)))
    label_mapping = {age: i for i, age in enumerate(unique_ages)}
    y_indices = np.array([label_mapping[age] for age in ages])
    
    print(f"\nClasses: {len(unique_ages)}")
    print(f"Label mapping: {label_mapping}")
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        images, y_indices, test_size=0.2, random_state=42, stratify=y_indices
    )
    
    # Further split training into train/val
    X_train_final, X_val, y_train_final, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    print(f"\nData split:")
    print(f"Train: {len(X_train_final)} images")
    print(f"Val: {len(X_val)} images") 
    print(f"Test: {len(X_test)} images")
    
    # Create balanced training set
    print("\nCreating balanced training set...")
    X_train_balanced, y_train_balanced = create_balanced_dataset(X_train_final, y_train_final)
    
    # Create datasets
    train_dataset = DeerDataset(X_train_balanced, y_train_balanced, training=True)
    val_dataset = DeerDataset(X_val, y_val, training=False)
    test_dataset = DeerDataset(X_test, y_test, training=False)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    # Initialize trainer
    trainer = SingleModelTrainer(num_classes=len(unique_ages))
    
    # Find best architecture
    best_model, best_arch, val_acc = trainer.find_best_architecture(train_loader, val_loader, test_loader, label_mapping)
    
    # Final optimization
    optimized_model, final_val_acc = trainer.final_optimization(best_model, train_loader, val_loader, best_arch)
    
    # Test evaluation
    test_acc = evaluate_model(optimized_model, test_loader, trainer.device)
    
    # Save final model
    save_path = os.path.join(trainer.save_dir, f"deer_age_model_{best_arch}_{test_acc:.1f}pct_FINAL.pth")
    torch.save({
        'model_state_dict': optimized_model.state_dict(),
        'architecture': best_arch,
        'num_classes': len(unique_ages),
        'label_mapping': label_mapping,
        'test_accuracy': test_acc,
        'val_accuracy': final_val_acc,
        'input_size': IMAGE_SIZE
    }, save_path)
    
    elapsed = (time.time() - start_time) / 60
    
    print("\n" + "=" * 50)
    print("FINAL RESULTS")
    print("=" * 50)
    print(f"Best architecture: {best_arch}")
    print(f"Validation accuracy: {final_val_acc:.1f}%")
    print(f"Test accuracy: {test_acc:.1f}%")
    print(f"Training time: {elapsed:.1f} minutes")
    print(f"Final model saved: {save_path}")
    
    return optimized_model, best_arch, test_acc

if __name__ == "__main__":
    model, architecture, accuracy = main()

Single Optimized Deer Age Prediction Model
Loading color images...
ERROR! Session/line number was not unique in database. History logging moved to new session 361
Loaded 363 color images
Loading grayscale images...
Loaded 108 grayscale images
Total images: 471
Final dataset: 471 images
Age distribution: {5.5: 121, 4.5: 89, 2.5: 83, 3.5: 111, 1.5: 67}
Source distribution: {'color': 363, 'grayscale': 108}

Classes: 5
Label mapping: {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}

Data split:
Train: 300 images
Val: 76 images
Test: 95 images

Creating balanced training set...

Original class distribution:
  Class 0: 42 images
  Class 1: 53 images
  Class 2: 71 images
  Class 3: 57 images
  Class 4: 77 images

Target samples per class: 1000
  Class 0: 42 original + 958 augmented = 1000 total
  Class 1: 53 original + 947 augmented = 1000 total
  Class 2: 71 original + 929 augmented = 1000 total
  Class 3: 57 original + 943 augmented = 1000 total
  Class 4: 77 original + 923 augmented = 1000 total



model.safetensors:   0%|          | 0.00/29.2M [00:00<?, ?B/s]

  ✗ regnetx_008 failed: mat1 and mat2 shapes cannot be multiplied (56448x7...

[25/30] Testing convnext_small...
Training convnext_small...
  ✗ convnext_small failed: mat1 and mat2 shapes cannot be multiplied (64512x7...

[26/30] Testing swin_small_patch4_window7_224...
Training swin_small_patch4_window7_224...
  ✗ swin_small_patch4_window7_224 failed: only batches of spatial targets supported (3D tens...

[27/30] Testing vit_tiny_patch16_224...
Training vit_tiny_patch16_224...
    Epoch 20: Train 98.6%, Val 48.7%
    Early stopping at epoch 28 (patience reached)
  vit_tiny_patch16_224 best validation: 63.2%
  ✓ vit_tiny_patch16_224: Val 63.2%, Test 50.5%
  Model saved: single_model_20250829_074605\deer_age_model_vit_tiny_patch16_224_50.5pct.pth

[28/30] Testing mobilevit_xs...
Training mobilevit_xs...


model.safetensors:   0%|          | 0.00/9.34M [00:00<?, ?B/s]

  ✗ mobilevit_xs failed: mat1 and mat2 shapes cannot be multiplied (32256x7...

[29/30] Testing resnext101_32x8d...
Training resnext101_32x8d...
    Epoch 20: Train 100.0%, Val 68.4%
    Early stopping at epoch 30 (patience reached)
  resnext101_32x8d best validation: 73.7%
  ✓ resnext101_32x8d: Val 73.7%, Test 60.0%
  Model saved: single_model_20250829_074605\deer_age_model_resnext101_32x8d_60.0pct.pth

[30/30] Testing dla60...
Training dla60...


model.safetensors:   0%|          | 0.00/88.3M [00:00<?, ?B/s]

  ✗ dla60 failed: 'Conv2d' object has no attribute 'in_features'...


ARCHITECTURE COMPARISON RESULTS
Rank Architecture                   Validation   Test    
------------------------------------------------------------
 1. 🏆 hrnet_w32                     80.3%       58.9%
 2.    efficientnet_b0               78.9%       53.7%
 3.    efficientnet_b2               76.3%       61.1%
 4.    ghostnet_100                  75.0%       51.6%
 5.    resnext101_32x8d              73.7%       60.0%
 6.    resnet34                      72.4%       60.0%
 7.    densenet169                   72.4%       58.9%
 8.    mobilenetv3_large_100         72.4%       54.7%
 9.    resnet50                      72.4%       56.8%
10.    densenet121                   72.4%       55.8%
11.    mobilenetv3_small_100         72.4%       51.6%
12.    resnext50_32x4d               71.1%       56.8%
13.    seresnet50                    71.1%       51.6%
14.    resnet18                      71.1%       61.1%
15.    eff

### Addressing algorithm robustness / deer detection

In [6]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import random
import json
import os
import glob
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Test different approaches to handle multi-source label inconsistency
METHODS_TO_TEST = [
    'baseline',                    # Original approach
    'robust_training',            # Heavy label smoothing + high dropout
    'uncertainty_aware',          # Predict confidence alongside age
    'crop_robust',               # Multiple crop augmentation
    'mixed_approach'             # Combination of techniques
]

# Efficient architectures for comparison
TEST_ARCHITECTURES = [
    'efficientnet_b0',
    'resnet34', 
    'mobilenetv3_large_100',
    'ghostnet_100'
]

IMAGE_SIZE = (224, 224)
AUGMENTATION_TARGET = 800  # Reduced for faster training during testing
BATCH_SIZE = 16

def detect_and_convert_image(image):
    """Detect if image is grayscale and convert to 3-channel RGB"""
    if len(image.shape) == 2:
        return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    elif len(image.shape) == 3:
        if image.shape[2] == 1:
            return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif image.shape[2] == 3:
            return image
        elif image.shape[2] == 4:
            return cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    return image

def load_multi_source_data():
    """Load data from multiple sources - modify paths as needed"""
    # Update these paths to your actual data locations
    data_sources = [
        ("G:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*.png", "NDA_color"),
        ("G:\\Dropbox\\AI Projects\\buck\\images\\squared\\grayscale\\*.png", "NDA_grayscale"),
        # Add more sources here:
        # ("path\\to\\source2\\*.jpg", "source2"),
        # ("path\\to\\source3\\*.png", "source3"),
    ]
    
    images = []
    ages = []
    sources = []
    
    for path_pattern, source_name in data_sources:
        print(f"Loading {source_name} images...")
        files = glob.glob(path_pattern)
        source_count = 0
        
        for img_path in files:
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = detect_and_convert_image(img)
                img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
                
                filename = os.path.basename(img_path)
                filename_no_ext = os.path.splitext(filename)[0]
                parts = filename_no_ext.split('_')
                
                if len(parts) < 5:
                    continue
                
                age_part = parts[3]
                if 'xpx' in age_part.lower() or 'p' not in age_part:
                    continue
                
                try:
                    age_value = float(age_part.replace('p', '.'))
                    images.append(img_resized)
                    ages.append(age_value)
                    sources.append(source_name)
                    source_count += 1
                except ValueError:
                    continue
                    
            except Exception as e:
                continue
        
        print(f"  Loaded {source_count} images from {source_name}")
    
    print(f"Total images: {len(images)}")
    
    # Group ages and filter
    ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
    age_counts = Counter(ages_grouped)
    valid_ages = {age for age, count in age_counts.items() if count >= 3}
    
    filtered_images = []
    filtered_ages = []
    filtered_sources = []
    
    for img, age, source in zip(images, ages_grouped, sources):
        if age in valid_ages:
            filtered_images.append(img)
            filtered_ages.append(age)
            filtered_sources.append(source)
    
    print(f"Final dataset: {len(filtered_images)} images")
    print(f"Age distribution: {dict(Counter(filtered_ages))}")
    print(f"Source distribution: {dict(Counter(filtered_sources))}")
    
    return np.array(filtered_images), filtered_ages, filtered_sources

def smart_crop_augmentation(image, crop_probability=0.7):
    """Intelligent cropping that tries to keep the deer in frame"""
    if random.random() > crop_probability:
        return image
    
    h, w = image.shape[:2]
    
    # Different crop strategies - some more conservative, some more aggressive
    crop_strategies = [
        (0.8, 0.8),   # Mild crop
        (0.7, 0.7),   # Moderate crop  
        (0.6, 0.8),   # Horizontal crop
        (0.8, 0.6),   # Vertical crop
        (0.6, 0.6),   # Aggressive crop
    ]
    
    crop_h_factor, crop_w_factor = random.choice(crop_strategies)
    
    new_h = int(h * crop_h_factor)
    new_w = int(w * crop_w_factor)
    
    # Random starting position, but avoid extreme edges
    max_start_y = h - new_h
    max_start_x = w - new_w
    
    # Bias towards center (where deer likely is)
    start_y = random.randint(max(0, max_start_y//4), max(max_start_y//4, max_start_y*3//4))
    start_x = random.randint(max(0, max_start_x//4), max(max_start_x//4, max_start_x*3//4))
    
    cropped = image[start_y:start_y+new_h, start_x:start_x+new_w]
    
    # Resize back to original size
    return cv2.resize(cropped, IMAGE_SIZE[::-1])

def enhanced_augment_image(image, method='baseline'):
    """Method-specific augmentation strategies"""
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Base augmentations for all methods
    if random.random() < 0.7:
        angle = random.uniform(-15, 15)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    # Crop-robust augmentation
    if method in ['crop_robust', 'mixed_approach']:
        image = smart_crop_augmentation(image, crop_probability=0.8)
    
    # Color/contrast augmentation
    if random.random() < 0.8:
        alpha = random.uniform(0.7, 1.3)
        beta = random.randint(-25, 25)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Strategic grayscale conversion
    if len(image.shape) == 3 and image.shape[2] == 3 and random.random() < 0.4:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        image = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
    
    # Additional heavy augmentation for robust methods
    if method in ['robust_training', 'mixed_approach']:
        # More aggressive noise and gamma
        if random.random() < 0.5:
            gamma = random.uniform(0.7, 1.4)
            inv_gamma = 1.0 / gamma
            table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
            image = cv2.LUT(image, table)
        
        if random.random() < 0.4:
            noise = np.random.normal(0, 10, image.shape).astype(np.int16)
            image_int16 = image.astype(np.int16)
            noisy_image = np.clip(image_int16 + noise, 0, 255)
            image = noisy_image.astype(np.uint8)
    
    return image

def create_balanced_dataset(X, y, sources, method='baseline'):
    """Create balanced dataset with method-specific augmentation"""
    print(f"\nCreating balanced dataset for method: {method}")
    
    class_counts = Counter(y)
    max_count = max(class_counts.values())
    target_count = max(AUGMENTATION_TARGET, max_count)
    
    X_balanced = []
    y_balanced = []
    sources_balanced = []
    
    for class_idx in range(len(set(y))):
        class_mask = np.array(y) == class_idx
        class_images = X[class_mask]
        class_sources = np.array(sources)[class_mask]
        current_count = len(class_images)
        
        if current_count == 0:
            continue
        
        # Add originals
        X_balanced.extend(class_images)
        y_balanced.extend([class_idx] * current_count)
        sources_balanced.extend(class_sources)
        
        # Add augmented
        needed = target_count - current_count
        for i in range(needed):
            orig_idx = random.randint(0, current_count - 1)
            aug_img = enhanced_augment_image(class_images[orig_idx].copy(), method=method)
            X_balanced.append(aug_img)
            y_balanced.append(class_idx)
            sources_balanced.append(class_sources[orig_idx])
    
    print(f"Total balanced images: {len(X_balanced)}")
    return np.array(X_balanced), np.array(y_balanced), sources_balanced

class RobustDeerDataset(Dataset):
    def __init__(self, X, y, sources=None, training=True, method='baseline'):
        self.X = torch.FloatTensor(X if isinstance(X, np.ndarray) else np.array(X))
        self.y = torch.LongTensor(y if isinstance(y, np.ndarray) else np.array(y))
        self.sources = sources
        self.training = training
        self.method = method
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        # Test time augmentation for validation/test
        if not self.training and random.random() < 0.5:
            image = torch.flip(image, [2])
        
        image = (image - self.mean) / self.std
        
        # Return source info for methods that need it
        if self.sources is not None:
            return image, label, self.sources[idx]
        return image, label

class BaselineModel(nn.Module):
    """Standard classification model"""
    def __init__(self, architecture, num_classes):
        super().__init__()
        self.backbone = timm.create_model(architecture, pretrained=True, num_classes=num_classes)
    
    def forward(self, x):
        return self.backbone(x)

class UncertaintyAwareModel(nn.Module):
    """Model that predicts both age and confidence"""
    def __init__(self, architecture, num_classes):
        super().__init__()
        self.backbone = timm.create_model(architecture, pretrained=True, num_classes=1000)
        
        # Get feature dimension
        if hasattr(self.backbone, 'fc'):
            feat_dim = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        elif hasattr(self.backbone, 'classifier'):
            feat_dim = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif hasattr(self.backbone, 'head'):
            feat_dim = self.backbone.head.in_features
            self.backbone.head = nn.Identity()
        else:
            feat_dim = 1000
        
        self.age_head = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(feat_dim, num_classes)
        )
        
        self.confidence_head = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(feat_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        features = self.backbone(x)
        age_logits = self.age_head(features)
        confidence = self.confidence_head(features)
        return age_logits, confidence

class MethodTrainer:
    def __init__(self, method, architecture, num_classes, device):
        self.method = method
        self.architecture = architecture
        self.num_classes = num_classes
        self.device = device
        
        # Create model based on method
        if method == 'uncertainty_aware':
            self.model = UncertaintyAwareModel(architecture, num_classes)
        else:
            self.model = BaselineModel(architecture, num_classes)
        
        self.model = self.model.to(device)
        
        # Method-specific parameters
        if method == 'robust_training':
            self.label_smoothing = 0.25
            self.dropout_rate = 0.5
        elif method == 'mixed_approach':
            self.label_smoothing = 0.2
            self.dropout_rate = 0.4
        else:
            self.label_smoothing = 0.1
            self.dropout_rate = 0.3
    
    def get_criterion(self):
        if self.method == 'uncertainty_aware':
            return self.uncertainty_loss
        else:
            return nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)
    
    def uncertainty_loss(self, predictions, targets):
        age_logits, confidence = predictions
        age_loss = F.cross_entropy(age_logits, targets, reduction='none')
        
        # Weight loss by confidence (lower confidence = higher weight for learning)
        weighted_loss = age_loss * (2.0 - confidence.squeeze())
        
        # Add confidence regularization
        conf_loss = -torch.mean(torch.log(confidence + 1e-8))  # Encourage confidence
        
        return weighted_loss.mean() + 0.1 * conf_loss
    
    def train_model(self, train_loader, val_loader, max_epochs=60):
        criterion = self.get_criterion()
        optimizer = optim.AdamW(self.model.parameters(), lr=0.0003, weight_decay=0.02)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs)
        
        best_val_acc = 0.0
        patience = 15
        patience_counter = 0
        best_state = None
        
        for epoch in range(max_epochs):
            # Training
            self.model.train()
            train_correct = 0
            train_total = 0
            train_loss = 0.0
            
            for batch in train_loader:
                if len(batch) == 3:  # Has source info
                    images, labels, sources = batch
                else:
                    images, labels = batch
                
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                
                outputs = self.model(images)
                
                if self.method == 'uncertainty_aware':
                    loss = criterion(outputs, labels)
                    age_logits, _ = outputs
                    _, predicted = torch.max(age_logits, 1)
                else:
                    loss = criterion(outputs, labels)
                    _, predicted = torch.max(outputs, 1)
                
                loss.backward()
                optimizer.step()
                
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
                train_loss += loss.item()
            
            # Validation
            val_acc = self.evaluate(val_loader)
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = self.model.state_dict().copy()
            else:
                patience_counter += 1
            
            scheduler.step()
            
            if epoch % 15 == 0:
                train_acc = 100 * train_correct / train_total
                print(f"    Epoch {epoch}: Train {train_acc:.1f}%, Val {val_acc:.1f}%")
            
            if patience_counter >= patience:
                break
        
        # Load best weights
        if best_state is not None:
            self.model.load_state_dict(best_state)
        
        return best_val_acc
    
    def evaluate(self, loader):
        self.model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in loader:
                if len(batch) == 3:
                    images, labels, sources = batch
                else:
                    images, labels = batch
                
                images, labels = images.to(self.device), labels.to(self.device)
                outputs = self.model(images)
                
                if self.method == 'uncertainty_aware':
                    age_logits, _ = outputs
                    _, predicted = torch.max(age_logits, 1)
                else:
                    _, predicted = torch.max(outputs, 1)
                
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        return 100 * correct / total

class MultiMethodComparison:
    def __init__(self, save_dir=None):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        if save_dir is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.save_dir = f"multi_method_comparison_{timestamp}"
        else:
            self.save_dir = save_dir
        
        os.makedirs(self.save_dir, exist_ok=True)
        
        print(f"Using device: {self.device}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
    
    def run_comparison(self, images, ages, sources):
        unique_ages = sorted(list(set(ages)))
        label_mapping = {age: i for i, age in enumerate(unique_ages)}
        y_indices = np.array([label_mapping[age] for age in ages])
        
        print(f"Classes: {len(unique_ages)}")
        print(f"Label mapping: {label_mapping}")
        
        # Single train/test split for fair comparison
        X_train, X_test, y_train, y_test, sources_train, sources_test = train_test_split(
            images, y_indices, sources, test_size=0.2, random_state=42, stratify=y_indices
        )
        
        X_train_final, X_val, y_train_final, y_val, sources_train_final, sources_val = train_test_split(
            X_train, y_train, sources_train, test_size=0.2, random_state=42, stratify=y_train
        )
        
        print(f"Data split - Train: {len(X_train_final)}, Val: {len(X_val)}, Test: {len(X_test)}")
        
        results = {}
        
        # Test each method with each architecture
        for method in METHODS_TO_TEST:
            print(f"\n{'='*60}")
            print(f"TESTING METHOD: {method.upper()}")
            print('='*60)
            
            method_results = {}
            
            for arch in TEST_ARCHITECTURES:
                print(f"\n[{method}] Testing {arch}...")
                
                try:
                    # Create method-specific balanced dataset
                    X_balanced, y_balanced, sources_balanced = create_balanced_dataset(
                        X_train_final, y_train_final, sources_train_final, method=method
                    )
                    
                    # Create datasets
                    train_dataset = RobustDeerDataset(X_balanced, y_balanced, sources_balanced, training=True, method=method)
                    val_dataset = RobustDeerDataset(X_val, y_val, sources_val, training=False, method=method)
                    test_dataset = RobustDeerDataset(X_test, y_test, sources_test, training=False, method=method)
                    
                    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
                    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
                    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
                    
                    # Train model
                    trainer = MethodTrainer(method, arch, len(unique_ages), self.device)
                    val_acc = trainer.train_model(train_loader, val_loader)
                    test_acc = trainer.evaluate(test_loader)
                    
                    method_results[arch] = {
                        'val_acc': val_acc,
                        'test_acc': test_acc,
                        'model_state': trainer.model.state_dict().copy()
                    }
                    
                    print(f"  {arch}: Val {val_acc:.1f}%, Test {test_acc:.1f}%")
                    
                    # Save model
                    save_path = os.path.join(self.save_dir, f"{method}_{arch}_{test_acc:.1f}pct.pth")
                    torch.save({
                        'model_state_dict': trainer.model.state_dict(),
                        'method': method,
                        'architecture': arch,
                        'num_classes': len(unique_ages),
                        'label_mapping': label_mapping,
                        'test_accuracy': test_acc,
                        'val_accuracy': val_acc
                    }, save_path)
                    
                    torch.cuda.empty_cache()
                    
                except Exception as e:
                    print(f"  {arch} FAILED: {str(e)[:60]}...")
                    continue
            
            results[method] = method_results
        
        # Print final comparison
        self.print_final_results(results)
        
        # Save results
        results_path = os.path.join(self.save_dir, "method_comparison_results.json")
        with open(results_path, 'w') as f:
            # Convert to serializable format
            serializable_results = {}
            for method, arch_results in results.items():
                serializable_results[method] = {}
                for arch, metrics in arch_results.items():
                    serializable_results[method][arch] = {
                        'val_acc': metrics['val_acc'],
                        'test_acc': metrics['test_acc']
                    }
            json.dump(serializable_results, f, indent=2)
        
        return results
    
    def print_final_results(self, results):
        print(f"\n{'='*80}")
        print("FINAL METHOD COMPARISON RESULTS")
        print('='*80)
        print(f"{'Method':<20} {'Architecture':<20} {'Val Acc':<10} {'Test Acc':<10}")
        print('-'*80)
        
        all_results = []
        for method, arch_results in results.items():
            for arch, metrics in arch_results.items():
                all_results.append((method, arch, metrics['val_acc'], metrics['test_acc']))
        
        # Sort by test accuracy
        all_results.sort(key=lambda x: x[3], reverse=True)
        
        for i, (method, arch, val_acc, test_acc) in enumerate(all_results):
            marker = "🏆" if i == 0 else "  "
            print(f"{marker} {method:<18} {arch:<20} {val_acc:6.1f}%    {test_acc:6.1f}%")
        
        # Method summaries
        print(f"\n{'='*60}")
        print("METHOD SUMMARIES (Best result per method):")
        print('='*60)
        
        for method in METHODS_TO_TEST:
            if method in results and results[method]:
                best_result = max(results[method].items(), key=lambda x: x[1]['test_acc'])
                arch, metrics = best_result
                print(f"{method:<20}: {metrics['test_acc']:5.1f}% (using {arch})")

def main():
    print("Multi-Method Deer Age Prediction Comparison")
    print("=" * 60)
    
    start_time = time.time()
    
    # Load data
    images, ages, sources = load_multi_source_data()
    
    # Run comparison
    comparator = MultiMethodComparison()
    results = comparator.run_comparison(images, ages, sources)
    
    elapsed = (time.time() - start_time) / 60
    print(f"\nTotal comparison time: {elapsed:.1f} minutes")
    
    return results

if __name__ == "__main__":
    results = main()

Multi-Method Deer Age Prediction Comparison
Loading NDA_color images...
  Loaded 363 images from NDA_color
Loading NDA_grayscale images...
  Loaded 108 images from NDA_grayscale
Total images: 471
Final dataset: 471 images
Age distribution: {5.5: 121, 4.5: 89, 2.5: 83, 3.5: 111, 1.5: 67}
Source distribution: {'NDA_color': 363, 'NDA_grayscale': 108}
Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Classes: 5
Label mapping: {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
Data split - Train: 300, Val: 76, Test: 95

TESTING METHOD: BASELINE

[baseline] Testing efficientnet_b0...

Creating balanced dataset for method: baseline
Total balanced images: 4000
    Epoch 0: Train 79.5%, Val 55.3%
    Epoch 15: Train 100.0%, Val 67.1%
    Epoch 30: Train 100.0%, Val 69.7%
  efficientnet_b0: Val 72.4%, Test 55.8%

[baseline] Testing resnet34...

Creating balanced dataset for method: baseline
Total balanced images: 4000
    Epoch 0: Train 60.2%, Val 60.5%
    Epoch 15: Train 100.0%, Val 71.1%
  resnet34: Val 

### Fine-tune the robust model

```
================================================================================
FINAL METHOD COMPARISON RESULTS
================================================================================
Method               Architecture         Val Acc    Test Acc  
--------------------------------------------------------------------------------
🏆 baseline           resnet34               71.1%      64.2%
   robust_training    mobilenetv3_large_100   78.9%      64.2%
   uncertainty_aware  efficientnet_b0        75.0%      63.2%
```
The output from the previou code suggests `robust_training` is learning more generalizable traits than the `baseline` model. Let's try to fine-tune its hyperparameters..

In [7]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import random
import json
import os
import glob
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Hyperparameter combinations to test for robust_training optimization
ROBUST_TRAINING_CONFIGS = [
    # Original robust_training
    {'label_smoothing': 0.25, 'dropout': 0.5, 'lr': 0.0003, 'weight_decay': 0.02, 'max_epochs': 60, 'name': 'original'},
    
    # Lower label smoothing variations
    {'label_smoothing': 0.15, 'dropout': 0.5, 'lr': 0.0003, 'weight_decay': 0.02, 'max_epochs': 80, 'name': 'less_smoothing'},
    {'label_smoothing': 0.20, 'dropout': 0.4, 'lr': 0.0003, 'weight_decay': 0.02, 'max_epochs': 80, 'name': 'moderate_reg'},
    
    # Learning rate variations
    {'label_smoothing': 0.25, 'dropout': 0.5, 'lr': 0.0001, 'weight_decay': 0.02, 'max_epochs': 100, 'name': 'lower_lr'},
    {'label_smoothing': 0.25, 'dropout': 0.5, 'lr': 0.0005, 'weight_decay': 0.01, 'max_epochs': 60, 'name': 'higher_lr'},
    
    # Training length variations
    {'label_smoothing': 0.20, 'dropout': 0.4, 'lr': 0.0002, 'weight_decay': 0.015, 'max_epochs': 120, 'name': 'extended_training'},
    
    # Weight decay variations
    {'label_smoothing': 0.25, 'dropout': 0.5, 'lr': 0.0003, 'weight_decay': 0.01, 'max_epochs': 80, 'name': 'less_decay'},
    {'label_smoothing': 0.25, 'dropout': 0.5, 'lr': 0.0003, 'weight_decay': 0.03, 'max_epochs': 80, 'name': 'more_decay'},
    
    # Conservative approach (less aggressive)
    {'label_smoothing': 0.10, 'dropout': 0.3, 'lr': 0.0003, 'weight_decay': 0.02, 'max_epochs': 80, 'name': 'conservative'},
    
    # Aggressive approach (more regularization)
    {'label_smoothing': 0.35, 'dropout': 0.6, 'lr': 0.0002, 'weight_decay': 0.025, 'max_epochs': 100, 'name': 'aggressive'},
]

IMAGE_SIZE = (224, 224)
AUGMENTATION_TARGET = 800
BATCH_SIZE = 16

def detect_and_convert_image(image):
    """Detect if image is grayscale and convert to 3-channel RGB"""
    if len(image.shape) == 2:
        return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    elif len(image.shape) == 3:
        if image.shape[2] == 1:
            return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif image.shape[2] == 3:
            return image
        elif image.shape[2] == 4:
            return cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    return image

def load_multi_source_data():
    """Load data from multiple sources"""
    data_sources = [
        ("G:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*.png", "NDA_color"),
        ("G:\\Dropbox\\AI Projects\\buck\\images\\squared\\grayscale\\*.png", "NDA_grayscale"),
    ]
    
    images = []
    ages = []
    sources = []
    
    for path_pattern, source_name in data_sources:
        print(f"Loading {source_name} images...")
        files = glob.glob(path_pattern)
        source_count = 0
        
        for img_path in files:
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = detect_and_convert_image(img)
                img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
                
                filename = os.path.basename(img_path)
                filename_no_ext = os.path.splitext(filename)[0]
                parts = filename_no_ext.split('_')
                
                if len(parts) < 5:
                    continue
                
                age_part = parts[3]
                if 'xpx' in age_part.lower() or 'p' not in age_part:
                    continue
                
                try:
                    age_value = float(age_part.replace('p', '.'))
                    images.append(img_resized)
                    ages.append(age_value)
                    sources.append(source_name)
                    source_count += 1
                except ValueError:
                    continue
                    
            except Exception as e:
                continue
        
        print(f"  Loaded {source_count} images from {source_name}")
    
    print(f"Total images: {len(images)}")
    
    # Group ages and filter
    ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
    age_counts = Counter(ages_grouped)
    valid_ages = {age for age, count in age_counts.items() if count >= 3}
    
    filtered_images = []
    filtered_ages = []
    filtered_sources = []
    
    for img, age, source in zip(images, ages_grouped, sources):
        if age in valid_ages:
            filtered_images.append(img)
            filtered_ages.append(age)
            filtered_sources.append(source)
    
    print(f"Final dataset: {len(filtered_images)} images")
    print(f"Age distribution: {dict(Counter(filtered_ages))}")
    print(f"Source distribution: {dict(Counter(filtered_sources))}")
    
    return np.array(filtered_images), filtered_ages, filtered_sources

def enhanced_augment_image_robust(image, strength_factor=1.0):
    """Enhanced augmentation for robust training with variable strength"""
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Base augmentations
    if random.random() < 0.7:
        angle = random.uniform(-15, 15)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    # Color/contrast augmentation
    if random.random() < 0.8:
        alpha = random.uniform(0.7, 1.3)
        beta = random.randint(-25, 25)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Strategic grayscale conversion
    if len(image.shape) == 3 and image.shape[2] == 3 and random.random() < 0.4:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        image = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
    
    # Variable strength additional augmentations
    gamma_prob = min(0.6, 0.3 * strength_factor)
    noise_prob = min(0.5, 0.2 * strength_factor)
    
    if random.random() < gamma_prob:
        gamma_range = (0.7, 1.4) if strength_factor > 1.0 else (0.8, 1.3)
        gamma = random.uniform(*gamma_range)
        inv_gamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        image = cv2.LUT(image, table)
    
    if random.random() < noise_prob:
        noise_std = 12 if strength_factor > 1.0 else 8
        noise = np.random.normal(0, noise_std, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

def create_balanced_dataset_robust(X, y, sources, strength_factor=1.0):
    """Create balanced dataset with variable augmentation strength"""
    class_counts = Counter(y)
    max_count = max(class_counts.values())
    target_count = max(AUGMENTATION_TARGET, max_count)
    
    X_balanced = []
    y_balanced = []
    sources_balanced = []
    
    for class_idx in range(len(set(y))):
        class_mask = np.array(y) == class_idx
        class_images = X[class_mask]
        class_sources = np.array(sources)[class_mask]
        current_count = len(class_images)
        
        if current_count == 0:
            continue
        
        # Add originals
        X_balanced.extend(class_images)
        y_balanced.extend([class_idx] * current_count)
        sources_balanced.extend(class_sources)
        
        # Add augmented
        needed = target_count - current_count
        for i in range(needed):
            orig_idx = random.randint(0, current_count - 1)
            aug_img = enhanced_augment_image_robust(class_images[orig_idx].copy(), strength_factor)
            X_balanced.append(aug_img)
            y_balanced.append(class_idx)
            sources_balanced.append(class_sources[orig_idx])
    
    return np.array(X_balanced), np.array(y_balanced), sources_balanced

class RobustDeerDataset(Dataset):
    def __init__(self, X, y, sources=None, training=True):
        self.X = torch.FloatTensor(X if isinstance(X, np.ndarray) else np.array(X))
        self.y = torch.LongTensor(y if isinstance(y, np.ndarray) else np.array(y))
        self.sources = sources
        self.training = training
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        # Test time augmentation for validation/test
        if not self.training and random.random() < 0.5:
            image = torch.flip(image, [2])
        
        image = (image - self.mean) / self.std
        return image, label

class RobustTrainer:
    def __init__(self, architecture, num_classes, config, device):
        self.architecture = architecture
        self.num_classes = num_classes
        self.config = config
        self.device = device
        self.config_name = config['name']
        
        # Create model with config-specific dropout
        self.model = timm.create_model(architecture, pretrained=True, num_classes=num_classes)
        
        # Replace classifier with config-specific dropout
        if hasattr(self.model, 'fc'):
            in_features = self.model.fc.in_features
            self.model.fc = nn.Sequential(
                nn.Dropout(config['dropout']),
                nn.Linear(in_features, num_classes)
            )
        elif hasattr(self.model, 'classifier'):
            if hasattr(self.model.classifier, 'in_features'):
                in_features = self.model.classifier.in_features
            else:
                in_features = self.model.classifier[-1].in_features
            self.model.classifier = nn.Sequential(
                nn.Dropout(config['dropout']),
                nn.Linear(in_features, num_classes)
            )
        elif hasattr(self.model, 'head'):
            if hasattr(self.model.head, 'in_features'):
                in_features = self.model.head.in_features
            else:
                in_features = self.model.head[-1].in_features if hasattr(self.model.head, '__getitem__') else 512
            self.model.head = nn.Sequential(
                nn.Dropout(config['dropout']),
                nn.Linear(in_features, num_classes)
            )
        
        self.model = self.model.to(device)
        
        # Config-specific training components
        self.criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
        self.optimizer = optim.AdamW(self.model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=config['max_epochs'], eta_min=1e-6)
    
    def train_model(self, train_loader, val_loader):
        print(f"Training with config: {self.config_name}")
        print(f"  Label smoothing: {self.config['label_smoothing']}, Dropout: {self.config['dropout']}")
        print(f"  LR: {self.config['lr']}, Weight decay: {self.config['weight_decay']}, Max epochs: {self.config['max_epochs']}")
        
        best_val_acc = 0.0
        best_test_acc = 0.0  # Track test acc at best val
        patience = 20
        patience_counter = 0
        best_state = None
        
        for epoch in range(self.config['max_epochs']):
            # Training
            self.model.train()
            train_correct = 0
            train_total = 0
            train_loss = 0.0
            
            for images, labels in train_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                self.optimizer.zero_grad()
                
                outputs = self.model(images)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
                train_loss += loss.item()
            
            # Validation
            val_acc = self.evaluate(val_loader)
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = self.model.state_dict().copy()
            else:
                patience_counter += 1
            
            self.scheduler.step()
            
            if epoch % 20 == 0:
                train_acc = 100 * train_correct / train_total
                print(f"    Epoch {epoch}: Train {train_acc:.1f}%, Val {val_acc:.1f}%")
            
            if patience_counter >= patience:
                print(f"    Early stopping at epoch {epoch}")
                break
        
        # Load best weights
        if best_state is not None:
            self.model.load_state_dict(best_state)
        
        return best_val_acc
    
    def evaluate(self, loader):
        self.model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(self.device), labels.to(self.device)
                
                # Test time augmentation
                outputs1 = self.model(images)
                flipped = torch.flip(images, [3])
                outputs2 = self.model(flipped)
                outputs = (outputs1 + outputs2) / 2
                
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        return 100 * correct / total

class RobustTrainingOptimizer:
    def __init__(self, save_dir=None):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        if save_dir is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.save_dir = f"robust_training_optimization_{timestamp}"
        else:
            self.save_dir = save_dir
        
        os.makedirs(self.save_dir, exist_ok=True)
        
        print(f"Using device: {self.device}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
    
    def run_optimization(self, images, ages, sources):
        unique_ages = sorted(list(set(ages)))
        label_mapping = {age: i for i, age in enumerate(unique_ages)}
        y_indices = np.array([label_mapping[age] for age in ages])
        
        print(f"Classes: {len(unique_ages)}")
        print(f"Label mapping: {label_mapping}")
        
        # Use same train/test split for fair comparison
        X_train, X_test, y_train, y_test, sources_train, sources_test = train_test_split(
            images, y_indices, sources, test_size=0.2, random_state=42, stratify=y_indices
        )
        
        X_train_final, X_val, y_train_final, y_val, sources_train_final, sources_val = train_test_split(
            X_train, y_train, sources_train, test_size=0.2, random_state=42, stratify=y_train
        )
        
        print(f"Data split - Train: {len(X_train_final)}, Val: {len(X_val)}, Test: {len(X_test)}")
        
        # Test best architectures from previous results: resnet34 and mobilenetv3_large_100
        best_archs = ['resnet34', 'mobilenetv3_large_100']
        
        results = {}
        best_overall = {'config': None, 'arch': None, 'test_acc': 0.0, 'val_acc': 0.0}
        
        for arch in best_archs:
            print(f"\n{'='*70}")
            print(f"OPTIMIZING ROBUST TRAINING WITH {arch.upper()}")
            print('='*70)
            
            arch_results = {}
            
            for config in ROBUST_TRAINING_CONFIGS:
                print(f"\n[{arch}] Testing config: {config['name']}")
                
                try:
                    # Create config-specific balanced dataset
                    # Use different strength factors for different configs
                    strength_factor = 1.2 if 'aggressive' in config['name'] else 1.0
                    if 'conservative' in config['name']:
                        strength_factor = 0.8
                    
                    X_balanced, y_balanced, sources_balanced = create_balanced_dataset_robust(
                        X_train_final, y_train_final, sources_train_final, strength_factor
                    )
                    
                    # Create datasets
                    train_dataset = RobustDeerDataset(X_balanced, y_balanced, sources_balanced, training=True)
                    val_dataset = RobustDeerDataset(X_val, y_val, sources_val, training=False)
                    test_dataset = RobustDeerDataset(X_test, y_test, sources_test, training=False)
                    
                    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
                    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
                    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
                    
                    # Train model
                    trainer = RobustTrainer(arch, len(unique_ages), config, self.device)
                    val_acc = trainer.train_model(train_loader, val_loader)
                    test_acc = trainer.evaluate(test_loader)
                    
                    arch_results[config['name']] = {
                        'val_acc': val_acc,
                        'test_acc': test_acc,
                        'config': config,
                        'model_state': trainer.model.state_dict().copy()
                    }
                    
                    print(f"  Result: Val {val_acc:.1f}%, Test {test_acc:.1f}%")
                    
                    # Track best overall
                    if test_acc > best_overall['test_acc']:
                        best_overall = {
                            'config': config['name'],
                            'arch': arch,
                            'test_acc': test_acc,
                            'val_acc': val_acc,
                            'model_state': trainer.model.state_dict().copy()
                        }
                    
                    # Save model
                    save_path = os.path.join(self.save_dir, f"robust_{arch}_{config['name']}_{test_acc:.1f}pct.pth")
                    torch.save({
                        'model_state_dict': trainer.model.state_dict(),
                        'architecture': arch,
                        'config': config,
                        'num_classes': len(unique_ages),
                        'label_mapping': label_mapping,
                        'test_accuracy': test_acc,
                        'val_accuracy': val_acc
                    }, save_path)
                    
                    torch.cuda.empty_cache()
                    
                except Exception as e:
                    print(f"  FAILED: {str(e)[:60]}...")
                    continue
            
            results[arch] = arch_results
        
        # Print optimization results
        self.print_optimization_results(results, best_overall)
        
        # Save best model separately
        if best_overall['model_state'] is not None:
            best_save_path = os.path.join(self.save_dir, f"BEST_robust_{best_overall['arch']}_{best_overall['config']}_{best_overall['test_acc']:.1f}pct.pth")
            torch.save({
                'model_state_dict': best_overall['model_state'],
                'architecture': best_overall['arch'],
                'config_name': best_overall['config'],
                'num_classes': len(unique_ages),
                'label_mapping': label_mapping,
                'test_accuracy': best_overall['test_acc'],
                'val_accuracy': best_overall['val_acc']
            }, best_save_path)
            print(f"\nBest model saved: {best_save_path}")
        
        return results, best_overall
    
    def print_optimization_results(self, results, best_overall):
        print(f"\n{'='*80}")
        print("ROBUST TRAINING OPTIMIZATION RESULTS")
        print('='*80)
        print(f"{'Architecture':<20} {'Config':<18} {'Val Acc':<10} {'Test Acc':<10}")
        print('-'*80)
        
        all_results = []
        for arch, configs in results.items():
            for config_name, metrics in configs.items():
                all_results.append((arch, config_name, metrics['val_acc'], metrics['test_acc']))
        
        # Sort by test accuracy
        all_results.sort(key=lambda x: x[3], reverse=True)
        
        for i, (arch, config, val_acc, test_acc) in enumerate(all_results):
            marker = "🏆" if i == 0 else "  "
            print(f"{marker} {arch:<18} {config:<18} {val_acc:6.1f}%    {test_acc:6.1f}%")
        
        print(f"\n{'='*60}")
        print("BEST CONFIGURATION FOUND:")
        print(f"Architecture: {best_overall['arch']}")
        print(f"Config: {best_overall['config']}")
        print(f"Validation Accuracy: {best_overall['val_acc']:.1f}%")
        print(f"Test Accuracy: {best_overall['test_acc']:.1f}%")

def main():
    print("Robust Training Optimization for Deer Age Prediction")
    print("=" * 60)
    
    start_time = time.time()
    
    # Load data
    images, ages, sources = load_multi_source_data()
    
    # Run optimization
    optimizer = RobustTrainingOptimizer()
    results, best_config = optimizer.run_optimization(images, ages, sources)
    
    elapsed = (time.time() - start_time) / 60
    print(f"\nTotal optimization time: {elapsed:.1f} minutes")
    
    return results, best_config

if __name__ == "__main__":
    results, best = main()

Robust Training Optimization for Deer Age Prediction
Loading NDA_color images...
  Loaded 363 images from NDA_color
Loading NDA_grayscale images...
  Loaded 108 images from NDA_grayscale
Total images: 471
Final dataset: 471 images
Age distribution: {5.5: 121, 4.5: 89, 2.5: 83, 3.5: 111, 1.5: 67}
Source distribution: {'NDA_color': 363, 'NDA_grayscale': 108}
Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Classes: 5
Label mapping: {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
Data split - Train: 300, Val: 76, Test: 95

OPTIMIZING ROBUST TRAINING WITH RESNET34

[resnet34] Testing config: original
Training with config: original
  Label smoothing: 0.25, Dropout: 0.5
  LR: 0.0003, Weight decay: 0.02, Max epochs: 60
    Epoch 0: Train 44.0%, Val 55.3%
    Epoch 20: Train 99.9%, Val 67.1%
    Early stopping at epoch 36
  Result: Val 73.7%, Test 58.9%

[resnet34] Testing config: less_smoothing
Training with config: less_smoothing
  Label smoothing: 0.15, Dropout: 0.5
  LR: 0.0003, Weight decay: 0.02