In [2]:
# Check if CUDA recognized

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("❌ CUDA not detected by PyTorch")

PyTorch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1
GPU count: 1
GPU name: NVIDIA GeForce RTX 2060
GPU memory: 6.0 GB


In [3]:
# Broad model family search

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Mixed precision imports (from your reference code)
try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

# GPU Configuration (matching your reference code)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    torch.backends.cudnn.benchmark = True
    
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
        print("Mixed Precision: Disabled")
else:
    use_amp = False
    print("WARNING: GPU not available")

# Data paths
color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"
grayscale_path = r"G:\Dropbox\AI Projects\buck\images\squared\grayscale"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            # Cap ages over 5.5 to 5.5
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            # Skip files with non-numeric age (e.g., "xpx")
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def load_images(color_path, grayscale_path, img_size=(224, 224)):
    images = []
    ages = []
    
    # Process color images (convert to grayscale)
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                            img_resized = cv2.resize(img_gray, img_size)
                            assert img_resized.shape == img_size, f"Image {filename} not resized correctly: {img_resized.shape}"
                            # Convert to 3-channel for pretrained models
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    # Process grayscale images
    if os.path.exists(grayscale_path):
        for filename in os.listdir(grayscale_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(grayscale_path, filename)
                        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                        if img is not None:
                            img_resized = cv2.resize(img, img_size)
                            assert img_resized.shape == img_size, f"Image {filename} not resized correctly: {img_resized.shape}"
                            # Convert to 3-channel for pretrained models
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    # Verify final dimensions
    assert images.shape[1:3] == img_size, f"Final image dimensions incorrect: {images.shape}"
    print(f"Images loaded with shape: {images.shape}")
    print(f"Classes: {np.unique(ages)} (0=1.5yr, 1=2.5yr, 2=3.5yr, 3=4.5yr, 4=5.5yr)")
    print(f"Class distribution: {Counter(ages)}")
    
    return images, ages

class DeerDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        # Normalize to [0,1]
        if image.max() > 1.0:
            image = image / 255.0
        
        # Convert to CHW format
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        # Normalize with ImageNet stats
        image = (image - self.mean) / self.std
        
        return image, label

def create_model(model_name, num_classes=5):
    """Create model using timm (matching your reference code)"""
    if model_name == 'ResNet50':
        model = timm.create_model('resnet50', pretrained=True, num_classes=num_classes)
    elif model_name == 'EfficientNetB0':
        model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=num_classes)
    elif model_name == 'VGG16':
        model = timm.create_model('vgg16', pretrained=True, num_classes=num_classes)
    elif model_name == 'MobileNetV2':
        model = timm.create_model('mobilenetv2_100', pretrained=True, num_classes=num_classes)
    elif model_name == 'InceptionV3':
        model = timm.create_model('inception_v3', pretrained=True, num_classes=num_classes)
    elif model_name == 'DenseNet121':
        model = timm.create_model('densenet121', pretrained=True, num_classes=num_classes)
    elif model_name == 'ResNet101':
        model = timm.create_model('resnet101', pretrained=True, num_classes=num_classes)
    elif model_name == 'ResNet152':
        model = timm.create_model('resnet152', pretrained=True, num_classes=num_classes)
    elif model_name == 'EfficientNetB1':
        model = timm.create_model('efficientnet_b1', pretrained=True, num_classes=num_classes)
    elif model_name == 'EfficientNetB2':
        model = timm.create_model('efficientnet_b2', pretrained=True, num_classes=num_classes)
    else:
        raise ValueError(f"Unknown model: {model_name}")
    
    return model.to(device)

def train_model(model, train_loader, test_loader, model_name, epochs=50):
    """Train model with your proven GPU configuration"""
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, min_lr=1e-6)
    
    best_acc = 0.0
    patience = 10
    patience_counter = 0
    best_state = None
    
    print(f"Training {model_name} on {device}")
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_correct = 0
        train_total = 0
        train_loss_total = 0.0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            if use_amp:
                with autocast():
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            train_loss_total += loss.item()
            
            # Memory management (from your reference code)
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        
        # Validation
        model.eval()
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                
                if use_amp:
                    with autocast():
                        outputs = model(images)
                else:
                    outputs = model(images)
                
                _, predicted = torch.max(outputs, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        
        test_acc = 100 * test_correct / test_total
        scheduler.step(test_acc)
        
        # Early stopping
        if test_acc > best_acc:
            best_acc = test_acc
            patience_counter = 0
            best_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if epoch % 10 == 0 or patience_counter >= patience:
            print(f"  Epoch {epoch:2d}: Train {train_acc:.1f}%, Test {test_acc:.1f}%")
        
        if patience_counter >= patience:
            print(f"  Early stopping at epoch {epoch}")
            break
        
        # Memory cleanup
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    # Restore best model
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_acc

# Create timestamped output folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"deer_age_models_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"Models will be saved to: {output_dir}")

# Load data
print("Loading images...")
X, y = load_images(color_path, grayscale_path)
print(f"Loaded {len(X)} images, age range: {y.min():.1f}-{y.max():.1f}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Create datasets
train_dataset = DeerDataset(X_train, y_train)
test_dataset = DeerDataset(X_test, y_test)

# Create dataloaders (using your batch size from reference code)
batch_size = 16  # From your reference code for RTX 2060
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# Model families to test
model_families = [
    'ResNet50', 'EfficientNetB0', 'VGG16', 
    'MobileNetV2', 'InceptionV3', 'DenseNet121'
]

results = []
best_accuracy = 0
best_family = None

print(f"\nTesting {len(model_families)} model families...")

for model_name in model_families:
    print(f"\nTesting {model_name}...")
    
    try:
        model = create_model(model_name)
        trained_model, accuracy = train_model(
            model, train_loader, test_loader, model_name
        )
        
        # Save model with accuracy in filename
        acc_str = f"{accuracy:.3f}".replace('.', 'p')
        model_filename = f"{model_name}_{acc_str}.pth"
        model_path = os.path.join(output_dir, model_filename)
        
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'model_name': model_name,
            'accuracy': accuracy,
            'num_classes': 5
        }, model_path)
        
        results.append({
            'model': model_name,
            'accuracy': accuracy,
            'filename': model_filename,
            'full_path': model_path
        })
        
        print(f"{model_name}: Accuracy={accuracy:.3f}, Saved: {model_filename}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_family = model_name
        
        # Cleanup (from your reference code)
        del model, trained_model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"Error with {model_name}: {e}")
        continue

# Find best performing family
print(f"\nBest family: {best_family} (accuracy: {best_accuracy:.3f})")

# Test variations within best family
if best_family:
    print(f"\nTesting variations of {best_family}...")
    
    if best_family == 'ResNet50':
        variations = ['ResNet101', 'ResNet152']
    elif best_family == 'EfficientNetB0':
        variations = ['EfficientNetB1', 'EfficientNetB2']
    else:
        variations = []
    
    for var_name in variations:
        print(f"\nTesting {var_name}...")
        
        try:
            model = create_model(var_name)
            trained_model, accuracy = train_model(
                model, train_loader, test_loader, var_name
            )
            
            # Save model
            acc_str = f"{accuracy:.3f}".replace('.', 'p')
            model_filename = f"{var_name}_{acc_str}.pth"
            model_path = os.path.join(output_dir, model_filename)
            
            torch.save({
                'model_state_dict': trained_model.state_dict(),
                'model_name': var_name,
                'accuracy': accuracy,
                'num_classes': 5
            }, model_path)
            
            results.append({
                'model': var_name,
                'accuracy': accuracy,
                'filename': model_filename,
                'full_path': model_path
            })
            
            print(f"{var_name}: Accuracy={accuracy:.3f}, Saved: {model_filename}")
            
            # Cleanup
            del model, trained_model
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
            
        except Exception as e:
            print(f"Error with {var_name}: {e}")
            continue

# Final results
print(f"\n{'='*50}")
print("FINAL RESULTS")
print(f"{'='*50}")

results.sort(key=lambda x: x['accuracy'], reverse=True)

for i, result in enumerate(results, 1):
    print(f"{i:2d}. {result['model']:15s} - Accuracy: {result['accuracy']:.3f}")

if results:
    best_result = results[0]
    print(f"\nBest model: {best_result['model']} with {best_result['accuracy']:.3f} accuracy")
    print(f"Saved as: {best_result['filename']}")

print(f"\nTotal models tested: {len(results)}")
print(f"All models saved in folder: {output_dir}")
print("All models saved with accuracy in filename.")

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
GPU Memory: 6.0 GB
Mixed Precision: Enabled
Models will be saved to: deer_age_models_20250731_072915
Loading images...
Images loaded with shape: (466, 224, 224, 3)
Classes: [0 1 2 3 4] (0=1.5yr, 1=2.5yr, 2=3.5yr, 3=4.5yr, 4=5.5yr)
Class distribution: Counter({np.int64(4): 118, np.int64(2): 111, np.int64(3): 89, np.int64(1): 82, np.int64(0): 66})
Loaded 466 images, age range: 0.0-4.0
Train: 372, Test: 94

Testing 6 model families...

Testing ResNet50...
Training ResNet50 on cuda
  Epoch  0: Train 22.6%, Test 25.5%
  Epoch 10: Train 100.0%, Test 47.9%
  Epoch 19: Train 100.0%, Test 50.0%
  Early stopping at epoch 19
ResNet50: Accuracy=51.064, Saved: ResNet50_51p064.pth

Testing EfficientNetB0...
Training EfficientNetB0 on cuda
  Epoch  0: Train 29.6%, Test 35.1%
  Epoch 10: Train 100.0%, Test 50.0%
  Epoch 18: Train 100.0%, Test 55.3%
  Early stopping at epoch 18
EfficientNetB0: Accuracy=55.319, Saved: EfficientNetB0_55p319.pth

Testing VGG

In [5]:
# Deeper family search.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
import random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

# GPU Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    torch.backends.cudnn.benchmark = True
    
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
else:
    use_amp = False

# Data paths
color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"
grayscale_path = r"G:\Dropbox\AI Projects\buck\images\squared\grayscale"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def augment_image(image):
    """Enhanced augmentation to reduce overfitting"""
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    if random.random() < 0.7:
        angle = random.uniform(-15, 15)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.8:
        alpha = random.uniform(0.7, 1.3)
        beta = random.randint(-25, 25)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    if random.random() < 0.3:
        noise = np.random.normal(0, 8, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

def load_images(color_path, grayscale_path, img_size=(224, 224)):
    images = []
    ages = []
    
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                            img_resized = cv2.resize(img_gray, img_size)
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    if os.path.exists(grayscale_path):
        for filename in os.listdir(grayscale_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(grayscale_path, filename)
                        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                        if img is not None:
                            img_resized = cv2.resize(img, img_size)
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    print(f"Images loaded with shape: {images.shape}")
    print(f"Class distribution: {Counter(ages)}")
    
    return images, ages

class AugmentedDeerDataset(Dataset):
    def __init__(self, X, y, augment=False):
        self.X = X
        self.y = torch.LongTensor(y)
        self.augment = augment
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].copy()
        label = self.y[idx].clone()
        
        # Apply augmentation during training
        if self.augment:
            image = augment_image(image)
        
        # Convert to tensor and normalize
        image = torch.FloatTensor(image)
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        image = (image - self.mean) / self.std
        
        return image, label

def create_model_with_regularization(model_name, num_classes=5, dropout_rate=0.5):
    """Create model with better regularization"""
    model = timm.create_model(model_name, pretrained=True, num_classes=num_classes, drop_rate=dropout_rate)
    
    # Freeze more layers to reduce overfitting
    if 'resnet' in model_name:
        for name, param in model.named_parameters():
            if not ('layer4' in name or 'fc' in name):
                param.requires_grad = False
    elif 'efficientnet' in model_name:
        for name, param in model.named_parameters():
            if not ('blocks.6' in name or 'blocks.7' in name or 'classifier' in name):
                param.requires_grad = False
    elif 'densenet' in model_name:
        for name, param in model.named_parameters():
            if not ('denseblock4' in name or 'classifier' in name):
                param.requires_grad = False
    elif 'mobilenet' in model_name:
        for name, param in model.named_parameters():
            if not ('features.18' in name or 'features.19' in name or 'classifier' in name):
                param.requires_grad = False
    
    return model.to(device)

def train_model_improved(model, train_loader, test_loader, model_name, epochs=60):
    """Improved training with better regularization"""
    criterion = nn.CrossEntropyLoss(label_smoothing=0.15)
    
    # Lower learning rate and higher weight decay
    optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.05)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)
    
    best_acc = 0.0
    patience = 15
    patience_counter = 0
    best_state = None
    
    print(f"Training {model_name} with improved regularization")
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            if use_amp:
                with autocast():
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        scheduler.step()
        
        # Validation
        model.eval()
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                
                if use_amp:
                    with autocast():
                        outputs = model(images)
                else:
                    outputs = model(images)
                
                _, predicted = torch.max(outputs, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        
        test_acc = 100 * test_correct / test_total
        
        # Early stopping
        if test_acc > best_acc:
            best_acc = test_acc
            patience_counter = 0
            best_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if epoch % 10 == 0 or patience_counter >= patience:
            print(f"  Epoch {epoch:2d}: Train {train_acc:.1f}%, Test {test_acc:.1f}%")
        
        if patience_counter >= patience:
            print(f"  Early stopping at epoch {epoch}")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_acc

# Create timestamped output folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"deer_age_deep_survey_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"Models will be saved to: {output_dir}")

# Load data
print("Loading images...")
X, y = load_images(color_path, grayscale_path)
print(f"Loaded {len(X)} images")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Create datasets with augmentation
train_dataset = AugmentedDeerDataset(X_train, y_train, augment=True)
test_dataset = AugmentedDeerDataset(X_test, y_test, augment=False)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# Deep exploration of top 3 families
model_configs = [
    # DenseNet family (won previous round)
    ('densenet121', 'DenseNet121'),
    ('densenet169', 'DenseNet169'),
    ('densenet201', 'DenseNet201'),
    
    # EfficientNet family (2nd place)
    ('efficientnet_b0', 'EfficientNetB0'),
    ('efficientnet_b1', 'EfficientNetB1'),
    ('efficientnet_b2', 'EfficientNetB2'),
    ('efficientnet_b3', 'EfficientNetB3'),
    
    # MobileNet family (3rd place)
    ('mobilenetv2_100', 'MobileNetV2'),
    ('mobilenetv3_small_100', 'MobileNetV3Small'),
    ('mobilenetv3_large_100', 'MobileNetV3Large'),
    
    # Additional high-performers to test
    ('resnet50', 'ResNet50_Regularized'),
    ('resnext50_32x4d', 'ResNeXt50'),
]

results = []
print(f"\nDeep survey: Testing {len(model_configs)} models with improved regularization...")

for model_timm_name, display_name in model_configs:
    print(f"\nTesting {display_name}...")
    
    try:
        model = create_model_with_regularization(model_timm_name, dropout_rate=0.5)
        trained_model, accuracy = train_model_improved(
            model, train_loader, test_loader, display_name
        )
        
        # Save model
        acc_str = f"{accuracy:.3f}".replace('.', 'p')
        model_filename = f"{display_name}_{acc_str}.pth"
        model_path = os.path.join(output_dir, model_filename)
        
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'model_name': display_name,
            'timm_name': model_timm_name,
            'accuracy': accuracy,
            'num_classes': 5
        }, model_path)
        
        results.append({
            'model': display_name,
            'timm_name': model_timm_name,
            'accuracy': accuracy,
            'filename': model_filename,
            'full_path': model_path
        })
        
        print(f"{display_name}: {accuracy:.3f}% - Saved: {model_filename}")
        
        # Cleanup
        del model, trained_model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"Error with {display_name}: {e}")
        continue

# Final results
print(f"\n{'='*60}")
print("DEEP SURVEY RESULTS - TOP 3 FAMILIES + EXTRAS")
print(f"{'='*60}")

results.sort(key=lambda x: x['accuracy'], reverse=True)

print("DENSENET FAMILY:")
for result in results:
    if 'DenseNet' in result['model']:
        print(f"  {result['model']:20s} - {result['accuracy']:.3f}%")

print("\nEFFICIENTNET FAMILY:")
for result in results:
    if 'EfficientNet' in result['model']:
        print(f"  {result['model']:20s} - {result['accuracy']:.3f}%")

print("\nMOBILENET FAMILY:")
for result in results:
    if 'MobileNet' in result['model']:
        print(f"  {result['model']:20s} - {result['accuracy']:.3f}%")

print("\nOTHER MODELS:")
for result in results:
    if not any(family in result['model'] for family in ['DenseNet', 'EfficientNet', 'MobileNet']):
        print(f"  {result['model']:20s} - {result['accuracy']:.3f}%")

print(f"\n{'='*60}")
print("OVERALL RANKING:")
for i, result in enumerate(results, 1):
    print(f"{i:2d}. {result['model']:20s} - {result['accuracy']:.3f}%")

if results:
    best_result = results[0]
    print(f"\nBEST MODEL: {best_result['model']} - {best_result['accuracy']:.3f}%")
    print(f"Saved as: {best_result['filename']}")

print(f"\nTotal models tested: {len(results)}")
print(f"All models saved in: {output_dir}")
print("Note: Improved regularization should reduce train/test accuracy gap")

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
GPU Memory: 6.0 GB
Mixed Precision: Enabled
Models will be saved to: deer_age_deep_survey_20250731_075454
Loading images...
Images loaded with shape: (466, 224, 224, 3)
Class distribution: Counter({np.int64(4): 118, np.int64(2): 111, np.int64(3): 89, np.int64(1): 82, np.int64(0): 66})
Loaded 466 images
Train: 372, Test: 94

Deep survey: Testing 12 models with improved regularization...

Testing DenseNet121...
Training DenseNet121 with improved regularization
  Epoch  0: Train 25.5%, Test 30.9%
  Epoch 10: Train 85.8%, Test 56.4%
  Epoch 20: Train 91.1%, Test 46.8%
  Epoch 25: Train 95.7%, Test 47.9%
  Early stopping at epoch 25
DenseNet121: 56.383% - Saved: DenseNet121_56p383.pth

Testing DenseNet169...
Training DenseNet169 with improved regularization
  Epoch  0: Train 27.7%, Test 34.0%
  Epoch 10: Train 90.3%, Test 52.1%
  Epoch 20: Train 95.7%, Test 60.6%
  Epoch 30: Train 98.9%, Test 60.6%
  Epoch 40: Train 99.7%, Test 63.8%
  Epoch 4

In [7]:
# Second attempt at model families 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
import random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.backends.cudnn.benchmark = True
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
else:
    use_amp = False

color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"
grayscale_path = r"G:\Dropbox\AI Projects\buck\images\squared\grayscale"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def mixup_data(x, y, alpha=0.4):
    """Mixup augmentation to create synthetic training examples"""
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)
    
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """Mixup loss function"""
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

def load_images(color_path, grayscale_path, img_size=(224, 224)):
    images = []
    ages = []
    
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                            img_resized = cv2.resize(img_gray, img_size)
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    if os.path.exists(grayscale_path):
        for filename in os.listdir(grayscale_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(grayscale_path, filename)
                        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                        if img is not None:
                            img_resized = cv2.resize(img, img_size)
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    print(f"Total images: {len(images)}")
    print(f"Class distribution: {Counter(ages)}")
    
    return images, ages

def conservative_augment(image):
    """Very light augmentation to preserve deer features"""
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    if random.random() < 0.3:
        angle = random.uniform(-8, 8)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.4:
        alpha = random.uniform(0.9, 1.1)
        beta = random.randint(-10, 10)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    return image

class MultiScaleDataset(Dataset):
    def __init__(self, X, y, augment=False, scale_size=224):
        self.X = X
        self.y = torch.LongTensor(y)
        self.augment = augment
        self.scale_size = scale_size
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].copy()
        label = self.y[idx].clone()
        
        # Multi-scale training
        if self.augment:
            scale_factor = random.choice([0.8, 0.9, 1.0, 1.1, 1.2])
            new_size = int(self.scale_size * scale_factor)
            image = cv2.resize(image, (new_size, new_size))
            image = cv2.resize(image, (self.scale_size, self.scale_size))
            
            image = conservative_augment(image)
        
        image = torch.FloatTensor(image)
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        image = (image - self.mean) / self.std
        
        return image, label

def create_conservative_model(model_name, num_classes=5):
    """Back to simpler model creation that worked"""
    model = timm.create_model(model_name, pretrained=True, num_classes=num_classes, drop_rate=0.4)
    
    # Conservative freezing (like the 63.8% model)
    if 'densenet' in model_name:
        for name, param in model.named_parameters():
            if not ('denseblock4' in name or 'classifier' in name):
                param.requires_grad = False
    elif 'resnext' in model_name or 'resnet' in model_name:
        for name, param in model.named_parameters():
            if not ('layer4' in name or 'fc' in name):
                param.requires_grad = False
    
    return model.to(device)

def train_with_mixup_and_multiscale(model, train_loader, test_loader, model_name, epochs=120):
    """Training with mixup + multi-scale + very conservative approach"""
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    
    # Conservative optimizer (back to what worked)
    optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.05)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)
    
    best_acc = 0.0
    patience = 30
    patience_counter = 0
    best_state = None
    
    print(f"Training {model_name} with Mixup + Multi-scale")
    
    for epoch in range(epochs):
        # Training with mixup
        model.train()
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            # Apply mixup
            if random.random() < 0.5:  # 50% chance of mixup
                mixed_images, y_a, y_b, lam = mixup_data(images, labels, alpha=0.4)
                optimizer.zero_grad()
                
                if use_amp:
                    with autocast():
                        outputs = model(mixed_images)
                        loss = mixup_criterion(criterion, outputs, y_a, y_b, lam)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(mixed_images)
                    loss = mixup_criterion(criterion, outputs, y_a, y_b, lam)
                    loss.backward()
                    optimizer.step()
                
                # For accuracy calculation, use original labels
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == y_a).sum().item()
            else:
                # Normal training
                optimizer.zero_grad()
                
                if use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        scheduler.step()
        
        # Simple TTA evaluation (not too heavy)
        test_acc = evaluate_with_simple_tta(model, test_loader)
        
        if test_acc > best_acc:
            best_acc = test_acc
            patience_counter = 0
            best_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if epoch % 20 == 0 or patience_counter >= patience:
            print(f"  Epoch {epoch:3d}: Train {train_acc:.1f}%, Test+TTA {test_acc:.1f}%")
        
        if patience_counter >= patience:
            print(f"  Early stopping at epoch {epoch}")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_acc

def evaluate_with_simple_tta(model, test_loader):
    """Simple TTA - just 3 versions"""
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Original prediction
            if use_amp:
                with autocast():
                    outputs1 = model(images)
            else:
                outputs1 = model(images)
            
            # Horizontal flip
            flipped = torch.flip(images, [3])
            if use_amp:
                with autocast():
                    outputs2 = model(flipped)
            else:
                outputs2 = model(flipped)
            
            # Slight zoom
            zoomed = F.interpolate(images, scale_factor=0.95, mode='bilinear', align_corners=False)
            zoomed = F.interpolate(zoomed, size=(224, 224), mode='bilinear', align_corners=False)
            if use_amp:
                with autocast():
                    outputs3 = model(zoomed)
            else:
                outputs3 = model(zoomed)
            
            # Average predictions
            avg_outputs = (F.softmax(outputs1, dim=1) + F.softmax(outputs2, dim=1) + F.softmax(outputs3, dim=1)) / 3
            _, predicted = torch.max(avg_outputs, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

def ensemble_predict(models, test_loader):
    """Simple ensemble of multiple models"""
    all_models_eval = [model.eval() for model in models]
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            ensemble_output = torch.zeros(images.size(0), 5).to(device)
            
            for model in models:
                # Simple TTA for each model
                if use_amp:
                    with autocast():
                        outputs1 = model(images)
                        outputs2 = model(torch.flip(images, [3]))
                else:
                    outputs1 = model(images)
                    outputs2 = model(torch.flip(images, [3]))
                
                avg_model_output = (F.softmax(outputs1, dim=1) + F.softmax(outputs2, dim=1)) / 2
                ensemble_output += avg_model_output
            
            # Final ensemble prediction
            _, predicted = torch.max(ensemble_output, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

# Main execution
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"deer_age_ensemble_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"Ensemble models saved to: {output_dir}")

print("Loading images...")
X, y = load_images(color_path, grayscale_path)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Create datasets
train_dataset = MultiScaleDataset(X_train, y_train, augment=True)
test_dataset = MultiScaleDataset(X_test, y_test, augment=False)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0)

print("\n" + "="*60)
print("SMALL DATA STRATEGY: MIXUP + MULTI-SCALE + ENSEMBLE")
print("="*60)
print("Approach: Conservative training + Mixup synthetic data")

# Train multiple models for ensemble
model_configs = [
    ('densenet169', 'DenseNet169'),
    ('resnext50_32x4d', 'ResNeXt50'),
    ('densenet201', 'DenseNet201'),
]

trained_models = []
individual_scores = []

for model_timm_name, display_name in model_configs:
    print(f"\n{'='*40}")
    print(f"Training {display_name}")
    print(f"{'='*40}")
    
    try:
        model = create_conservative_model(model_timm_name)
        trained_model, accuracy = train_with_mixup_and_multiscale(
            model, train_loader, test_loader, display_name
        )
        
        # Save individual model
        acc_str = f"{accuracy:.1f}".replace('.', 'p')
        model_filename = f"{display_name}_{acc_str}pct.pth"
        model_path = os.path.join(output_dir, model_filename)
        
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'model_name': display_name,
            'timm_name': model_timm_name,
            'accuracy': accuracy,
            'num_classes': 5
        }, model_path)
        
        trained_models.append(trained_model)
        individual_scores.append(accuracy)
        
        print(f"{display_name}: {accuracy:.1f}% - Saved")
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error with {display_name}: {e}")
        continue

# Ensemble evaluation
if len(trained_models) > 1:
    print(f"\n{'='*40}")
    print("ENSEMBLE EVALUATION")
    print(f"{'='*40}")
    
    ensemble_accuracy = ensemble_predict(trained_models, test_loader)
    
    print("INDIVIDUAL MODEL RESULTS:")
    for i, (score, config) in enumerate(zip(individual_scores, model_configs)):
        print(f"  {config[1]}: {score:.1f}%")
    
    print(f"\nENSEMBLE RESULT: {ensemble_accuracy:.1f}%")
    
    if ensemble_accuracy >= 75.0:
        print("SUCCESS: 75% target achieved!")
    else:
        gap = 75.0 - ensemble_accuracy
        print(f"Gap to 75%: {gap:.1f}%")
        
        if ensemble_accuracy > max(individual_scores):
            improvement = ensemble_accuracy - max(individual_scores)
            print(f"Ensemble improvement: +{improvement:.1f}%")

print(f"\nAll models saved in: {output_dir}")
print("="*60)

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Mixed Precision: Enabled
Ensemble models saved to: deer_age_ensemble_20250731_220115
Loading images...
Total images: 466
Class distribution: Counter({np.int64(4): 118, np.int64(2): 111, np.int64(3): 89, np.int64(1): 82, np.int64(0): 66})
Train: 372, Test: 94

SMALL DATA STRATEGY: MIXUP + MULTI-SCALE + ENSEMBLE
Approach: Conservative training + Mixup synthetic data

Training DenseNet169
Training DenseNet169 with Mixup + Multi-scale
  Epoch   0: Train 29.8%, Test+TTA 37.2%
  Epoch  20: Train 64.2%, Test+TTA 54.3%
  Epoch  40: Train 80.1%, Test+TTA 56.4%
  Epoch  60: Train 78.5%, Test+TTA 62.8%
  Epoch  64: Train 81.5%, Test+TTA 57.4%
  Early stopping at epoch 64
DenseNet169: 64.9% - Saved

Training ResNeXt50
Training ResNeXt50 with Mixup + Multi-scale
  Epoch   0: Train 23.1%, Test+TTA 24.5%
  Epoch  20: Train 84.9%, Test+TTA 53.2%
  Epoch  40: Train 84.7%, Test+TTA 52.1%
  Epoch  58: Train 84.4%, Test+TTA 57.4%
  Early stopping at epoch 58

In [9]:
# Third attempt at model families

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
import random
import math
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.backends.cudnn.benchmark = True
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
else:
    use_amp = False

color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"
grayscale_path = r"G:\Dropbox\AI Projects\buck\images\squared\grayscale"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def age_to_ordinal(age):
    """Convert age to ordinal targets for ordinal regression"""
    class_idx = age_to_class(age)
    if class_idx is None:
        return None
    # Create ordinal targets: [1,1,1,0,0] for class 2, [1,1,1,1,0] for class 3, etc.
    ordinal = [1 if i <= class_idx else 0 for i in range(5)]
    return ordinal

def cutmix_data(x, y, alpha=1.0):
    """CutMix augmentation"""
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size(0)
    index = torch.randperm(batch_size).to(device)
    
    W, H = x.size(2), x.size(3)
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)
    
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    
    x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (W * H))
    
    return x, y, y[index], lam

def load_images(color_path, grayscale_path, img_size=(224, 224)):
    images = []
    ages = []
    
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                            img_resized = cv2.resize(img_gray, img_size)
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    if os.path.exists(grayscale_path):
        for filename in os.listdir(grayscale_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(grayscale_path, filename)
                        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                        if img is not None:
                            img_resized = cv2.resize(img, img_size)
                            img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_GRAY2RGB)
                            images.append(img_rgb)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    print(f"Total images: {len(images)}")
    print(f"Class distribution: {Counter(ages)}")
    
    return images, ages

def advanced_augment(image):
    """Comprehensive augmentation suite"""
    # Random horizontal flip
    if random.random() < 0.6:
        image = cv2.flip(image, 1)
    
    # Random rotation with scaling
    if random.random() < 0.8:
        angle = random.uniform(-20, 20)
        scale = random.uniform(0.9, 1.1)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, scale)
        image = cv2.warpAffine(image, M, (w, h))
    
    # Color jittering
    if random.random() < 0.9:
        alpha = random.uniform(0.8, 1.2)
        beta = random.randint(-20, 20)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Gaussian noise
    if random.random() < 0.4:
        noise = np.random.normal(0, random.uniform(3, 8), image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    # Random erasing
    if random.random() < 0.3:
        h, w, c = image.shape
        area = h * w
        target_area = random.uniform(0.02, 0.1) * area
        aspect_ratio = random.uniform(0.3, 3.3)
        
        h_erase = int(round(math.sqrt(target_area * aspect_ratio)))
        w_erase = int(round(math.sqrt(target_area / aspect_ratio)))
        
        if h_erase < h and w_erase < w:
            x1 = random.randint(0, h - h_erase)
            y1 = random.randint(0, w - w_erase)
            image[x1:x1+h_erase, y1:y1+w_erase, :] = random.randint(0, 255)
    
    return image

class AdvancedDataset(Dataset):
    def __init__(self, X, y, augment=False, progressive_size=None, ordinal=False):
        self.X = X
        self.y = torch.LongTensor(y)
        self.augment = augment
        self.progressive_size = progressive_size or 224
        self.ordinal = ordinal
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].copy()
        label = self.y[idx].clone()
        
        # Progressive resizing
        if self.progressive_size != 224:
            image = cv2.resize(image, (self.progressive_size, self.progressive_size))
            image = cv2.resize(image, (224, 224))
        
        if self.augment:
            image = advanced_augment(image)
        
        image = torch.FloatTensor(image)
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        image = (image - self.mean) / self.std
        
        if self.ordinal:
            # Convert to ordinal targets
            ordinal_targets = torch.FloatTensor([1 if i <= label.item() else 0 for i in range(4)])
            return image, ordinal_targets
        
        return image, label

class OrdinalLoss(nn.Module):
    """Ordinal regression loss"""
    def __init__(self):
        super(OrdinalLoss, self).__init__()
    
    def forward(self, predictions, targets):
        return F.binary_cross_entropy_with_logits(predictions, targets)

class SelfSupervisedPretrainer:
    """Self-supervised pretraining on the deer images"""
    def __init__(self, model):
        self.model = model
        # Replace final layer for rotation prediction (4 classes: 0, 90, 180, 270)
        if hasattr(model, 'classifier'):
            model.classifier = nn.Linear(model.classifier.in_features, 4)
        elif hasattr(model, 'fc'):
            model.fc = nn.Linear(model.fc.in_features, 4)
    
    def create_rotation_dataset(self, images):
        """Create rotation prediction dataset"""
        rotation_images = []
        rotation_labels = []
        
        for img in images:
            for rotation in range(4):
                rotated = np.rot90(img, k=rotation, axes=(0, 1))
                rotation_images.append(rotated)
                rotation_labels.append(rotation)
        
        return np.array(rotation_images), np.array(rotation_labels)
    
    def pretrain(self, images, epochs=50):
        """Self-supervised pretraining"""
        print("Self-supervised pretraining...")
        
        rot_images, rot_labels = self.create_rotation_dataset(images)
        dataset = AdvancedDataset(rot_images, rot_labels, augment=True)
        loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=0)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(self.model.parameters(), lr=0.001, weight_decay=0.01)
        
        # Ensure model is on correct device
        self.model = self.model.to(device)
        self.model.train()
        
        for epoch in range(epochs):
            correct = 0
            total = 0
            
            for images_batch, labels_batch in loader:
                # Ensure tensors are on correct device
                images_batch = images_batch.to(device)
                labels_batch = labels_batch.to(device)
                
                optimizer.zero_grad()
                
                if use_amp:
                    with autocast():
                        outputs = self.model(images_batch)
                        loss = criterion(outputs, labels_batch)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = self.model(images_batch)
                    loss = criterion(outputs, labels_batch)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                total += labels_batch.size(0)
                correct += (predicted == labels_batch).sum().item()
                
                # Memory cleanup
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            
            if epoch % 10 == 0:
                acc = 100 * correct / total
                print(f"  Pretraining epoch {epoch}: {acc:.1f}%")
        
        print("Pretraining complete")
        return self.model

def create_diverse_models():
    """Create diverse model architectures"""
    models = {}
    
    # Vision Transformers
    try:
        models['vit_base'] = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=5)
        print("Added Vision Transformer")
    except:
        pass
    
    # ConvNeXt
    try:
        models['convnext_tiny'] = timm.create_model('convnext_tiny', pretrained=True, num_classes=5)
        print("Added ConvNeXt")
    except:
        pass
    
    # EfficientNet variants
    try:
        models['efficientnet_v2_s'] = timm.create_model('tf_efficientnetv2_s', pretrained=True, num_classes=5)
        print("Added EfficientNetV2")
    except:
        pass
    
    # RegNet
    try:
        models['regnet_y_800mf'] = timm.create_model('regnetx_800mf', pretrained=True, num_classes=5)
        print("Added RegNet")
    except:
        pass
    
    # Fallback to proven architectures
    models['densenet169'] = timm.create_model('densenet169', pretrained=True, num_classes=5)
    models['resnext50'] = timm.create_model('resnext50_32x4d', pretrained=True, num_classes=5)
    models['densenet201'] = timm.create_model('densenet201', pretrained=True, num_classes=5)
    
    return models

def train_with_all_techniques(model, train_loader, test_loader, model_name, epochs=150):
    """Training with every technique combined"""
    criterion = nn.CrossEntropyLoss(label_smoothing=0.15)
    
    # Different optimizers for different models
    if 'vit' in model_name.lower():
        optimizer = optim.AdamW(model.parameters(), lr=0.0003, weight_decay=0.3)
    elif 'convnext' in model_name.lower():
        optimizer = optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.05)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.05)
    
    # Advanced scheduler with warm restarts
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=30, T_mult=2, eta_min=1e-7
    )
    
    best_acc = 0.0
    patience = 40
    patience_counter = 0
    best_state = None
    
    print(f"Training {model_name} with ALL techniques")
    
    for epoch in range(epochs):
        model.train()
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            # Random choice of augmentation technique
            aug_choice = random.choice(['mixup', 'cutmix', 'normal'])
            
            if aug_choice == 'mixup' and random.random() < 0.4:
                # Mixup
                lam = np.random.beta(0.4, 0.4)
                batch_size = images.size(0)
                index = torch.randperm(batch_size).to(device)
                mixed_images = lam * images + (1 - lam) * images[index, :]
                y_a, y_b = labels, labels[index]
                
                if use_amp:
                    with autocast():
                        outputs = model(mixed_images)
                        loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(mixed_images)
                    loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == y_a).sum().item()
                
            elif aug_choice == 'cutmix' and random.random() < 0.4:
                # CutMix
                mixed_images, y_a, y_b, lam = cutmix_data(images, labels, alpha=1.0)
                
                if use_amp:
                    with autocast():
                        outputs = model(mixed_images)
                        loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(mixed_images)
                    loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == y_a).sum().item()
                
            else:
                # Normal training
                if use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        scheduler.step()
        
        # Advanced TTA evaluation
        test_acc = evaluate_with_advanced_tta(model, test_loader)
        
        if test_acc > best_acc:
            best_acc = test_acc
            patience_counter = 0
            best_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if epoch % 25 == 0 or patience_counter >= patience:
            print(f"  Epoch {epoch:3d}: Train {train_acc:.1f}%, Test+TTA {test_acc:.1f}%")
        
        if patience_counter >= patience:
            print(f"  Early stopping at epoch {epoch}")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_acc

def evaluate_with_advanced_tta(model, test_loader, num_tta=6):
    """Advanced TTA with multiple techniques"""
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            tta_outputs = []
            
            # Original
            if use_amp:
                with autocast():
                    outputs = model(images)
            else:
                outputs = model(images)
            tta_outputs.append(F.softmax(outputs, dim=1))
            
            # Horizontal flip
            flipped = torch.flip(images, [3])
            if use_amp:
                with autocast():
                    outputs = model(flipped)
            else:
                outputs = model(flipped)
            tta_outputs.append(F.softmax(outputs, dim=1))
            
            # Multiple scales
            for scale in [0.9, 1.1]:
                scaled = F.interpolate(images, scale_factor=scale, mode='bilinear', align_corners=False)
                scaled = F.interpolate(scaled, size=(224, 224), mode='bilinear', align_corners=False)
                if use_amp:
                    with autocast():
                        outputs = model(scaled)
                else:
                    outputs = model(scaled)
                tta_outputs.append(F.softmax(outputs, dim=1))
            
            # Crop variants
            for crop_factor in [0.85, 0.95]:
                size = int(224 * crop_factor)
                start = (224 - size) // 2
                cropped = images[:, :, start:start+size, start:start+size]
                cropped = F.interpolate(cropped, size=(224, 224), mode='bilinear', align_corners=False)
                if use_amp:
                    with autocast():
                        outputs = model(cropped)
                else:
                    outputs = model(cropped)
                tta_outputs.append(F.softmax(outputs, dim=1))
            
            # Average all predictions
            avg_output = torch.stack(tta_outputs).mean(0)
            _, predicted = torch.max(avg_output, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

def extract_features_for_ml(model, images):
    """Extract CNN features for traditional ML"""
    model.eval()
    features = []
    
    dataset = AdvancedDataset(images, np.zeros(len(images)), augment=False)
    loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0)
    
    with torch.no_grad():
        for imgs, _ in loader:
            imgs = imgs.to(device)
            
            # Remove final classification layer
            if hasattr(model, 'classifier'):
                feat = model.features(imgs)
                feat = F.adaptive_avg_pool2d(feat, (1, 1))
                feat = feat.view(feat.size(0), -1)
            elif hasattr(model, 'fc'):
                feat = model.forward_head(imgs, pre_logits=True)
            else:
                # For ViT and other models
                feat = model.forward_features(imgs)
                if len(feat.shape) > 2:
                    feat = feat.mean(dim=1)
            
            features.append(feat.cpu().numpy())
    
    return np.vstack(features)

def progressive_training(model, X_train, y_train, X_test, y_test, model_name):
    """Progressive training: easy distinctions first"""
    print(f"Progressive training for {model_name}")
    
    # Phase 1: Binary young vs old (classes 0,1,2 vs 3,4)
    y_binary_train = np.array([0 if y <= 2 else 1 for y in y_train])
    y_binary_test = np.array([0 if y <= 2 else 1 for y in y_test])
    
    # Modify model for binary classification
    if hasattr(model, 'classifier'):
        model.classifier = nn.Linear(model.classifier.in_features, 2)
    elif hasattr(model, 'fc'):
        model.fc = nn.Linear(model.fc.in_features, 2)
    
    model = model.to(device)
    
    # Train binary classifier
    binary_dataset_train = AdvancedDataset(X_train, y_binary_train, augment=True)
    binary_dataset_test = AdvancedDataset(X_test, y_binary_test, augment=False)
    binary_loader_train = DataLoader(binary_dataset_train, batch_size=16, shuffle=True, num_workers=0)
    binary_loader_test = DataLoader(binary_dataset_test, batch_size=16, shuffle=False, num_workers=0)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    
    model.train()
    for epoch in range(30):
        for images, labels in binary_loader_train:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            if use_amp:
                with autocast():
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
    
    # Phase 2: Modify for 5-class and fine-tune
    if hasattr(model, 'classifier'):
        model.classifier = nn.Linear(model.classifier.in_features, 5)
    elif hasattr(model, 'fc'):
        model.fc = nn.Linear(model.fc.in_features, 5)
    
    model = model.to(device)
    
    # Fine-tune on all classes
    full_dataset_train = AdvancedDataset(X_train, y_train, augment=True)
    full_dataset_test = AdvancedDataset(X_test, y_test, augment=False)
    full_loader_train = DataLoader(full_dataset_train, batch_size=16, shuffle=True, num_workers=0)
    full_loader_test = DataLoader(full_dataset_test, batch_size=16, shuffle=False, num_workers=0)
    
    # Lower learning rate for fine-tuning
    optimizer = optim.AdamW(model.parameters(), lr=0.0002, weight_decay=0.01)
    
    best_acc = 0.0
    for epoch in range(50):
        model.train()
        for images, labels in full_loader_train:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            if use_amp:
                with autocast():
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
        
        # Evaluate
        test_acc = evaluate_with_advanced_tta(model, full_loader_test)
        if test_acc > best_acc:
            best_acc = test_acc
        
        if epoch % 10 == 0:
            print(f"  Progressive epoch {epoch}: {test_acc:.1f}%")
    
    return model, best_acc

# Main comprehensive pipeline
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"deer_age_comprehensive_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"Comprehensive results saved to: {output_dir}")

print("Loading images...")
X, y = load_images(color_path, grayscale_path)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# PHASE 1: Advanced Architecture Survey
print("\n" + "="*80)
print("PHASE 1: ADVANCED ARCHITECTURE SURVEY")
print("="*80)

diverse_models = create_diverse_models()
phase1_results = []

for model_name, model_template in diverse_models.items():
    print(f"\n--- Testing {model_name} ---")
    
    try:
        # Self-supervised pretraining
        pretrainer = SelfSupervisedPretrainer(model_template.to(device))
        pretrained_model = pretrainer.pretrain(X_train, epochs=30)
        
        # Reset for main task
        if hasattr(pretrained_model, 'classifier'):
            pretrained_model.classifier = nn.Linear(pretrained_model.classifier.in_features, 5)
        elif hasattr(pretrained_model, 'fc'):
            pretrained_model.fc = nn.Linear(pretrained_model.fc.in_features, 5)
        
        pretrained_model = pretrained_model.to(device)
        
        # Create datasets
        train_dataset = AdvancedDataset(X_train, y_train, augment=True)
        test_dataset = AdvancedDataset(X_test, y_test, augment=False)
        train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, num_workers=0)
        test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False, num_workers=0)
        
        # Train with all techniques
        trained_model, accuracy = train_with_all_techniques(
            pretrained_model, train_loader, test_loader, model_name, epochs=120
        )
        
        # Save model
        acc_str = f"{accuracy:.1f}".replace('.', 'p')
        model_filename = f"{model_name}_pretrained_{acc_str}pct.pth"
        model_path = os.path.join(output_dir, model_filename)
        
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'model_name': model_name,
            'accuracy': accuracy,
            'num_classes': 5,
            'pretrained': True
        }, model_path)
        
        phase1_results.append({
            'model': model_name,
            'accuracy': accuracy,
            'filename': model_filename,
            'trained_model': trained_model
        })
        
        print(f"{model_name} with pretraining: {accuracy:.1f}%")
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"Error with {model_name}: {e}")
        continue

# PHASE 2: Progressive Training
print("\n" + "="*80)
print("PHASE 2: PROGRESSIVE TRAINING")
print("="*80)

phase2_results = []
for model_name in ['densenet169', 'resnext50_32x4d']:
    print(f"\n--- Progressive training {model_name} ---")
    
    try:
        model = timm.create_model(model_name, pretrained=True, num_classes=5)
        trained_model, accuracy = progressive_training(
            model, X_train, y_train, X_test, y_test, model_name
        )
        
        acc_str = f"{accuracy:.1f}".replace('.', 'p')
        model_filename = f"{model_name}_progressive_{acc_str}pct.pth"
        model_path = os.path.join(output_dir, model_filename)
        
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'model_name': f"{model_name}_progressive",
            'accuracy': accuracy,
            'num_classes': 5
        }, model_path)
        
        phase2_results.append({
            'model': f"{model_name}_progressive",
            'accuracy': accuracy,
            'filename': model_filename,
            'trained_model': trained_model
        })
        
        print(f"{model_name} progressive: {accuracy:.1f}%")
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"Error with progressive {model_name}: {e}")
        continue

# PHASE 3: CNN Feature Extraction + Traditional ML
print("\n" + "="*80)
print("PHASE 3: HYBRID CNN + TRADITIONAL ML")
print("="*80)

phase3_results = []
if phase1_results:
    # Use best CNN model for feature extraction
    best_cnn = max(phase1_results, key=lambda x: x['accuracy'])
    print(f"Using {best_cnn['model']} for feature extraction...")
    
    # Extract features
    train_features = extract_features_for_ml(best_cnn['trained_model'], X_train)
    test_features = extract_features_for_ml(best_cnn['trained_model'], X_test)
    
    # Traditional ML models
    ml_models = {
        'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
        'XGBoost': XGBClassifier(n_estimators=200, max_depth=6, random_state=42),
        'SVM': SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
    }
    
    for ml_name, ml_model in ml_models.items():
        try:
            print(f"Training {ml_name} on CNN features...")
            ml_model.fit(train_features, y_train)
            ml_predictions = ml_model.predict(test_features)
            ml_accuracy = accuracy_score(y_test, ml_predictions) * 100
            
            phase3_results.append({
                'model': f"CNN+{ml_name}",
                'accuracy': ml_accuracy,
                'type': 'hybrid'
            })
            
            print(f"CNN + {ml_name}: {ml_accuracy:.1f}%")
            
        except Exception as e:
            print(f"Error with {ml_name}: {e}")
            continue

# PHASE 4: Ordinal Regression
print("\n" + "="*80)
print("PHASE 4: ORDINAL REGRESSION")
print("="*80)

phase4_results = []
try:
    print("Training ordinal regression model...")
    
    # Create ordinal targets
    y_train_ordinal = np.array([age_to_ordinal(y_train[i]*0.5 + 1.5) for i in range(len(y_train))])
    y_test_ordinal = np.array([age_to_ordinal(y_test[i]*0.5 + 1.5) for i in range(len(y_test))])
    
    # Remove None values
    valid_train = [i for i, val in enumerate(y_train_ordinal) if val is not None]
    valid_test = [i for i, val in enumerate(y_test_ordinal) if val is not None]
    
    if valid_train and valid_test:
        X_train_ord = X_train[valid_train]
        y_train_ord = np.array([y_train_ordinal[i] for i in valid_train])
        X_test_ord = X_test[valid_test]
        y_test_ord = np.array([y_test_ordinal[i] for i in valid_test])
        
        # Create ordinal model
        ordinal_model = timm.create_model('densenet169', pretrained=True, num_classes=4)
        ordinal_model = ordinal_model.to(device)
        
        # Replace classifier for ordinal regression
        if hasattr(ordinal_model, 'classifier'):
            ordinal_model.classifier = nn.Linear(ordinal_model.classifier.in_features, 4)
        
        # Create datasets
        train_ord_dataset = AdvancedDataset(X_train_ord, np.zeros(len(X_train_ord)), augment=True, ordinal=True)
        test_ord_dataset = AdvancedDataset(X_test_ord, np.zeros(len(X_test_ord)), augment=False, ordinal=True)
        train_ord_loader = DataLoader(train_ord_dataset, batch_size=16, shuffle=True, num_workers=0)
        test_ord_loader = DataLoader(test_ord_dataset, batch_size=16, shuffle=False, num_workers=0)
        
        # Train ordinal model
        ordinal_criterion = OrdinalLoss()
        ordinal_optimizer = optim.AdamW(ordinal_model.parameters(), lr=0.001, weight_decay=0.01)
        
        best_ordinal_acc = 0.0
        for epoch in range(50):
            ordinal_model.train()
            for images, targets in train_ord_loader:
                images = images.to(device)
                # Reconstruct ordinal targets from y_train_ord
                batch_targets = torch.FloatTensor([y_train_ord[i % len(y_train_ord)] for i in range(len(images))]).to(device)
                
                ordinal_optimizer.zero_grad()
                
                if use_amp:
                    with autocast():
                        outputs = ordinal_model(images)
                        loss = ordinal_criterion(outputs, batch_targets)
                    scaler.scale(loss).backward()
                    scaler.step(ordinal_optimizer)
                    scaler.update()
                else:
                    outputs = ordinal_model(images)
                    loss = ordinal_criterion(outputs, batch_targets)
                    loss.backward()
                    ordinal_optimizer.step()
            
            # Evaluate ordinal model (convert back to class predictions)
            ordinal_model.eval()
            correct = 0
            total = 0
            
            with torch.no_grad():
                for images, _ in test_ord_loader:
                    images = images.to(device)
                    outputs = torch.sigmoid(ordinal_model(images))
                    # Convert ordinal predictions back to classes
                    predicted_classes = (outputs > 0.5).sum(dim=1)
                    
                    batch_true_classes = torch.LongTensor([y_test[valid_test[i % len(valid_test)]] for i in range(len(images))]).to(device)
                    
                    total += len(images)
                    correct += (predicted_classes == batch_true_classes).sum().item()
            
            ordinal_acc = 100 * correct / total
            if ordinal_acc > best_ordinal_acc:
                best_ordinal_acc = ordinal_acc
            
            if epoch % 10 == 0:
                print(f"  Ordinal epoch {epoch}: {ordinal_acc:.1f}%")
        
        phase4_results.append({
            'model': 'Ordinal_Regression',
            'accuracy': best_ordinal_acc,
            'type': 'ordinal'
        })
        
        print(f"Ordinal regression: {best_ordinal_acc:.1f}%")

except Exception as e:
    print(f"Error with ordinal regression: {e}")

# FINAL RESULTS COMPILATION
print("\n" + "="*80)
print("COMPREHENSIVE RESULTS SUMMARY")
print("="*80)

all_results = []
all_results.extend(phase1_results)
all_results.extend(phase2_results)
all_results.extend(phase3_results)
all_results.extend(phase4_results)

all_results.sort(key=lambda x: x['accuracy'], reverse=True)

print("PHASE 1 - ADVANCED ARCHITECTURES + PRETRAINING:")
for result in phase1_results:
    print(f"  {result['model']:25s} - {result['accuracy']:.1f}%")

print("\nPHASE 2 - PROGRESSIVE TRAINING:")
for result in phase2_results:
    print(f"  {result['model']:25s} - {result['accuracy']:.1f}%")

print("\nPHASE 3 - HYBRID CNN + TRADITIONAL ML:")
for result in phase3_results:
    print(f"  {result['model']:25s} - {result['accuracy']:.1f}%")

print("\nPHASE 4 - ORDINAL REGRESSION:")
for result in phase4_results:
    print(f"  {result['model']:25s} - {result['accuracy']:.1f}%")

print(f"\n{'='*80}")
print("OVERALL BEST RESULTS:")
for i, result in enumerate(all_results[:10], 1):
    status = "TARGET ACHIEVED" if result['accuracy'] >= 75.0 else f"{75.0-result['accuracy']:.1f}% to go"
    print(f"{i:2d}. {result['model']:25s} - {result['accuracy']:.1f}% - {status}")

if all_results:
    best_overall = all_results[0]
    print(f"\nBEST OVERALL: {best_overall['model']} - {best_overall['accuracy']:.1f}%")
    
    if best_overall['accuracy'] >= 75.0:
        print("SUCCESS: 75% TARGET ACHIEVED!")
    else:
        gap = 75.0 - best_overall['accuracy']
        print(f"Best effort with 466 images: {gap:.1f}% short of 75% target")
        print("Consider multi-fold ensemble if this is insufficient")

print(f"\nAll models and results saved in: {output_dir}")
print("="*80)

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Mixed Precision: Enabled
ERROR! Session/line number was not unique in database. History logging moved to new session 287
Comprehensive results saved to: deer_age_comprehensive_20250801_075228
Loading images...
Total images: 466
Class distribution: Counter({np.int64(4): 118, np.int64(2): 111, np.int64(3): 89, np.int64(1): 82, np.int64(0): 66})
Train: 372, Test: 94

PHASE 1: ADVANCED ARCHITECTURE SURVEY
Added Vision Transformer
Added ConvNeXt
Added EfficientNetV2

--- Testing vit_base ---
Self-supervised pretraining...
  Pretraining epoch 0: 23.3%
  Pretraining epoch 10: 23.1%
  Pretraining epoch 20: 24.1%
Pretraining complete
Training vit_base with ALL techniques
  Epoch   0: Train 23.7%, Test+TTA 19.1%
  Epoch  25: Train 25.0%, Test+TTA 27.7%
  Epoch  50: Train 23.7%, Test+TTA 25.5%
  Epoch  61: Train 24.7%, Test+TTA 24.5%
  Early stopping at epoch 61
vit_base with pretraining: 27.7%

--- Testing convnext_tiny ---
Self-supervised pretrain

In [13]:
# Multi-family survey, but ONLY color images

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
import random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.backends.cudnn.benchmark = True
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
else:
    use_amp = False

# ONLY COLOR IMAGES PATH
color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def load_color_images_only(color_path, img_size=(224, 224)):
    """Load ONLY color images and preserve RGB color information"""
    images = []
    ages = []
    
    print(f"Loading ONLY color images from: {color_path}")
    
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            # PRESERVE COLOR: BGR -> RGB (no grayscale conversion)
                            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                            img_resized = cv2.resize(img_rgb, img_size)
                            assert img_resized.shape == (*img_size, 3), f"Image {filename} not resized correctly: {img_resized.shape}"
                            images.append(img_resized)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    print(f"Color images loaded: {len(images)}")
    print(f"Image shape: {images.shape}")
    print(f"Class distribution: {Counter(ages)}")
    
    return images, ages

def smart_augment(image):
    """Smart augmentation that preserves color information"""
    # Ensure image is uint8
    image = image.astype(np.uint8)
    
    # Horizontal flip (deer can face either direction)
    if random.random() < 0.6:
        image = cv2.flip(image, 1)
    
    # Rotation (not too much to preserve antler features)
    if random.random() < 0.7:
        angle = random.uniform(-15, 15)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    # Color jittering (vary lighting conditions)
    if random.random() < 0.8:
        # Brightness and contrast - use cv2.convertScaleAbs to ensure valid range
        alpha = random.uniform(0.8, 1.2)
        beta = random.randint(-15, 15)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Hue/Saturation shifts (seasonal color changes)
    if random.random() < 0.5:
        try:
            hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
            # Hue shift - ensure values stay in valid range
            h_shift = random.randint(-10, 10)
            hsv[:, :, 0] = np.clip(hsv[:, :, 0].astype(np.int16) + h_shift, 0, 179).astype(np.uint8)
            # Saturation shift
            s_factor = random.uniform(0.8, 1.2)
            hsv[:, :, 1] = np.clip(hsv[:, :, 1].astype(np.float32) * s_factor, 0, 255).astype(np.uint8)
            image = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
        except:
            pass  # Skip HSV if conversion fails
    
    # Gaussian noise
    if random.random() < 0.3:
        noise = np.random.normal(0, 3, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

class ColorDeerDataset(Dataset):
    def __init__(self, X, y, augment=False):
        self.X = X
        self.y = torch.LongTensor(y)
        self.augment = augment
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].copy()
        label = self.y[idx].clone()
        
        if self.augment:
            image = smart_augment(image)
        
        image = torch.FloatTensor(image)
        if image.max() > 1.0:
            image = image / 255.0
        
        # Convert to CHW format
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        # Normalize with ImageNet statistics
        image = (image - self.mean) / self.std
        
        return image, label

def create_model(model_name, num_classes=5):
    """Create model with optimal regularization"""
    model = timm.create_model(model_name, pretrained=True, num_classes=num_classes, drop_rate=0.4)
    
    # Strategic freezing based on model type
    if 'densenet' in model_name:
        for name, param in model.named_parameters():
            if not ('denseblock4' in name or 'classifier' in name):
                param.requires_grad = False
    elif 'resnext' in model_name or 'resnet' in model_name:
        for name, param in model.named_parameters():
            if not ('layer4' in name or 'fc' in name):
                param.requires_grad = False
    elif 'efficientnet' in model_name:
        for name, param in model.named_parameters():
            if not ('blocks.6' in name or 'blocks.7' in name or 'classifier' in name):
                param.requires_grad = False
    elif 'convnext' in model_name:
        for name, param in model.named_parameters():
            if not ('stages.3' in name or 'head' in name):
                param.requires_grad = False
    elif 'vit' in model_name:
        # Freeze early transformer blocks
        for name, param in model.named_parameters():
            if 'blocks.0.' in name or 'blocks.1.' in name or 'blocks.2.' in name:
                param.requires_grad = False
    
    return model.to(device)

def train_model_optimized(model, train_loader, test_loader, model_name, epochs=100):
    """Optimized training with proven techniques"""
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    
    # Architecture-specific optimizers
    if 'vit' in model_name.lower():
        optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.05)
    elif 'convnext' in model_name.lower():
        optimizer = optim.AdamW(model.parameters(), lr=0.0008, weight_decay=0.05)
    elif 'efficientnet' in model_name.lower():
        optimizer = optim.AdamW(model.parameters(), lr=0.0006, weight_decay=0.04)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=0.0007, weight_decay=0.05)
    
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-7)
    
    best_acc = 0.0
    patience = 25
    patience_counter = 0
    best_state = None
    
    print(f"Training {model_name} on COLOR images")
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            # Occasional mixup
            if random.random() < 0.3:
                lam = np.random.beta(0.4, 0.4)
                batch_size = images.size(0)
                index = torch.randperm(batch_size).to(device)
                mixed_images = lam * images + (1 - lam) * images[index, :]
                y_a, y_b = labels, labels[index]
                
                if use_amp:
                    with autocast():
                        outputs = model(mixed_images)
                        loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(mixed_images)
                    loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == y_a).sum().item()
            else:
                if use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        scheduler.step()
        
        # Test with TTA
        test_acc = evaluate_with_tta(model, test_loader)
        
        if test_acc > best_acc:
            best_acc = test_acc
            patience_counter = 0
            best_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if epoch % 20 == 0 or patience_counter >= patience:
            print(f"  Epoch {epoch:3d}: Train {train_acc:.1f}%, Test+TTA {test_acc:.1f}%")
        
        if patience_counter >= patience:
            print(f"  Early stopping at epoch {epoch}")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_acc

def evaluate_with_tta(model, test_loader):
    """Test-Time Augmentation evaluation"""
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Original
            if use_amp:
                with autocast():
                    outputs1 = model(images)
            else:
                outputs1 = model(images)
            
            # Horizontal flip
            flipped = torch.flip(images, [3])
            if use_amp:
                with autocast():
                    outputs2 = model(flipped)
            else:
                outputs2 = model(flipped)
            
            # Slight scale variations
            scaled_up = F.interpolate(images, scale_factor=1.1, mode='bilinear', align_corners=False)
            scaled_up = F.interpolate(scaled_up, size=(224, 224), mode='bilinear', align_corners=False)
            if use_amp:
                with autocast():
                    outputs3 = model(scaled_up)
            else:
                outputs3 = model(scaled_up)
            
            scaled_down = F.interpolate(images, scale_factor=0.9, mode='bilinear', align_corners=False)
            scaled_down = F.interpolate(scaled_down, size=(224, 224), mode='bilinear', align_corners=False)
            if use_amp:
                with autocast():
                    outputs4 = model(scaled_down)
            else:
                outputs4 = model(scaled_down)
            
            # Average all TTA predictions
            avg_outputs = (F.softmax(outputs1, dim=1) + F.softmax(outputs2, dim=1) + 
                          F.softmax(outputs3, dim=1) + F.softmax(outputs4, dim=1)) / 4
            _, predicted = torch.max(avg_outputs, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

# Main execution
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"deer_age_color_only_{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"COLOR-ONLY models saved to: {output_dir}")

# Load ONLY color images with preserved color information
print("Loading COLOR images only...")
X, y = load_color_images_only(color_path)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Create datasets
train_dataset = ColorDeerDataset(X_train, y_train, augment=True)
test_dataset = ColorDeerDataset(X_test, y_test, augment=False)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

print(f"\n{'='*80}")
print("COLOR-ONLY WIDE FAMILY SURVEY")
print(f"{'='*80}")
print("Hypothesis: Color information will boost accuracy to 75%+")
print("Strategy: Preserve RGB, test many architectures")

# Comprehensive model families
model_families = [
    # Proven performers (from previous tests)
    ('resnext50_32x4d', 'ResNeXt50'),
    ('densenet169', 'DenseNet169'),
    ('densenet201', 'DenseNet201'),
    ('tf_efficientnetv2_s', 'EfficientNetV2_S'),
    
    # EfficientNet family
    ('efficientnet_b0', 'EfficientNetB0'),
    ('efficientnet_b1', 'EfficientNetB1'),
    ('efficientnet_b2', 'EfficientNetB2'),
    ('efficientnet_b3', 'EfficientNetB3'),
    
    # ResNet family
    ('resnet50', 'ResNet50'),
    ('resnet101', 'ResNet101'),
    ('resnet152', 'ResNet152'),
    
    # DenseNet family
    ('densenet121', 'DenseNet121'),
    
    # Modern architectures
    ('convnext_tiny', 'ConvNeXt_Tiny'),
    ('convnext_small', 'ConvNeXt_Small'),
    
    # Vision Transformers
    ('vit_base_patch16_224', 'ViT_Base'),
    ('vit_small_patch16_224', 'ViT_Small'),
    
    # RegNet family
    ('regnetx_800mf', 'RegNetX_800MF'),
    ('regnety_800mf', 'RegNetY_800MF'),
    
    # Additional strong performers
    ('wide_resnet50_2', 'WideResNet50'),
    ('mobilenetv3_large_100', 'MobileNetV3_Large'),
]

results = []
best_accuracy = 0
best_family = None

print(f"Testing {len(model_families)} model families on COLOR images...")

for model_timm_name, display_name in model_families:
    print(f"\n{'='*50}")
    print(f"Testing {display_name}")
    print(f"{'='*50}")
    
    try:
        model = create_model(model_timm_name)
        trained_model, accuracy = train_model_optimized(
            model, train_loader, test_loader, display_name
        )
        
        # Save model
        acc_str = f"{accuracy:.1f}".replace('.', 'p')
        model_filename = f"{display_name}_color_{acc_str}pct.pth"
        model_path = os.path.join(output_dir, model_filename)
        
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'model_name': display_name,
            'timm_name': model_timm_name,
            'accuracy': accuracy,
            'num_classes': 5,
            'color_only': True
        }, model_path)
        
        results.append({
            'model': display_name,
            'timm_name': model_timm_name,
            'accuracy': accuracy,
            'filename': model_filename
        })
        
        status = "TARGET ACHIEVED!" if accuracy >= 75.0 else f"{75.0-accuracy:.1f}% to go"
        print(f"{display_name}: {accuracy:.1f}% - {status}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_family = display_name
        
        # Cleanup
        del model, trained_model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
    except Exception as e:
        print(f"Error with {display_name}: {e}")
        continue

# Results by family
print(f"\n{'='*80}")
print("COLOR-ONLY RESULTS BY FAMILY")
print(f"{'='*80}")

results.sort(key=lambda x: x['accuracy'], reverse=True)

families = {
    'ResNet': ['ResNet50', 'ResNet101', 'ResNet152', 'ResNeXt50', 'WideResNet50'],
    'DenseNet': ['DenseNet121', 'DenseNet169', 'DenseNet201'],
    'EfficientNet': ['EfficientNetB0', 'EfficientNetB1', 'EfficientNetB2', 'EfficientNetB3', 'EfficientNetV2_S'],
    'ConvNeXt': ['ConvNeXt_Tiny', 'ConvNeXt_Small'],
    'ViT': ['ViT_Base', 'ViT_Small'],
    'RegNet': ['RegNetX_800MF', 'RegNetY_800MF'],
    'Mobile': ['MobileNetV3_Large']
}

for family_name, family_models in families.items():
    family_results = [r for r in results if r['model'] in family_models]
    if family_results:
        print(f"\n{family_name} FAMILY:")
        for result in family_results:
            status = "SUCCESS!" if result['accuracy'] >= 75.0 else f"{75.0-result['accuracy']:.1f}% to go"
            print(f"  {result['model']:20s} - {result['accuracy']:.1f}% - {status}")

print(f"\n{'='*80}")
print("OVERALL TOP PERFORMERS (COLOR-ONLY)")
print(f"{'='*80}")

for i, result in enumerate(results[:15], 1):
    status = "TARGET ACHIEVED!" if result['accuracy'] >= 75.0 else f"{75.0-result['accuracy']:.1f}% to go"
    print(f"{i:2d}. {result['model']:20s} - {result['accuracy']:.1f}% - {status}")

if results:
    best_result = results[0]
    print(f"\nBEST COLOR-ONLY MODEL: {best_result['model']} - {best_result['accuracy']:.1f}%")
    
    if best_result['accuracy'] >= 75.0:
        print("SUCCESS: COLOR INFORMATION ACHIEVED 75% TARGET!")
        print("The hypothesis was correct - color data was crucial!")
    else:
        gap = 75.0 - best_result['accuracy']
        print(f"COLOR-ONLY RESULT: {gap:.1f}% short of 75% target")
        
        # Compare to previous best (66% with mixed data)
        improvement = best_result['accuracy'] - 66.0
        if improvement > 0:
            print(f"IMPROVEMENT: +{improvement:.1f}% vs mixed color/grayscale data")
        else:
            print("No improvement over mixed data - hypothesis incorrect")

print(f"\nTotal models tested: {len(results)}")
print(f"All COLOR-ONLY models saved in: {output_dir}")
print("="*80)

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Mixed Precision: Enabled
COLOR-ONLY models saved to: deer_age_color_only_20250802_082228
Loading COLOR images only...
Loading ONLY color images from: G:\Dropbox\AI Projects\buck\images\squared\color
Color images loaded: 361
Image shape: (361, 224, 224, 3)
Class distribution: Counter({np.int64(4): 88, np.int64(3): 75, np.int64(2): 75, np.int64(1): 66, np.int64(0): 57})
Train: 288, Test: 73

COLOR-ONLY WIDE FAMILY SURVEY
Hypothesis: Color information will boost accuracy to 75%+
Strategy: Preserve RGB, test many architectures
Testing 20 model families on COLOR images...

Testing ResNeXt50
Training ResNeXt50 on COLOR images
  Epoch   0: Train 22.9%, Test+TTA 27.4%
  Epoch  20: Train 74.0%, Test+TTA 60.3%
  Epoch  40: Train 82.6%, Test+TTA 63.0%
  Epoch  60: Train 86.1%, Test+TTA 63.0%
  Epoch  61: Train 96.5%, Test+TTA 63.0%
  Early stopping at epoch 61
ResNeXt50: 68.5% - 6.5% to go

Testing DenseNet169
Training DenseNet169 on COLOR images
  

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Training ViT_Small on COLOR images
  Epoch   0: Train 20.5%, Test+TTA 23.3%
  Epoch  20: Train 51.0%, Test+TTA 39.7%
  Epoch  40: Train 69.1%, Test+TTA 64.4%
  Epoch  60: Train 80.6%, Test+TTA 63.0%
  Epoch  67: Train 83.3%, Test+TTA 58.9%
  Early stopping at epoch 67
ViT_Small: 67.1% - 7.9% to go

Testing RegNetX_800MF
Error with RegNetX_800MF: Unknown model (regnetx_800mf)

Testing RegNetY_800MF
Error with RegNetY_800MF: Unknown model (regnety_800mf)

Testing WideResNet50
Training WideResNet50 on COLOR images
  Epoch   0: Train 25.0%, Test+TTA 37.0%
  Epoch  20: Train 84.0%, Test+TTA 65.8%
  Epoch  39: Train 77.1%, Test+TTA 64.4%
  Early stopping at epoch 39
WideResNet50: 68.5% - 6.5% to go

Testing MobileNetV3_Large
Training MobileNetV3_Large on COLOR images
  Epoch   0: Train 25.7%, Test+TTA 31.5%
  Epoch  20: Train 84.7%, Test+TTA 53.4%
  Epoch  40: Train 78.1%, Test+TTA 56.2%
  Epoch  51: Train 77.4%, Test+TTA 53.4%
  Early stopping at epoch 51
MobileNetV3_Large: 64.4% - 10.6% to

In [14]:
# Hyperparameter optimization for Efficient-B0 model

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
import random
import json
import itertools
from datetime import datetime, timedelta
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.backends.cudnn.benchmark = True
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
else:
    use_amp = False

color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def load_color_images_only(color_path, img_size=(224, 224)):
    images = []
    ages = []
    
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                            img_resized = cv2.resize(img_rgb, img_size)
                            images.append(img_resized)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    return images, ages

def create_augmented_image(image, aug_intensity='medium'):
    """Parameterized augmentation with different intensities"""
    image = image.astype(np.uint8)
    
    # Augmentation parameters by intensity
    aug_params = {
        'light': {
            'flip_prob': 0.3, 'rot_prob': 0.2, 'rot_range': 5,
            'color_prob': 0.4, 'alpha_range': (0.95, 1.05), 'beta_range': 5,
            'hsv_prob': 0.2, 'hue_range': 5, 'sat_range': (0.95, 1.05),
            'noise_prob': 0.1, 'noise_std': 2
        },
        'medium': {
            'flip_prob': 0.5, 'rot_prob': 0.4, 'rot_range': 10,
            'color_prob': 0.6, 'alpha_range': (0.9, 1.1), 'beta_range': 10,
            'hsv_prob': 0.3, 'hue_range': 8, 'sat_range': (0.9, 1.1),
            'noise_prob': 0.2, 'noise_std': 3
        },
        'heavy': {
            'flip_prob': 0.7, 'rot_prob': 0.6, 'rot_range': 15,
            'color_prob': 0.8, 'alpha_range': (0.8, 1.2), 'beta_range': 15,
            'hsv_prob': 0.5, 'hue_range': 12, 'sat_range': (0.8, 1.2),
            'noise_prob': 0.3, 'noise_std': 5
        }
    }
    
    params = aug_params[aug_intensity]
    
    # Horizontal flip
    if random.random() < params['flip_prob']:
        image = cv2.flip(image, 1)
    
    # Rotation
    if random.random() < params['rot_prob']:
        angle = random.uniform(-params['rot_range'], params['rot_range'])
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    # Color jittering
    if random.random() < params['color_prob']:
        alpha = random.uniform(*params['alpha_range'])
        beta = random.randint(-params['beta_range'], params['beta_range'])
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # HSV shifts
    if random.random() < params['hsv_prob']:
        try:
            hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
            h_shift = random.randint(-params['hue_range'], params['hue_range'])
            hsv[:, :, 0] = np.clip(hsv[:, :, 0].astype(np.int16) + h_shift, 0, 179).astype(np.uint8)
            s_factor = random.uniform(*params['sat_range'])
            hsv[:, :, 1] = np.clip(hsv[:, :, 1].astype(np.float32) * s_factor, 0, 255).astype(np.uint8)
            image = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
        except:
            pass
    
    # Gaussian noise
    if random.random() < params['noise_prob']:
        noise = np.random.normal(0, params['noise_std'], image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

class OptimizedDeerDataset(Dataset):
    def __init__(self, X, y, augment=False, aug_intensity='medium', img_size=224):
        self.X = X
        self.y = torch.LongTensor(y)
        self.augment = augment
        self.aug_intensity = aug_intensity
        self.img_size = img_size
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].copy()
        label = self.y[idx].clone()
        
        # Resize if needed
        if image.shape[:2] != (self.img_size, self.img_size):
            image = cv2.resize(image, (self.img_size, self.img_size))
        
        if self.augment:
            image = create_augmented_image(image, self.aug_intensity)
        
        image = torch.FloatTensor(image)
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        image = (image - self.mean) / self.std
        
        return image, label

def create_efficientnet_b0(num_classes=5, drop_rate=0.4, freezing_strategy='standard'):
    """Create EfficientNetB0 with different freezing strategies"""
    model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=num_classes, drop_rate=drop_rate)
    
    if freezing_strategy == 'none':
        # No freezing - train everything
        pass
    elif freezing_strategy == 'light':
        # Freeze only first 2 blocks
        for name, param in model.named_parameters():
            if 'blocks.0.' in name or 'blocks.1.' in name:
                param.requires_grad = False
    elif freezing_strategy == 'standard':
        # Freeze first 4 blocks (original approach)
        for name, param in model.named_parameters():
            if not ('blocks.6.' in name or 'blocks.7.' in name or 'classifier' in name):
                param.requires_grad = False
    elif freezing_strategy == 'heavy':
        # Freeze everything except last block and classifier
        for name, param in model.named_parameters():
            if not ('blocks.7.' in name or 'classifier' in name):
                param.requires_grad = False
    
    return model.to(device)

def train_with_config(config, X_train, y_train, X_test, y_test, trial_num):
    """Train model with specific hyperparameter configuration"""
    
    # Create datasets with config parameters
    train_dataset = OptimizedDeerDataset(
        X_train, y_train, 
        augment=True, 
        aug_intensity=config['aug_intensity'],
        img_size=config['img_size']
    )
    test_dataset = OptimizedDeerDataset(
        X_test, y_test, 
        augment=False,
        img_size=config['img_size']
    )
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=config['batch_size'], 
        shuffle=True, 
        num_workers=0
    )
    test_loader = DataLoader(
        test_dataset, 
        batch_size=config['batch_size'], 
        shuffle=False, 
        num_workers=0
    )
    
    # Create model
    model = create_efficientnet_b0(
        drop_rate=config['dropout_rate'],
        freezing_strategy=config['freezing']
    )
    
    # Loss function
    if config['loss_function'] == 'crossentropy':
        criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
    elif config['loss_function'] == 'focal':
        # Simple focal loss implementation
        class FocalLoss(nn.Module):
            def __init__(self, alpha=1, gamma=2):
                super().__init__()
                self.alpha = alpha
                self.gamma = gamma
            
            def forward(self, inputs, targets):
                ce_loss = F.cross_entropy(inputs, targets, reduction='none')
                pt = torch.exp(-ce_loss)
                focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
                return focal_loss.mean()
        
        criterion = FocalLoss(gamma=config['focal_gamma'])
    
    # Optimizer
    if config['optimizer'] == 'adamw':
        optimizer = optim.AdamW(
            model.parameters(), 
            lr=config['learning_rate'],
            weight_decay=config['weight_decay'],
            betas=(config['beta1'], config['beta2'])
        )
    elif config['optimizer'] == 'sgd':
        optimizer = optim.SGD(
            model.parameters(),
            lr=config['learning_rate'],
            weight_decay=config['weight_decay'],
            momentum=config['momentum'],
            nesterov=config['nesterov']
        )
    elif config['optimizer'] == 'rmsprop':
        optimizer = optim.RMSprop(
            model.parameters(),
            lr=config['learning_rate'],
            weight_decay=config['weight_decay'],
            alpha=config['rmsprop_alpha']
        )
    
    # Scheduler
    if config['scheduler'] == 'cosine':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, 
            T_max=config['epochs'], 
            eta_min=config['min_lr']
        )
    elif config['scheduler'] == 'step':
        scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=config['step_size'],
            gamma=config['step_gamma']
        )
    elif config['scheduler'] == 'reduce':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            patience=config['reduce_patience'],
            factor=config['reduce_factor'],
            min_lr=config['min_lr']
        )
    elif config['scheduler'] == 'warmup_cosine':
        # Warmup + Cosine
        warmup_epochs = config['warmup_epochs']
        scheduler = optim.lr_scheduler.LambdaLR(
            optimizer,
            lr_lambda=lambda epoch: (epoch / warmup_epochs) if epoch < warmup_epochs 
            else 0.5 * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (config['epochs'] - warmup_epochs)))
        )
    
    best_acc = 0.0
    patience = config['patience']
    patience_counter = 0
    best_state = None
    
    print(f"Trial {trial_num}: LR={config['learning_rate']:.6f}, BS={config['batch_size']}, "
          f"Drop={config['dropout_rate']:.3f}, Aug={config['aug_intensity']}")
    
    for epoch in range(config['epochs']):
        # Training
        model.train()
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            # Mixup
            if random.random() < config['mixup_prob']:
                lam = np.random.beta(config['mixup_alpha'], config['mixup_alpha'])
                batch_size = images.size(0)
                index = torch.randperm(batch_size).to(device)
                mixed_images = lam * images + (1 - lam) * images[index, :]
                y_a, y_b = labels, labels[index]
                
                if use_amp:
                    with autocast():
                        outputs = model(mixed_images)
                        loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(mixed_images)
                    loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == y_a).sum().item()
            else:
                if use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        
        # Update scheduler
        if config['scheduler'] == 'reduce':
            scheduler.step(train_acc)
        else:
            scheduler.step()
        
        # Evaluation with TTA
        if epoch % config['eval_every'] == 0 or epoch == config['epochs'] - 1:
            test_acc = evaluate_with_advanced_tta(model, test_loader, config['tta_transforms'])
            
            if test_acc > best_acc:
                best_acc = test_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            if epoch % (config['eval_every'] * 3) == 0:
                print(f"    Epoch {epoch:3d}: Train {train_acc:.1f}%, Test {test_acc:.1f}%")
            
            if patience_counter >= patience:
                break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    # Final evaluation
    final_acc = evaluate_with_advanced_tta(model, test_loader, config['tta_transforms'])
    
    # Cleanup
    del model, optimizer, scheduler
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    return final_acc

def evaluate_with_advanced_tta(model, test_loader, tta_transforms=5):
    """Advanced TTA evaluation"""
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            predictions = []
            
            # Original
            if use_amp:
                with autocast():
                    outputs = model(images)
            else:
                outputs = model(images)
            predictions.append(F.softmax(outputs, dim=1))
            
            if tta_transforms >= 2:
                # Horizontal flip
                flipped = torch.flip(images, [3])
                if use_amp:
                    with autocast():
                        outputs = model(flipped)
                else:
                    outputs = model(flipped)
                predictions.append(F.softmax(outputs, dim=1))
            
            if tta_transforms >= 3:
                # Scale variations
                for scale in [0.9, 1.1]:
                    scaled = F.interpolate(images, scale_factor=scale, mode='bilinear', align_corners=False)
                    scaled = F.interpolate(scaled, size=(224, 224), mode='bilinear', align_corners=False)
                    if use_amp:
                        with autocast():
                            outputs = model(scaled)
                    else:
                        outputs = model(scaled)
                    predictions.append(F.softmax(outputs, dim=1))
            
            if tta_transforms >= 5:
                # Crop variations
                for crop_factor in [0.85, 0.95]:
                    size = int(224 * crop_factor)
                    start = (224 - size) // 2
                    cropped = images[:, :, start:start+size, start:start+size]
                    cropped = F.interpolate(cropped, size=(224, 224), mode='bilinear', align_corners=False)
                    if use_amp:
                        with autocast():
                            outputs = model(cropped)
                    else:
                        outputs = model(cropped)
                    predictions.append(F.softmax(outputs, dim=1))
            
            # Average all predictions
            avg_output = torch.stack(predictions).mean(0)
            _, predicted = torch.max(avg_output, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

def generate_hyperparameter_configs():
    """Generate comprehensive hyperparameter search space"""
    
    # Define search space
    param_grid = {
        'learning_rate': [0.0001, 0.0003, 0.0005, 0.0007, 0.001, 0.002],
        'weight_decay': [0.01, 0.03, 0.05, 0.07, 0.1],
        'dropout_rate': [0.2, 0.3, 0.4, 0.5, 0.6],
        'batch_size': [12, 16, 20, 24],
        'optimizer': ['adamw', 'sgd'],
        'scheduler': ['cosine', 'warmup_cosine', 'reduce'],
        'aug_intensity': ['light', 'medium', 'heavy'],
        'mixup_alpha': [0.2, 0.4, 0.6],
        'mixup_prob': [0.2, 0.3, 0.5],
        'label_smoothing': [0.0, 0.1, 0.15, 0.2],
        'freezing': ['light', 'standard', 'heavy'],
        'img_size': [224, 256],
        'tta_transforms': [2, 3, 5],
        'loss_function': ['crossentropy', 'focal']
    }
    
    # Additional optimizer-specific parameters
    optimizer_params = {
        'adamw': {
            'beta1': [0.9, 0.95],
            'beta2': [0.999, 0.9999]
        },
        'sgd': {
            'momentum': [0.9, 0.95],
            'nesterov': [True, False]
        },
        'rmsprop': {
            'rmsprop_alpha': [0.9, 0.99]
        }
    }
    
    # Scheduler-specific parameters
    scheduler_params = {
        'cosine': {'min_lr': [1e-7, 1e-6]},
        'warmup_cosine': {'warmup_epochs': [5, 10], 'min_lr': [1e-7, 1e-6]},
        'step': {'step_size': [20, 30], 'step_gamma': [0.5, 0.7]},
        'reduce': {'reduce_patience': [5, 8], 'reduce_factor': [0.5, 0.7], 'min_lr': [1e-7, 1e-6]}
    }
    
    # Loss-specific parameters
    loss_params = {
        'focal': {'focal_gamma': [1.5, 2.0, 2.5]}
    }
    
    # Fixed parameters
    fixed_params = {
        'epochs': 80,
        'patience': 20,
        'eval_every': 5
    }
    
    configs = []
    
    # Sample random combinations (too many to test all)
    n_trials = 200  # Will run for many hours
    
    for trial in range(n_trials):
        config = fixed_params.copy()
        
        # Sample basic parameters
        for param, values in param_grid.items():
            config[param] = random.choice(values)
        
        # Add optimizer-specific parameters
        if config['optimizer'] in optimizer_params:
            for param, values in optimizer_params[config['optimizer']].items():
                config[param] = random.choice(values)
        
        # Add scheduler-specific parameters
        if config['scheduler'] in scheduler_params:
            for param, values in scheduler_params[config['scheduler']].items():
                config[param] = random.choice(values)
        
        # Add loss-specific parameters
        if config['loss_function'] in loss_params:
            for param, values in loss_params[config['loss_function']].items():
                config[param] = random.choice(values)
        
        configs.append(config)
    
    return configs

# Main hyperparameter optimization
def main():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"efficientnet_b0_optimization_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    
    start_time = datetime.now()
    
    print("="*80)
    print("EFFICIENTNET-B0 COMPREHENSIVE HYPERPARAMETER OPTIMIZATION")
    print("="*80)
    print(f"Start time: {start_time}")
    print(f"Results directory: {output_dir}")
    print("Goal: Push beyond 72.6% to maximum possible accuracy")
    print("Strategy: Systematic exploration of hyperparameter space")
    
    # Load data
    print("\nLoading color images...")
    X, y = load_color_images_only(color_path)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Train: {len(X_train)}, Test: {len(X_test)}")
    
    # Generate hyperparameter configurations
    print("\nGenerating hyperparameter configurations...")
    configs = generate_hyperparameter_configs()
    print(f"Total configurations to test: {len(configs)}")
    print(f"Estimated runtime: {len(configs) * 15} minutes = {len(configs) * 15 / 60:.1f} hours")
    
    results = []
    best_accuracy = 0
    best_config = None
    
    # Progress tracking
    save_every = 10
    
    for trial_num, config in enumerate(configs, 1):
        trial_start = datetime.now()
        
        try:
            accuracy = train_with_config(config, X_train, y_train, X_test, y_test, trial_num)
            
            result = {
                'trial': trial_num,
                'accuracy': accuracy,
                'config': config,
                'timestamp': trial_start.isoformat()
            }
            
            results.append(result)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_config = config.copy()
                
                # Save best model configuration
                best_result = {
                    'best_accuracy': best_accuracy,
                    'best_config': best_config,
                    'trial': trial_num,
                    'timestamp': trial_start.isoformat()
                }
                
                with open(os.path.join(output_dir, 'best_result.json'), 'w') as f:
                    json.dump(best_result, f, indent=2)
                
                status = "NEW BEST!" 
                if accuracy >= 75.0:
                    status += " TARGET ACHIEVED!"
            else:
                status = f"{best_accuracy - accuracy:.1f}% below best"
            
            trial_duration = (datetime.now() - trial_start).total_seconds() / 60
            elapsed_total = (datetime.now() - start_time).total_seconds() / 60
            estimated_remaining = (len(configs) - trial_num) * trial_duration
            
            print(f"Trial {trial_num:3d}/{len(configs)}: {accuracy:.1f}% - {status}")
            print(f"    Time: {trial_duration:.1f}min | Total: {elapsed_total:.1f}min | Est. remaining: {estimated_remaining:.1f}min")
            print(f"    Best so far: {best_accuracy:.1f}%")
            
            # Save results periodically
            if trial_num % save_every == 0:
                with open(os.path.join(output_dir, f'results_checkpoint_{trial_num}.json'), 'w') as f:
                    json.dump({
                        'results': results,
                        'best_accuracy': best_accuracy,
                        'best_config': best_config,
                        'trials_completed': trial_num,
                        'start_time': start_time.isoformat(),
                        'checkpoint_time': datetime.now().isoformat()
                    }, f, indent=2)
                
                print(f"    Checkpoint saved at trial {trial_num}")
            
            print()
            
        except Exception as e:
            print(f"Trial {trial_num} failed: {e}")
            continue
    
    # Final results
    end_time = datetime.now()
    total_duration = end_time - start_time
    
    print("="*80)
    print("OPTIMIZATION COMPLETE")
    print("="*80)
    print(f"Total time: {total_duration}")
    print(f"Trials completed: {len(results)}")
    print(f"Best accuracy achieved: {best_accuracy:.1f}%")
    
    if best_accuracy >= 75.0:
        print("SUCCESS: 75% TARGET ACHIEVED!")
    else:
        print(f"Gap to 75%: {75.0 - best_accuracy:.1f}%")
    
    # Sort results by accuracy
    results.sort(key=lambda x: x['accuracy'], reverse=True)
    
    print(f"\nTop 10 configurations:")
    for i, result in enumerate(results[:10], 1):
        print(f"{i:2d}. {result['accuracy']:.1f}% - Trial {result['trial']}")
    
    # Save final results
    final_results = {
        'optimization_summary': {
            'start_time': start_time.isoformat(),
            'end_time': end_time.isoformat(),
            'total_duration_hours': total_duration.total_seconds() / 3600,
            'trials_completed': len(results),
            'best_accuracy': best_accuracy,
            'target_achieved': best_accuracy >= 75.0
        },
        'best_configuration': best_config,
        'all_results': results
    }
    
    with open(os.path.join(output_dir, 'final_optimization_results.json'), 'w') as f:
        json.dump(final_results, f, indent=2)
    
    print(f"\nFinal results saved to: {output_dir}")
    print("="*80)

if __name__ == "__main__":
    main()

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Mixed Precision: Enabled
EFFICIENTNET-B0 COMPREHENSIVE HYPERPARAMETER OPTIMIZATION
Start time: 2025-08-02 09:49:32.963024
Results directory: efficientnet_b0_optimization_20250802_094932
Goal: Push beyond 72.6% to maximum possible accuracy
Strategy: Systematic exploration of hyperparameter space

Loading color images...
Train: 288, Test: 73

Generating hyperparameter configurations...
Total configurations to test: 200
Estimated runtime: 3000 minutes = 50.0 hours
Trial 1: LR=0.000700, BS=20, Drop=0.500, Aug=heavy
    Epoch   0: Train 18.4%, Test 15.1%
    Epoch  15: Train 67.4%, Test 61.6%
    Epoch  30: Train 61.1%, Test 58.9%
    Epoch  45: Train 81.2%, Test 60.3%
    Epoch  60: Train 80.6%, Test 60.3%
    Epoch  75: Train 69.4%, Test 63.0%
Trial   1/200: 63.0% - NEW BEST!
    Time: 1.9min | Total: 2.0min | Est. remaining: 373.1min
    Best so far: 63.0%

Trial 2: LR=0.000100, BS=12, Drop=0.600, Aug=heavy
    Epoch   0: Train 21.9%, Test 

In [16]:
# Multi-fold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import os
import gc
import random
import json
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.backends.cudnn.benchmark = True
    if MIXED_PRECISION_AVAILABLE:
        scaler = GradScaler()
        use_amp = True
        print("Mixed Precision: Enabled")
    else:
        use_amp = False
else:
    use_amp = False

color_path = r"G:\Dropbox\AI Projects\buck\images\squared\color"

def parse_filename(filename):
    parts = filename.split('_')
    if len(parts) >= 4:
        age_str = parts[3]
        try:
            age = float(age_str.replace('p', '.'))
            if age > 5.5:
                age = 5.5
            return age
        except ValueError:
            return None
    return None

def age_to_class(age):
    age_mapping = {1.5: 0, 2.5: 1, 3.5: 2, 4.5: 3, 5.5: 4}
    return age_mapping.get(age, None)

def load_color_images_only(color_path, img_size=(224, 224)):
    images = []
    ages = []
    
    if os.path.exists(color_path):
        for filename in os.listdir(color_path):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                age = parse_filename(filename)
                if age is not None:
                    class_idx = age_to_class(age)
                    if class_idx is not None:
                        img_path = os.path.join(color_path, filename)
                        img = cv2.imread(img_path)
                        if img is not None:
                            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                            img_resized = cv2.resize(img_rgb, img_size)
                            images.append(img_resized)
                            ages.append(class_idx)
    
    images = np.array(images)
    ages = np.array(ages)
    
    return images, ages

def optimal_augment(image):
    """Optimal augmentation based on hyperparameter optimization results"""
    image = image.astype(np.uint8)
    
    # Light augmentation (performed best in optimization)
    if random.random() < 0.4:
        image = cv2.flip(image, 1)
    
    if random.random() < 0.3:
        angle = random.uniform(-8, 8)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.5:
        alpha = random.uniform(0.9, 1.1)
        beta = random.randint(-8, 8)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    if random.random() < 0.25:
        try:
            hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
            h_shift = random.randint(-5, 5)
            hsv[:, :, 0] = np.clip(hsv[:, :, 0].astype(np.int16) + h_shift, 0, 179).astype(np.uint8)
            s_factor = random.uniform(0.95, 1.05)
            hsv[:, :, 1] = np.clip(hsv[:, :, 1].astype(np.float32) * s_factor, 0, 255).astype(np.uint8)
            image = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
        except:
            pass
    
    if random.random() < 0.15:
        noise = np.random.normal(0, 2, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

class OptimalDeerDataset(Dataset):
    def __init__(self, X, y, augment=False):
        self.X = X
        self.y = torch.LongTensor(y)
        self.augment = augment
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].copy()
        label = self.y[idx].clone()
        
        if self.augment:
            image = optimal_augment(image)
        
        image = torch.FloatTensor(image)
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        image = (image - self.mean) / self.std
        
        return image, label

def create_optimized_efficientnet_b0(config_name="best"):
    """Create EfficientNetB0 with optimized configurations"""
    
    # Best configurations from hyperparameter optimization
    configs = {
        "best": {"drop_rate": 0.3, "freezing": "light"},
        "alternative": {"drop_rate": 0.5, "freezing": "standard"},
        "conservative": {"drop_rate": 0.4, "freezing": "heavy"}
    }
    
    config = configs[config_name]
    model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=5, drop_rate=config["drop_rate"])
    
    # Apply freezing strategy
    if config["freezing"] == "light":
        for name, param in model.named_parameters():
            if 'blocks.0.' in name or 'blocks.1.' in name:
                param.requires_grad = False
    elif config["freezing"] == "standard":
        for name, param in model.named_parameters():
            if not ('blocks.6.' in name or 'blocks.7.' in name or 'classifier' in name):
                param.requires_grad = False
    elif config["freezing"] == "heavy":
        for name, param in model.named_parameters():
            if not ('blocks.7.' in name or 'classifier' in name):
                param.requires_grad = False
    
    return model.to(device)

def train_fold_model(X_fold_train, y_fold_train, X_fold_val, y_fold_val, fold_num, config_name="best"):
    """Train a single fold with optimal hyperparameters"""
    
    # Conservative hyperparameters for smaller fold datasets
    optimal_params = {
        "best": {
            "lr": 0.0005, "weight_decay": 0.05, "batch_size": 16,
            "epochs": 120, "patience": 30
        },
        "alternative": {
            "lr": 0.0003, "weight_decay": 0.07, "batch_size": 12,
            "epochs": 120, "patience": 30
        }
    }
    
    params = optimal_params.get(config_name, optimal_params["best"])
    
    # Create datasets
    train_dataset = OptimalDeerDataset(X_fold_train, y_fold_train, augment=True)
    val_dataset = OptimalDeerDataset(X_fold_val, y_fold_val, augment=False)
    
    train_loader = DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=False, num_workers=0)
    
    # Create model
    model = create_optimized_efficientnet_b0(config_name)
    
    # Optimizer and scheduler
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=params["epochs"], eta_min=1e-7)
    
    best_val_acc = 0.0
    patience_counter = 0
    best_state = None
    
    print(f"    Training {config_name} model for fold {fold_num}")
    
    for epoch in range(params["epochs"]):
        # Training
        model.train()
        train_correct = 0
        train_total = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            # Light mixup (based on optimization results)
            if random.random() < 0.2:
                lam = np.random.beta(0.4, 0.4)
                batch_size = images.size(0)
                index = torch.randperm(batch_size).to(device)
                mixed_images = lam * images + (1 - lam) * images[index, :]
                y_a, y_b = labels, labels[index]
                
                if use_amp:
                    with autocast():
                        outputs = model(mixed_images)
                        loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(mixed_images)
                    loss = lam * criterion(outputs, y_a) + (1 - lam) * criterion(outputs, y_b)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == y_a).sum().item()
            else:
                if use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            if batch_idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        train_acc = 100 * train_correct / train_total
        scheduler.step()
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                
                if use_amp:
                    with autocast():
                        outputs = model(images)
                else:
                    outputs = model(images)
                
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_acc = 100 * val_correct / val_total
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            best_state = model.state_dict().copy()
        else:
            patience_counter += 1
        
        if epoch % 20 == 0 or patience_counter >= params["patience"]:
            print(f"      Epoch {epoch:2d}: Train {train_acc:.1f}%, Val {val_acc:.1f}%")
        
        if patience_counter >= params["patience"]:
            print(f"      Early stopping at epoch {epoch}")
            break
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, best_val_acc

def evaluate_with_comprehensive_tta(model, test_loader):
    """Comprehensive TTA evaluation"""
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            predictions = []
            
            # Original
            if use_amp:
                with autocast():
                    outputs = model(images)
            else:
                outputs = model(images)
            predictions.append(F.softmax(outputs, dim=1))
            
            # Horizontal flip
            flipped = torch.flip(images, [3])
            if use_amp:
                with autocast():
                    outputs = model(flipped)
            else:
                outputs = model(flipped)
            predictions.append(F.softmax(outputs, dim=1))
            
            # Multi-scale
            for scale in [0.9, 1.1]:
                scaled = F.interpolate(images, scale_factor=scale, mode='bilinear', align_corners=False)
                scaled = F.interpolate(scaled, size=(224, 224), mode='bilinear', align_corners=False)
                if use_amp:
                    with autocast():
                        outputs = model(scaled)
                else:
                    outputs = model(scaled)
                predictions.append(F.softmax(outputs, dim=1))
            
            # Crop variations
            for crop_factor in [0.85, 0.95]:
                size = int(224 * crop_factor)
                start = (224 - size) // 2
                cropped = images[:, :, start:start+size, start:start+size]
                cropped = F.interpolate(cropped, size=(224, 224), mode='bilinear', align_corners=False)
                if use_amp:
                    with autocast():
                        outputs = model(cropped)
                else:
                    outputs = model(cropped)
                predictions.append(F.softmax(outputs, dim=1))
            
            # Average all TTA predictions
            avg_output = torch.stack(predictions).mean(0)
            _, predicted = torch.max(avg_output, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

def evaluate_ensemble_with_tta(all_fold_models, test_loader):
    """Evaluate full ensemble with comprehensive TTA"""
    for fold_models in all_fold_models:
        for model in fold_models:
            model.eval()
    
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            ensemble_output = torch.zeros(images.size(0), 5).to(device)
            model_count = 0
            
            # Collect predictions from all models across all folds
            for fold_models in all_fold_models:
                for model in fold_models:
                    tta_predictions = []
                    
                    # Comprehensive TTA for each model
                    # Original
                    if use_amp:
                        with autocast():
                            outputs = model(images)
                    else:
                        outputs = model(images)
                    tta_predictions.append(F.softmax(outputs, dim=1))
                    
                    # Horizontal flip
                    flipped = torch.flip(images, [3])
                    if use_amp:
                        with autocast():
                            outputs = model(flipped)
                    else:
                        outputs = model(flipped)
                    tta_predictions.append(F.softmax(outputs, dim=1))
                    
                    # Multi-scale
                    for scale in [0.9, 1.1]:
                        scaled = F.interpolate(images, scale_factor=scale, mode='bilinear', align_corners=False)
                        scaled = F.interpolate(scaled, size=(224, 224), mode='bilinear', align_corners=False)
                        if use_amp:
                            with autocast():
                                outputs = model(scaled)
                        else:
                            outputs = model(scaled)
                        tta_predictions.append(F.softmax(outputs, dim=1))
                    
                    # Average TTA for this model
                    model_avg = torch.stack(tta_predictions).mean(0)
                    ensemble_output += model_avg
                    model_count += 1
            
            # Final ensemble prediction
            ensemble_output /= model_count
            _, predicted = torch.max(ensemble_output, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return 100 * correct / total

# Main Multi-Fold Ensemble Pipeline
def main():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"efficientnet_b0_multifold_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    
    print("="*80)
    print("EFFICIENTNET-B0 MULTI-FOLD ENSEMBLE ON COLOR IMAGES")
    print("="*80)
    print("Goal: Push beyond 72.6% using multi-fold ensemble")
    print("Strategy: 5-fold CV with multiple optimal configurations")
    
    # Load color images only
    print("\nLoading color images...")
    X, y = load_color_images_only(color_path)
    
    # Reserve final test set
    X_train_all, X_test_final, y_train_all, y_test_final = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training pool: {len(X_train_all)}, Final test: {len(X_test_final)}")
    print(f"Training distribution: {Counter(y_train_all)}")
    print(f"Test distribution: {Counter(y_test_final)}")
    
    # 5-fold cross-validation
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Model configurations to train per fold
    model_configs = ["best", "alternative"]
    
    print(f"\nTraining {n_folds} folds × {len(model_configs)} configs = {n_folds * len(model_configs)} models")
    print("Estimated time: 2-3 hours")
    
    all_fold_models = []
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
        fold_num = fold + 1
        print(f"\n{'='*60}")
        print(f"FOLD {fold_num}/{n_folds}")
        print(f"{'='*60}")
        
        X_fold_train = X_train_all[train_idx]
        y_fold_train = y_train_all[train_idx]
        X_fold_val = X_train_all[val_idx]
        y_fold_val = y_train_all[val_idx]
        
        print(f"Fold {fold_num}: Train {len(X_fold_train)}, Val {len(X_fold_val)}")
        
        fold_models = []
        fold_scores = []
        
        for config_name in model_configs:
            print(f"\n  Training {config_name} configuration...")
            
            try:
                model, val_acc = train_fold_model(
                    X_fold_train, y_fold_train, X_fold_val, y_fold_val, 
                    fold_num, config_name
                )
                
                fold_models.append(model)
                fold_scores.append(val_acc)
                
                print(f"    {config_name}: {val_acc:.1f}%")
                
                # Save individual model
                model_filename = f"fold_{fold_num}_{config_name}_{val_acc:.1f}pct.pth"
                model_path = os.path.join(output_dir, model_filename)
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'fold': fold_num,
                    'config': config_name,
                    'accuracy': val_acc
                }, model_path)
                
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"    Error with {config_name}: {e}")
                continue
        
        all_fold_models.append(fold_models)
        fold_avg = np.mean(fold_scores) if fold_scores else 0
        fold_results.append(fold_avg)
        
        print(f"\nFold {fold_num} average: {fold_avg:.1f}%")
        print(f"Individual scores: {[f'{score:.1f}%' for score in fold_scores]}")
    
    # Cross-validation results
    print(f"\n{'='*80}")
    print("CROSS-VALIDATION RESULTS")
    print(f"{'='*80}")
    
    cv_mean = np.mean(fold_results)
    cv_std = np.std(fold_results)
    
    print("Fold averages:")
    for i, score in enumerate(fold_results, 1):
        print(f"  Fold {i}: {score:.1f}%")
    
    print(f"\nCV Mean: {cv_mean:.1f}% ± {cv_std:.1f}%")
    
    # Final ensemble evaluation on held-out test set
    print(f"\n{'='*80}")
    print("FINAL ENSEMBLE EVALUATION")
    print(f"{'='*80}")
    
    test_dataset = OptimalDeerDataset(X_test_final, y_test_final, augment=False)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0)
    
    print("Evaluating ensemble with comprehensive TTA...")
    final_ensemble_accuracy = evaluate_ensemble_with_tta(all_fold_models, test_loader)
    
    # Save ensemble
    ensemble_save_path = os.path.join(output_dir, f"efficientnet_b0_ensemble_{final_ensemble_accuracy:.1f}pct.pth")
    torch.save({
        'all_models_state_dicts': [
            [model.state_dict() for model in fold_models] 
            for fold_models in all_fold_models
        ],
        'model_configs': model_configs,
        'fold_scores': fold_results,
        'cv_mean': cv_mean,
        'cv_std': cv_std,
        'final_test_accuracy': final_ensemble_accuracy,
        'n_folds': n_folds,
        'total_models': sum(len(fold) for fold in all_fold_models),
        'color_only': True,
        'architecture': 'efficientnet_b0'
    }, ensemble_save_path)
    
    # Final results
    print(f"\n{'='*80}")
    print("FINAL RESULTS")
    print(f"{'='*80}")
    
    print(f"Cross-validation: {cv_mean:.1f}% ± {cv_std:.1f}%")
    print(f"Final ensemble test accuracy: {final_ensemble_accuracy:.1f}%")
    print(f"Total models in ensemble: {sum(len(fold) for fold in all_fold_models)}")
    
    # Compare to previous results
    print(f"\nCOMPARISON:")
    print(f"  Single EfficientNetB0 (original): 72.6%")
    print(f"  Hyperparameter optimization: 69.9%")
    print(f"  Multi-fold ensemble: {final_ensemble_accuracy:.1f}%")
    
    if final_ensemble_accuracy >= 75.0:
        print("\nSUCCESS: 75% TARGET ACHIEVED!")
        print("Multi-fold ensemble approach worked!")
    elif final_ensemble_accuracy > 72.6:
        improvement = final_ensemble_accuracy - 72.6
        print(f"\nIMPROVEMENT: +{improvement:.1f}% over single model")
        gap = 75.0 - final_ensemble_accuracy
        print(f"Gap to 75%: {gap:.1f}%")
    else:
        print(f"\nNo improvement over single model (72.6%)")
        print("Dataset size may be the fundamental limiting factor")
    
    print(f"\nEnsemble saved: {ensemble_save_path}")
    
    # Save detailed results
    results_summary = {
        'timestamp': timestamp,
        'total_images': len(X),
        'train_images': len(X_train_all),
        'test_images': len(X_test_final),
        'n_folds': n_folds,
        'model_configs': model_configs,
        'fold_results': fold_results,
        'cv_mean': float(cv_mean),
        'cv_std': float(cv_std),
        'final_test_accuracy': float(final_ensemble_accuracy),
        'target_achieved': final_ensemble_accuracy >= 75.0,
        'total_models': sum(len(fold) for fold in all_fold_models),
        'architecture': 'efficientnet_b0',
        'color_only': True
    }
    
    with open(os.path.join(output_dir, 'ensemble_results.json'), 'w') as f:
        json.dump(results_summary, f, indent=2)
    
    print(f"Detailed results saved to: {output_dir}")
    print("="*80)

if __name__ == "__main__":
    main()

Using device: cuda
GPU: NVIDIA GeForce RTX 2060
Mixed Precision: Enabled
EFFICIENTNET-B0 MULTI-FOLD ENSEMBLE ON COLOR IMAGES
Goal: Push beyond 72.6% using multi-fold ensemble
Strategy: 5-fold CV with multiple optimal configurations

Loading color images...
ERROR! Session/line number was not unique in database. History logging moved to new session 289
Training pool: 288, Final test: 73
Training distribution: Counter({np.int64(4): 70, np.int64(3): 60, np.int64(2): 60, np.int64(1): 53, np.int64(0): 45})
Test distribution: Counter({np.int64(4): 18, np.int64(3): 15, np.int64(2): 15, np.int64(1): 13, np.int64(0): 12})

Training 5 folds × 2 configs = 10 models
Estimated time: 2-3 hours

FOLD 1/5
Fold 1: Train 230, Val 58

  Training best configuration...
    Training best model for fold 1
      Epoch  0: Train 24.3%, Val 39.7%
      Epoch 20: Train 91.3%, Val 53.4%
      Epoch 40: Train 88.3%, Val 53.4%
      Epoch 60: Train 90.0%, Val 50.0%
      Epoch 66: Train 90.0%, Val 58.6%
      Early 