In [None]:
# CUDA Diagnosis Script
import torch
import subprocess
import sys

def diagnose_cuda():
    print("CUDA DIAGNOSIS")
    print("="*50)
    
    # Check PyTorch version
    print(f"PyTorch Version: {torch.__version__}")
    
    # Check CUDA availability
    print(f"CUDA Available: {torch.cuda.is_available()}")
    
    if torch.cuda.is_available():
        print(f"CUDA Version: {torch.version.cuda}")
        print(f"GPU Count: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print("✅ CUDA is working!")
        return True
    else:
        print("❌ CUDA not available")
        
        # Check if CUDA is compiled into PyTorch
        print(f"CUDA compiled into PyTorch: {torch.backends.cudnn.enabled if hasattr(torch.backends, 'cudnn') else 'Unknown'}")
        
        # Try to detect NVIDIA GPU
        try:
            result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
            if result.returncode == 0:
                print("✅ NVIDIA GPU detected via nvidia-smi")
                print("❌ But PyTorch can't access it - need CUDA-enabled PyTorch")
            else:
                print("❌ No NVIDIA GPU detected")
        except FileNotFoundError:
            print("❌ nvidia-smi not found - CUDA drivers may not be installed")
        
        return False

def get_fix_commands():
    """Get the right PyTorch installation commands"""
    print("\nFIX COMMANDS")
    print("="*50)
    
    # Check current environment
    print("1. UNINSTALL CURRENT PYTORCH:")
    print("   pip uninstall torch torchvision torchaudio -y")
    print()
    
    print("2. INSTALL CUDA-ENABLED PYTORCH:")
    print("   For RTX 40-series, RTX 30-series, or newer:")
    print("   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
    print()
    print("   For older GPUs (GTX 10-series, RTX 20-series):")
    print("   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
    print()
    
    print("3. VERIFY INSTALLATION:")
    print("   python -c \"import torch; print(f'CUDA: {torch.cuda.is_available()}')\"")
    print()
    
    print("4. IF STILL ISSUES:")
    print("   - Check NVIDIA drivers: nvidia-smi")
    print("   - Update drivers from NVIDIA website")
    print("   - Restart after driver update")

def quick_test():
    """Quick test if CUDA works"""
    if torch.cuda.is_available():
        try:
            # Test basic CUDA operations
            x = torch.randn(100, 100).cuda()
            y = torch.randn(100, 100).cuda()
            z = torch.mm(x, y)
            print("✅ CUDA tensor operations working!")
            
            # Test model creation
            import timm
            model = timm.create_model('resnet18', pretrained=True, num_classes=5)
            model = model.cuda()
            
            # Test forward pass
            test_input = torch.randn(1, 3, 224, 224).cuda()
            with torch.no_grad():
                output = model(test_input)
            
            print("✅ CUDA model inference working!")
            print(f"🚀 Ready for GPU training!")
            return True
            
        except Exception as e:
            print(f"❌ CUDA test failed: {e}")
            return False
    else:
        print("❌ Cannot test - CUDA not available")
        return False

if __name__ == "__main__":
    cuda_works = diagnose_cuda()
    
    if not cuda_works:
        get_fix_commands()
        print("\n" + "="*50)
        print("IMPORTANT: After installing CUDA PyTorch:")
        print("1. Restart your Python kernel/notebook")
        print("2. Re-run your training script")
        print("3. Should see 'Device: cuda' instead of 'Device: cpu'")
        print("4. Training will be 5-10x faster!")
    else:
        quick_test()
        print("\n🎯 CUDA is ready - your training should be using GPU!")
        print("If your script still shows CPU, restart your Python kernel.")

In [None]:
# 95% accuracy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import random
import time
import json
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Mixed precision imports
try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

def load_original_data():
    """Load the original images"""
    try:
        from buck.analysis.basics import ingest_images
        fpath = "G:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*_NDA.png"
        images, ages = ingest_images(fpath)
        ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
        print(f"Loaded {len(images)} images")
        print(f"Distribution: {dict(Counter(ages_grouped))}")
        return images, ages_grouped
    except Exception as e:
        print(f"ERROR: {e}")
        raise

def enhanced_augment_image(image):
    """Optimized augmentation for deer aging"""
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Core augmentations that preserve antler features
    if random.random() < 0.7:
        angle = random.uniform(-12, 12)  # Reduced rotation to preserve antler orientation
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    # Enhanced lighting (important for outdoor deer photos)
    if random.random() < 0.8:
        alpha = random.uniform(0.75, 1.25)
        beta = random.randint(-20, 20)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Gamma correction for lighting conditions
    if random.random() < 0.4:
        gamma = random.uniform(0.8, 1.2)
        inv_gamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        image = cv2.LUT(image, table)
    
    # Realistic noise
    if random.random() < 0.3:
        noise = np.random.normal(0, 6, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

def create_optimized_augmented_data(X_train, y_train, multiplier=40):
    """Create balanced augmented data"""
    print(f"OPTIMIZED AUGMENTATION ({multiplier}x)")
    print("="*40)
    
    class_counts = Counter(y_train)
    max_count = max(class_counts.values())
    target_count = max_count * multiplier
    
    X_aug = []
    y_aug = []
    
    for class_idx in range(len(set(y_train))):
        class_mask = y_train == class_idx
        class_images = X_train[class_mask]
        current_count = len(class_images)
        
        print(f"   Class {class_idx}: {current_count} -> {target_count}")
        
        # Add originals 4 times to preserve signal
        for _ in range(4):
            X_aug.extend(class_images)
            y_aug.extend([class_idx] * current_count)
        
        # Generate remaining augmented samples
        needed = target_count - (current_count * 4)
        for i in range(needed):
            orig_idx = random.randint(0, current_count - 1)
            aug_img = enhanced_augment_image(class_images[orig_idx].copy())
            X_aug.append(aug_img)
            y_aug.append(class_idx)
    
    print(f"   Total: {len(X_aug)} samples")
    return np.array(X_aug), np.array(y_aug)

class OptimizedDeerDataset(Dataset):
    """Optimized dataset for deer aging"""
    def __init__(self, X, y, test_time_aug=False):
        self.X = torch.FloatTensor(X if isinstance(X, np.ndarray) else np.array(X))
        self.y = torch.LongTensor(y if isinstance(y, np.ndarray) else np.array(y))
        self.test_time_aug = test_time_aug
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        # Normalize
        if image.max() > 1.0:
            image = image / 255.0
        
        # CHW format and resize
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        if image.shape[-2:] != (224, 224):
            image = F.interpolate(image.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)
        
        # Test-time augmentation
        if self.test_time_aug and random.random() < 0.5:
            image = torch.flip(image, [2])
        
        # Normalize
        image = (image - self.mean) / self.std
        return image, label

class CrossValidationTrainer:
    """Cross-validation trainer for reliable 70% test accuracy"""
    
    def __init__(self, num_classes=5):
        self.num_classes = num_classes
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        print(f"CROSS-VALIDATION TRAINER - TARGET 70%")
        print(f"   Device: {self.device}")
        
        if torch.cuda.is_available():
            print(f"   GPU: {torch.cuda.get_device_name(0)}")
            torch.backends.cudnn.benchmark = True
            if MIXED_PRECISION_AVAILABLE:
                self.scaler = GradScaler()
                self.use_amp = True
                print(f"   Mixed Precision: Enabled")
            else:
                self.use_amp = False
        else:
            self.use_amp = False
    
    def create_optimal_model(self):
        """Create your best-performing ResNet-18 configuration"""
        model = timm.create_model('resnet18', pretrained=True, num_classes=self.num_classes)
        
        # Your optimal freezing strategy (75% trainable)
        frozen_layers = ['conv1', 'bn1', 'layer1', 'layer2', 'layer3']
        
        for name, param in model.named_parameters():
            for frozen_layer in frozen_layers:
                if name.startswith(frozen_layer):
                    param.requires_grad = False
                    break
        
        return model.to(self.device)
    
    def train_single_model(self, train_loader, val_loader, model_name="model"):
        """Train a single model with optimal hyperparameters"""
        model = self.create_optimal_model()
        
        # Optimized hyperparameters based on your best results
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        
        # Differential learning rates
        backbone_params = []
        classifier_params = []
        
        for name, param in model.named_parameters():
            if param.requires_grad:
                if 'fc' in name:
                    classifier_params.append(param)
                else:
                    backbone_params.append(param)
        
        optimizer = optim.AdamW([
            {'params': backbone_params, 'lr': 0.0003},
            {'params': classifier_params, 'lr': 0.001}
        ], weight_decay=0.015)
        
        # Cosine annealing
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=70, eta_min=1e-6)
        
        max_epochs = 70
        patience = 20
        best_val_acc = 0.0
        patience_counter = 0
        
        print(f"   Training {model_name}: {max_epochs} epochs, patience={patience}")
        
        for epoch in range(max_epochs):
            # Training
            model.train()
            train_correct = 0
            train_total = 0
            
            for images, labels in train_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                
                if self.use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    self.scaler.scale(loss).backward()
                    self.scaler.step(optimizer)
                    self.scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            train_acc = 100 * train_correct / train_total
            
            # Validation
            model.eval()
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(self.device), labels.to(self.device)
                    
                    if self.use_amp:
                        with autocast():
                            outputs = model(images)
                    else:
                        outputs = model(images)
                    
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            val_acc = 100 * val_correct / val_total
            scheduler.step()
            
            # Track best
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
                status = "BEST"
            else:
                patience_counter += 1
                status = ""
            
            # Progress (less frequent printing)
            if epoch % 10 == 0 or epoch < 3 or status or epoch > max_epochs - 3:
                print(f"     Epoch {epoch:2d}: Train {train_acc:.1f}%, Val {val_acc:.1f}% {status}")
            
            if patience_counter >= patience:
                print(f"     Early stop at epoch {epoch}")
                break
        
        # Restore best model
        if 'best_state' in locals():
            model.load_state_dict(best_state)
        
        print(f"   {model_name} complete: Best val {best_val_acc:.1f}%")
        return model, best_val_acc
    
    def evaluate_with_tta(self, model, test_loader):
        """Evaluate model with test-time augmentation"""
        model.eval()
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                
                # Original prediction
                if self.use_amp:
                    with autocast():
                        outputs1 = model(images)
                else:
                    outputs1 = model(images)
                
                # Flipped prediction
                flipped = torch.flip(images, [3])
                if self.use_amp:
                    with autocast():
                        outputs2 = model(flipped)
                else:
                    outputs2 = model(flipped)
                
                # Average (TTA)
                avg_outputs = (outputs1 + outputs2) / 2
                _, predicted = torch.max(avg_outputs, 1)
                
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        
        return 100 * test_correct / test_total
    
    def run_cross_validation(self, images, ages, n_splits=5):
        """Run k-fold cross-validation to find best approach"""
        print(f"\n{n_splits}-FOLD CROSS-VALIDATION")
        print("="*50)
        
        if not isinstance(images, np.ndarray):
            images = np.array(images)
        if not isinstance(ages, np.ndarray):
            ages = np.array(ages)
        
        # Create label mapping
        unique_ages = sorted(list(set(ages)))
        label_mapping = {age: i for i, age in enumerate(unique_ages)}
        y_indices = np.array([label_mapping[age] for age in ages])
        
        # Stratified K-Fold
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        cv_scores = []
        best_models = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(images, y_indices)):
            print(f"\nFOLD {fold + 1}/{n_splits}")
            print("-" * 30)
            
            # Split data
            X_train_fold = images[train_idx]
            y_train_fold = y_indices[train_idx]
            X_val_fold = images[val_idx]
            y_val_fold = y_indices[val_idx]
            
            print(f"   Train: {len(X_train_fold)}, Val: {len(X_val_fold)}")
            
            # Augment training data
            X_train_aug, y_train_aug = create_optimized_augmented_data(X_train_fold, y_train_fold, multiplier=40)
            
            # Create datasets and loaders
            train_dataset = OptimizedDeerDataset(X_train_aug, y_train_aug)
            val_dataset = OptimizedDeerDataset(X_val_fold, y_val_fold, test_time_aug=True)
            
            train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
            val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
            
            # Train model
            model, val_acc = self.train_single_model(train_loader, val_loader, f"fold-{fold+1}")
            
            cv_scores.append(val_acc)
            best_models.append(model)
            
            torch.cuda.empty_cache()
        
        return cv_scores, best_models, label_mapping
    
    def final_test_evaluation(self, images, ages, trained_models, label_mapping):
        """Final evaluation on held-out test set"""
        print(f"\nFINAL TEST EVALUATION")
        print("="*30)
        
        if not isinstance(images, np.ndarray):
            images = np.array(images)
        if not isinstance(ages, np.ndarray):
            ages = np.array(ages)
        
        # Convert to indices
        y_indices = np.array([label_mapping[age] for age in ages])
        
        # Create train/test split (80/20)
        X_train_all, X_test, y_train_all, y_test = train_test_split(
            images, y_indices, test_size=0.2, random_state=42, stratify=y_indices
        )
        
        print(f"   Final test set: {len(X_test)} samples")
        print(f"   Test distribution: {Counter(y_test)}")
        
        # Create test dataset
        test_dataset = OptimizedDeerDataset(X_test, y_test, test_time_aug=True)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
        
        # Evaluate each model
        individual_scores = []
        for i, model in enumerate(trained_models):
            test_acc = self.evaluate_with_tta(model, test_loader)
            individual_scores.append(test_acc)
            print(f"   Model {i+1}: {test_acc:.1f}%")
        
        # Ensemble evaluation
        print(f"\n   ENSEMBLE EVALUATION:")
        ensemble_acc = self.evaluate_ensemble_with_tta(trained_models, test_loader)
        print(f"   Ensemble + TTA: {ensemble_acc:.1f}%")
        
        return individual_scores, ensemble_acc
    
    def evaluate_ensemble_with_tta(self, models, test_loader):
        """Ensemble evaluation with test-time augmentation"""
        for model in models:
            model.eval()
        
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                
                ensemble_outputs = torch.zeros(images.size(0), self.num_classes).to(self.device)
                
                for model in models:
                    # Original
                    if self.use_amp:
                        with autocast():
                            outputs1 = model(images)
                    else:
                        outputs1 = model(images)
                    
                    # Flipped
                    flipped = torch.flip(images, [3])
                    if self.use_amp:
                        with autocast():
                            outputs2 = model(flipped)
                    else:
                        outputs2 = model(flipped)
                    
                    # Average and add to ensemble
                    avg_outputs = (outputs1 + outputs2) / 2
                    ensemble_outputs += F.softmax(avg_outputs, dim=1)
                
                ensemble_outputs /= len(models)
                _, predicted = torch.max(ensemble_outputs, 1)
                
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        
        return 100 * test_correct / test_total

def cross_validation_pipeline():
    """Cross-validation pipeline to reliably reach 70%"""
    print("CROSS-VALIDATION PIPELINE - RELIABLE 70%")
    print("="*60)
    print("Strategy: K-fold CV + Best model selection + TTA")
    print("="*60)
    
    start_time = time.time()
    
    try:
        # Load data
        images, ages = load_original_data()
        
        # Initialize trainer
        trainer = CrossValidationTrainer(num_classes=len(set(ages)))
        
        # Run cross-validation
        cv_scores, trained_models, label_mapping = trainer.run_cross_validation(images, ages, n_splits=5)
        
        # Final test evaluation
        individual_scores, ensemble_score = trainer.final_test_evaluation(images, ages, trained_models, label_mapping)
        
        # Results
        elapsed = time.time() - start_time
        
        print(f"\nCROSS-VALIDATION RESULTS")
        print("="*40)
        print(f"CV Scores: {[f'{score:.1f}%' for score in cv_scores]}")
        print(f"CV Mean: {np.mean(cv_scores):.1f}% ± {np.std(cv_scores):.1f}%")
        print(f"CV Best: {max(cv_scores):.1f}%")
        
        print(f"\nFINAL TEST RESULTS")
        print("="*40)
        for i, score in enumerate(individual_scores):
            print(f"Model {i+1}: {score:.1f}%")
        
        best_individual = max(individual_scores)
        print(f"\nBest Individual: {best_individual:.1f}%")
        print(f"Ensemble + TTA:  {ensemble_score:.1f}%")
        print(f"Time: {elapsed/60:.1f} minutes")
        
        # Goal assessment
        final_score = max(best_individual, ensemble_score)
        
        if final_score >= 70:
            print(f"\n🎉 SUCCESS: REACHED 70% GOAL! ({final_score:.1f}%)")
        elif final_score >= 68:
            print(f"\n🔥 SO CLOSE: {70 - final_score:.1f}% away from 70%")
            print("Recommendation: Try ensemble of different architectures")
        elif final_score >= 65:
            print(f"\n📈 VERY GOOD: {70 - final_score:.1f}% away from 70%")
            print("Recommendation: Hyperparameter optimization or more data")
        else:
            print(f"\n💪 GOOD PROGRESS: {70 - final_score:.1f}% away from 70%")
        
        # Save results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results = {
            'cv_scores': cv_scores,
            'cv_mean': float(np.mean(cv_scores)),
            'cv_std': float(np.std(cv_scores)),
            'individual_test_scores': individual_scores,
            'ensemble_test_score': ensemble_score,
            'best_score': final_score,
            'goal_reached': final_score >= 70,
            'gap_to_goal': max(0, 70 - final_score),
            'elapsed_minutes': elapsed/60,
            'timestamp': datetime.now().isoformat()
        }
        
        with open(f'cross_validation_results_{timestamp}.json', 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"\nResults saved: cross_validation_results_{timestamp}.json")
        return results
        
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    cross_validation_pipeline()

In [None]:
# Full model with save

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import timm
import numpy as np
import cv2
import random
import time
import json
import pickle
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Mixed precision imports
try:
    from torch.cuda.amp import autocast, GradScaler
    MIXED_PRECISION_AVAILABLE = True
except ImportError:
    MIXED_PRECISION_AVAILABLE = False
    class autocast:
        def __enter__(self):
            return self
        def __exit__(self, *args):
            pass

def load_original_data():
    """Load the original images"""
    try:
        from buck.analysis.basics import ingest_images
        fpath = "G:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*_NDA.png"
        images, ages = ingest_images(fpath)
        ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
        print(f"Loaded {len(images)} images")
        print(f"Distribution: {dict(Counter(ages_grouped))}")
        return images, ages_grouped
    except Exception as e:
        print(f"ERROR: {e}")
        raise

def enhanced_augment_image(image):
    """Optimized augmentation for deer aging"""
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Core augmentations that preserve antler features
    if random.random() < 0.7:
        angle = random.uniform(-12, 12)  # Reduced rotation to preserve antler orientation
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < 0.5:
        image = cv2.flip(image, 1)
    
    # Enhanced lighting (important for outdoor deer photos)
    if random.random() < 0.8:
        alpha = random.uniform(0.75, 1.25)
        beta = random.randint(-20, 20)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    # Gamma correction for lighting conditions
    if random.random() < 0.4:
        gamma = random.uniform(0.8, 1.2)
        inv_gamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        image = cv2.LUT(image, table)
    
    # Realistic noise
    if random.random() < 0.3:
        noise = np.random.normal(0, 6, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

def create_optimized_augmented_data(X_train, y_train, multiplier=40):
    """Create balanced augmented data"""
    print(f"OPTIMIZED AUGMENTATION ({multiplier}x)")
    print("="*40)
    
    class_counts = Counter(y_train)
    max_count = max(class_counts.values())
    target_count = max_count * multiplier
    
    X_aug = []
    y_aug = []
    
    for class_idx in range(len(set(y_train))):
        class_mask = y_train == class_idx
        class_images = X_train[class_mask]
        current_count = len(class_images)
        
        print(f"   Class {class_idx}: {current_count} -> {target_count}")
        
        # Add originals 4 times to preserve signal
        for _ in range(4):
            X_aug.extend(class_images)
            y_aug.extend([class_idx] * current_count)
        
        # Generate remaining augmented samples
        needed = target_count - (current_count * 4)
        for i in range(needed):
            orig_idx = random.randint(0, current_count - 1)
            aug_img = enhanced_augment_image(class_images[orig_idx].copy())
            X_aug.append(aug_img)
            y_aug.append(class_idx)
    
    print(f"   Total: {len(X_aug)} samples")
    return np.array(X_aug), np.array(y_aug)

class OptimizedDeerDataset(Dataset):
    """Optimized dataset for deer aging"""
    def __init__(self, X, y, test_time_aug=False):
        self.X = torch.FloatTensor(X if isinstance(X, np.ndarray) else np.array(X))
        self.y = torch.LongTensor(y if isinstance(y, np.ndarray) else np.array(y))
        self.test_time_aug = test_time_aug
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        # Normalize
        if image.max() > 1.0:
            image = image / 255.0
        
        # CHW format and resize
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        if image.shape[-2:] != (224, 224):
            image = F.interpolate(image.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)
        
        # Test-time augmentation
        if self.test_time_aug and random.random() < 0.5:
            image = torch.flip(image, [2])
        
        # Normalize
        image = (image - self.mean) / self.std
        return image, label

class CrossValidationTrainerWithSaving:
    """Enhanced trainer that saves models and comprehensive training data"""
    
    def __init__(self, num_classes=5, save_dir="saved_models"):
        self.num_classes = num_classes
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.save_dir = save_dir
        
        # Create save directory
        import os
        os.makedirs(save_dir, exist_ok=True)
        
        print(f"ENHANCED CROSS-VALIDATION TRAINER WITH MODEL SAVING")
        print(f"   Device: {self.device}")
        print(f"   Save directory: {save_dir}")
        
        if torch.cuda.is_available():
            print(f"   GPU: {torch.cuda.get_device_name(0)}")
            torch.backends.cudnn.benchmark = True
            if MIXED_PRECISION_AVAILABLE:
                self.scaler = GradScaler()
                self.use_amp = True
                print(f"   Mixed Precision: Enabled")
            else:
                self.use_amp = False
        else:
            self.use_amp = False
    
    def create_optimal_model(self):
        """Create your best-performing ResNet-18 configuration"""
        model = timm.create_model('resnet18', pretrained=True, num_classes=self.num_classes)
        
        # Your optimal freezing strategy (75% trainable)
        frozen_layers = ['conv1', 'bn1', 'layer1', 'layer2', 'layer3']
        
        for name, param in model.named_parameters():
            for frozen_layer in frozen_layers:
                if name.startswith(frozen_layer):
                    param.requires_grad = False
                    break
        
        return model.to(self.device)
    
    def train_single_model(self, train_loader, val_loader, fold_num):
        """Train a single model with comprehensive tracking"""
        model = self.create_optimal_model()
        
        # Optimized hyperparameters based on your best results
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        
        # Differential learning rates
        backbone_params = []
        classifier_params = []
        
        for name, param in model.named_parameters():
            if param.requires_grad:
                if 'fc' in name:
                    classifier_params.append(param)
                else:
                    backbone_params.append(param)
        
        optimizer = optim.AdamW([
            {'params': backbone_params, 'lr': 0.0003},
            {'params': classifier_params, 'lr': 0.001}
        ], weight_decay=0.015)
        
        # Cosine annealing
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=70, eta_min=1e-6)
        
        max_epochs = 70
        patience = 20
        best_val_acc = 0.0
        patience_counter = 0
        
        # Track training history
        training_history = {
            'train_accs': [],
            'val_accs': [],
            'train_losses': [],
            'val_losses': [],
            'learning_rates': []
        }
        
        print(f"   Training fold-{fold_num}: {max_epochs} epochs, patience={patience}")
        
        for epoch in range(max_epochs):
            # Training
            model.train()
            train_correct = 0
            train_total = 0
            train_loss_total = 0.0
            
            for images, labels in train_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                
                if self.use_amp:
                    with autocast():
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    self.scaler.scale(loss).backward()
                    self.scaler.step(optimizer)
                    self.scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                train_loss_total += loss.item()
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            train_acc = 100 * train_correct / train_total
            train_loss = train_loss_total / len(train_loader)
            
            # Validation
            model.eval()
            val_correct = 0
            val_total = 0
            val_loss_total = 0.0
            
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(self.device), labels.to(self.device)
                    
                    if self.use_amp:
                        with autocast():
                            outputs = model(images)
                            loss = criterion(outputs, labels)
                    else:
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    
                    val_loss_total += loss.item()
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            val_acc = 100 * val_correct / val_total
            val_loss = val_loss_total / len(val_loader)
            
            # Record history
            training_history['train_accs'].append(train_acc)
            training_history['val_accs'].append(val_acc)
            training_history['train_losses'].append(train_loss)
            training_history['val_losses'].append(val_loss)
            training_history['learning_rates'].append(optimizer.param_groups[0]['lr'])
            
            scheduler.step()
            
            # Track best
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
                status = "BEST"
            else:
                patience_counter += 1
                status = ""
            
            # Progress (less frequent printing)
            if epoch % 10 == 0 or epoch < 3 or status or epoch > max_epochs - 3:
                print(f"     Epoch {epoch:2d}: Train {train_acc:.1f}%, Val {val_acc:.1f}% {status}")
            
            if patience_counter >= patience:
                print(f"     Early stop at epoch {epoch}")
                break
        
        # Restore best model
        if 'best_state' in locals():
            model.load_state_dict(best_state)
        
        print(f"   fold-{fold_num} complete: Best val {best_val_acc:.1f}%")
        
        # Save model and training history
        model_save_path = f"{self.save_dir}/model_fold_{fold_num}.pth"
        history_save_path = f"{self.save_dir}/history_fold_{fold_num}.pkl"
        
        torch.save({
            'model_state_dict': model.state_dict(),
            'model_config': {
                'architecture': 'resnet18',
                'num_classes': self.num_classes,
                'frozen_layers': ['conv1', 'bn1', 'layer1', 'layer2', 'layer3']
            },
            'best_val_acc': best_val_acc,
            'fold_num': fold_num
        }, model_save_path)
        
        with open(history_save_path, 'wb') as f:
            pickle.dump(training_history, f)
        
        print(f"   ✓ Saved: {model_save_path}")
        
        return model, best_val_acc, training_history
    
    def evaluate_with_tta(self, model, test_loader):
        """Evaluate model with test-time augmentation"""
        model.eval()
        test_correct = 0
        test_total = 0
        all_predictions = []
        all_probabilities = []
        all_labels = []
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                
                # Original prediction
                if self.use_amp:
                    with autocast():
                        outputs1 = model(images)
                else:
                    outputs1 = model(images)
                
                # Flipped prediction
                flipped = torch.flip(images, [3])
                if self.use_amp:
                    with autocast():
                        outputs2 = model(flipped)
                else:
                    outputs2 = model(flipped)
                
                # Average (TTA)
                avg_outputs = (outputs1 + outputs2) / 2
                probs = F.softmax(avg_outputs, dim=1)
                _, predicted = torch.max(avg_outputs, 1)
                
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
                
                # Store for detailed analysis
                all_predictions.extend(predicted.cpu().numpy())
                all_probabilities.extend(probs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        accuracy = 100 * test_correct / test_total
        return accuracy, all_predictions, all_probabilities, all_labels
    
    def run_cross_validation_with_saving(self, images, ages, n_splits=5):
        """Run k-fold cross-validation with comprehensive saving"""
        print(f"\n{n_splits}-FOLD CROSS-VALIDATION WITH MODEL SAVING")
        print("="*60)
        
        if not isinstance(images, np.ndarray):
            images = np.array(images)
        if not isinstance(ages, np.ndarray):
            ages = np.array(ages)
        
        # Create label mapping
        unique_ages = sorted(list(set(ages)))
        label_mapping = {age: i for i, age in enumerate(unique_ages)}
        y_indices = np.array([label_mapping[age] for age in ages])
        
        # Create train/test split (same as original)
        X_train_all, X_test, y_train_all, y_test = train_test_split(
            images, y_indices, test_size=0.2, random_state=42, stratify=y_indices
        )
        
        # Save data splits
        data_splits = {
            'X_train_all': X_train_all,
            'X_test': X_test,
            'y_train_all': y_train_all,
            'y_test': y_test,
            'label_mapping': label_mapping,
            'unique_ages': unique_ages
        }
        
        with open(f"{self.save_dir}/data_splits.pkl", 'wb') as f:
            pickle.dump(data_splits, f)
        
        print(f"✓ Saved data splits to {self.save_dir}/data_splits.pkl")
        
        # Stratified K-Fold
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        cv_scores = []
        all_training_histories = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_all, y_train_all)):
            print(f"\nFOLD {fold + 1}/{n_splits}")
            print("-" * 30)
            
            # Split data
            X_train_fold = X_train_all[train_idx]
            y_train_fold = y_train_all[train_idx]
            X_val_fold = X_train_all[val_idx]
            y_val_fold = y_train_all[val_idx]
            
            print(f"   Train: {len(X_train_fold)}, Val: {len(X_val_fold)}")
            
            # Augment training data
            X_train_aug, y_train_aug = create_optimized_augmented_data(X_train_fold, y_train_fold, multiplier=40)
            
            # Create datasets and loaders
            train_dataset = OptimizedDeerDataset(X_train_aug, y_train_aug)
            val_dataset = OptimizedDeerDataset(X_val_fold, y_val_fold, test_time_aug=True)
            
            train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
            val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)
            
            # Train model with saving
            model, val_acc, training_history = self.train_single_model(train_loader, val_loader, fold + 1)
            
            cv_scores.append(val_acc)
            all_training_histories.append(training_history)
            
            torch.cuda.empty_cache()
        
        return cv_scores, all_training_histories, data_splits
    
    def final_test_evaluation_with_saving(self, data_splits):
        """Final evaluation with detailed saving"""
        print(f"\nFINAL TEST EVALUATION WITH COMPREHENSIVE SAVING")
        print("="*50)
        
        X_test = data_splits['X_test']
        y_test = data_splits['y_test']
        
        print(f"   Test set: {len(X_test)} samples")
        print(f"   Test distribution: {Counter(y_test)}")
        
        # Create test dataset
        test_dataset = OptimizedDeerDataset(X_test, y_test, test_time_aug=True)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
        
        # Load and evaluate each model
        individual_scores = []
        all_individual_predictions = []
        all_individual_probabilities = []
        
        for fold in range(1, 6):  # 5 folds
            # Load model
            model_path = f"{self.save_dir}/model_fold_{fold}.pth"
            checkpoint = torch.load(model_path)
            
            model = self.create_optimal_model()
            model.load_state_dict(checkpoint['model_state_dict'])
            
            # Evaluate
            test_acc, preds, probs, labels = self.evaluate_with_tta(model, test_loader)
            individual_scores.append(test_acc)
            all_individual_predictions.append(preds)
            all_individual_probabilities.append(probs)
            
            print(f"   Model {fold}: {test_acc:.1f}%")
        
        # Ensemble evaluation
        print(f"\n   ENSEMBLE EVALUATION:")
        ensemble_probs = np.mean(all_individual_probabilities, axis=0)
        ensemble_preds = np.argmax(ensemble_probs, axis=1)
        ensemble_acc = np.mean(ensemble_preds == labels) * 100
        
        print(f"   Ensemble + TTA: {ensemble_acc:.1f}%")
        
        # Save comprehensive test results
        test_results = {
            'individual_scores': individual_scores,
            'ensemble_score': ensemble_acc,
            'individual_predictions': all_individual_predictions,
            'individual_probabilities': all_individual_probabilities,
            'ensemble_predictions': ensemble_preds.tolist(),
            'ensemble_probabilities': ensemble_probs.tolist(),
            'true_labels': labels,
            'test_distribution': dict(Counter(y_test))
        }
        
        with open(f"{self.save_dir}/test_results.pkl", 'wb') as f:
            pickle.dump(test_results, f)
        
        print(f"✓ Saved comprehensive test results to {self.save_dir}/test_results.pkl")
        
        return individual_scores, ensemble_acc, test_results

def enhanced_cross_validation_pipeline():
    """Enhanced pipeline with comprehensive model and data saving"""
    print("ENHANCED CROSS-VALIDATION PIPELINE WITH MODEL SAVING")
    print("="*70)
    print("Strategy: K-fold CV + Model saving + Comprehensive data tracking")
    print("="*70)
    
    start_time = time.time()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_dir = f"saved_models_{timestamp}"
    
    try:
        # Load data
        images, ages = load_original_data()
        
        # Initialize trainer with saving
        trainer = CrossValidationTrainerWithSaving(
            num_classes=len(set(ages)),
            save_dir=save_dir
        )
        
        # Run cross-validation with saving
        cv_scores, training_histories, data_splits = trainer.run_cross_validation_with_saving(images, ages, n_splits=5)
        
        # Final test evaluation with saving
        individual_scores, ensemble_score, test_results = trainer.final_test_evaluation_with_saving(data_splits)
        
        # Results
        elapsed = time.time() - start_time
        
        print(f"\nCROSS-VALIDATION RESULTS")
        print("="*40)
        print(f"CV Scores: {[f'{score:.1f}%' for score in cv_scores]}")
        print(f"CV Mean: {np.mean(cv_scores):.1f}% ± {np.std(cv_scores):.1f}%")
        print(f"CV Best: {max(cv_scores):.1f}%")
        
        print(f"\nFINAL TEST RESULTS")
        print("="*40)
        for i, score in enumerate(individual_scores):
            print(f"Model {i+1}: {score:.1f}%")
        
        best_individual = max(individual_scores)
        print(f"\nBest Individual: {best_individual:.1f}%")
        print(f"Ensemble + TTA:  {ensemble_score:.1f}%")
        print(f"Time: {elapsed/60:.1f} minutes")
        
        # Save comprehensive results
        final_results = {
            'cv_scores': cv_scores,
            'cv_mean': float(np.mean(cv_scores)),
            'cv_std': float(np.std(cv_scores)),
            'individual_test_scores': individual_scores,
            'ensemble_test_score': ensemble_score,
            'best_score': max(best_individual, ensemble_score),
            'goal_reached': max(best_individual, ensemble_score) >= 70,
            'gap_to_goal': max(0, 70 - max(best_individual, ensemble_score)),
            'elapsed_minutes': elapsed/60,
            'timestamp': datetime.now().isoformat(),
            'save_directory': save_dir
        }
        
        with open(f'{save_dir}/comprehensive_results.json', 'w') as f:
            json.dump(final_results, f, indent=2)
        
        # Save training histories
        with open(f'{save_dir}/all_training_histories.pkl', 'wb') as f:
            pickle.dump(training_histories, f)
        
        print(f"\n" + "="*70)
        print(f"🎉 TRAINING COMPLETE WITH COMPREHENSIVE SAVING!")
        print(f"🎉 All models and data saved to: {save_dir}/")
        print("="*70)
        print("\nSaved files:")
        print(f"• 5 trained models: model_fold_1.pth through model_fold_5.pth")
        print(f"• Training histories: history_fold_1.pkl through history_fold_5.pkl")
        print(f"• Data splits: data_splits.pkl")
        print(f"• Test results: test_results.pkl")
        print(f"• Final results: comprehensive_results.json")
        print(f"• All training histories: all_training_histories.pkl")
        print(f"\nNow you can run analysis scripts without any training!")
        
        return final_results, save_dir
        
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    enhanced_cross_validation_pipeline()

ENHANCED CROSS-VALIDATION PIPELINE WITH MODEL SAVING
Strategy: K-fold CV + Model saving + Comprehensive data tracking


In [None]:
# Analysis script

import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
import numpy as np
import json
import pickle
import random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class OptimizedDeerDataset(Dataset):
    """Same dataset class as training"""
    def __init__(self, X, y, test_time_aug=False):
        self.X = torch.FloatTensor(X if isinstance(X, np.ndarray) else np.array(X))
        self.y = torch.LongTensor(y if isinstance(y, np.ndarray) else np.array(y))
        self.test_time_aug = test_time_aug
        self.mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = self.X[idx].clone()
        label = self.y[idx].clone()
        
        if image.max() > 1.0:
            image = image / 255.0
        
        if len(image.shape) == 3 and image.shape[-1] == 3:
            image = image.permute(2, 0, 1)
        
        if image.shape[-2:] != (224, 224):
            image = F.interpolate(image.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)
        
        if self.test_time_aug and random.random() < 0.5:
            image = torch.flip(image, [2])
        
        image = (image - self.mean) / self.std
        return image, label

class PureModelAnalyzer:
    """Pure analysis class - loads saved models, NO TRAINING"""
    
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        print(f"PURE MODEL ANALYZER - NO TRAINING")
        print("="*50)
        print(f"Loading from: {save_dir}")
        print(f"Device: {self.device}")
        
        # Verify directory exists
        if not Path(save_dir).exists():
            raise FileNotFoundError(f"Save directory not found: {save_dir}")
        
        # Load all saved data
        self.load_saved_data()
        
        print("[PASS] All data loaded successfully")
        print("[PASS] Ready for analysis")
    
    def load_saved_data(self):
        """Load all saved models and data"""
        print("\nLoading saved data...")
        
        # Load comprehensive results
        with open(f"{self.save_dir}/comprehensive_results.json", 'r') as f:
            self.results = json.load(f)
        print("[PASS] Loaded comprehensive results")
        
        # Load data splits
        with open(f"{self.save_dir}/data_splits.pkl", 'rb') as f:
            self.data_splits = pickle.load(f)
        print("[PASS] Loaded data splits")
        
        # Load test results
        with open(f"{self.save_dir}/test_results.pkl", 'rb') as f:
            self.test_results = pickle.load(f)
        print("[PASS] Loaded test results")
        
        # Load training histories
        with open(f"{self.save_dir}/all_training_histories.pkl", 'rb') as f:
            self.training_histories = pickle.load(f)
        print("[PASS] Loaded training histories")
        
        # Model configuration
        self.num_classes = len(self.data_splits['unique_ages'])
        
        print(f"[PASS] Configuration: {self.num_classes} classes, {len(self.data_splits['X_test'])} test samples")
    
    def create_model_architecture(self):
        """Create the same model architecture (for loading weights)"""
        model = timm.create_model('resnet18', pretrained=False, num_classes=self.num_classes)
        
        # Apply same freezing (though not needed for inference)
        frozen_layers = ['conv1', 'bn1', 'layer1', 'layer2', 'layer3']
        for name, param in model.named_parameters():
            for frozen_layer in frozen_layers:
                if name.startswith(frozen_layer):
                    param.requires_grad = False
                    break
        
        return model.to(self.device)
    
    def load_trained_models(self):
        """Load all 5 trained models"""
        print("\nLoading trained models...")
        models = []
        
        for fold in range(1, 6):
            model_path = f"{self.save_dir}/model_fold_{fold}.pth"
            
            # Load checkpoint
            checkpoint = torch.load(model_path, map_location=self.device)
            
            # Create model and load weights
            model = self.create_model_architecture()
            model.load_state_dict(checkpoint['model_state_dict'])
            model.eval()  # Set to evaluation mode
            
            models.append(model)
            print(f"[PASS] Loaded model fold {fold} (Val acc: {checkpoint['best_val_acc']:.1f}%)")
        
        return models
    
    def calculate_all_metrics(self):
        """Calculate comprehensive metrics from saved results"""
        print("\nCalculating comprehensive metrics...")
        
        # Get data from saved results
        true_labels = np.array(self.test_results['true_labels'])
        ensemble_preds = np.array(self.test_results['ensemble_predictions'])
        individual_preds = self.test_results['individual_predictions']
        ensemble_probs = np.array(self.test_results['ensemble_probabilities'])
        individual_probs = self.test_results['individual_probabilities']
        
        metrics = {}
        
        # Individual model metrics
        for i, preds in enumerate(individual_preds):
            preds = np.array(preds)
            accuracy = np.mean(preds == true_labels) * 100
            f1_macro = f1_score(true_labels, preds, average='macro') * 100
            f1_weighted = f1_score(true_labels, preds, average='weighted') * 100
            precision = precision_score(true_labels, preds, average='macro') * 100
            recall = recall_score(true_labels, preds, average='macro') * 100
            
            metrics[f'model_{i+1}'] = {
                'accuracy': accuracy,
                'f1_macro': f1_macro,
                'f1_weighted': f1_weighted,
                'precision': precision,
                'recall': recall
            }
        
        # Ensemble metrics
        ensemble_accuracy = np.mean(ensemble_preds == true_labels) * 100
        ensemble_f1_macro = f1_score(true_labels, ensemble_preds, average='macro') * 100
        ensemble_f1_weighted = f1_score(true_labels, ensemble_preds, average='weighted') * 100
        ensemble_precision = precision_score(true_labels, ensemble_preds, average='macro') * 100
        ensemble_recall = recall_score(true_labels, ensemble_preds, average='macro') * 100
        
        metrics['ensemble'] = {
            'accuracy': ensemble_accuracy,
            'f1_macro': ensemble_f1_macro,
            'f1_weighted': ensemble_f1_weighted,
            'precision': ensemble_precision,
            'recall': ensemble_recall
        }
        
        # Class-wise metrics (without target_names to get numeric keys)
        class_names = [f'Age {age}' for age in self.data_splits['unique_ages']]
        metrics['classification_report'] = classification_report(
            true_labels, ensemble_preds,
            output_dict=True
        )
        
        # Debug: Print classification report keys
        print(f"Classification report keys: {list(metrics['classification_report'].keys())}")
        print(f"Number of classes: {len(class_names)}")
        print(f"Class names: {class_names}")
        
        # Also store the class names for plotting
        metrics['class_names'] = class_names
        
        print("[PASS] All metrics calculated")
        return metrics
    
    def create_all_plots(self, metrics):
        """Create all academic plots"""
        print("\nCreating comprehensive plots...")
        
        # Create output directory
        Path("analysis_plots").mkdir(exist_ok=True)
        
        # 1. Cross-validation and test scores
        self.plot_cv_and_test_scores()
        
        # 2. Training curves from saved histories
        self.plot_training_curves_from_history()
        
        # 3. Confusion matrices
        self.plot_confusion_matrices()
        
        # 4. Model comparison
        self.plot_model_comparison(metrics)
        
        # 5. Class-wise performance
        self.plot_class_performance(metrics)
        
        # 6. ROC curves
        self.plot_roc_curves()
        
        # 7. Loss curves
        self.plot_loss_curves()
        
        print("[PASS] All plots saved to 'analysis_plots/' directory")
    
    def plot_cv_and_test_scores(self):
        """Plot CV and test scores"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # CV scores
        cv_scores = self.results['cv_scores']
        folds = range(1, len(cv_scores) + 1)
        
        bars1 = ax1.bar(folds, cv_scores, alpha=0.8, color='skyblue', edgecolor='navy', linewidth=2)
        ax1.axhline(y=self.results['cv_mean'], color='red', linestyle='--', linewidth=2,
                   label=f"Mean: {self.results['cv_mean']:.1f}%")
        ax1.axhline(y=70, color='green', linestyle='--', alpha=0.7, linewidth=2, label="Target: 70%")
        
        # Add value labels on bars
        for bar, score in zip(bars1, cv_scores):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                    f'{score:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        ax1.set_xlabel('Fold', fontsize=12)
        ax1.set_ylabel('Validation Accuracy (%)', fontsize=12)
        ax1.set_title('Cross-Validation Scores', fontsize=14, fontweight='bold')
        ax1.legend(fontsize=11)
        ax1.grid(True, alpha=0.3)
        ax1.set_ylim(0, max(cv_scores) * 1.1)
        
        # Test scores
        test_scores = self.results['individual_test_scores'] + [self.results['ensemble_test_score']]
        model_names = [f'Model {i+1}' for i in range(len(self.results['individual_test_scores']))] + ['Ensemble']
        colors = ['lightcoral', 'lightgreen', 'lightsalmon', 'lightblue', 'plum', 'gold']
        
        bars2 = ax2.bar(model_names, test_scores, alpha=0.8, color=colors, edgecolor='black', linewidth=2)
        ax2.axhline(y=70, color='red', linestyle='--', alpha=0.7, linewidth=2, label="Target: 70%")
        
        # Add value labels
        for bar, score in zip(bars2, test_scores):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{score:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        ax2.set_ylabel('Test Accuracy (%)', fontsize=12)
        ax2.set_title('Final Test Performance', fontsize=14, fontweight='bold')
        ax2.legend(fontsize=11)
        ax2.grid(True, alpha=0.3)
        ax2.tick_params(axis='x', rotation=45)
        ax2.set_ylim(0, max(test_scores) * 1.1)
        
        plt.tight_layout()
        plt.savefig('analysis_plots/cv_and_test_scores.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_training_curves_from_history(self):
        """Plot training curves from saved histories"""
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        axes = axes.flatten()
        
        for fold, history in enumerate(self.training_histories):
            ax = axes[fold]
            epochs = range(1, len(history['train_accs']) + 1)
            
            # Plot accuracy curves
            ax.plot(epochs, history['train_accs'], 'b-', label='Training Accuracy', linewidth=2, marker='o', markersize=3)
            ax.plot(epochs, history['val_accs'], 'r-', label='Validation Accuracy', linewidth=2, marker='s', markersize=3)
            
            ax.set_xlabel('Epoch', fontsize=11)
            ax.set_ylabel('Accuracy (%)', fontsize=11)
            ax.set_title(f'Fold {fold + 1} - Training Curves', fontsize=12, fontweight='bold')
            ax.legend(fontsize=10)
            ax.grid(True, alpha=0.3)
            
            # Find best epoch
            best_epoch = np.argmax(history['val_accs']) + 1
            best_val_acc = max(history['val_accs'])
            ax.axvline(x=best_epoch, color='green', linestyle='--', alpha=0.7, label=f'Best: Epoch {best_epoch}')
            
            # Add final accuracy text
            final_train = history['train_accs'][-1]
            final_val = history['val_accs'][-1]
            ax.text(0.02, 0.98, f'Final: Train {final_train:.1f}%, Val {final_val:.1f}%\nBest Val: {best_val_acc:.1f}%', 
                   transform=ax.transAxes, bbox=dict(boxstyle="round", facecolor='wheat', alpha=0.8),
                   verticalalignment='top', fontsize=9)
        
        # Summary plot
        ax_summary = axes[5]
        all_train_accs = [history['train_accs'][-1] for history in self.training_histories]
        all_val_accs = [history['val_accs'][-1] for history in self.training_histories]
        all_best_val = [max(history['val_accs']) for history in self.training_histories]
        folds = range(1, 6)
        
        x = np.arange(len(folds))
        width = 0.25
        
        ax_summary.bar(x - width, all_train_accs, width, label='Final Training', alpha=0.8, color='lightblue')
        ax_summary.bar(x, all_val_accs, width, label='Final Validation', alpha=0.8, color='lightcoral')
        ax_summary.bar(x + width, all_best_val, width, label='Best Validation', alpha=0.8, color='lightgreen')
        
        ax_summary.set_xlabel('Fold', fontsize=11)
        ax_summary.set_ylabel('Accuracy (%)', fontsize=11)
        ax_summary.set_title('Summary: Final vs Best Accuracies', fontsize=12, fontweight='bold')
        ax_summary.set_xticks(x)
        ax_summary.set_xticklabels([f'Fold {i}' for i in folds])
        ax_summary.legend(fontsize=10)
        ax_summary.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('analysis_plots/training_curves.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_confusion_matrices(self):
        """Plot confusion matrices"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        true_labels = np.array(self.test_results['true_labels'])
        ensemble_preds = np.array(self.test_results['ensemble_predictions'])
        class_names = [f'Age {age}' for age in self.data_splits['unique_ages']]
        
        # Raw confusion matrix
        cm = confusion_matrix(true_labels, ensemble_preds)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                   xticklabels=class_names, yticklabels=class_names,
                   cbar_kws={'label': 'Count'})
        ax1.set_title('Confusion Matrix (Raw Counts)', fontsize=14, fontweight='bold')
        ax1.set_xlabel('Predicted', fontsize=12)
        ax1.set_ylabel('True', fontsize=12)
        
        # Normalized confusion matrix
        cm_norm = confusion_matrix(true_labels, ensemble_preds, normalize='true')
        sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues', ax=ax2,
                   xticklabels=class_names, yticklabels=class_names,
                   cbar_kws={'label': 'Proportion'})
        ax2.set_title('Confusion Matrix (Normalized)', fontsize=14, fontweight='bold')
        ax2.set_xlabel('Predicted', fontsize=12)
        ax2.set_ylabel('True', fontsize=12)
        
        plt.tight_layout()
        plt.savefig('analysis_plots/confusion_matrices.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_model_comparison(self, metrics):
        """Plot comprehensive model comparison"""
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        model_names = [f'Model {i+1}' for i in range(5)] + ['Ensemble']
        colors = ['lightcoral', 'lightgreen', 'lightsalmon', 'lightblue', 'plum', 'gold']
        
        # Accuracy comparison
        accuracies = [metrics[f'model_{i+1}']['accuracy'] for i in range(5)] + [metrics['ensemble']['accuracy']]
        bars1 = axes[0,0].bar(model_names, accuracies, alpha=0.8, color=colors, edgecolor='black')
        axes[0,0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
        axes[0,0].set_ylabel('Accuracy (%)', fontsize=12)
        axes[0,0].tick_params(axis='x', rotation=45)
        axes[0,0].grid(True, alpha=0.3)
        self.add_value_labels(axes[0,0], bars1, accuracies)
        
        # F1 Score comparison
        f1_scores = [metrics[f'model_{i+1}']['f1_macro'] for i in range(5)] + [metrics['ensemble']['f1_macro']]
        bars2 = axes[0,1].bar(model_names, f1_scores, alpha=0.8, color=colors, edgecolor='black')
        axes[0,1].set_title('F1 Score (Macro) Comparison', fontsize=14, fontweight='bold')
        axes[0,1].set_ylabel('F1 Score (%)', fontsize=12)
        axes[0,1].tick_params(axis='x', rotation=45)
        axes[0,1].grid(True, alpha=0.3)
        self.add_value_labels(axes[0,1], bars2, f1_scores)
        
        # Precision comparison
        precisions = [metrics[f'model_{i+1}']['precision'] for i in range(5)] + [metrics['ensemble']['precision']]
        bars3 = axes[1,0].bar(model_names, precisions, alpha=0.8, color=colors, edgecolor='black')
        axes[1,0].set_title('Precision Comparison', fontsize=14, fontweight='bold')
        axes[1,0].set_ylabel('Precision (%)', fontsize=12)
        axes[1,0].tick_params(axis='x', rotation=45)
        axes[1,0].grid(True, alpha=0.3)
        self.add_value_labels(axes[1,0], bars3, precisions)
        
        # Recall comparison
        recalls = [metrics[f'model_{i+1}']['recall'] for i in range(5)] + [metrics['ensemble']['recall']]
        bars4 = axes[1,1].bar(model_names, recalls, alpha=0.8, color=colors, edgecolor='black')
        axes[1,1].set_title('Recall Comparison', fontsize=14, fontweight='bold')
        axes[1,1].set_ylabel('Recall (%)', fontsize=12)
        axes[1,1].tick_params(axis='x', rotation=45)
        axes[1,1].grid(True, alpha=0.3)
        self.add_value_labels(axes[1,1], bars4, recalls)
        
        plt.tight_layout()
        plt.savefig('analysis_plots/model_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def add_value_labels(self, ax, bars, values):
        """Add value labels on top of bars"""
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                    f'{value:.1f}%', ha='center', va='bottom', fontweight='bold', fontsize=10)
    
    def plot_class_performance(self, metrics):
        """Plot class-wise performance"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        class_names = metrics['class_names']
        report = metrics['classification_report']
        
        try:
            # Extract class-wise metrics using numeric string keys
            f1_scores = [report[str(i)]['f1-score'] * 100 for i in range(len(class_names))]
            precisions = [report[str(i)]['precision'] * 100 for i in range(len(class_names))]
            recalls = [report[str(i)]['recall'] * 100 for i in range(len(class_names))]
            supports = [report[str(i)]['support'] for i in range(len(class_names))]
        except KeyError as e:
            print(f"KeyError in class performance: {e}")
            print(f"Available keys in classification report: {list(report.keys())}")
            print(f"Looking for keys: {[str(i) for i in range(len(class_names))]}")
            # Try alternative key format if numeric keys don't work
            if 'macro avg' in report:
                print("Using macro averages as fallback...")
                macro_avg = report['macro avg']
                f1_scores = [macro_avg['f1-score'] * 100] * len(class_names)
                precisions = [macro_avg['precision'] * 100] * len(class_names)
                recalls = [macro_avg['recall'] * 100] * len(class_names)
                supports = [1] * len(class_names)  # Dummy values
            else:
                raise
        
        # Class-wise metrics
        x = np.arange(len(class_names))
        width = 0.25
        
        bars1 = ax1.bar(x - width, f1_scores, width, label='F1-Score', alpha=0.8, color='lightcoral')
        bars2 = ax1.bar(x, precisions, width, label='Precision', alpha=0.8, color='lightgreen')
        bars3 = ax1.bar(x + width, recalls, width, label='Recall', alpha=0.8, color='lightblue')
        
        ax1.set_xlabel('Age Class', fontsize=12)
        ax1.set_ylabel('Score (%)', fontsize=12)
        ax1.set_title('Class-wise Performance Metrics', fontsize=14, fontweight='bold')
        ax1.set_xticks(x)
        ax1.set_xticklabels(class_names)
        ax1.legend(fontsize=11)
        ax1.grid(True, alpha=0.3)
        
        # Support (number of samples)
        bars4 = ax2.bar(class_names, supports, alpha=0.8, color='mediumpurple', edgecolor='indigo')
        ax2.set_xlabel('Age Class', fontsize=12)
        ax2.set_ylabel('Number of Test Samples', fontsize=12)
        ax2.set_title('Test Set Distribution', fontsize=14, fontweight='bold')
        ax2.grid(True, alpha=0.3)
        
        # Add value labels
        for bar, support in zip(bars4, supports):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                    str(support), ha='center', va='bottom', fontweight='bold', fontsize=11)
        
        plt.tight_layout()
        plt.savefig('analysis_plots/class_performance.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_roc_curves(self):
        """Plot ROC curves"""
        fig, ax = plt.subplots(figsize=(10, 8))
        
        true_labels = np.array(self.test_results['true_labels'])
        ensemble_probs = np.array(self.test_results['ensemble_probabilities'])
        class_names = [f'Age {age}' for age in self.data_splits['unique_ages']]
        n_classes = len(class_names)
        
        # Binarize the output
        y_test_bin = label_binarize(true_labels, classes=range(n_classes))
        
        # If binary classification, reshape
        if n_classes == 2:
            y_test_bin = y_test_bin.ravel()
        
        # Plot ROC curve for each class
        colors = plt.cm.Set1(np.linspace(0, 1, n_classes))
        
        for i, (class_name, color) in enumerate(zip(class_names, colors)):
            if n_classes == 2:
                fpr, tpr, _ = roc_curve(y_test_bin, ensemble_probs[:, 1])
                roc_auc = auc(fpr, tpr)
                ax.plot(fpr, tpr, color=color, lw=3, 
                       label=f'{class_name} (AUC = {roc_auc:.3f})')
                break
            else:
                fpr, tpr, _ = roc_curve(y_test_bin[:, i], ensemble_probs[:, i])
                roc_auc = auc(fpr, tpr)
                ax.plot(fpr, tpr, color=color, lw=3, 
                       label=f'{class_name} (AUC = {roc_auc:.3f})')
        
        # Plot diagonal line
        ax.plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.5, label='Random (AUC = 0.500)')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate', fontsize=12)
        ax.set_ylabel('True Positive Rate', fontsize=12)
        ax.set_title('ROC Curves - Multi-class Classification', fontsize=14, fontweight='bold')
        ax.legend(loc="lower right", fontsize=11)
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig('analysis_plots/roc_curves.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_loss_curves(self):
        """Plot loss curves from training histories"""
        fig, axes = plt.subplots(2, 3, figsize=(20, 12))
        axes = axes.flatten()
        
        for fold, history in enumerate(self.training_histories):
            ax = axes[fold]
            epochs = range(1, len(history['train_losses']) + 1)
            
            # Plot loss curves
            ax.plot(epochs, history['train_losses'], 'b-', label='Training Loss', linewidth=2, marker='o', markersize=3)
            ax.plot(epochs, history['val_losses'], 'r-', label='Validation Loss', linewidth=2, marker='s', markersize=3)
            
            ax.set_xlabel('Epoch', fontsize=11)
            ax.set_ylabel('Loss', fontsize=11)
            ax.set_title(f'Fold {fold + 1} - Loss Curves', fontsize=12, fontweight='bold')
            ax.legend(fontsize=10)
            ax.grid(True, alpha=0.3)
            
            # Find best epoch (min val loss)
            best_epoch = np.argmin(history['val_losses']) + 1
            min_val_loss = min(history['val_losses'])
            ax.axvline(x=best_epoch, color='green', linestyle='--', alpha=0.7)
            
            # Add final loss text
            final_train_loss = history['train_losses'][-1]
            final_val_loss = history['val_losses'][-1]
            ax.text(0.02, 0.98, f'Final: Train {final_train_loss:.3f}, Val {final_val_loss:.3f}\nMin Val: {min_val_loss:.3f}', 
                   transform=ax.transAxes, bbox=dict(boxstyle="round", facecolor='wheat', alpha=0.8),
                   verticalalignment='top', fontsize=9)
        
        # Summary plot - Learning rates
        ax_summary = axes[5]
        for fold, history in enumerate(self.training_histories):
            epochs = range(1, len(history['learning_rates']) + 1)
            ax_summary.plot(epochs, history['learning_rates'], label=f'Fold {fold+1}', linewidth=2)
        
        ax_summary.set_xlabel('Epoch', fontsize=11)
        ax_summary.set_ylabel('Learning Rate', fontsize=11)
        ax_summary.set_title('Learning Rate Schedule (All Folds)', fontsize=12, fontweight='bold')
        ax_summary.legend(fontsize=10)
        ax_summary.grid(True, alpha=0.3)
        ax_summary.set_yscale('log')
        
        plt.tight_layout()
        plt.savefig('analysis_plots/loss_curves.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def generate_comprehensive_report(self, metrics):
        """Generate comprehensive academic report"""
        print("\nGenerating comprehensive report...")
        
        report = []
        report.append("="*80)
        report.append("COMPREHENSIVE MODEL ANALYSIS REPORT")
        report.append("="*80)
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append(f"Results from: {self.results['timestamp']}")
        report.append(f"Models loaded from: {self.save_dir}")
        report.append("")
        
        # Dataset overview
        report.append("DATASET OVERVIEW")
        report.append("-"*40)
        total_samples = len(self.data_splits['X_train_all']) + len(self.data_splits['X_test'])
        report.append(f"Total samples: {total_samples}")
        report.append(f"Training samples: {len(self.data_splits['X_train_all'])}")
        report.append(f"Test samples: {len(self.data_splits['X_test'])}")
        report.append(f"Classes: {self.num_classes} age groups ({', '.join([str(age) for age in self.data_splits['unique_ages']])})")
        report.append(f"Train/Test split: 80/20")
        report.append(f"Cross-validation: 5-fold stratified")
        report.append("")
        
        # Model architecture
        report.append("MODEL ARCHITECTURE")
        report.append("-"*40)
        report.append("Base model: ResNet-18 (pretrained on ImageNet)")
        report.append("Frozen layers: conv1, bn1, layer1, layer2, layer3 (75% frozen)")
        report.append("Trainable layers: layer4, fc")
        report.append("Input size: 224x224 RGB images")
        report.append("Data augmentation: 40x multiplier with rotation, flip, lighting")
        report.append("Test-time augmentation: Horizontal flip averaging")
        report.append("")
        
        # Training details
        report.append("TRAINING CONFIGURATION")
        report.append("-"*40)
        report.append("Optimizer: AdamW with differential learning rates")
        report.append("  - Backbone: 0.0003")
        report.append("  - Classifier: 0.001")
        report.append("Weight decay: 0.015")
        report.append("Loss function: CrossEntropyLoss with label smoothing (0.1)")
        report.append("Scheduler: CosineAnnealingLR (T_max=70, eta_min=1e-6)")
        report.append("Max epochs: 70, Early stopping patience: 20")
        report.append("Mixed precision training: Enabled")
        report.append(f"Total training time: {self.results['elapsed_minutes']:.1f} minutes")
        report.append("")
        
        # Cross-validation results
        report.append("CROSS-VALIDATION RESULTS")
        report.append("-"*40)
        for i, score in enumerate(self.results['cv_scores']):
            report.append(f"Fold {i+1}: {score:.1f}%")
        report.append(f"Mean CV accuracy: {self.results['cv_mean']:.1f} ± {self.results['cv_std']:.1f}%")
        report.append(f"Best CV accuracy: {max(self.results['cv_scores']):.1f}%")
        report.append("")
        
        # Test performance
        report.append("TEST SET PERFORMANCE")
        report.append("-"*40)
        for i, score in enumerate(self.results['individual_test_scores']):
            report.append(f"Model {i+1}: {score:.1f}%")
        report.append(f"Ensemble (5 models + TTA): {self.results['ensemble_test_score']:.1f}%")
        report.append("")
        
        # Detailed ensemble metrics
        report.append("DETAILED ENSEMBLE METRICS")
        report.append("-"*40)
        ensemble_metrics = metrics['ensemble']
        report.append(f"Accuracy: {ensemble_metrics['accuracy']:.2f}%")
        report.append(f"F1-Score (Macro): {ensemble_metrics['f1_macro']:.2f}%")
        report.append(f"F1-Score (Weighted): {ensemble_metrics['f1_weighted']:.2f}%")
        report.append(f"Precision (Macro): {ensemble_metrics['precision']:.2f}%")
        report.append(f"Recall (Macro): {ensemble_metrics['recall']:.2f}%")
        report.append("")
        
        # Class-wise detailed performance
        report.append("CLASS-WISE PERFORMANCE (ENSEMBLE)")
        report.append("-"*40)
        class_report = metrics['classification_report']
        
        try:
            for i, age in enumerate(self.data_splits['unique_ages']):
                class_metrics = class_report[str(i)]  # Use numeric string key
                report.append(f"Age {age}:")
                report.append(f"  Precision: {class_metrics['precision']*100:.2f}%")
                report.append(f"  Recall: {class_metrics['recall']*100:.2f}%")
                report.append(f"  F1-Score: {class_metrics['f1-score']*100:.2f}%")
                report.append(f"  Support: {class_metrics['support']} samples")
                report.append("")
        except KeyError as e:
            report.append(f"Error accessing class-wise metrics: {e}")
            report.append(f"Available keys: {list(class_report.keys())}")
            if 'macro avg' in class_report:
                macro_avg = class_report['macro avg']
                report.append(f"Macro averages:")
                report.append(f"  Precision: {macro_avg['precision']*100:.2f}%")
                report.append(f"  Recall: {macro_avg['recall']*100:.2f}%")
                report.append(f"  F1-Score: {macro_avg['f1-score']*100:.2f}%")
            report.append("")
        
        # Statistical significance
        report.append("STATISTICAL ANALYSIS")
        report.append("-"*40)
        cv_scores = self.results['cv_scores']
        cv_mean = np.mean(cv_scores)
        cv_std = np.std(cv_scores)
        cv_sem = cv_std / np.sqrt(len(cv_scores))  # Standard error of mean
        confidence_95 = 1.96 * cv_sem  # 95% confidence interval
        
        report.append(f"Cross-validation mean: {cv_mean:.2f}%")
        report.append(f"Cross-validation std: {cv_std:.2f}%")
        report.append(f"Standard error of mean: {cv_sem:.2f}%")
        report.append(f"95% Confidence interval: [{cv_mean-confidence_95:.2f}%, {cv_mean+confidence_95:.2f}%]")
        report.append("")
        
        # Model generalization assessment
        report.append("GENERALIZATION ASSESSMENT")
        report.append("-"*40)
        
        # Check for overfitting by comparing train vs val
        final_train_accs = [h['train_accs'][-1] for h in self.training_histories]
        final_val_accs = [h['val_accs'][-1] for h in self.training_histories]
        best_val_accs = [max(h['val_accs']) for h in self.training_histories]
        
        mean_train = np.mean(final_train_accs)
        mean_val = np.mean(final_val_accs)
        mean_best_val = np.mean(best_val_accs)
        
        overfitting_gap = mean_train - mean_val
        generalization_gap = mean_best_val - self.results['ensemble_test_score']
        
        report.append(f"Average final training accuracy: {mean_train:.2f}%")
        report.append(f"Average final validation accuracy: {mean_val:.2f}%")
        report.append(f"Average best validation accuracy: {mean_best_val:.2f}%")
        report.append(f"Final test accuracy (ensemble): {self.results['ensemble_test_score']:.2f}%")
        report.append(f"Overfitting gap (train - val): {overfitting_gap:.2f}%")
        report.append(f"Generalization gap (val - test): {generalization_gap:.2f}%")
        
        if overfitting_gap < 5:
            report.append("✓ No significant overfitting detected".replace("✓", "[PASS]"))
        elif overfitting_gap < 10:
            report.append("⚠ Mild overfitting detected".replace("⚠", "[WARN]"))
        else:
            report.append("❌ Significant overfitting detected".replace("❌", "[FAIL]"))
        
        if abs(generalization_gap) < 5:
            report.append("✓ Good generalization to test set".replace("✓", "[PASS]"))
        else:
            report.append("⚠ Some generalization gap observed".replace("⚠", "[WARN]"))
        
        report.append("")
        
        # Goal achievement
        report.append("GOAL ACHIEVEMENT")
        report.append("-"*40)
        target = 70
        best_score = self.results['ensemble_test_score']
        
        if best_score >= target:
            report.append(f"[SUCCESS] Target achieved!")
            report.append(f"   Target: {target}%")
            report.append(f"   Achieved: {best_score:.1f}%")
            report.append(f"   Margin: +{best_score - target:.1f}%")
        else:
            report.append(f"[MISS] Target not reached")
            report.append(f"   Target: {target}%")
            report.append(f"   Achieved: {best_score:.1f}%")
            report.append(f"   Gap: -{target - best_score:.1f}%")
        report.append("")
        
        # Technical notes
        report.append("TECHNICAL IMPLEMENTATION")
        report.append("-"*40)
        report.append("- Framework: PyTorch with timm (transformers)")
        report.append("- Hardware: NVIDIA RTX 2060 with CUDA")
        report.append("- Mixed precision training for efficiency")
        report.append("- Reproducible results (fixed random seeds)")
        report.append("- Ensemble method: Simple averaging of softmax outputs")
        report.append("- Cross-validation: Stratified to maintain class balance")
        report.append("- Data augmentation: Domain-specific for deer images")
        report.append("- Early stopping to prevent overfitting")
        report.append("")
        
        # Files generated
        report.append("ANALYSIS OUTPUTS")
        report.append("-"*40)
        report.append("Generated plots:")
        report.append("- cv_and_test_scores.png - Cross-validation and test performance")
        report.append("- training_curves.png - Training/validation accuracy curves")
        report.append("- confusion_matrices.png - Prediction confusion analysis")
        report.append("- model_comparison.png - Individual vs ensemble metrics")
        report.append("- class_performance.png - Per-class performance breakdown")
        report.append("- roc_curves.png - ROC curves for each class")
        report.append("- loss_curves.png - Training/validation loss progression")
        report.append("")
        report.append("Data files:")
        report.append(f"- Models: {self.save_dir}/model_fold_*.pth")
        report.append(f"- Results: {self.save_dir}/comprehensive_results.json")
        report.append(f"- Test predictions: {self.save_dir}/test_results.pkl")
        report.append("")
        
        report_text = "\n".join(report)
        
        # Save report with UTF-8 encoding
        with open('analysis_plots/academic_report.txt', 'w', encoding='utf-8') as f:
            f.write(report_text)
        
        print(report_text)
        print("\n[PASS] Academic report saved to 'analysis_plots/academic_report.txt'")
    
    def run_complete_analysis(self):
        """Run complete analysis pipeline - NO TRAINING"""
        print("STARTING PURE ANALYSIS (NO TRAINING)")
        print("="*60)
        
        try:
            # Calculate metrics from saved results
            metrics = self.calculate_all_metrics()
            
            # Create all plots
            self.create_all_plots(metrics)
            
            # Generate academic report
            self.generate_comprehensive_report(metrics)
            
            print("\n" + "="*60)
            print("SUCCESS! PURE ANALYSIS COMPLETE!")
            print("="*60)
            print("✓ NO TRAINING WAS PERFORMED".replace("✓", "[PASS]"))
            print("✓ All metrics calculated from saved models".replace("✓", "[PASS]"))
            print("✓ All plots generated and saved".replace("✓", "[PASS]"))
            print("✓ Academic report created".replace("✓", "[PASS]"))
            print("")
            print("Check 'analysis_plots/' folder for:")
            print("   - All visualization plots")
            print("   - Academic report (academic_report.txt)")
            print("")
            print(f"Key Results:")
            print(f"   - CV Mean: {self.results['cv_mean']:.1f}% ± {self.results['cv_std']:.1f}%")
            print(f"   - Best Individual: {max(self.results['individual_test_scores']):.1f}%")
            print(f"   - Ensemble: {self.results['ensemble_test_score']:.1f}%")
            success_status = "ACHIEVED" if self.results['ensemble_test_score'] >= 70 else "NOT REACHED"
            print(f"   - Goal (70%): {success_status}")
            
            return metrics
            
        except Exception as e:
            print(f"Error in analysis: {e}")
            import traceback
            traceback.print_exc()

# Main execution function
def analyze_saved_models(save_dir):
    """Analyze saved models without any training"""
    print("🔍 PURE MODEL ANALYSIS - ZERO TRAINING")
    print("="*50)
    
    # Initialize analyzer
    analyzer = PureModelAnalyzer(save_dir)
    
    # Run complete analysis
    metrics = analyzer.run_complete_analysis()
    
    return analyzer, metrics

# Example usage
if __name__ == "__main__":
    # Replace with your actual save directory
    save_dir = "saved_models_20250619_171443"  # Updated to match your training output
    
    # If you don't know the exact directory name, uncomment this:
    # import glob
    # save_dirs = glob.glob("saved_models_*")
    # if save_dirs:
    #     save_dir = save_dirs[-1]  # Use the most recent
    #     print(f"Found save directory: {save_dir}")
    # else:
    #     print("No saved model directories found!")
    #     print("Please run the enhanced training script first.")
    #     exit()
    
    try:
        analyzer, metrics = analyze_saved_models(save_dir)
        print("\nSUCCESS! All analysis complete with NO TRAINING!")
    except FileNotFoundError:
        print(f"\nSave directory not found: {save_dir}")
        print("Please:")
        print("1. Run the enhanced training script first, OR")
        print("2. Update the save_dir variable to point to your saved models")
    except Exception as e:
        print(f"\nError: {e}")
        print("Please check that all required files are in the save directory")