### Check RTX5090 running CUDA

In [None]:
import torch
import torchvision.models as models

# Check if CUDA
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("❌ CUDA not detected by PyTorch")

# Test ResNet50 specifically
model = models.resnet50(pretrained=True).cuda()
test_batch = torch.randn(2, 3, 224, 224).cuda()
try:
    output = model(test_batch)
    print("ResNet50 works!")
except Exception as e:
    print(f"ResNet50 failed: {e}")

# Test EfficientNet
try:
    model_eff = models.efficientnet_b0(pretrained=True).cuda()
    output_eff = model_eff(test_batch)
    print("EfficientNet works!")
except Exception as e:
    print(f"EfficientNet failed: {e}")

### Process deer data

In [2]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
import numpy as np
import cv2
import random
import json
import os
import glob
import itertools
import gc
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# RTX 5090 Calculated Configuration - Targeting ~27GB VRAM usage
FIXED_HYPERPARAMS = {
    'backbone_lr': 0.0001,
    'classifier_lr': 0.0005,
    'batch_size': 384,  # Calculated to use ~27GB total VRAM
    'optimizer': 'adamw',
    'dropout': 0.4,
    'freeze_layers': 3
}

# Missing hyperparameters to test
MISSING_HYPERPARAMS = {
    'weight_decay': [0.05, 0.08],
    'scheduler': ['cosine', 'plateau'],
    'label_smoothing': [0.15, 0.2],
    'augmentation_strength': ['medium', 'heavy']
}

# Calculated RTX 5090 settings
IMAGE_SIZE = (576, 576)  # Calculated to fit with batch_size=384 in ~27GB total
AUGMENTATION_TARGET = 2000
NUM_FOLDS = 25
NUM_WORKERS = 0
MIXED_PRECISION = True
COMPILE_MODEL = False

def detect_and_convert_image(image):
    if len(image.shape) == 2:
        return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    elif len(image.shape) == 3:
        if image.shape[2] == 1:
            return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif image.shape[2] == 3:
            return image
        elif image.shape[2] == 4:
            return cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    return image

def load_combined_data():
    color_path = "D:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*_NDA.png"
    gray_path = "D:\\Dropbox\\AI Projects\\buck\\images\\squared\\grayscale\\*_NDA.png"
    
    images = []
    ages = []
    sources = []
    
    print("Loading color images...")
    color_files = glob.glob(color_path)
    for img_path in color_files:
        try:
            img = cv2.imread(img_path)
            if img is None:
                continue
            
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = detect_and_convert_image(img)
            img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
            
            filename = os.path.basename(img_path)
            filename_no_ext = os.path.splitext(filename)[0]
            parts = filename_no_ext.split('_')
            
            if len(parts) < 5:
                continue
            
            age_part = parts[3]
            if 'xpx' in age_part.lower() or 'p' not in age_part:
                continue
            
            try:
                age_value = float(age_part.replace('p', '.'))
                images.append(img_resized)
                ages.append(age_value)
                sources.append('color')
            except ValueError:
                continue
                
        except Exception as e:
            continue
    
    print(f"Loaded {len([s for s in sources if s == 'color'])} color images")
    
    print("Loading grayscale images...")
    gray_files = glob.glob(gray_path)
    for img_path in gray_files:
        try:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            if img is None:
                continue
            
            img = detect_and_convert_image(img)
            img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
            
            filename = os.path.basename(img_path)
            filename_no_ext = os.path.splitext(filename)[0]
            parts = filename_no_ext.split('_')
            
            if len(parts) < 5:
                continue
            
            age_part = parts[3]
            if 'xpx' in age_part.lower() or 'p' not in age_part:
                continue
            
            try:
                age_value = float(age_part.replace('p', '.'))
                images.append(img_resized)
                ages.append(age_value)
                sources.append('grayscale')
            except ValueError:
                continue
                
        except Exception as e:
            continue
    
    print(f"Loaded {len([s for s in sources if s == 'grayscale'])} grayscale images")
    print(f"Total images: {len(images)}")
    
    ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
    
    age_counts = Counter(ages_grouped)
    valid_ages = {age for age, count in age_counts.items() if count >= 3}
    
    filtered_images = []
    filtered_ages = []
    filtered_sources = []
    
    for img, age, source in zip(images, ages_grouped, sources):
        if age in valid_ages:
            filtered_images.append(img)
            filtered_ages.append(age)
            filtered_sources.append(source)
    
    print(f"Final dataset: {len(filtered_images)} images")
    print(f"Age distribution: {dict(Counter(filtered_ages))}")
    
    return np.array(filtered_images, dtype=np.uint8), filtered_ages, filtered_sources

def enhanced_augment_image(image, strength='medium'):
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    if strength == 'light':
        rot_prob, flip_prob, bright_prob, gamma_prob, noise_prob = 0.5, 0.3, 0.6, 0.2, 0.1
        rot_range, bright_range = 10, (0.8, 1.2)
    elif strength == 'medium':
        rot_prob, flip_prob, bright_prob, gamma_prob, noise_prob = 0.7, 0.5, 0.8, 0.4, 0.3
        rot_range, bright_range = 15, (0.7, 1.3)
    else:  # heavy
        rot_prob, flip_prob, bright_prob, gamma_prob, noise_prob = 0.8, 0.6, 0.9, 0.5, 0.4
        rot_range, bright_range = 20, (0.6, 1.4)
    
    if random.random() < rot_prob:
        angle = random.uniform(-rot_range, rot_range)
        h, w = image.shape[:2]
        M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
        image = cv2.warpAffine(image, M, (w, h))
    
    if random.random() < flip_prob:
        image = cv2.flip(image, 1)
    
    if len(image.shape) == 3 and image.shape[2] == 3 and random.random() < 0.4:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        image = cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB)
    
    if random.random() < bright_prob:
        alpha = random.uniform(*bright_range)
        beta = random.randint(-25, 25)
        image = cv2.convertScaleAbs(image, alpha=alpha, beta=beta)
    
    if random.random() < gamma_prob:
        gamma = random.uniform(0.8, 1.2)
        inv_gamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** inv_gamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        image = cv2.LUT(image, table)
    
    if random.random() < noise_prob:
        noise = np.random.normal(0, 7, image.shape).astype(np.int16)
        image_int16 = image.astype(np.int16)
        noisy_image = np.clip(image_int16 + noise, 0, 255)
        image = noisy_image.astype(np.uint8)
    
    return image

class MemoryEfficientDataset(Dataset):
    def __init__(self, base_images, labels, aug_strength='medium', target_per_class=2000, training=True):
        self.base_images = base_images
        self.labels = np.array(labels)
        self.aug_strength = aug_strength
        self.training = training
        self.target_per_class = target_per_class
        
        unique_classes = np.unique(labels)
        self.class_to_indices = {}
        for cls in unique_classes:
            self.class_to_indices[cls] = np.where(self.labels == cls)[0]
        
        self.num_classes = len(unique_classes)
        self.class_list = sorted(unique_classes)
        self.length = self.num_classes * self.target_per_class
        
        self.mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(3, 1, 1)
        self.std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(3, 1, 1)
        
        print(f"Memory-efficient dataset: {self.length} samples from {len(base_images)} base images")
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        class_idx = idx // self.target_per_class
        within_class_idx = idx % self.target_per_class
        
        target_class = self.class_list[class_idx]
        available_indices = self.class_to_indices[target_class]
        
        base_idx = available_indices[within_class_idx % len(available_indices)]
        image = self.base_images[base_idx].copy()
        
        if self.training and within_class_idx >= len(available_indices):
            image = enhanced_augment_image(image, self.aug_strength)
        
        image = image.astype(np.float32) / 255.0
        if len(image.shape) == 3:
            image = image.transpose(2, 0, 1)
        
        if not self.training and random.random() < 0.5:
            image = np.flip(image, axis=2).copy()
        
        image = (image - self.mean) / self.std
        
        return torch.from_numpy(image.astype(np.float32)), target_class

class RTX5090OptimizedModel:
    def __init__(self, num_classes, save_dir=None):
        self.num_classes = num_classes
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        if save_dir is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.save_dir = f"rtx5090_optimized_{timestamp}"
        else:
            self.save_dir = save_dir
        
        os.makedirs(self.save_dir, exist_ok=True)
        self.best_multiplicative_score = 0.0
        
        print(f"Using device: {self.device}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
            
            torch.backends.cudnn.benchmark = True
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cuda.matmul.allow_tf32 = True
            
            if MIXED_PRECISION:
                print("Mixed precision enabled")
                self.scaler = torch.amp.GradScaler('cuda')
            else:
                self.scaler = None
    
    def create_model(self, dropout=0.3, freeze_layers=3):
        print("Loading EfficientNet-B4 (optimized for RTX 5090)")
        model = models.efficientnet_b4(weights='DEFAULT')
        
        layers_to_freeze = list(model.features.children())[:freeze_layers]
        for layer in layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False
        
        frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Model parameters: {trainable_params:,} trainable, {frozen_params:,} frozen")
        
        original_features = model.classifier[1].in_features
        model.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(original_features, 1024),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout * 0.5),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout * 0.25),
            nn.Linear(512, self.num_classes)
        )
        
        return model.to(self.device)
    
    def get_optimizer(self, model, opt_type, backbone_lr, classifier_lr, weight_decay):
        backbone_params = []
        classifier_params = []
        
        for name, param in model.named_parameters():
            if param.requires_grad:
                if 'classifier' in name:
                    classifier_params.append(param)
                else:
                    backbone_params.append(param)
        
        param_groups = [
            {'params': backbone_params, 'lr': backbone_lr},
            {'params': classifier_params, 'lr': classifier_lr}
        ]
        
        print(f"Optimizer groups: {len(backbone_params)} backbone, {len(classifier_params)} classifier params")
        
        if opt_type == 'adamw':
            return optim.AdamW(param_groups, weight_decay=weight_decay, fused=True)
        elif opt_type == 'sgd':
            return optim.SGD(param_groups, weight_decay=weight_decay, momentum=0.9, fused=True)
        else:
            raise ValueError(f"Unknown optimizer: {opt_type}")
    
    def get_scheduler(self, optimizer, scheduler_type, max_epochs):
        if scheduler_type == 'cosine':
            return optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs, eta_min=1e-6)
        elif scheduler_type == 'plateau':
            return optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=12, factor=0.5, verbose=False)
        else:
            raise ValueError(f"Unknown scheduler: {scheduler_type}")
    
    def train_with_hyperparams(self, train_loader, val_loader, test_loader, hyperparams, fold_num):
        print(f"Creating model for fold {fold_num}...")
        model = self.create_model(
            dropout=hyperparams['dropout'], 
            freeze_layers=hyperparams['freeze_layers']
        )
        
        optimizer = self.get_optimizer(
            model, hyperparams['optimizer'], 
            hyperparams['backbone_lr'], hyperparams['classifier_lr'], 
            hyperparams['weight_decay']
        )
        
        scheduler = self.get_scheduler(optimizer, hyperparams['scheduler'], 100)
        criterion = nn.CrossEntropyLoss(label_smoothing=hyperparams['label_smoothing'])
        
        start_memory = torch.cuda.memory_allocated() / 1e9
        print(f"Starting VRAM usage: {start_memory:.1f} GB")
        
        best_val_acc = 0.0
        patience = 35
        patience_counter = 0
        best_state = None
        
        for epoch in range(100):
            model.train()
            train_correct = 0
            train_total = 0
            epoch_start_time = time.time()
            
            for batch_idx, (images, labels) in enumerate(train_loader):
                images, labels = images.to(self.device), labels.to(self.device)
                
                optimizer.zero_grad()
                
                if MIXED_PRECISION and self.scaler:
                    with torch.amp.autocast('cuda'):
                        outputs = model(images)
                        loss = criterion(outputs, labels)
                    
                    self.scaler.scale(loss).backward()
                    self.scaler.step(optimizer)
                    self.scaler.update()
                else:
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
                
                if batch_idx == 0:
                    current_memory = torch.cuda.memory_allocated() / 1e9
                    print(f"    Training VRAM: {current_memory:.1f} GB", end="")
            
            model.eval()
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(self.device), labels.to(self.device)
                    
                    if MIXED_PRECISION:
                        with torch.amp.autocast('cuda'):
                            outputs = model(images)
                    else:
                        outputs = model(images)
                    
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            train_acc = 100 * train_correct / train_total
            val_acc = 100 * val_correct / val_total
            epoch_time = time.time() - epoch_start_time
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            if hyperparams['scheduler'] == 'plateau':
                scheduler.step(val_acc)
            else:
                scheduler.step()
            
            if epoch % 25 == 0 and epoch > 0:
                print(f" - Epoch {epoch}: Train {train_acc:.1f}%, Val {val_acc:.1f}% ({epoch_time:.1f}s)")
            
            if patience_counter >= patience:
                print(f"    Early stopping at epoch {epoch}")
                break
            
            if epoch % 10 == 0:
                torch.cuda.empty_cache()
        
        if best_state is not None:
            model.load_state_dict(best_state)
        
        model.eval()
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                
                if MIXED_PRECISION:
                    with torch.amp.autocast('cuda'):
                        outputs1 = model(images)
                        outputs2 = model(torch.flip(images, [3]))
                        outputs = (outputs1 + outputs2) / 2
                else:
                    outputs1 = model(images)
                    outputs2 = model(torch.flip(images, [3]))
                    outputs = (outputs1 + outputs2) / 2
                
                _, predicted = torch.max(outputs, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
        
        test_acc = 100 * test_correct / test_total
        multiplicative_score = (best_val_acc / 100) * (test_acc / 100) * 10000
        
        if multiplicative_score > self.best_multiplicative_score:
            self.best_multiplicative_score = multiplicative_score
            
            save_path = os.path.join(self.save_dir, f"best_rtx5090_mult_{multiplicative_score:.1f}_val_{best_val_acc:.1f}_test_{test_acc:.1f}_fold_{fold_num}.pth")
            torch.save({
                'model_state_dict': model.state_dict(),
                'hyperparams': hyperparams,
                'val_accuracy': best_val_acc,
                'test_accuracy': test_acc,
                'train_accuracy': train_acc,
                'multiplicative_score': multiplicative_score,
                'fold_number': fold_num,
                'model_info': 'EfficientNet-B4_576x576_batch384'
            }, save_path)
            print(f"    NEW GLOBAL BEST! Mult: {multiplicative_score:.1f}, Val: {best_val_acc:.1f}%, Test: {test_acc:.1f}% (Fold {fold_num})")
            return True
            
        return False
    
    def run_fold_search(self, images, ages, sources):
        print(f"RTX 5090 Optimized Training - {NUM_FOLDS} folds")
        print(f"Configuration: EfficientNet-B4, {IMAGE_SIZE[0]}x{IMAGE_SIZE[1]}, batch_size={FIXED_HYPERPARAMS['batch_size']}")
        print(f"Target VRAM usage: ~27GB")
        
        unique_ages = sorted(list(set(ages)))
        label_mapping = {age: i for i, age in enumerate(unique_ages)}
        y_indices = np.array([label_mapping[age] for age in ages])
        
        print(f"Classes: {len(unique_ages)}")
        print(f"Label mapping: {label_mapping}")
        
        missing_keys = list(MISSING_HYPERPARAMS.keys())
        missing_values = list(MISSING_HYPERPARAMS.values())
        missing_combos = list(itertools.product(*missing_values))
        
        best_found = False
        
        for combo_idx, missing_combo in enumerate(missing_combos, 1):
            hyperparams = FIXED_HYPERPARAMS.copy()
            for key, value in zip(missing_keys, missing_combo):
                hyperparams[key] = value
            
            print(f"\n[Config {combo_idx}/{len(missing_combos)}] Testing:")
            print(f"  EfficientNet-B4, batch={hyperparams['batch_size']}, lr={hyperparams['backbone_lr']}/{hyperparams['classifier_lr']}")
            print(f"  wd={hyperparams['weight_decay']}, sched={hyperparams['scheduler']}, smooth={hyperparams['label_smoothing']}, aug={hyperparams['augmentation_strength']}")
            
            for fold in range(1, NUM_FOLDS + 1):
                random.seed(fold * 42)
                np.random.seed(fold * 42)
                torch.manual_seed(fold * 42)
                
                print(f"\n  [Fold {fold:2d}/{NUM_FOLDS}]", end=" ")
                
                try:
                    X_train, X_test, y_train, y_test = train_test_split(
                        images, y_indices, test_size=0.2, random_state=fold * 42, stratify=y_indices
                    )
                    
                    X_train_final, X_val, y_train_final, y_val = train_test_split(
                        X_train, y_train, test_size=0.2, random_state=fold * 42 + 1, stratify=y_train
                    )
                    
                    train_dataset = MemoryEfficientDataset(X_train_final, y_train_final, 
                                                         hyperparams['augmentation_strength'], AUGMENTATION_TARGET, True)
                    val_dataset = MemoryEfficientDataset(X_val, y_val, 'medium', 100, False)
                    test_dataset = MemoryEfficientDataset(X_test, y_test, 'medium', 100, False)
                    
                    train_loader = DataLoader(train_dataset, batch_size=hyperparams['batch_size'], 
                                            shuffle=True, num_workers=0)
                    val_loader = DataLoader(val_dataset, batch_size=hyperparams['batch_size'], 
                                          shuffle=False, num_workers=0)
                    test_loader = DataLoader(test_dataset, batch_size=hyperparams['batch_size'], 
                                           shuffle=False, num_workers=0)
                    
                    was_best = self.train_with_hyperparams(
                        train_loader, val_loader, test_loader, hyperparams, fold
                    )
                    
                    if was_best:
                        best_found = True
                    
                    # Aggressive cleanup
                    del train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader, model
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
                    gc.collect()
                    
                    allocated = torch.cuda.memory_allocated() / 1e9
                    if allocated > 5.0:
                        print(f"    Warning: {allocated:.1f}GB still allocated")
                        torch.cuda.empty_cache()
                        torch.cuda.synchronize()
                        gc.collect()
                    
                except Exception as e:
                    print(f"FAILED: {str(e)}")
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
                    gc.collect()
                    continue
        
        print(f"\n{'='*80}")
        print(f"RTX 5090 OPTIMIZED TRAINING COMPLETE")
        print(f"{'='*80}")
        print(f"Best Multiplicative Score: {self.best_multiplicative_score:.1f}")
        print(f"Configuration: EfficientNet-B4, {IMAGE_SIZE[0]}x{IMAGE_SIZE[1]}, batch_size={FIXED_HYPERPARAMS['batch_size']}")
        print(f"Results saved to: {self.save_dir}")
        
        return best_found

def main():
    import gc
    import time
    import torch
    # Force clean start
    #torch.cuda.empty_cache()
    #torch.cuda.reset_peak_memory_stats()
    #torch.cuda.synchronize()
    
    #print(f"After cleanup - Allocated: {torch.cuda.memory_allocated() / 1e9:.3f} GB")
    #print(f"After cleanup - Reserved: {torch.cuda.memory_reserved() / 1e9:.3f} GB")
    
    print("Minimal Test Mode for RTX 5090")
    print("=" * 60)
    
     #gc.collect()
    
    #torch.cuda.empty_cache()    
    
    start_time = time.time()
    
    images, ages, sources = load_combined_data()
    
    trainer = MinimalTestModel(num_classes=len(set(ages)))
    trainer.run_test(images, ages, sources)
    
    elapsed = (time.time() - start_time) / 60
    print(f"\nTest completed in: {elapsed:.1f} minutes")

if __name__ == "__main__":
    main()

Minimal Test Mode for RTX 5090
Loading color images...
Loaded 201 color images
Loading grayscale images...
Loaded 40 grayscale images
Total images: 241
Final dataset: 241 images
Age distribution: {2.5: 41, 3.5: 50, 4.5: 57, 5.5: 60, 1.5: 33}


NameError: name 'MinimalTestModel' is not defined

In [4]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
import numpy as np
import cv2
import random
import json
import os
import glob
import itertools
import gc
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Check PyTorch and CUDA setup first
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"Initial allocated: {torch.cuda.memory_allocated() / 1e9:.3f} GB")
    print(f"Initial reserved: {torch.cuda.memory_reserved() / 1e9:.3f} GB")

# MINIMAL CONFIGURATION - Testing if basic functionality works
FIXED_HYPERPARAMS = {
    'backbone_lr': 0.0001,
    'classifier_lr': 0.0005,
    'batch_size': 16,  # Very small to test basic functionality
    'optimizer': 'adamw',
    'dropout': 0.3,
    'freeze_layers': 4
}

# Minimal hyperparameter testing
MISSING_HYPERPARAMS = {
    'weight_decay': [0.05],
    'scheduler': ['cosine'],
    'label_smoothing': [0.1],
    'augmentation_strength': ['light']
}

# Conservative settings for testing
IMAGE_SIZE = (512, 512)  # Standard ImageNet size
AUGMENTATION_TARGET = 100  # Very small for testing
NUM_FOLDS = 2  # Just test 2 folds
NUM_WORKERS = 0
MIXED_PRECISION = False  # Disable for testing
COMPILE_MODEL = False

def detect_and_convert_image(image):
    if len(image.shape) == 2:
        return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    elif len(image.shape) == 3:
        if image.shape[2] == 1:
            return cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
        elif image.shape[2] == 3:
            return image
        elif image.shape[2] == 4:
            return cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    return image

def load_combined_data():
    color_path = "D:\\Dropbox\\AI Projects\\buck\\images\\squared\\color\\*_NDA.png"
    gray_path = "D:\\Dropbox\\AI Projects\\buck\\images\\squared\\grayscale\\*_NDA.png"
    
    images = []
    ages = []
    sources = []
    
    print("Loading color images...")
    color_files = glob.glob(color_path)
    for img_path in color_files:
        try:
            img = cv2.imread(img_path)
            if img is None:
                continue
            
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = detect_and_convert_image(img)
            img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
            
            filename = os.path.basename(img_path)
            filename_no_ext = os.path.splitext(filename)[0]
            parts = filename_no_ext.split('_')
            
            if len(parts) < 5:
                continue
            
            age_part = parts[3]
            if 'xpx' in age_part.lower() or 'p' not in age_part:
                continue
            
            try:
                age_value = float(age_part.replace('p', '.'))
                images.append(img_resized)
                ages.append(age_value)
                sources.append('color')
            except ValueError:
                continue
                
        except Exception as e:
            continue
    
    print(f"Loaded {len([s for s in sources if s == 'color'])} color images")
    
    print("Loading grayscale images...")
    gray_files = glob.glob(gray_path)
    for img_path in gray_files:
        try:
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            if img is None:
                continue
            
            img = detect_and_convert_image(img)
            img_resized = cv2.resize(img, IMAGE_SIZE[::-1])
            
            filename = os.path.basename(img_path)
            filename_no_ext = os.path.splitext(filename)[0]
            parts = filename_no_ext.split('_')
            
            if len(parts) < 5:
                continue
            
            age_part = parts[3]
            if 'xpx' in age_part.lower() or 'p' not in age_part:
                continue
            
            try:
                age_value = float(age_part.replace('p', '.'))
                images.append(img_resized)
                ages.append(age_value)
                sources.append('grayscale')
            except ValueError:
                continue
                
        except Exception as e:
            continue
    
    print(f"Loaded {len([s for s in sources if s == 'grayscale'])} grayscale images")
    print(f"Total images: {len(images)}")
    
    ages_grouped = [5.5 if age >= 5.5 else age for age in ages]
    
    age_counts = Counter(ages_grouped)
    valid_ages = {age for age, count in age_counts.items() if count >= 3}
    
    filtered_images = []
    filtered_ages = []
    filtered_sources = []
    
    for img, age, source in zip(images, ages_grouped, sources):
        if age in valid_ages:
            filtered_images.append(img)
            filtered_ages.append(age)
            filtered_sources.append(source)
    
    print(f"Final dataset: {len(filtered_images)} images")
    print(f"Age distribution: {dict(Counter(filtered_ages))}")
    
    return np.array(filtered_images, dtype=np.uint8), filtered_ages, filtered_sources

def enhanced_augment_image(image, strength='light'):
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Very light augmentation for testing
    if strength == 'light':
        if random.random() < 0.3:
            image = cv2.flip(image, 1)
        if random.random() < 0.2:
            angle = random.uniform(-5, 5)
            h, w = image.shape[:2]
            M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
            image = cv2.warpAffine(image, M, (w, h))
    
    return image

class SimpleDataset(Dataset):
    def __init__(self, base_images, labels, aug_strength='light', target_per_class=100, training=True):
        self.base_images = base_images
        self.labels = np.array(labels)
        self.aug_strength = aug_strength
        self.training = training
        self.target_per_class = target_per_class
        
        unique_classes = np.unique(labels)
        self.class_to_indices = {}
        for cls in unique_classes:
            self.class_to_indices[cls] = np.where(self.labels == cls)[0]
        
        self.num_classes = len(unique_classes)
        self.class_list = sorted(unique_classes)
        self.length = self.num_classes * self.target_per_class
        
        self.mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(3, 1, 1)
        self.std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(3, 1, 1)
        
        print(f"Simple dataset: {self.length} samples from {len(base_images)} base images")
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        class_idx = idx // self.target_per_class
        within_class_idx = idx % self.target_per_class
        
        target_class = self.class_list[class_idx]
        available_indices = self.class_to_indices[target_class]
        
        base_idx = available_indices[within_class_idx % len(available_indices)]
        image = self.base_images[base_idx].copy()
        
        if self.training and within_class_idx >= len(available_indices):
            image = enhanced_augment_image(image, self.aug_strength)
        
        image = image.astype(np.float32) / 255.0
        if len(image.shape) == 3:
            image = image.transpose(2, 0, 1)
        
        image = (image - self.mean) / self.std
        
        return torch.from_numpy(image.astype(np.float32)), target_class

class MinimalTestModel:
    def __init__(self, num_classes, save_dir=None):
        self.num_classes = num_classes
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        if save_dir is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            self.save_dir = f"minimal_test_{timestamp}"
        else:
            self.save_dir = save_dir
        
        os.makedirs(self.save_dir, exist_ok=True)
        self.best_multiplicative_score = 0.0
        
        print(f"Using device: {self.device}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
            print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    def create_model(self, dropout=0.3, freeze_layers=4):
        print("Loading ResNet18 (minimal model for testing)")
        model = models.resnet18(weights='DEFAULT')
        
        # Freeze early layers
        layers_to_freeze = [model.conv1, model.bn1, model.layer1]
        if freeze_layers >= 2:
            layers_to_freeze.append(model.layer2)
        
        for layer in layers_to_freeze:
            for param in layer.parameters():
                param.requires_grad = False
        
        frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print(f"Model parameters: {trainable_params:,} trainable, {frozen_params:,} frozen")
        
        # Simple classifier
        model.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(model.fc.in_features, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout * 0.5),
            nn.Linear(128, self.num_classes)
        )
        
        return model.to(self.device)
    
    def get_optimizer(self, model, opt_type, backbone_lr, classifier_lr, weight_decay):
        backbone_params = []
        classifier_params = []
        
        for name, param in model.named_parameters():
            if param.requires_grad:
                if 'fc' in name:
                    classifier_params.append(param)
                else:
                    backbone_params.append(param)
        
        param_groups = [
            {'params': backbone_params, 'lr': backbone_lr},
            {'params': classifier_params, 'lr': classifier_lr}
        ]
        
        print(f"Optimizer groups: {len(backbone_params)} backbone, {len(classifier_params)} classifier params")
        
        if opt_type == 'adamw':
            return optim.AdamW(param_groups, weight_decay=weight_decay)
        else:
            raise ValueError(f"Unknown optimizer: {opt_type}")
    
    def get_scheduler(self, optimizer, scheduler_type, max_epochs):
        if scheduler_type == 'cosine':
            return optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs, eta_min=1e-6)
        else:
            raise ValueError(f"Unknown scheduler: {scheduler_type}")
    
    def train_with_hyperparams(self, train_loader, val_loader, test_loader, hyperparams, fold_num):
        print(f"Creating model for fold {fold_num}...")
        
        # Memory check before model creation
        allocated_before = torch.cuda.memory_allocated() / 1e9
        reserved_before = torch.cuda.memory_reserved() / 1e9
        print(f"Before model creation - Allocated: {allocated_before:.3f} GB, Reserved: {reserved_before:.3f} GB")
        
        model = self.create_model(
            dropout=hyperparams['dropout'], 
            freeze_layers=hyperparams['freeze_layers']
        )
        
        allocated_after = torch.cuda.memory_allocated() / 1e9
        reserved_after = torch.cuda.memory_reserved() / 1e9
        print(f"After model creation - Allocated: {allocated_after:.3f} GB, Reserved: {reserved_after:.3f} GB")
        
        optimizer = self.get_optimizer(
            model, hyperparams['optimizer'], 
            hyperparams['backbone_lr'], hyperparams['classifier_lr'], 
            hyperparams['weight_decay']
        )
        
        scheduler = self.get_scheduler(optimizer, hyperparams['scheduler'], 20)
        criterion = nn.CrossEntropyLoss(label_smoothing=hyperparams['label_smoothing'])
        
        allocated_final = torch.cuda.memory_allocated() / 1e9
        print(f"Ready to train - Allocated: {allocated_final:.3f} GB")
        
        best_val_acc = 0.0
        patience = 10
        patience_counter = 0
        best_state = None
        
        for epoch in range(20):  # Short training for testing
            model.train()
            train_correct = 0
            train_total = 0
            
            for batch_idx, (images, labels) in enumerate(train_loader):
                images, labels = images.to(self.device), labels.to(self.device)
                
                if batch_idx == 0:
                    batch_memory = torch.cuda.memory_allocated() / 1e9
                    print(f"    First batch loaded - VRAM: {batch_memory:.3f} GB")
                
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                _, predicted = torch.max(outputs, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()
            
            # Validation
            model.eval()
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(self.device), labels.to(self.device)
                    outputs = model(images)
                    
                    _, predicted = torch.max(outputs, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            train_acc = 100 * train_correct / train_total
            val_acc = 100 * val_correct / val_total
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                patience_counter = 0
                best_state = model.state_dict().copy()
            else:
                patience_counter += 1
            
            scheduler.step()
            
            if epoch % 5 == 0:
                print(f"    Epoch {epoch}: Train {train_acc:.1f}%, Val {val_acc:.1f}%")
            
            if patience_counter >= patience:
                print(f"    Early stopping at epoch {epoch}")
                break
        
        print(f"    Fold {fold_num} completed - Best val acc: {best_val_acc:.1f}%")
        return False  # Don't save models in test mode
    
    def run_test(self, images, ages, sources):
        print(f"MINIMAL TEST MODE - {NUM_FOLDS} folds only")
        print(f"Configuration: ResNet18, {IMAGE_SIZE[0]}x{IMAGE_SIZE[1]}, batch_size={FIXED_HYPERPARAMS['batch_size']}")
        
        unique_ages = sorted(list(set(ages)))
        label_mapping = {age: i for i, age in enumerate(unique_ages)}
        y_indices = np.array([label_mapping[age] for age in ages])
        
        print(f"Classes: {len(unique_ages)}")
        
        hyperparams = FIXED_HYPERPARAMS.copy()
        for key, value in zip(list(MISSING_HYPERPARAMS.keys()), [v[0] for v in MISSING_HYPERPARAMS.values()]):
            hyperparams[key] = value
        
        print(f"Testing hyperparams: {hyperparams}")
        
        for fold in range(1, NUM_FOLDS + 1):
            print(f"\n[Fold {fold}/{NUM_FOLDS}]")
            
            try:
                X_train, X_test, y_train, y_test = train_test_split(
                    images, y_indices, test_size=0.2, random_state=fold * 42, stratify=y_indices
                )
                
                X_train_final, X_val, y_train_final, y_val = train_test_split(
                    X_train, y_train, test_size=0.2, random_state=fold * 42 + 1, stratify=y_train
                )
                
                train_dataset = SimpleDataset(X_train_final, y_train_final, 
                                           hyperparams['augmentation_strength'], AUGMENTATION_TARGET, True)
                val_dataset = SimpleDataset(X_val, y_val, 'light', 50, False)
                test_dataset = SimpleDataset(X_test, y_test, 'light', 50, False)
                
                train_loader = DataLoader(train_dataset, batch_size=hyperparams['batch_size'], 
                                        shuffle=True, num_workers=0)
                val_loader = DataLoader(val_dataset, batch_size=hyperparams['batch_size'], 
                                      shuffle=False, num_workers=0)
                test_loader = DataLoader(test_dataset, batch_size=hyperparams['batch_size'], 
                                       shuffle=False, num_workers=0)
                
                self.train_with_hyperparams(train_loader, val_loader, test_loader, hyperparams, fold)
                
                # Cleanup
                del train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader
                torch.cuda.empty_cache()
                gc.collect()
                
                final_memory = torch.cuda.memory_allocated() / 1e9
                print(f"    After cleanup: {final_memory:.3f} GB allocated")
                
            except Exception as e:
                print(f"FAILED: {str(e)}")
                torch.cuda.empty_cache()
                gc.collect()
                continue
        
        print(f"\nMINIMAL TEST COMPLETE")

def main():
    import gc
    import time
    import torch
    # Force clean start
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

    gc.collect()
  
    images, ages, sources = load_combined_data()
    
    trainer = MinimalTestModel(num_classes=len(set(ages)))
    trainer.run_test(images, ages, sources)
    
    elapsed = (time.time() - start_time) / 60
    print(f"\nTest completed in: {elapsed:.1f} minutes")

if __name__ == "__main__":
    main()

PyTorch version: 2.10.0.dev20250910+cu128
CUDA available: True
CUDA version: 12.8
GPU count: 1
GPU name: NVIDIA GeForce RTX 5090
GPU memory: 34.2 GB
Initial allocated: 0.018 GB
Initial reserved: 0.107 GB
Loading color images...
Loaded 201 color images
Loading grayscale images...
Loaded 40 grayscale images
Total images: 241
Final dataset: 241 images
Age distribution: {2.5: 41, 3.5: 50, 4.5: 57, 5.5: 60, 1.5: 33}
Using device: cuda
GPU: NVIDIA GeForce RTX 5090
GPU Memory: 34.2 GB
MINIMAL TEST MODE - 2 folds only
Configuration: ResNet18, 512x512, batch_size=16
Classes: 5
Testing hyperparams: {'backbone_lr': 0.0001, 'classifier_lr': 0.0005, 'batch_size': 16, 'optimizer': 'adamw', 'dropout': 0.3, 'freeze_layers': 4, 'weight_decay': 0.05, 'scheduler': 'cosine', 'label_smoothing': 0.1, 'augmentation_strength': 'light'}

[Fold 1/2]
Simple dataset: 500 samples from 153 base images
Simple dataset: 250 samples from 39 base images
Simple dataset: 250 samples from 49 base images
Creating model for 

NameError: name 'start_time' is not defined

In [None]:
import torch
print("Simple CUDA test:")
x = torch.randn(10, 10).cuda()
print(f"Created small tensor, allocated: {torch.cuda.memory_allocated() / 1e9:.3f} GB")