In [1]:
# workspace/notebooks/gpu/real_world_test.ipynb

import os
os.environ['MIOPEN_DISABLE_CACHE'] = '1'
os.environ['MIOPEN_DEBUG_DISABLE_FIND_DB'] = '1'

# Now run your code
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import time

print("=== Real-World Computer Vision Test (with MIOpen fixes) ===")

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    
    # Load CIFAR-10 with REASONABLE batch size
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    # REDUCE batch size - 2048 is too aggressive for BN layers
    batch_size = 512  # Still large but more reasonable
    
    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform
    )
    trainloader = DataLoader(
        trainset, batch_size=batch_size, shuffle=True, num_workers=2  # Reduce workers
    )
    
    print(f"Dataset: CIFAR-10")
    print(f"Batch size: {batch_size} (adjusted for ROCm)")
    print(f"Training samples: {len(trainset)}")
    
    # Create ResNet model WITH gradient checkpointing
    from torchvision.models import resnet50
    
    # Disable cudnn benchmark to avoid compilation issues
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False  # Try disabling cudnn
    
    model = resnet50(weights=None, num_classes=10).to(device)
    
    # Set model to eval mode temporarily to avoid BN training issues
    model.train()
    
    print(f"\nModel: ResNet50")
    print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Test training speed
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    
    print("\nTraining for 3 batches...")
    
    data_iter = iter(trainloader)
    batch_times = []
    
    for i in range(3):  # Fewer batches
        try:
            inputs, labels = next(data_iter)
        except StopIteration:
            data_iter = iter(trainloader)
            inputs, labels = next(data_iter)
        
        inputs, labels = inputs.to(device), labels.to(device)
        
        start = time.time()
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        torch.cuda.synchronize()
        batch_time = time.time() - start
        
        batch_times.append(batch_time)
        print(f"Batch {i+1}: loss={loss.item():.4f}, time={batch_time:.3f}s")
    
    avg_time = sum(batch_times) / len(batch_times)
    samples_per_sec = batch_size / avg_time
    
    print(f"\n✓ Performance Summary:")
    print(f"  Average batch time: {avg_time:.3f} seconds")
    print(f"  Throughput: {samples_per_sec:.0f} samples/second")
    print(f"  Memory used: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    
else:
    print("No GPU available!")

print("\n" + "="*60)
print("REAL-WORLD TEST COMPLETE!")
print("="*60)

=== Real-World Computer Vision Test (with MIOpen fixes) ===


  entry = pickle.load(f, encoding="latin1")


Dataset: CIFAR-10
Batch size: 512 (adjusted for ROCm)
Training samples: 50000

Model: ResNet50
Parameters: 23,528,522

Training for 3 batches...
Batch 1: loss=2.7111, time=2.349s
Batch 2: loss=2.7185, time=1.870s
Batch 3: loss=3.0227, time=1.871s

✓ Performance Summary:
  Average batch time: 2.030 seconds
  Throughput: 252 samples/second
  Memory used: 0.45 GB

REAL-WORLD TEST COMPLETE!
