## Part 1: Setup and Imports

In [None]:
import os
import subprocess
import time
import csv
import torch
import numpy as np
from datetime import datetime
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.datasets import ImageFolder

In [None]:
# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


## Part 2: Configuration

**Edit this cell to change rclone mount options before each run.**

In [None]:
RESULTS_CSV = 'training_benchmark_results.csv'
GPU_CSV = 'gpu_utilization.csv'

# Unique identifier for this test run
RUN_ID = "test_001"

# rclone remote and container
RCLONE_REMOTE = "rclone_s3"  # or the swift configuration in the rclone config
RCLONE_CONTAINER = "object-persist-YOURNETID"  # Your container name

# Mount point
MOUNT_POINT = "/mnt/object"


RCLONE_OPTIONS = {
    # Cache settings
    'vfs_cache_mode': 'full',        # off, minimal, writes, full
    'vfs_cache_max_size': '20G',     # e.g., 5G, 10G, 20G, 50G
    'vfs_cache_max_age': '1h',       # e.g., 1h, 24h
    
    # Read performance
    'vfs_read_chunk_size': '64M',    # e.g., 16M, 64M, 128M, 256M
    'vfs_read_chunk_size_limit': '512M',  # e.g., 256M, 512M, off (unlimited)
    'vfs_read_ahead': '256M',        # e.g., 128M, 256M, 512M, 1G
    'buffer_size': '128M',           # e.g., 16M, 64M, 128M, 256M
    
    # Parallelism
    'transfers': '16',               # e.g., 4, 8, 16, 32
    'checkers': '8',                 # e.g., 4, 8, 16
    
    # Directory caching
    'dir_cache_time': '30m',         # e.g., 5m, 30m, 1h
    'attr_timeout': '30s',           # e.g., 1s, 10s, 30s, 1m
    
    # Stability/Retry settings
    'low_level_retries': '10',
    'retries': '3',
    'contimeout': '30s',
    'timeout': '120s',
}


DATALOADER_OPTIONS = {
    'batch_size': 64,
    'num_workers': 8,
    'prefetch_factor': 4,
    'pin_memory': True,
}


print("Configuration loaded:")
print(f"  Run ID: {RUN_ID}")
print(f"  Remote: {RCLONE_REMOTE}:{RCLONE_CONTAINER}")
print(f"  Cache Mode: {RCLONE_OPTIONS['vfs_cache_mode']}")
print(f"  Batch Size: {DATALOADER_OPTIONS['batch_size']}")
print(f"  Num Workers: {DATALOADER_OPTIONS['num_workers']}")

## Part 3: Mount Object Store

Run this cell to mount the object store with the configured options.

In [None]:
def build_mount_command(remote, container, mount_point, options):
    """Build rclone mount command from options dictionary."""
    cmd = f"rclone mount {remote}:{container} {mount_point}"
    
    # Add options
    cmd += " --read-only --allow-other"
    
    option_map = {
        'vfs_cache_mode': '--vfs-cache-mode',
        'vfs_cache_max_size': '--vfs-cache-max-size',
        'vfs_cache_max_age': '--vfs-cache-max-age',
        'vfs_read_chunk_size': '--vfs-read-chunk-size',
        'vfs_read_chunk_size_limit': '--vfs-read-chunk-size-limit',
        'vfs_read_ahead': '--vfs-read-ahead',
        'buffer_size': '--buffer-size',
        'transfers': '--transfers',
        'checkers': '--checkers',
        'dir_cache_time': '--dir-cache-time',
        'attr_timeout': '--attr-timeout',
        'low_level_retries': '--low-level-retries',
        'retries': '--retries',
        'contimeout': '--contimeout',
        'timeout': '--timeout',
    }
    
    for key, flag in option_map.items():
        if key in options and options[key]:
            cmd += f" {flag} {options[key]}"
    
    cmd += " --daemon"
    return cmd

# Build the mount command
mount_cmd = build_mount_command(RCLONE_REMOTE, RCLONE_CONTAINER, MOUNT_POINT, RCLONE_OPTIONS)
print("Mount command:")
print(mount_cmd)

In [None]:
# Unmount if already mounted
print("Checking for existing mount...")
unmount_result = subprocess.run(f"fusermount -u {MOUNT_POINT}", shell=True, capture_output=True)
time.sleep(2)

if unmount_result.returncode == 0:
    print(f"Unmounted existing mount at {MOUNT_POINT}")
else:
    print(f"No existing mount at {MOUNT_POINT} or unmount failed, proceeding...")

# Mount with new configuration
print(f"Mounting {RCLONE_REMOTE}:{RCLONE_CONTAINER} to {MOUNT_POINT}...")
result = subprocess.run(mount_cmd, shell=True, capture_output=True, text=True)

if result.returncode != 0:
    print(f"ERROR: Mount failed!")
    print(f"stderr: {result.stderr}")
else:
    # Wait for mount to be ready
    time.sleep(3)
    
    # Verify mount
    if os.path.exists(MOUNT_POINT) and os.listdir(MOUNT_POINT):
        print(f"SUCCESS: Mounted successfully!")
        print(f"Contents: {os.listdir(MOUNT_POINT)}")
    else:
        print("WARNING: Mount point exists but appears empty")

## Part 4: Run Training Benchmark

This section runs model training with proper evaluation and test, measuring both data loading performance and model accuracy.

In [None]:
DATA_DIR = MOUNT_POINT
print(f"Data directory: {DATA_DIR}")
print(f"Contents: {os.listdir(DATA_DIR)}")

In [None]:
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import gc

def run_training_benchmark(data_dir, batch_size, num_workers, epochs=3, 
                          prefetch_factor=2, pin_memory=True, use_gpu=False):
    """
    Run model training benchmark with proper evaluation and test.
    Returns training metrics, data loading throughput, and model performance.
    """
    print(f"\nSetting up training benchmark...")
    print(f"  Batch size: {batch_size}")
    print(f"  Num workers: {num_workers}")
    print(f"  Prefetch factor: {prefetch_factor}")
    print(f"  Pin memory: {pin_memory}")
    print(f"  Epochs: {epochs}")
    
    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
    print(f"  Device: {device}")
    
    # Define transforms - aligned with train.py
    train_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(15),
        transforms.ColorJitter(
            brightness=0.2,
            contrast=0.2,
            saturation=0.2,
            hue=0.1
        ),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    val_test_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    # Create datasets
    train_dataset = ImageFolder(os.path.join(data_dir, 'training'), transform=train_transform)
    val_dataset = ImageFolder(os.path.join(data_dir, 'validation'), transform=val_test_transform)
    eval_dataset = ImageFolder(os.path.join(data_dir, 'evaluation'), transform=val_test_transform)
    
    print(f"\nDataset sizes:")
    print(f"  Training: {len(train_dataset)} images")
    print(f"  Validation: {len(val_dataset)} images")
    print(f"  Evaluation (test): {len(eval_dataset)} images")
    print(f"  Classes: {len(train_dataset.classes)}")
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory,
        prefetch_factor=prefetch_factor,
        persistent_workers=True if num_workers > 0 else False
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory,
        prefetch_factor=prefetch_factor,
        persistent_workers=True if num_workers > 0 else False
    )
    
    eval_loader = DataLoader(
        eval_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory,
        prefetch_factor=prefetch_factor,
        persistent_workers=True if num_workers > 0 else False
    )
    
    # Create MobileNetV2 model - aligned with train.py architecture
    model = models.mobilenet_v2(weights='MobileNet_V2_Weights.DEFAULT')
    
    # Modify classifier for 11 classes (same as train.py)
    num_ftrs = model.last_channel
    model.classifier = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(num_ftrs, 11)
    )
    model = model.to(device)
    
    # For benchmark, freeze backbone features (similar to train.py approach)
    for param in model.features.parameters():
        param.requires_grad = False
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  Trainable parameters: {trainable_params:,}")
    
    # Training setup - aligned with train.py
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.classifier.parameters(), lr=0.0001)  # 1e-4 as in train.py
    
    # Training metrics tracking
    training_start_time = time.perf_counter()
    epoch_times = []
    data_loading_times = []
    batch_processing_times = []
    training_losses = []
    validation_accuracies = []
    
    best_val_loss = float('inf')
    
    print(f"\nStarting training...")
    
    # Training loop
    for epoch in range(epochs):
        epoch_start = time.perf_counter()
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        epoch_data_time = 0.0
        epoch_compute_time = 0.0
        
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print("-" * 50)
        
        for i, (images, labels) in enumerate(train_loader):
            data_start = time.perf_counter()
            
            # Move to device
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            if use_gpu:
                torch.cuda.synchronize()
            
            data_time = time.perf_counter() - data_start
            epoch_data_time += data_time
            
            # Training step
            compute_start = time.perf_counter()
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            if use_gpu:
                torch.cuda.synchronize()
            
            compute_time = time.perf_counter() - compute_start
            epoch_compute_time += compute_time
            
            # Statistics - following train.py pattern
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            # Progress update every 50 batches
            if (i + 1) % 50 == 0:
                avg_loss = running_loss / (i + 1)
                train_acc = 100. * correct / total
                data_throughput = total / (epoch_data_time + 1e-6)
                print(f"  Batch {i + 1:4d}: Loss={avg_loss:.4f}, "
                      f"Train Acc={train_acc:.2f}%, "
                      f"Data throughput={data_throughput:.1f} samples/sec")
        
        epoch_time = time.perf_counter() - epoch_start
        epoch_times.append(epoch_time)
        data_loading_times.append(epoch_data_time)
        batch_processing_times.append(epoch_compute_time)
        
        # Calculate epoch metrics
        train_loss = running_loss / len(train_loader)
        train_acc = 100. * correct / total
        training_losses.append(train_loss)
        
        # Validation - following train.py validation function pattern
        model.eval()
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device, non_blocking=True)
                labels = labels.to(device, non_blocking=True)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                
                val_running_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        
        val_loss = val_running_loss / len(val_loader)
        val_accuracy = 100. * val_correct / val_total
        validation_accuracies.append(val_accuracy)
        
        print(f"  Epoch {epoch + 1} Results:")
        print(f"    Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.2f}%")
        print(f"    Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")
        print(f"    Epoch Time: {epoch_time:.2f}s")
        print(f"    Data Loading Time: {epoch_data_time:.2f}s ({100*epoch_data_time/epoch_time:.1f}%)")
        print(f"    Compute Time: {epoch_compute_time:.2f}s ({100*epoch_compute_time/epoch_time:.1f}%)")
        
        # Save best model (following train.py pattern)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print("    Validation loss improved. Best model updated.")
    
    total_training_time = time.perf_counter() - training_start_time
    
    # Final evaluation on test set
    print(f"\nFinal evaluation on test set...")
    model.eval()
    test_running_loss = 0.0
    test_correct = 0
    test_total = 0
    all_predictions = []
    all_labels = []
    test_start_time = time.perf_counter()
    
    with torch.no_grad():
        for images, labels in eval_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            test_running_loss += loss.item()
            _, predicted = outputs.max(1)
            test_total += labels.size(0)
            test_correct += predicted.eq(labels).sum().item()
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    test_time = time.perf_counter() - test_start_time
    
    # Calculate test metrics
    test_loss = test_running_loss / len(eval_loader)
    test_accuracy = 100. * test_correct / test_total
    
    # Additional metrics using sklearn
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
    
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
    
    # Compile results
    results = {
        # Training metrics
        'total_training_time_sec': total_training_time,
        'epochs': epochs,
        'final_training_loss': training_losses[-1],
        'best_val_accuracy': max(validation_accuracies),
        'final_val_accuracy': validation_accuracies[-1],
        'best_val_loss': best_val_loss,
        
        # Test metrics
        'test_loss': test_loss,
        'test_accuracy': test_accuracy,
        'test_precision': precision * 100,
        'test_recall': recall * 100,
        'test_f1': f1 * 100,
        'test_inference_time_sec': test_time,
        
        # Data loading performance
        'avg_epoch_time_sec': np.mean(epoch_times),
        'avg_data_loading_time_sec': np.mean(data_loading_times),
        'avg_compute_time_sec': np.mean(batch_processing_times),
        'data_loading_ratio': np.mean(data_loading_times) / np.mean(epoch_times),
        'samples_per_sec_training': len(train_dataset) * epochs / total_training_time,
        'total_samples_processed': len(train_dataset) * epochs,
        
        # System metrics
        'batch_size': batch_size,
        'num_workers': num_workers,
        'total_train_samples': len(train_dataset),
        'total_val_samples': len(val_dataset),
        'total_test_samples': len(eval_dataset),
        'trainable_parameters': trainable_params,
    }
    
    # Cleanup
    del model, train_loader, val_loader, eval_loader
    del train_dataset, val_dataset, eval_dataset
    if use_gpu:
        torch.cuda.empty_cache()
    gc.collect()
    
    return results

In [None]:
# Run the training benchmark
results = run_training_benchmark(
    data_dir=DATA_DIR,
    batch_size=DATALOADER_OPTIONS['batch_size'],
    num_workers=DATALOADER_OPTIONS['num_workers'],
    epochs=3,  # Short benchmark run
    prefetch_factor=DATALOADER_OPTIONS['prefetch_factor'],
    pin_memory=DATALOADER_OPTIONS['pin_memory'],
    use_gpu=True if device == 'cuda' else False
)

print(f"\n{'='*60}")
print("BENCHMARK RESULTS SUMMARY")
print(f"{'='*60}")

print(f"\nTraining Performance:")
print(f"  Total training time: {results['total_training_time_sec']:.2f}s")
print(f"  Average epoch time: {results['avg_epoch_time_sec']:.2f}s")
print(f"  Data loading ratio: {results['data_loading_ratio']:.1%}")
print(f"  Training throughput: {results['samples_per_sec_training']:.1f} samples/sec")

print(f"\nModel Performance:")
print(f"  Best validation accuracy: {results['best_val_accuracy']:.2f}%")
print(f"  Final test accuracy: {results['test_accuracy']:.2f}%")
print(f"  Test F1 score: {results['test_f1']:.2f}%")

print(f"\nSystem Configuration:")
print(f"  Batch size: {results['batch_size']}")
print(f"  Number of workers: {results['num_workers']}")
print(f"  Trainable parameters: {results['trainable_parameters']:,}")

# Save detailed results
results['timestamp'] = datetime.now().isoformat()
results['run_id'] = RUN_ID
results['rclone_config'] = RCLONE_OPTIONS

print(f"\nResults saved with Run ID: {RUN_ID}")

In [None]:
# Save results to CSV for analysis
results_file = RESULTS_CSV
file_exists = os.path.exists(results_file)

with open(results_file, 'a', newline='') as csvfile:
    fieldnames = [
        'timestamp', 'run_id', 'total_training_time_sec', 'epochs', 
        'final_training_loss', 'best_val_accuracy', 'final_val_accuracy', 'best_val_loss',
        'test_loss', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_inference_time_sec',
        'avg_epoch_time_sec', 'avg_data_loading_time_sec', 'avg_compute_time_sec', 
        'data_loading_ratio', 'samples_per_sec_training', 'total_samples_processed',
        'batch_size', 'num_workers', 'total_train_samples', 'total_val_samples', 'total_test_samples', 
        'trainable_parameters', 'vfs_cache_mode', 'vfs_cache_max_size', 'vfs_read_chunk_size', 
        'buffer_size', 'transfers'
    ]
    
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    if not file_exists:
        writer.writeheader()
    
    # Prepare row data
    row_data = {
        'timestamp': results['timestamp'],
        'run_id': results['run_id'],
        'total_training_time_sec': results['total_training_time_sec'],
        'epochs': results['epochs'],
        'final_training_loss': results['final_training_loss'],
        'best_val_accuracy': results['best_val_accuracy'],
        'final_val_accuracy': results['final_val_accuracy'],
        'best_val_loss': results['best_val_loss'],
        'test_loss': results['test_loss'],
        'test_accuracy': results['test_accuracy'],
        'test_precision': results['test_precision'],
        'test_recall': results['test_recall'],
        'test_f1': results['test_f1'],
        'test_inference_time_sec': results['test_inference_time_sec'],
        'avg_epoch_time_sec': results['avg_epoch_time_sec'],
        'avg_data_loading_time_sec': results['avg_data_loading_time_sec'],
        'avg_compute_time_sec': results['avg_compute_time_sec'],
        'data_loading_ratio': results['data_loading_ratio'],
        'samples_per_sec_training': results['samples_per_sec_training'],
        'total_samples_processed': results['total_samples_processed'],
        'batch_size': results['batch_size'],
        'num_workers': results['num_workers'],
        'total_train_samples': results['total_train_samples'],
        'total_val_samples': results['total_val_samples'],
        'total_test_samples': results['total_test_samples'],
        'trainable_parameters': results['trainable_parameters'],
        'vfs_cache_mode': RCLONE_OPTIONS['vfs_cache_mode'],
        'vfs_cache_max_size': RCLONE_OPTIONS['vfs_cache_max_size'],
        'vfs_read_chunk_size': RCLONE_OPTIONS['vfs_read_chunk_size'],
        'buffer_size': RCLONE_OPTIONS['buffer_size'],
        'transfers': RCLONE_OPTIONS['transfers'],
    }
    
    writer.writerow(row_data)

print(f"Results appended to {results_file}")

## Part 6: Cleanup

Run this cell to unmount and prepare for the next configuration test.

In [None]:
# Unmount the object store
print(f"Unmounting {MOUNT_POINT}...")
result = subprocess.run(f"fusermount -u {MOUNT_POINT}", shell=True, capture_output=True, text=True)

if result.returncode == 0:
    print("SUCCESS: Unmounted successfully!")
else:
    print(f"Warning: {result.stderr}")

# Clear rclone cache (optional - uncomment if you want to clear cache between runs)
# subprocess.run("rm -rf ~/.cache/rclone/*", shell=True)
# print("Cleared rclone cache")

print("\n" + "="*60)
print("Ready for next configuration!")
print("Go back to Part 2 and change the configuration, then re-run.")
print("="*60)

## Data Processing and Merge

This section reads the benchmark CSV and GPU CSV files, cleans the data by removing zero GPU utilization entries, calculates GPU metrics per run, and creates a final merged CSV file.

In [None]:
import pandas as pd
import os

# File paths
FINAL_CSV = 'final_benchmark_results.csv'

print("=" * 60)
print("DATA PROCESSING AND MERGE")
print("=" * 60)

# Read benchmark results
if os.path.exists(RESULTS_CSV):
    df_benchmark = pd.read_csv(RESULTS_CSV)
    print(f"Loaded benchmark data: {len(df_benchmark)} runs")
    print(f"Columns: {list(df_benchmark.columns)}")
else:
    print(f"ERROR: {RESULTS_CSV} not found!")
    df_benchmark = pd.DataFrame()

# Read GPU utilization data  
if os.path.exists(GPU_CSV):
    df_gpu = pd.read_csv(GPU_CSV)
    print(f"Loaded GPU data: {len(df_gpu)} samples")
    print(f"Columns: {list(df_gpu.columns)}")
else:
    print(f"WARNING: {GPU_CSV} not found - will proceed with benchmark data only")
    df_gpu = pd.DataFrame()

In [None]:
# Clean GPU data by removing zero utilization entries
if not df_gpu.empty:
    print(f"\nCleaning GPU data...")
    print(f"Original GPU samples: {len(df_gpu)}")
    
    # Remove entries with 0% GPU utilization
    if 'gpu_util_percent' in df_gpu.columns:
        df_gpu_clean = df_gpu[df_gpu['gpu_util_percent'] > 0].copy()
        print(f"After removing 0% utilization: {len(df_gpu_clean)} samples")
        removed = len(df_gpu) - len(df_gpu_clean)
        print(f"Removed {removed} zero-utilization samples ({removed/len(df_gpu)*100:.1f}%)")
        
        if len(df_gpu_clean) == 0:
            print("WARNING: No valid GPU data after cleaning!")
            df_gpu_clean = pd.DataFrame()
    else:
        print("WARNING: No 'gpu_util_percent' column found in GPU data")
        df_gpu_clean = df_gpu.copy()
else:
    print("No GPU data to clean")
    df_gpu_clean = pd.DataFrame()

In [None]:
# Calculate GPU metrics per run_id
if not df_gpu_clean.empty and 'run_id' in df_gpu_clean.columns:
    print(f"\nCalculating GPU metrics per run...")
    
    # Group by run_id and calculate aggregated metrics
    gpu_metrics = df_gpu_clean.groupby('run_id').agg({
        'gpu_util_percent': ['mean', 'max', 'std', 'min'],
        'mem_used_mb': ['mean', 'max'],
        'mem_total_mb': 'first',  # Should be constant
        'temperature_c': ['mean', 'max'] if 'temperature_c' in df_gpu_clean.columns else ['mean', 'max'],
        'timestamp': ['first', 'last', 'count']  # For tracking duration and sample count
    }).round(2)
    
    # Flatten column names
    gpu_metrics.columns = [f"gpu_{col[0]}_{col[1]}" if col[1] != 'first' else f"gpu_{col[0]}" 
                          for col in gpu_metrics.columns]
    
    # Rename columns for clarity
    gpu_metrics = gpu_metrics.rename(columns={
        'gpu_gpu_util_percent_mean': 'avg_gpu_utilization',
        'gpu_gpu_util_percent_max': 'max_gpu_utilization', 
        'gpu_gpu_util_percent_std': 'gpu_utilization_std',
        'gpu_gpu_util_percent_min': 'min_gpu_utilization',
        'gpu_mem_used_mb_mean': 'avg_memory_used_mb',
        'gpu_mem_used_mb_max': 'max_memory_used_mb',
        'gpu_mem_total_mb': 'total_memory_mb',
        'gpu_timestamp_count': 'gpu_sample_count'
    })
    
    # Calculate memory usage percentage
    gpu_metrics['avg_memory_usage_percent'] = (gpu_metrics['avg_memory_used_mb'] / gpu_metrics['total_memory_mb'] * 100).round(2)
    gpu_metrics['max_memory_usage_percent'] = (gpu_metrics['max_memory_used_mb'] / gpu_metrics['total_memory_mb'] * 100).round(2)
    
    # Reset index to make run_id a column
    gpu_metrics = gpu_metrics.reset_index()
    
    print(f"Calculated GPU metrics for {len(gpu_metrics)} runs")
    print(f"GPU metrics columns: {list(gpu_metrics.columns)}")
    
else:
    print("No GPU data available for metrics calculation")
    gpu_metrics = pd.DataFrame()

In [None]:
# Merge benchmark data with GPU metrics
if not df_benchmark.empty:
    if not gpu_metrics.empty:
        print(f"\nMerging benchmark data with GPU metrics...")
        df_final = df_benchmark.merge(gpu_metrics, on='run_id', how='left')
        print(f"Merged data: {len(df_final)} runs")
        
        # Fill missing GPU values with NaN (for runs without GPU data)
        gpu_columns = [col for col in gpu_metrics.columns if col != 'run_id']
        missing_gpu_runs = df_final[gpu_columns[0]].isna().sum()
        if missing_gpu_runs > 0:
            print(f"Warning: {missing_gpu_runs} runs have no GPU data")
    else:
        print("No GPU metrics to merge - using benchmark data only")
        df_final = df_benchmark.copy()
        
    print(f"Final dataset columns: {len(df_final.columns)}")
    print(f"Final dataset shape: {df_final.shape}")
    
else:
    print("ERROR: No benchmark data available!")
    df_final = pd.DataFrame()

In [None]:
# Save final merged dataset
if not df_final.empty:
    print(f"\nSaving final dataset to {FINAL_CSV}...")
    df_final.to_csv(FINAL_CSV, index=False)
    print(f"SUCCESS: Saved {len(df_final)} runs to {FINAL_CSV}")
    
    # Display summary of the final dataset
    print(f"\nFinal dataset summary:")
    print(f"  Rows: {len(df_final)}")
    print(f"  Columns: {len(df_final.columns)}")
    
    # Show first few rows
    print(f"\nFirst 3 rows of final dataset:")
    display_cols = ['run_id', 'timestamp', 'total_training_time_sec', 'test_accuracy']
    if 'avg_gpu_utilization' in df_final.columns:
        display_cols.extend(['avg_gpu_utilization', 'max_gpu_utilization'])
    if 'avg_memory_usage_percent' in df_final.columns:
        display_cols.append('avg_memory_usage_percent')
    
    print(df_final[display_cols].head(3).to_string(index=False))
    
    print(f"\nData processing completed successfully!")
    print(f"Final file: {FINAL_CSV}")
    
else:
    print("ERROR: No data to save!")

In [None]:
# Display column information for reference
if not df_final.empty:
    print("=" * 60)
    print("FINAL DATASET COLUMN REFERENCE")
    print("=" * 60)
    
    print("\nBenchmark columns:")
    benchmark_cols = [col for col in df_final.columns if not col.startswith(('avg_gpu', 'max_gpu', 'min_gpu', 'gpu_', 'total_memory', 'avg_memory', 'max_memory'))]
    for col in benchmark_cols:
        print(f"  {col}")
    
    if 'avg_gpu_utilization' in df_final.columns:
        print("\nGPU metrics columns:")
        gpu_cols = [col for col in df_final.columns if col.startswith(('avg_gpu', 'max_gpu', 'min_gpu', 'gpu_', 'total_memory', 'avg_memory', 'max_memory'))]
        for col in gpu_cols:
            print(f"  {col}")
    
    print(f"\nTotal columns: {len(df_final.columns)}")
    print("=" * 60)