In [None]:
# Check GPU availability
import torch
print(f"🚀 PyTorch version: {torch.__version__}")
print(f"🔥 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"📱 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Navigate to project directory and clone/update repo
import os
project_path = '/content/drive/MyDrive/Colab Notebooks/Physics Informed DL Project'

# Clone or update repository
if not os.path.exists(project_path):
    print("🔄 Cloning repository...")
    !git clone https://github.com/assafch/physics_informed_dl_project.git "$project_path"
else:
    print("🔄 Updating repository...")
    os.chdir(project_path)
    !git pull origin main

os.chdir(project_path)
print(f"📁 Working directory: {os.getcwd()}")

# Verify project structure
key_dirs = ['src', 'configs', 'data', 'experiments']
for dir_name in key_dirs:
    if os.path.exists(dir_name):
        print(f"✅ {dir_name}/ found")
    else:
        print(f"❌ {dir_name}/ missing")
        
print("\n📂 Repository ready!")


In [None]:
# Install/update requirements
%pip install -q -r requirements.txt
%pip install -q mlflow

print("✅ Dependencies installed")


In [None]:
# Check if T=250 dataset exists
from pathlib import Path

dataset_path = "data/wave_dataset_T250.h5"
if Path(dataset_path).exists():
    file_size = Path(dataset_path).stat().st_size / (1024**3)  # GB
    print(f"✅ Dataset found: {dataset_path}")
    print(f"📦 File size: {file_size:.1f} GB")
else:
    print(f"❌ Dataset not found: {dataset_path}")
    print("\n🔍 Available datasets:")
    for file in Path("data").glob("*.h5"):
        size = file.stat().st_size / (1024**3)
        print(f"   {file.name} ({size:.1f} GB)")


In [None]:
import sys
import time
from datetime import datetime

# Add project to path
sys.path.append('.')

from src.training.trainer import WaveTrainer
from configs.training_config import TrainingConfig

print("🧪 T=250 Dataset Hyperparameter Validation")
print("=" * 60)
print(f"🕐 Start time: {datetime.now().strftime('%H:%M:%S')}")
print("⏱️ Expected duration: ~2 hours")
print("🎯 Goal: Validate hyperparameters before full 5-fold CV")
print("=" * 60)


In [None]:
# Create validation configuration using winning hyperparameters
config = TrainingConfig(
    # Winning hyperparameters from T=500 grid search
    learning_rate=0.001,
    batch_size=32,
    optimizer="adam",
    weight_decay=0.01,
    
    # Dataset configuration
    dataset_path="data/wave_dataset_T250.h5",
    train_split=0.8,
    val_split=0.2,
    
    # Training configuration - validation run
    num_epochs=50,  # Quick validation
    early_stopping_patience=15,
    
    # Model configuration
    model_name="WaveSourceMiniResNet",
    grid_size=128,
    
    # Training settings
    device="cuda",
    num_workers=2,
    pin_memory=True,
    
    # Scheduler
    scheduler_type="plateau",
    scheduler_patience=5,
    
    # Logging and saving
    experiment_name="t250_hyperparams_validation",
    run_name=f"validation_lr001_bs32_adam_50epochs_{datetime.now().strftime('%Y%m%d_%H%M')}",
    save_model_every_n_epochs=25,
    
    # Random seed for reproducibility
    random_seed=42
)

print("🔧 Validation Configuration:")
print(f"   Dataset: T=250 ({config.dataset_path})")
print(f"   Hyperparameters: lr={config.learning_rate}, bs={config.batch_size}, opt={config.optimizer}")
print(f"   Epochs: {config.num_epochs}")
print(f"   Expected time: ~2 hours")
print(f"   Device: {config.device}")
print(f"   Run name: {config.run_name}")


In [None]:
# Create and run trainer
trainer = WaveTrainer(config)

print("\n🚀 Starting validation training...")
start_time = time.time()

try:
    # Train the model
    training_history = trainer.train()
    
    end_time = time.time()
    training_duration = (end_time - start_time) / 60  # Convert to minutes
    
    print(f"\n⏱️ Training completed in {training_duration:.1f} minutes")
    
except Exception as e:
    print(f"\n❌ Training failed: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Extract and analyze results
if 'training_history' in locals():
    final_train_loss = training_history['train_loss'][-1]
    final_val_loss = training_history['val_loss'][-1]
    final_val_distance_error = training_history['val_distance_error'][-1]
    best_epoch = training_history.get('best_epoch', len(training_history['val_loss']))
    
    print("=" * 60)
    print("🎉 VALIDATION TRAINING COMPLETE!")
    print("=" * 60)
    
    print(f"⏱️ Training Time: {training_duration:.1f} minutes")
    print(f"🏆 Best Epoch: {best_epoch}")
    print(f"📊 Final Results:")
    print(f"   Training Loss: {final_train_loss:.4f}")
    print(f"   Validation Loss: {final_val_loss:.4f}")
    print(f"   Distance Error: {final_val_distance_error:.3f} px")


In [None]:
    # Performance assessment (continuing from previous cell)
    if 'final_val_distance_error' in locals():
        print(f"\n📈 Performance Assessment:")
        if final_val_distance_error <= 2.0:
            print(f"   ✅ EXCELLENT: {final_val_distance_error:.3f} px (≤ 2.0 px)")
            recommendation = "✅ RECOMMENDED: Proceed with full 5-fold CV training"
        elif final_val_distance_error <= 3.0:
            print(f"   ✅ GOOD: {final_val_distance_error:.3f} px (≤ 3.0 px)")
            recommendation = "✅ RECOMMENDED: Proceed with full 5-fold CV training"
        elif final_val_distance_error <= 4.0:
            print(f"   ⚠️ ACCEPTABLE: {final_val_distance_error:.3f} px (≤ 4.0 px)")
            recommendation = "⚠️ CONSIDER: Maybe adjust hyperparameters or proceed with caution"
        else:
            print(f"   ❌ CONCERNING: {final_val_distance_error:.3f} px (> 4.0 px)")
            recommendation = "❌ RECOMMEND: Consider hyperparameter tuning before full CV"
        
        # Comparison with T=500 results
        print(f"\n🔄 Comparison with T=500 Results:")
        print(f"   T=500 Grid Search Best: 2.37 px")
        print(f"   T=500 CV Average: 2.078 px")
        print(f"   T=250 Validation: {final_val_distance_error:.3f} px")
        
        if final_val_distance_error < 2.5:
            print("   ✅ T=250 performance is competitive with T=500!")
        elif final_val_distance_error < 3.5:
            print("   ✅ T=250 performance is reasonable compared to T=500")
        else:
            print("   ⚠️ T=250 performance is worse than T=500 - consider investigation")
        
        print(f"\n🎯 RECOMMENDATION:")
        print(f"   {recommendation}")
        
        if "RECOMMENDED" in recommendation:
            print(f"\n🚀 Next Steps:")
            print(f"   1. Run full 5-fold CV training on T=250 dataset")
            print(f"   2. Use same hyperparameters: lr=0.001, bs=32, adam")
            print(f"   3. Expected full CV time: ~10 hours")
            print(f"   4. Expected CV performance: ~{final_val_distance_error:.1f} ± 0.3 px")
    
    print("\n" + "=" * 60)
    print("🎉 Validation complete! Check results above for next steps.")
else:
    print("❌ No training history available - training may have failed")


In [None]:
if 'training_history' in locals():
    import matplotlib.pyplot as plt
    import numpy as np
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Loss curves
    epochs = range(1, len(training_history['train_loss']) + 1)
    axes[0].plot(epochs, training_history['train_loss'], 'b-', label='Training Loss', alpha=0.8)
    axes[0].plot(epochs, training_history['val_loss'], 'r-', label='Validation Loss', alpha=0.8)
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('T=250 Validation: Loss Curves')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Distance error
    axes[1].plot(epochs, training_history['val_distance_error'], 'g-', label='Distance Error', alpha=0.8)
    axes[1].axhline(y=2.0, color='orange', linestyle='--', alpha=0.7, label='Target (2.0 px)')
    axes[1].axhline(y=2.078, color='purple', linestyle='--', alpha=0.7, label='T=500 CV Avg')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Distance Error (px)')
    axes[1].set_title('T=250 Validation: Distance Error')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # Summary
    axes[2].text(0.5, 0.7, f'Final Results', ha='center', va='center', transform=axes[2].transAxes, 
                fontsize=14, fontweight='bold')
    axes[2].text(0.5, 0.5, f'Distance Error: {final_val_distance_error:.3f} px', 
                ha='center', va='center', transform=axes[2].transAxes, fontsize=12)
    axes[2].text(0.5, 0.3, f'Training Time: {training_duration:.1f} min', 
                ha='center', va='center', transform=axes[2].transAxes, fontsize=12)
    axes[2].set_xticks([])
    axes[2].set_yticks([])
    axes[2].set_title('Summary')
    
    plt.tight_layout()
    plt.show()
    
    print(f"📊 Training completed successfully!")
    print(f"🎯 Final distance error: {final_val_distance_error:.3f} px")
else:
    print("📊 No training history to plot")


In [None]:
if 'training_history' in locals():
    import json
    import shutil
    from datetime import datetime
    from pathlib import Path
    
    # Create timestamp for this validation run
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    
    print("💾 Saving validation results to Drive...")
    
    # 1. Create validation results directory
    validation_dir = Path(f"experiments/t250_validation_{timestamp}")
    validation_dir.mkdir(parents=True, exist_ok=True)
    
    # 2. Save validation summary
    validation_results = {
        'timestamp': datetime.now().isoformat(),
        'experiment_type': 'T250_hyperparameter_validation',
        'dataset': 'wave_dataset_T250.h5',
        'hyperparameters': {
            'learning_rate': config.learning_rate,
            'batch_size': config.batch_size,
            'optimizer': config.optimizer,
            'weight_decay': config.weight_decay
        },
        'training_config': {
            'epochs': config.num_epochs,
            'early_stopping_patience': config.early_stopping_patience,
            'model_name': config.model_name
        },
        'results': {
            'final_train_loss': float(final_train_loss),
            'final_val_loss': float(final_val_loss),
            'final_distance_error': float(final_val_distance_error),
            'best_epoch': int(best_epoch),
            'training_time_minutes': float(training_duration)
        },
        'recommendation': recommendation,
        'comparison': {
            't500_grid_search_best': 2.37,
            't500_cv_average': 2.078,
            't250_validation': float(final_val_distance_error)
        },
        'mlflow_experiment_name': config.experiment_name,
        'mlflow_run_name': config.run_name
    }
    
    # Save summary JSON
    summary_file = validation_dir / f"validation_summary_{timestamp}.json"
    with open(summary_file, 'w') as f:
        json.dump(validation_results, f, indent=2)
    
    print(f"✅ Summary saved: {summary_file}")
    
    # 3. Save training curves plot
    if 'fig' in locals():
        plot_file = validation_dir / f"training_curves_{timestamp}.png"
        fig.savefig(plot_file, dpi=300, bbox_inches='tight')
        print(f"✅ Plots saved: {plot_file}")
    
    # 4. Copy MLflow experiment data
    mlflow_backup_dir = validation_dir / "mlflow_backup"
    mlflow_backup_dir.mkdir(exist_ok=True)
    
    # Find the MLflow experiment
    mlflow_dir = Path("mlruns")
    if mlflow_dir.exists():
        # Copy the relevant experiment folder
        for exp_dir in mlflow_dir.iterdir():
            if exp_dir.is_dir() and exp_dir.name != "0":
                # Copy the entire experiment folder
                backup_exp_dir = mlflow_backup_dir / exp_dir.name
                if backup_exp_dir.exists():
                    shutil.rmtree(backup_exp_dir)
                shutil.copytree(exp_dir, backup_exp_dir)
        print(f"✅ MLflow data backed up to: {mlflow_backup_dir}")
    
    # 5. Create final report
    report_file = validation_dir / f"VALIDATION_REPORT_{timestamp}.md"
    with open(report_file, 'w') as f:
        f.write(f"# T=250 Hyperparameter Validation Report\\n\\n")
        f.write(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\\n\\n")
        f.write(f"## 🎯 Objective\\n")
        f.write(f"Validate winning hyperparameters on T=250 dataset before full 5-fold CV.\\n\\n")
        f.write(f"## 📊 Results\\n")
        f.write(f"- **Final Distance Error**: {final_val_distance_error:.3f} px\\n")
        f.write(f"- **Training Time**: {training_duration:.1f} minutes\\n")
        f.write(f"- **Best Epoch**: {best_epoch}\\n")
        f.write(f"- **Final Train Loss**: {final_train_loss:.4f}\\n")
        f.write(f"- **Final Val Loss**: {final_val_loss:.4f}\\n\\n")
        f.write(f"## 🔄 Comparison with T=500\\n")
        f.write(f"- **T=500 Grid Search Best**: 2.37 px\\n")
        f.write(f"- **T=500 CV Average**: 2.078 px\\n")
        f.write(f"- **T=250 Validation**: {final_val_distance_error:.3f} px\\n\\n")
        f.write(f"## 🎯 Recommendation\\n")
        f.write(f"{recommendation}\\n\\n")
        f.write(f"## 📁 Files in this validation\\n")
        f.write(f"- `validation_summary_{timestamp}.json`: Complete results data\\n")
        f.write(f"- `training_curves_{timestamp}.png`: Training visualizations\\n")
        f.write(f"- `mlflow_backup/`: MLflow experiment data\\n")
        f.write(f"- `VALIDATION_REPORT_{timestamp}.md`: This report\\n")
    
    print(f"✅ Report saved: {report_file}")
    
    print(f"\\n" + "=" * 60)
    print(f"🎉 ALL RESULTS SAVED TO DRIVE!")
    print(f"📁 Location: {validation_dir}")
    print(f"💾 Total files saved:")
    saved_files = list(validation_dir.rglob("*"))
    for file in saved_files[:10]:  # Show first 10 files
        if file.is_file():
            print(f"   📄 {file.relative_to(validation_dir)}")
    if len(saved_files) > 10:
        print(f"   ... and {len(saved_files) - 10} more files")
    
    print(f"\\n🔄 Results are automatically synced to Google Drive!")
    print(f"🔗 You can access them anytime from Drive.")
    
else:
    print("❌ No training results to save - training may have failed")


In [None]:
# Runtime management and auto-save protection
import signal
import atexit

def emergency_save():
    """Emergency save function in case of runtime interruption."""
    try:
        if 'trainer' in globals() and hasattr(trainer, 'save_checkpoint'):
            checkpoint_path = f"emergency_checkpoint_{datetime.now().strftime('%Y%m%d_%H%M')}.pt"
            trainer.save_checkpoint(checkpoint_path)
            print(f"🚨 Emergency checkpoint saved: {checkpoint_path}")
        
        # Save any partial results
        if 'config' in globals():
            emergency_dir = Path(f"emergency_backup_{datetime.now().strftime('%Y%m%d_%H%M')}")
            emergency_dir.mkdir(exist_ok=True)
            
            emergency_info = {
                'timestamp': datetime.now().isoformat(),
                'status': 'interrupted',
                'config': {
                    'learning_rate': config.learning_rate,
                    'batch_size': config.batch_size,
                    'optimizer': config.optimizer,
                    'dataset_path': config.dataset_path,
                    'experiment_name': config.experiment_name,
                    'run_name': config.run_name
                }
            }
            
            with open(emergency_dir / "emergency_info.json", 'w') as f:
                json.dump(emergency_info, f, indent=2)
            
            print(f"🚨 Emergency info saved: {emergency_dir}")
            
    except Exception as e:
        print(f"❌ Emergency save failed: {e}")

# Register emergency save
atexit.register(emergency_save)

# Keep runtime alive function
def keep_alive():
    """Prevent runtime disconnection during training."""
    import time
    print("🔄 Runtime keep-alive activated")
    print("💡 Tip: Keep this tab active and check back periodically")
    
    # Show progress indicators
    print("\\n📊 Training Progress Indicators:")
    print("✅ Watch for MLflow logging messages")
    print("✅ Monitor GPU memory usage")
    print("✅ Check validation loss improvements")
    print("\\n⏰ Expected milestones:")
    print("   10 min: Initial convergence")
    print("   30 min: Stable training")
    print("   60 min: Best model selection")
    print("   120 min: Training completion")

keep_alive()

print("\\n🛡️ Runtime protection activated!")
print("📱 The notebook will auto-save progress and handle interruptions gracefully.")
print("🚀 Ready to start validation training!")


In [None]:
# Extract and analyze results
if 'training_history' in locals():
    # Extract final results
    final_train_loss = training_history['train_loss'][-1]
    final_val_loss = training_history['val_loss'][-1]
    final_val_distance_error = training_history['val_distance_error'][-1]
    best_epoch = training_history.get('best_epoch', len(training_history['val_loss']))
    
    print("=" * 60)
    print("🎉 VALIDATION TRAINING COMPLETE!")
    print("=" * 60)
    
    print(f"⏱️ Training Time: {training_duration:.1f} minutes")
    print(f"🏆 Best Epoch: {best_epoch}")
    print(f"📊 Final Results:")
    print(f"   Training Loss: {final_train_loss:.4f}")
    print(f"   Validation Loss: {final_val_loss:.4f}")
    print(f"   Distance Error: {final_val_distance_error:.3f} px")
    
    # Performance assessment
    print(f"\n📈 Performance Assessment:")
    if final_val_distance_error <= 2.0:
        print(f"   ✅ EXCELLENT: {final_val_distance_error:.3f} px (≤ 2.0 px)")
        recommendation = "✅ RECOMMENDED: Proceed with full 5-fold CV training"
    elif final_val_distance_error <= 3.0:
        print(f"   ✅ GOOD: {final_val_distance_error:.3f} px (≤ 3.0 px)")
        recommendation = "✅ RECOMMENDED: Proceed with full 5-fold CV training"
    elif final_val_distance_error <= 4.0:
        print(f"   ⚠️ ACCEPTABLE: {final_val_distance_error:.3f} px (≤ 4.0 px)")
        recommendation = "⚠️ CONSIDER: Maybe adjust hyperparameters or proceed with caution"
    else:
        print(f"   ❌ CONCERNING: {final_val_distance_error:.3f} px (> 4.0 px)")
        recommendation = "❌ RECOMMEND: Consider hyperparameter tuning before full CV"
    
    # Comparison with T=500 results
    print(f"\n🔄 Comparison with T=500 Results:")
    print(f"   T=500 Grid Search Best: 2.37 px")
    print(f"   T=500 CV Average: 2.078 px")
    print(f"   T=250 Validation: {final_val_distance_error:.3f} px")
    
    if final_val_distance_error < 2.5:
        print("   ✅ T=250 performance is competitive with T=500!")
    elif final_val_distance_error < 3.5:
        print("   ✅ T=250 performance is reasonable compared to T=500")
    else:
        print("   ⚠️ T=250 performance is worse than T=500 - consider investigation")
    
    print(f"\n🎯 RECOMMENDATION:")
    print(f"   {recommendation}")
    
    if "RECOMMENDED" in recommendation:
        print(f"\n🚀 Next Steps:")
        print(f"   1. Run full 5-fold CV training on T=250 dataset")
        print(f"   2. Use same hyperparameters: lr=0.001, bs=32, adam")
        print(f"   3. Expected full CV time: ~10 hours")
        print(f"   4. Expected CV performance: ~{final_val_distance_error:.1f} ± 0.3 px")
    
    print("\n" + "=" * 60)
    print("🎉 Validation complete! Check results above for next steps.")
else:
    print("❌ No training history available - training may have failed")
