In [45]:
from pathlib import Path
import os
workdir = Path("/ibstorage/anthony/NYS_Wetlands_GHG/")
print(workdir)
os.chdir(workdir)
current_working_dir = Path.cwd()
print(f"Current working directory is now: {current_working_dir}")

/ibstorage/anthony/NYS_Wetlands_GHG
Current working directory is now: /ibstorage/anthony/NYS_Wetlands_GHG


In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from pathlib import Path
import os
import sys

# === SET WORKING DIRECTORY ===
workdir = Path("/ibstorage/anthony/NYS_Wetlands_GHG/")
os.chdir(workdir)

# === ADD SCRIPT DIRECTORY TO PYTHON PATH ===
script_dir = Path("/ibstorage/anthony/NYS_Wetlands_GHG/Python_Code_Analysis/DL_Learning")
sys.path.insert(0, str(script_dir))

# Now these imports will work
from _04_dataset import get_dataloaders
from _05_unet_model import UNet

In [47]:
def train_one_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch and return average loss."""
    model.train()
    running_loss = 0.0
    
    for batch_idx, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # Progress update every 10 batches
        if (batch_idx + 1) % 10 == 0:
            print(f"    Batch {batch_idx + 1}/{len(train_loader)}, Loss: {loss.item():.4f}")
    
    return running_loss / len(train_loader)


def validate(model, val_loader, criterion, device, num_classes=5):
    """Validate and return loss plus per-class accuracy."""
    model.eval()
    running_loss = 0.0
    
    # Track correct predictions per class
    correct_per_class = torch.zeros(num_classes)
    total_per_class = torch.zeros(num_classes)
    
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            
            outputs = model(X)
            loss = criterion(outputs, y)
            running_loss += loss.item()
            
            # Get predictions
            preds = torch.argmax(outputs, dim=1)
            
            # Per-class accuracy
            for c in range(num_classes):
                mask = (y == c)
                total_per_class[c] += mask.sum().item()
                correct_per_class[c] += ((preds == c) & mask).sum().item()
    
    avg_loss = running_loss / len(val_loader)
    
    # Calculate per-class accuracy
    class_acc = {}
    class_names = ['Background', 'EMW', 'FSW', 'SSW', 'OWW']
    for c in range(num_classes):
        if total_per_class[c] > 0:
            class_acc[class_names[c]] = correct_per_class[c] / total_per_class[c]
        else:
            class_acc[class_names[c]] = 0.0
    
    # Overall accuracy
    overall_acc = correct_per_class.sum() / total_per_class.sum()
    
    return avg_loss, overall_acc.item(), class_acc

In [48]:
def main():
    # === CONFIGURATION ===
    data_dir = Path("Data/Patches_v2")
    output_dir = Path("Models")
    output_dir.mkdir(exist_ok=True)
    
    num_epochs = 10
    batch_size = 16
    learning_rate = 0.001
    
    # Class weights from our analysis
    class_weights = torch.tensor([1.0, 22.6, 12.88, 13.04, 55.52], dtype=torch.float32)
    
    # Device (CPU in your case)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # === LOAD DATA ===
    print("\nLoading data...")
    train_loader, val_loader = get_dataloaders(
        data_dir / "X_train.npy",
        data_dir / "y_train.npy",
        data_dir / "X_val.npy",
        data_dir / "y_val.npy",
        batch_size=batch_size
    )
    print(f"Training batches: {len(train_loader)}")
    print(f"Validation batches: {len(val_loader)}")
    
    # === CREATE MODEL ===
    print("\nInitializing model...")
    model = UNet(in_channels=8, num_classes=5, base_filters=32)
    model = model.to(device)
    
    # === LOSS AND OPTIMIZER ===
    class_weights = class_weights.to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # === TRAINING LOOP ===
    print("\nStarting training...")
    print("=" * 60)
    
    best_val_loss = float('inf')
    history = {'train_loss': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(num_epochs):
        epoch_start = time.time()
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        print("-" * 40)
        
        # Train
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        
        # Validate
        val_loss, val_acc, class_acc = validate(model, val_loader, criterion, device)
        
        epoch_time = time.time() - epoch_start
        
        # Log results
        print(f"\n  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss:   {val_loss:.4f}")
        print(f"  Val Acc:    {val_acc:.4f}")
        print(f"  Time:       {epoch_time:.1f}s")
        print("  Per-class accuracy:")
        for name, acc in class_acc.items():
            print(f"    {name}: {acc:.4f}")
        
        # Save history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'val_acc': val_acc
            }, output_dir / "best_model.pth")
            print("  [Saved new best model]")
    
    # === SAVE FINAL MODEL AND HISTORY ===
    torch.save({
        'epoch': num_epochs,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_loss': val_loss,
        'val_acc': val_acc
    }, output_dir / "final_model.pth")
    
    np.save(output_dir / "training_history.npy", history)
    
    print("\n" + "=" * 60)
    print("Training complete!")
    print(f"Best validation loss: {best_val_loss:.4f}")
    print(f"Models saved to: {output_dir}")


if __name__ == "__main__":
    main()

Using device: cpu

Loading data...
Training batches: 18
Validation batches: 5

Initializing model...

Starting training...

Epoch 1/10
----------------------------------------
    Batch 10/18, Loss: 1.5347

  Train Loss: 1.5155
  Val Loss:   2.8257
  Val Acc:    0.2667
  Time:       216.7s
  Per-class accuracy:
    Background: 0.2966
    EMW: 0.1187
    FSW: 0.0430
    SSW: 0.1952
    OWW: 0.7488
  [Saved new best model]

Epoch 2/10
----------------------------------------
    Batch 10/18, Loss: 1.4641

  Train Loss: 1.4412
  Val Loss:   1.4123
  Val Acc:    0.4894
  Time:       216.2s
  Per-class accuracy:
    Background: 0.5203
    EMW: 0.0218
    FSW: 0.5022
    SSW: 0.3804
    OWW: 0.3781
  [Saved new best model]

Epoch 3/10
----------------------------------------
    Batch 10/18, Loss: 1.2970

  Train Loss: 1.4254
  Val Loss:   1.2978
  Val Acc:    0.4648
  Time:       215.9s
  Per-class accuracy:
    Background: 0.4592
    EMW: 0.1676
    FSW: 0.7650
    SSW: 0.1927
    OWW: 0.6