In [11]:
from pathlib import Path
import os
workdir = Path("/ibstorage/anthony/NYS_Wetlands_GHG/")
print(workdir)
os.chdir(workdir)
current_working_dir = Path.cwd()
print(f"Current working directory is now: {current_working_dir}")

/ibstorage/anthony/NYS_Wetlands_GHG
Current working directory is now: /ibstorage/anthony/NYS_Wetlands_GHG


In [36]:
import rasterio
import geopandas as gpd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

In [40]:
# === CONFIGURATION ===
naip_path = "Data/NAIP/HUC_NAIP_Processed/cluster_208_huc_041402011002_NAIP_metrics.tif"
dem_path = "Data/TerrainProcessed/HUC_DEMs/cluster_208_huc_041402011002.tif"
slp_path = "Data/TerrainProcessed/HUC_TerrainMetrics/cluster_208_huc_041402011002_terrain_slp_5m.tif"
labels_path = "Data/Training_Data/cluster_208_huc_041402011002_labels.tif"
wetlands_path = "Data/Training_Data/HUC_Extracted_Training_Data/cluster_208_huc_041402011002_NWI.gpkg"
output_dir = Path("Data/Patches_v2")

patch_size = 256
max_offset = 32  # Random offset from centroid (pixels) to add variety
background_patches = 100  # Number of random background patches to include
val_split = 0.2
random_seed = 42

In [41]:
# === CREATE OUTPUT DIRECTORY ===
output_dir.mkdir(parents=True, exist_ok=True)

# === LOAD ALL BANDS ===
print("Loading rasters...")

with rasterio.open(naip_path) as src:
    naip = src.read()  # Shape: (6, H, W)
    
with rasterio.open(dem_path) as src:
    dem = src.read()  # Shape: (1, H, W)
    
with rasterio.open(slp_path) as src:
    slp = src.read()  # Shape: (1, H, W)

with rasterio.open(labels_path) as src:
    labels = src.read(1)  # Shape: (H, W)

# Stack inputs: (7, H, W)
inputs = np.vstack([naip, dem, slp])
print(f"Input stack shape: {inputs.shape}")
print(f"Labels shape: {labels.shape}")

Loading rasters...
Input stack shape: (11, 12000, 18000)
Labels shape: (12000, 18000)


In [34]:
# === LOAD WETLAND POLYGONS ===
print("\nLoading wetland polygons...")
wetlands = gpd.read_file(wetlands_path)
print(f"Number of wetland polygons: {len(wetlands)}")

# === HELPER FUNCTION: EXTRACT PATCH ===
def extract_patch(center_row, center_col, patch_size, inputs, labels):
    """
    Extract a patch centered at (center_row, center_col).
    Returns None if patch would be out of bounds or contains NaN.
    """
    half = patch_size // 2
    
    # Calculate bounds
    row_start = center_row - half
    row_end = center_row + half
    col_start = center_col - half
    col_end = center_col + half
    
    # Check bounds
    if row_start < 0 or row_end > height or col_start < 0 or col_end > width:
        return None, None
    
    # Extract patches
    X_patch = inputs[:, row_start:row_end, col_start:col_end]
    y_patch = labels[row_start:row_end, col_start:col_end]
    
    # Check for NaN
    if np.any(np.isnan(X_patch)):
        return None, None
    
    return X_patch, y_patch

# === EXTRACT WETLAND-CENTERED PATCHES ===
print("\nExtracting wetland-centered patches...")
np.random.seed(random_seed)

wetland_patches_X = []
wetland_patches_y = []
skipped_count = 0

for idx, row in wetlands.iterrows():
    # Get centroid coordinates
    centroid = row.geometry.centroid
    
    # Convert geographic coordinates to pixel coordinates
    # transform: (x, y) -> (col, row)
    col, row_px = ~transform * (centroid.x, centroid.y)
    col, row_px = int(col), int(row_px)
    
    # Add random offset for variety
    offset_row = np.random.randint(-max_offset, max_offset + 1)
    offset_col = np.random.randint(-max_offset, max_offset + 1)
    center_row = row_px + offset_row
    center_col = col + offset_col
    
    # Extract patch
    X_patch, y_patch = extract_patch(center_row, center_col, patch_size, inputs, labels)
    
    if X_patch is not None:
        wetland_patches_X.append(X_patch)
        wetland_patches_y.append(y_patch)
    else:
        skipped_count += 1

print(f"Wetland-centered patches extracted: {len(wetland_patches_X)}")
print(f"Skipped (out of bounds or NaN): {skipped_count}")

# === EXTRACT RANDOM BACKGROUND PATCHES ===
print(f"\nExtracting {background_patches} random background patches...")

background_patches_X = []
background_patches_y = []
attempts = 0
max_attempts = background_patches * 10  # Limit attempts to avoid infinite loop

while len(background_patches_X) < background_patches and attempts < max_attempts:
    attempts += 1
    
    # Random center point
    center_row = np.random.randint(patch_size // 2, height - patch_size // 2)
    center_col = np.random.randint(patch_size // 2, width - patch_size // 2)
    
    # Extract patch
    X_patch, y_patch = extract_patch(center_row, center_col, patch_size, inputs, labels)
    
    if X_patch is None:
        continue
    
    # Only keep if it's purely background (no wetland pixels)
    if not np.any(y_patch > 0):
        background_patches_X.append(X_patch)
        background_patches_y.append(y_patch)

print(f"Background patches extracted: {len(background_patches_X)}")

# === COMBINE AND SPLIT ===
print("\nCombining and splitting data...")

all_X = wetland_patches_X + background_patches_X
all_y = wetland_patches_y + background_patches_y

X_array = np.array(all_X, dtype=np.float32)
y_array = np.array(all_y, dtype=np.uint8)

print(f"Total patches: {len(X_array)}")
print(f"X shape: {X_array.shape}")  # (N, 7, 256, 256)
print(f"y shape: {y_array.shape}")  # (N, 256, 256)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, 
    test_size=val_split, 
    random_state=random_seed
)

print(f"\nTrain patches: {len(X_train)}")
print(f"Validation patches: {len(X_val)}")

# === SAVE PATCHES ===
np.save(output_dir / "X_train.npy", X_train)
np.save(output_dir / "y_train.npy", y_train)
np.save(output_dir / "X_val.npy", X_val)
np.save(output_dir / "y_val.npy", y_val)

print(f"\nSaved patches to {output_dir}")

# === CLASS DISTRIBUTION IN TRAINING SET ===
print("\nClass distribution in training patches (pixel counts):")
unique, counts = np.unique(y_train, return_counts=True)
total = y_train.size
for val, count in zip(unique, counts):
    class_name = {0: 'Background', 1: 'EMW', 2: 'FSW', 3: 'SSW', 4: 'OWW'}.get(val, 'Unknown')
    pct = (count / total) * 100
    print(f"  {class_name}: {count:,} pixels ({pct:.2f}%)")


Extracting patches...
Total possible patches: 3220
Patches with wetlands: 398
Background-only patches: 438


In [33]:
# === SAMPLE BACKGROUND PATCHES ===
np.random.seed(random_seed)
n_background_keep = int(len(background_patches_X) * background_sample_ratio)
background_indices = np.random.choice(
    len(background_patches_X), 
    size=n_background_keep, 
    replace=False
)

sampled_bg_X = [background_patches_X[i] for i in background_indices]
sampled_bg_y = [background_patches_y[i] for i in background_indices]

print(f"Sampled background patches: {len(sampled_bg_X)}")

Sampled background patches: 43


In [17]:
# === COMBINE AND SPLIT ===
all_X = wetland_patches_X + sampled_bg_X
all_y = wetland_patches_y + sampled_bg_y

X_array = np.array(all_X, dtype=np.float32)
y_array = np.array(all_y, dtype=np.uint8)

print(f"\nTotal patches: {len(X_array)}")
print(f"X shape: {X_array.shape}")  # (N, 7, 128, 128)
print(f"y shape: {y_array.shape}")  # (N, 128, 128)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_array, y_array, 
    test_size=val_split, 
    random_state=random_seed
)

print(f"\nTrain patches: {len(X_train)}")
print(f"Validation patches: {len(X_val)}")


Total patches: 441
X shape: (441, 7, 256, 256)
y shape: (441, 256, 256)

Train patches: 352
Validation patches: 89


In [18]:
# === SAVE PATCHES ===
np.save(output_dir / "X_train.npy", X_train)
np.save(output_dir / "y_train.npy", y_train)
np.save(output_dir / "X_val.npy", X_val)
np.save(output_dir / "y_val.npy", y_val)

print(f"\nSaved patches to {output_dir}")

# === CLASS DISTRIBUTION IN TRAINING SET ===
print("\nClass distribution in training patches (pixel counts):")
unique, counts = np.unique(y_train, return_counts=True)
total = y_train.size
for val, count in zip(unique, counts):
    class_name = {0: 'Background', 1: 'EMW', 2: 'FSW', 3: 'SSW', 4: 'OWW'}.get(val, 'Unknown')
    pct = (count / total) * 100
    print(f"  {class_name}: {count:,} pixels ({pct:.2f}%)")


Saved patches to Data/Patches

Class distribution in training patches (pixel counts):
  Background: 19,837,679 pixels (85.99%)
  EMW: 527,896 pixels (2.29%)
  FSW: 1,357,839 pixels (5.89%)
  SSW: 1,157,860 pixels (5.02%)
  OWW: 187,398 pixels (0.81%)


In [19]:
import numpy as np

X_train = np.load("Data/Patches/X_train.npy")

band_names = ['R', 'G', 'B', 'NIR', 'NDWI', 'NDVI', 'DEM']
for i, name in enumerate(band_names):
    band = X_train[:, i, :, :]
    print(f"{name}: min={band.min():.2f}, max={band.max():.2f}, mean={band.mean():.2f}")

R: min=0.00, max=255.00, mean=94.74
G: min=19.00, max=255.00, mean=127.76
B: min=55.00, max=255.00, mean=126.93
NIR: min=12.00, max=255.00, mean=184.27
NDWI: min=-0.79, max=1.00, mean=0.33
NDVI: min=-0.70, max=0.82, mean=-0.18
DEM: min=311.67, max=495.94, mean=363.50
