# Generate 2D Plane Sequence Data

This notebook reads raw 2D plane data from FEFLOW simulations and generates temporal sequences for training the GFNO model.

**Output**: Saves sequence data organized by planes to disk for use in model training.

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

## Configuration

In [3]:
# Input data paths
data_path = '/Users/arpitkapoor/data/GW/2d_plane_data'
sea_level_csv = '/Users/arpitkapoor/Library/CloudStorage/OneDrive-UNSW/Shared/Data/FEFLOW/simulation_files/SeaLevelDataPeaksHL.csv'

# Output data path
output_data_dir = '/Users/arpitkapoor/data/GW/2d_plane_sequences'

# Processing parameters
skip_factor = 2  # Process every 2nd timestep for faster processing
alpha = 16  # Sequence length for input and output
N_planes = 32  # Number of planes

## Load Timestep Data

In [4]:
# Load time data
times = pd.read_csv(sea_level_csv, header=None, names=['time', 'sealevel'])['time'].values[::skip_factor]
print(f"Loaded {len(times)} timesteps")

# Get sorted data directories
sorted_data_dirs = sorted(os.listdir(data_path))
sorted_data_dirs = [d for d in sorted_data_dirs if d.startswith('timestep_')]
sorted_data_dirs = sorted_data_dirs[::skip_factor]
print(f"Found {len(sorted_data_dirs)} timestep directories")

Loaded 955 timesteps
Found 955 timestep directories


## Load and Process Plane Data

In [5]:
# Initialize storage for all planes
plane_data_all_timesteps = {p: [] for p in range(N_planes)}
bc_data_all_timesteps = {p: [] for p in range(N_planes)}
bc_nodes_for_plane = {}

print("Loading plane data across all timesteps...")
for t in tqdm(range(len(sorted_data_dirs))):
    d = sorted_data_dirs[t]
    ts_dir = os.path.join(data_path, d)
    sorted_planes_dir = sorted(os.listdir(ts_dir))
    sorted_planes_dir = [os.path.join(ts_dir, f) for f in sorted_planes_dir]

    for p, f in enumerate(sorted_planes_dir):
        # Load plane data
        plane_data = np.load(os.path.join(f, 'plane_data.npz'))
        S = plane_data['S']
        Z = plane_data['Z']
        X = plane_data['X']
        Y = plane_data['Y']
        T = np.ones_like(X) * times[t]
        head_m = plane_data['head_m']
        mass_conc_mg_l = plane_data['mass_conc_mg_l']
        
        # Stack: S, Z, T, X, Y, head, mass_conc
        stacked_plane = np.stack([T, S, Z, X, Y, head_m, mass_conc_mg_l], axis=-1)
        plane_data_all_timesteps[p].append(stacked_plane)

        # Load boundary condition data
        bc_data = np.load(os.path.join(f, 'bc_data.npz'))
        bc_node_idx = bc_data['node_idx']
        bc_S = bc_data['S']
        bc_Z = bc_data['Z']
        bc_T = np.ones_like(bc_node_idx) * times[t]
        bc_head = bc_data['head']
        bc_mass_conc = bc_data['mass_conc']
        
        # Validate BC data consistency
        if not len(bc_S) == len(bc_Z) == len(bc_head) == len(bc_mass_conc):
            print(f"Warning: Plane {p} at timestep {t} has mismatched BC array lengths")
            continue

        # Track consistent BC nodes across timesteps
        if t == 0:
            bc_nodes_for_plane[p] = bc_node_idx.astype(np.int32)
        else:
            bc_nodes_for_plane[p] = np.intersect1d(bc_nodes_for_plane[p], bc_node_idx.astype(np.int32))

        # Stack BC data: node_idx, S, Z, T, head, mass_conc
        stacked_bc = np.stack([bc_node_idx, bc_T, bc_S, bc_Z, bc_head, bc_mass_conc], axis=-1)
        bc_data_all_timesteps[p].append(stacked_bc)

print("\nData loading complete!")

Loading plane data across all timesteps...


100%|██████████| 955/955 [00:18<00:00, 52.02it/s]


Data loading complete!





## Stack Data Across Timesteps

In [6]:
# Stack plane data
print("Stacking plane data...")
for p in tqdm(range(N_planes)):
    plane_data_p = plane_data_all_timesteps[p]
    plane_data_all_timesteps[p] = np.stack(plane_data_p, axis=0)

print(f"Example plane data shape: {plane_data_all_timesteps[0].shape}")

Stacking plane data...


100%|██████████| 32/32 [00:00<00:00, 76.73it/s]

Example plane data shape: (955, 32, 32, 7)





In [7]:
# Clean and stack BC data (filter to consistent nodes only)
print("\nCleaning and stacking BC data...")
for p in tqdm(range(N_planes)):
    bc_list = bc_data_all_timesteps[p]
    n_bc = len(bc_list)
    bc_data_for_selected_nodes = []

    for t in range(n_bc):
        bc_data_for_p_at_t = bc_data_all_timesteps[p][t]
        bc_nodes_for_p = bc_nodes_for_plane[p]
        
        # Filter to only consistent BC nodes
        mask = np.isin(bc_data_for_p_at_t[:, 0], bc_nodes_for_p)
        bc_data_for_selected_nodes.append(bc_data_for_p_at_t[mask])
    
    bc_data_all_timesteps[p] = np.stack(bc_data_for_selected_nodes, axis=0)

print(f"Example BC data shape: {bc_data_all_timesteps[0].shape}")


Cleaning and stacking BC data...


100%|██████████| 32/32 [00:00<00:00, 69.88it/s]

Example BC data shape: (955, 326, 6)





## Generate Temporal Sequences

In [8]:
# Initialize sequence storage
input_sequences = {p: {'input_geom': [], 'input_data': [], 'latent_geom': [], 'latent_features': []} for p in range(N_planes)}
output_sequences = {p: {'latent_geom': [], 'latent_features': []} for p in range(N_planes)}

# Generate time sequence indices
n_timesteps = len(sorted_data_dirs)
ts = [(i, i + alpha, i + 2 * alpha) for i in range(0, n_timesteps - 2 * alpha + 1, alpha//2)]

print(f"Generated {len(ts)} time sequences with alpha={alpha}")
print(f"First sequence: {ts[0]} -> times: {times[list(ts[0])]}")
print(f"Last sequence: {ts[-1]} -> times: {times[list(ts[-1])]}")

Generated 116 time sequences with alpha=16
First sequence: (0, 16, 32) -> times: [ 0.         13.91666667 26.91666667]
Last sequence: (920, 936, 952) -> times: [875.5833333 891.4583333 906.3333333]


In [9]:
# Generate sequences for all planes
print("\nGenerating sequences for all planes...")
for p in tqdm(range(N_planes)):
    bc_data = bc_data_all_timesteps[p]  # (n_timesteps, n_bc_nodes, 6)
    plane_data = plane_data_all_timesteps[p]  # (n_timesteps, n_nodes, 7)
    
    for t_start, t_mid, t_end in ts:
        # Input sequence: [t_start:t_mid]
        input_geom_seq = bc_data[t_start:t_mid, ..., 1:4].reshape(-1, 3)  # T, S, Z
        input_data_seq = bc_data[t_start:t_mid, ..., 4:].reshape(-1, 2)  # head, mass_conc
        
        latent_geom_seq = plane_data[t_start:t_mid, ..., :3]  # T, S, Z
        latent_features_seq = plane_data[t_start:t_mid, ..., :]  # T, S, Z X, Y, head, mass_conc
        
        # Output sequence: [t_mid:t_end]
        output_latent_geom_seq = plane_data[t_mid:t_end, ..., :3]
        output_latent_features_seq = plane_data[t_mid:t_end, ..., -2:]
        
        # Store sequences
        input_sequences[p]['input_geom'].append(input_geom_seq)
        input_sequences[p]['input_data'].append(input_data_seq)
        input_sequences[p]['latent_geom'].append(latent_geom_seq)
        input_sequences[p]['latent_features'].append(latent_features_seq)
        
        output_sequences[p]['latent_geom'].append(output_latent_geom_seq)
        output_sequences[p]['latent_features'].append(output_latent_features_seq)

# Convert to numpy arrays
for p in range(N_planes):
    for key in input_sequences[p]:
        input_sequences[p][key] = np.array(input_sequences[p][key])
    for key in output_sequences[p]:
        output_sequences[p][key] = np.array(output_sequences[p][key])

print(f"\nSequence generation complete!")
print(f"Total sequences per plane: {len(ts)}")
print(f"Total planes: {N_planes}")
print(f"\nExample shapes for plane 0:")
print(f"  input_geom: {input_sequences[0]['input_geom'].shape}")
print(f"  input_data: {input_sequences[0]['input_data'].shape}")
print(f"  latent_geom: {input_sequences[0]['latent_geom'].shape}")
print(f"  latent_features: {input_sequences[0]['latent_features'].shape}")
print(f"  output_latent_geom: {output_sequences[0]['latent_geom'].shape}")
print(f"  output_latent_features: {output_sequences[0]['latent_features'].shape}")


Generating sequences for all planes...


100%|██████████| 32/32 [00:00<00:00, 3648.41it/s]



Sequence generation complete!
Total sequences per plane: 116
Total planes: 32

Example shapes for plane 0:
  input_geom: (116, 5216, 3)
  input_data: (116, 5216, 2)
  latent_geom: (116, 16, 32, 32, 3)
  latent_features: (116, 16, 32, 32, 7)
  output_latent_geom: (116, 16, 32, 32, 3)
  output_latent_features: (116, 16, 32, 32, 2)


## Save Sequences to Disk

In [10]:
import json

# Create output directory
os.makedirs(output_data_dir, exist_ok=True)

print(f"Saving plane sequence data to {output_data_dir}...")

for p in tqdm(range(N_planes)):
    plane_dir = os.path.join(output_data_dir, f'plane_{p:02d}')
    os.makedirs(plane_dir, exist_ok=True)
    
    # Save all sequence data for this plane
    np.save(os.path.join(plane_dir, 'input_geom.npy'), input_sequences[p]['input_geom'])
    np.save(os.path.join(plane_dir, 'input_data.npy'), input_sequences[p]['input_data'])
    np.save(os.path.join(plane_dir, 'latent_geom.npy'), input_sequences[p]['latent_geom'])
    np.save(os.path.join(plane_dir, 'latent_features.npy'), input_sequences[p]['latent_features'])
    np.save(os.path.join(plane_dir, 'output_latent_geom.npy'), output_sequences[p]['latent_geom'])
    np.save(os.path.join(plane_dir, 'output_latent_features.npy'), output_sequences[p]['latent_features'])

# Save configuration metadata

config = {
    "n_planes": N_planes,
    "n_sequences_per_plane": len(ts),
    "sequence_length": alpha,
    "skip_factor": skip_factor,
    "n_timesteps": n_timesteps,
    "data_path": data_path,
    "sea_level_csv": sea_level_csv
}

with open(os.path.join(output_data_dir, 'config.json'), 'w') as f:
    json.dump(config, f, indent=4)

print(f"\nData saved successfully!")
print(f"Output directory: {output_data_dir}")
print(f"Configuration saved to: {os.path.join(output_data_dir, 'config.json')}")
print(f"\nYou can now use this data with GWPlaneDatasetFromFiles in the model training notebook.")

Saving plane sequence data to /Users/arpitkapoor/data/GW/2d_plane_sequences...


100%|██████████| 32/32 [00:08<00:00,  3.95it/s]


Data saved successfully!
Output directory: /Users/arpitkapoor/data/GW/2d_plane_sequences
Configuration saved to: /Users/arpitkapoor/data/GW/2d_plane_sequences/config.json

You can now use this data with GWPlaneDatasetFromFiles in the model training notebook.



