In this notebook:
1. We load finite element results files.
1. We split data in train/validation/test.
1. We compute some statistic.
1. We save all in one master file.

### General Libraries

In [1]:
import numpy as np
import h5py
from pathlib import Path

In [2]:

# ========================= CONFIG =========================
DATA_DIR = Path("./data/Run1")
MASTER_FOLDER = Path("master_data")
MASTER_FOLDER.mkdir(parents=True, exist_ok=True)
MASTER_FILE = MASTER_FOLDER / "rve_run1.h5"

In [3]:

# Get list of .h5 files in the data directory
h5_files = sorted(list(DATA_DIR.glob("*.h5")))

N_RVE = len(h5_files)            # Total number of RVE files available

# Take dimensions and number of steps from the first file
first_file = h5_files[0]
with h5py.File(first_file, 'r') as f:
    # Read metadata attributes for dimensions and steps
    H = int(f.attrs['H_gp'])
    W = int(f.attrs['W_gp'])
    N_STEPS_PER_RVE = int(f.attrs['n_steps'])  # Time steps to use from each RVE

print(f'Total RVE files found: {N_RVE}')
print(f'Image dimensions (H x W): {H} x {W}')
print(f'Time steps per RVE: {N_STEPS_PER_RVE}')

Total RVE files found: 1000
Image dimensions (H x W): 96 x 96
Time steps per RVE: 100


In [4]:

from sklearn.model_selection import train_test_split

# ====================== TRAIN / VALIDATION / TEST ======================

rnd_seed = 42

# Define fractions for train/validation/test
val_size = 0.20
test_size = 0.20
train_size = 1 - (val_size + test_size)

# Generate sets
rve_indexs = list(range(1, N_RVE + 1))
train_val_rves, test_rves = train_test_split(rve_indexs,
                                                   test_size = test_size,
                                                   random_state = rnd_seed)

train_rves, val_rves = train_test_split(train_val_rves,
                                            test_size=val_size / (train_size + val_size),
                                            random_state = rnd_seed)

# Compute amount of elements per set
ntrain = len(train_rves)
nval = len(val_rves)
ntest = len(test_rves)

print(f"Train RVEs: {ntrain} (f={train_size:.2f}) | Val: {nval}  (f={val_size:.2f}) | Test: {ntest}  (f={test_size:.2f})")
print(f"Total samples: {N_RVE * N_STEPS_PER_RVE}")


Train RVEs: 600 (f=0.60) | Val: 200  (f=0.20) | Test: 200  (f=0.20)
Total samples: 100000


In [5]:
import concurrent.futures
from tqdm import tqdm

# ====================== CREATE MASTER HDF5 ======================

"""
Create Master HDF5 dataset for Dual-Encoder FNO
- x_local  : phase field (0=soft, 1=hard)
- x_global : [exx, eyy, gxy, E_soft, nu_soft, E_hard, nu_hard]
- y_local  : [Sxx, Syy, Sxy]
"""

# Auxiliary function to load RVE data from a single file
def load_rve_data(file_path):
    with h5py.File(file_path, 'r') as src:
        fields = src['fields'][:]
        macro  = src['macro'][:]
    return fields, macro

workers = 8

with h5py.File(MASTER_FILE, 'w') as f:
    for split_name, rve_list in [("train", train_rves), 
                                 ("val",   val_rves), 
                                 ("test",  test_rves)]:
        
        N_split = len(rve_list) * N_STEPS_PER_RVE
        g = f.create_group(split_name)
        
        g.create_dataset('x_local',  shape=(N_split, H, W, 1), dtype=np.float32, compression='gzip', chunks=True)
        g.create_dataset('x_global', shape=(N_split, 7), dtype=np.float32, compression='gzip', chunks=True)
        g.create_dataset('y_local',  shape=(N_split, H, W, 3), dtype=np.float32, compression='gzip', chunks=True)
        
        idx = 0
        print(f"\nProcessing {split_name} split ({N_split} samples)...")
        
        # Pre-compute paths
        file_paths = [DATA_DIR / f"rve_{rve_id:04d}.h5" for rve_id in rve_list]
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            
            results = executor.map(load_rve_data, file_paths)
            
            for fields, macro in tqdm(results, total=len(file_paths)):
                n_steps = min(N_STEPS_PER_RVE, len(fields))
                
                start_idx = idx
                end_idx = idx + n_steps
                
                g['x_local'][start_idx:end_idx] = fields[:n_steps, :, :, 0:1]
                g['x_global'][start_idx:end_idx] = macro[:n_steps]
                g['y_local'][start_idx:end_idx] = fields[:n_steps, :, :, 1:4]
                
                idx = end_idx

print(f"\n✅ Master HDF5 created successfully:")
print(f"   → {MASTER_FILE}")


Processing train split (60000 samples)...


100%|██████████| 600/600 [38:40<00:00,  3.87s/it] 



Processing val split (20000 samples)...


100%|██████████| 200/200 [12:45<00:00,  3.83s/it]



Processing test split (20000 samples)...


100%|██████████| 200/200 [13:46<00:00,  4.13s/it]


✅ Master HDF5 created successfully:
   → master_data\rve_run1.h5





In [6]:
# ====================== COMPUTE AND SAVE STATS ======================

print("\nComputing normalization statistics (from train only)...")

with h5py.File(MASTER_FILE, 'a') as f:
    train = f['train']
    
    x_local  = train['x_local'][:]
    x_global = train['x_global'][:]
    y_local  = train['y_local'][:]
    
    stats = f.create_group('stats')
    
    stats.create_dataset('mean_x_local',  data=x_local.mean(axis=(0,1,2), keepdims=True).squeeze(0))
    stats.create_dataset('std_x_local',   data=x_local.std(axis=(0,1,2), keepdims=True).squeeze(0) + 1e-8)
    
    stats.create_dataset('mean_x_global', data=x_global.mean(axis=0, keepdims=True).squeeze(0))
    stats.create_dataset('std_x_global',  data=x_global.std(axis=0, keepdims=True).squeeze(0) + 1e-8)
    
    stats.create_dataset('mean_y_local',  data=y_local.mean(axis=(0,1,2), keepdims=True).squeeze(0))
    stats.create_dataset('std_y_local',   data=y_local.std(axis=(0,1,2), keepdims=True).squeeze(0) + 1e-8)

print("   Statistics saved in group '/stats'")
print("\n✅ Master dataset is ready for training!")


Computing normalization statistics (from train only)...
   Statistics saved in group '/stats'

✅ Master dataset is ready for training!
