In [4]:
# Install required dependencies
import sys
import subprocess

def install_package(package, version=None):
    """Install a package with specific version if needed."""
    if version:
        package_with_version = f"{package}=={version}"
    else:
        package_with_version = package
    
    print(f"Installing {package_with_version}...")
    # Allow all packages to install their dependencies
    subprocess.check_call([sys.executable, "-m", "pip", "install", package_with_version])
    print(f"Successfully installed {package_with_version}")

# Only install non-standard library packages
install_package("boto3")       # For S3/MinIO operations (will install botocore)
install_package("h5py")        # For HDF5 file operations
install_package("matplotlib")  # For plotting
install_package("psutil")      # For memory tracking
install_package("numpy")       # For numerical operations

print("Dependencies installed successfully.")

Installing boto3...
Successfully installed boto3
Installing h5py...
Successfully installed h5py
Installing matplotlib...
Successfully installed matplotlib
Installing psutil...
Successfully installed psutil
Installing numpy...
Successfully installed numpy
Dependencies installed successfully.


In [None]:
# 2_rom_modeling.ipynb
#
# This notebook preprocesses the Cylinder Flow Dataset with memory optimizations:
# - Loads the data from MinIO
# - Extracts velocity fields
# - Creates the snapshot matrix
# - Performs mean subtraction
# - Normalizes the data if needed
# - Uploads processed data back to MinIO

import os
import numpy as np
import h5py
import matplotlib.pyplot as plt
import json
from datetime import datetime
import boto3
from botocore.client import Config
import tempfile
import io
import gc  # For garbage collection
import psutil  # For memory tracking

# Memory management flags
MEMORY_EFFICIENT = True
CHUNK_SIZE = 10  # Process this many snapshots at a time
ENABLE_MEMORY_TRACKING = True
DOWNSAMPLE = False
DOWNSAMPLE_FACTOR = 2  # Only use every Nth point in grid

# Thread control to limit memory usage
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["NUMEXPR_MAX_THREADS"] = "2"

# Function to track memory usage
def print_memory_usage(label=""):
    if ENABLE_MEMORY_TRACKING:
        process = psutil.Process(os.getpid())
        memory_mb = process.memory_info().rss / 1024 / 1024
        print(f"Memory usage {label}: {memory_mb:.1f} MB")

print_memory_usage("at start")

# Function to convert NumPy types to Python native types for JSON serialization
def json_serialize(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, tuple) and hasattr(obj, '_asdict'):
        # Handle named tuples
        return obj._asdict()
    elif isinstance(obj, tuple):
        return list(obj)
    else:
        return obj

def save_dict_to_json(data_dict, filepath):
    """Save dictionary to JSON with NumPy value conversion"""
    # Convert data to JSON-serializable types
    serializable_dict = {}
    for key, value in data_dict.items():
        serializable_dict[key] = json_serialize(value)
    
    # Save to file
    with open(filepath, 'w') as f:
        json.dump(serializable_dict, f, indent=2)

# Create a temporary local directory for processing
temp_dir = tempfile.mkdtemp()
print(f"Using temporary directory: {temp_dir}")

# Connect to MinIO
print("Connecting to MinIO...")
s3_endpoint = os.environ.get('S3_ENDPOINT', 'http://minio:9000' )

# Fix the endpoint URL if the protocol is missing
if s3_endpoint and not s3_endpoint.startswith(('http://', 'https://' )):
    s3_endpoint = f"http://{s3_endpoint}"
    print(f"Adding http:// prefix to endpoint: {s3_endpoint}" )

s3_access_key = os.environ.get('AWS_ACCESS_KEY_ID', 'minioadmin')
s3_secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY', 'minioadmin')

s3 = boto3.client('s3',
                  endpoint_url=s3_endpoint,
                  aws_access_key_id=s3_access_key,
                  aws_secret_access_key=s3_secret_key,
                  config=Config(signature_version='s3v4'))

# Load parameters from previous step
MINIO_BUCKET = 'rom-data'
MINIO_OUTPUT_PREFIX = 'rom-pipeline/outputs'

# Download parameters file from MinIO
try:
    params_key = f"{MINIO_OUTPUT_PREFIX}/params.json"
    params_path = os.path.join(temp_dir, 'params.json')
    s3.download_file(MINIO_BUCKET, params_key, params_path)
    
    with open(params_path, 'r') as f:
        params = json.load(f)
        
    print(f"Loaded parameters successfully")
except Exception as e:
    print(f"Error loading parameters, using defaults: {str(e)}")
    params = {
        "dataset_name": "cylinder",
        "minio_bucket": MINIO_BUCKET,
        "minio_output_prefix": MINIO_OUTPUT_PREFIX
    }

# Update parameters for this step
params.update({
    "preprocessing_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "preprocessing_options": {
        "mean_subtraction": True,
        "normalization": True,
        "memory_efficient": MEMORY_EFFICIENT,
        "chunk_size": CHUNK_SIZE,
        "downsample": DOWNSAMPLE,
        "downsample_factor": DOWNSAMPLE_FACTOR if DOWNSAMPLE else None
    }
})

# Verify the previous step completed
try:
    marker_key = f"{MINIO_OUTPUT_PREFIX}/data_download_completed.txt"
    s3.head_object(Bucket=MINIO_BUCKET, Key=marker_key)
    print("Previous step (data fetching) completed successfully")
except Exception as e:
    print(f"Warning: Previous step completion marker not found: {str(e)}")
    print("Continuing anyway...")

# List objects in the data directory to check what's available
print("\nListing files in MinIO data directory:")
data_prefix = f"{MINIO_OUTPUT_PREFIX}/data/"
response = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix=data_prefix)

if 'Contents' not in response or len(response.get('Contents', [])) == 0:
    print(f"Warning: No data files found in {MINIO_BUCKET}/{data_prefix}")
    print("Please ensure that data files have been uploaded.")
    raise FileNotFoundError(f"No data files found in {MINIO_BUCKET}/{data_prefix}")

# Download data files from MinIO
print("\nDownloading data files from MinIO:")
for obj in response.get('Contents', []):
    if obj['Key'].endswith(('.h5', '.hdf5')):
        filename = os.path.basename(obj['Key'])
        local_path = os.path.join(temp_dir, filename)
        
        print(f"  Downloading {obj['Key']} to {local_path}")
        s3.download_file(MINIO_BUCKET, obj['Key'], local_path)

# Find HDF5 files
h5_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) 
            if f.endswith('.h5') or f.endswith('.hdf5')]

if not h5_files:
    raise FileNotFoundError(f"No HDF5 files found in downloaded data")

print(f"Found {len(h5_files)} HDF5 files: {[os.path.basename(f) for f in h5_files]}")

# Load velocity data from the first file
h5_file = h5_files[0]
print(f"Loading data from {os.path.basename(h5_file)}")

# Load the data or just get its shape
data = None  # We'll load data in chunks if memory efficient mode is on
shape = None
velocity_key = None

try:
    with h5py.File(h5_file, 'r') as f:
        # First, explore the HDF5 file structure to find the velocity field
        print("Exploring HDF5 file structure:")
        print("Top-level groups/datasets:")
        for key in f.keys():
            if isinstance(f[key], h5py.Group):
                print(f"  Group: {key} (contains: {list(f[key].keys())})")
            elif isinstance(f[key], h5py.Dataset):
                print(f"  Dataset: {key} (shape: {f[key].shape}, dtype: {f[key].dtype})")
        
        # Define possible velocity field names to check
        velocity_keys = ['velocity', 'u', 'v', 'vel', 'flow', 'flowfield', 'vector']
        
        # Search for velocity data
        velocity_key = None
        
        # Step 1: Check direct keys in root level
        for key in velocity_keys:
            if key in f:
                velocity_key = key
                print(f"Found direct velocity key: {key}")
                break
        
        # Step 2: Look for keys containing velocity terms
        if velocity_key is None:
            for key in f.keys():
                if any(vk in key.lower() for vk in velocity_keys):
                    velocity_key = key
                    print(f"Found key containing velocity term: {key}")
                    break
        
        # Step 3: Search in groups
        if velocity_key is None:
            for key in f.keys():
                if isinstance(f[key], h5py.Group):
                    for subkey in f[key].keys():
                        if any(vk == subkey.lower() or vk in subkey.lower() for vk in velocity_keys):
                            velocity_key = f"{key}/{subkey}"
                            print(f"Found velocity data in group: {velocity_key}")
                            break
        
        # Step 4: Look for large datasets (likely to be the flow field)
        if velocity_key is None:
            largest_dataset = None
            largest_size = 0
            
            def find_largest_dataset(name, obj):
                global largest_dataset, largest_size
                if isinstance(obj, h5py.Dataset):
                    # Skip small datasets (likely metadata)
                    if obj.size > 1000 and obj.size > largest_size:
                        largest_size = obj.size
                        largest_dataset = name
            
            # Traverse the entire file
            f.visititems(find_largest_dataset)
            
            if largest_dataset:
                velocity_key = largest_dataset
                print(f"Selected largest dataset as velocity data: {velocity_key} (size: {largest_size})")
        
        if velocity_key is None:
            raise KeyError("Could not find velocity data in the file")
            
        print(f"Loading velocity data from '{velocity_key}'")
        
        # Just get the shape if memory efficient
        if MEMORY_EFFICIENT:
            shape = f[velocity_key].shape
            print(f"Data shape: {shape} (memory efficient mode - not loading full data)")
        else:
            data = f[velocity_key][:]
            shape = data.shape
            print(f"Data shape: {shape}")
        
        # Load any coordinates or metadata if available
        x_coords = None
        y_coords = None
        time_data = None
        
        # Look for common coordinate names
        for coord_name in ['x', 'y', 'z', 'X', 'Y', 'Z', 'coord', 'coords', 'coordinates', 'grid', 'time', 't']:
            if coord_name in f:
                if coord_name.lower() in ['x', 'X']:
                    x_coords = f[coord_name][:]
                    print(f"Found x coordinates: shape {x_coords.shape}")
                elif coord_name.lower() in ['y', 'Y']:
                    y_coords = f[coord_name][:]
                    print(f"Found y coordinates: shape {y_coords.shape}")
                elif coord_name.lower() in ['time', 't']:
                    time_data = f[coord_name][:]
                    print(f"Found time data: shape {time_data.shape}")
                else:
                    print(f"Found other coordinate data '{coord_name}': shape {f[coord_name].shape}")
        
except Exception as e:
    print(f"Error loading data: {str(e)}")
    raise

print_memory_usage("after data exploration")

# Determine data dimensions
if len(shape) == 2:  # (time, points) or (points, time)
    if shape[0] < shape[1]:  # Likely (time, points)
        n_snapshots = shape[0]
        n_points = shape[1]
        n_dims = 1
        structured_grid = False
        print(f"Detected unstructured data: {n_snapshots} snapshots, {n_points} points, {n_dims} dimensions")
    else:  # Likely (points, time)
        n_snapshots = shape[1]
        n_points = shape[0]
        n_dims = 1
        structured_grid = False
        print(f"Detected unstructured data: {n_snapshots} snapshots, {n_points} points, {n_dims} dimensions")
        
elif len(shape) == 3:  # (time, x, y) or (time, points, dims)
    if shape[1] > shape[2]:  # Likely (time, points, dims)
        n_snapshots = shape[0]
        n_points = shape[1]
        n_dims = shape[2]
        structured_grid = False
        print(f"Detected (time, points, dimensions) format: {n_snapshots} snapshots, {n_points} points, {n_dims} dimensions")
    else:  # Likely (time, x, y) for a scalar field
        n_snapshots = shape[0]
        grid_shape = (shape[1], shape[2])
        n_points = grid_shape[0] * grid_shape[1]
        n_dims = 1
        structured_grid = True
        print(f"Detected structured scalar data: {n_snapshots} snapshots, grid shape {grid_shape}")
    # REMOVED THE PROBLEMATIC SECOND ELSE STATEMENT HERE

elif len(shape) == 4:  # Likely (time, x, y, components) 
    if shape[0] < shape[1] and shape[0] < shape[2]:  # Typical for time series
        n_snapshots = shape[0]
        grid_shape = (shape[1], shape[2])
        n_points = np.prod(grid_shape)
        n_dims = shape[3]
        structured_grid = True
        print(f"Detected (time, x, y, components) format: {n_snapshots} snapshots, grid shape {grid_shape}, {n_dims} dimensions")
    else:
        raise ValueError(f"Could not determine data structure from shape {shape}")
else:
    raise ValueError(f"Unsupported data shape: {shape}")

# Create snapshot matrix and process mean/fluctuations
if MEMORY_EFFICIENT and data is None:
    print(f"Processing data in chunks of {CHUNK_SIZE} snapshots to save memory")
    # Initialize arrays for the final results
    snapshot_matrix = np.zeros((n_points * n_dims, n_snapshots))
    mean_flow = np.zeros((n_points * n_dims, 1))
    
    # Process data in chunks
    with h5py.File(h5_file, 'r') as f:
        # We'll compute the mean in a streaming fashion
        chunk_means = []
        chunk_weights = []
        
        # First pass: load data in chunks and compute chunk means
        for chunk_start in range(0, n_snapshots, CHUNK_SIZE):
            chunk_end = min(chunk_start + CHUNK_SIZE, n_snapshots)
            chunk_size = chunk_end - chunk_start
            print(f"Processing chunk {chunk_start+1}-{chunk_end} of {n_snapshots}")
            
            # Load chunk
            chunk_data = f[velocity_key][chunk_start:chunk_end]
            
            # Reshape chunk based on data structure
            if structured_grid:
                if n_dims == 1:
                    # For scalar field
                    chunk_reshaped = np.zeros((n_points, chunk_size))
                    for i in range(chunk_size):
                        chunk_reshaped[:, i] = chunk_data[i].reshape(n_points)
                else:
                    # For vector field
                    chunk_reshaped = np.zeros((n_points * n_dims, chunk_size))
                    for i in range(chunk_size):
                        reshaped = chunk_data[i].reshape(n_points, n_dims)
                        chunk_reshaped[:, i] = reshaped.flatten()
            else:
                # Already in the right format for unstructured grid
                if n_dims == 1:
                    if chunk_data.shape[0] < chunk_data.shape[1]:
                        chunk_reshaped = chunk_data.T  # Transpose to get (n_points, chunk_size)
                    else:
                        chunk_reshaped = chunk_data  # Already (n_points, chunk_size)
                else:
                    chunk_reshaped = np.zeros((n_points * n_dims, chunk_size))
                    for i in range(chunk_size):
                        chunk_reshaped[:, i] = chunk_data[i].flatten()
            
            # Store in snapshot matrix
            snapshot_matrix[:, chunk_start:chunk_end] = chunk_reshaped
            
            # Compute chunk mean
            chunk_mean = np.mean(chunk_reshaped, axis=1, keepdims=True)
            chunk_means.append(chunk_mean)
            chunk_weights.append(chunk_size)
            
            # Free memory
            del chunk_data, chunk_reshaped
            gc.collect()
            print_memory_usage(f"after processing chunk {chunk_start+1}-{chunk_end}")
        
        # Compute weighted mean of all chunks
        total_weight = sum(chunk_weights)
        for i, (mean, weight) in enumerate(zip(chunk_means, chunk_weights)):
            mean_flow += (mean * weight / total_weight)
        
        print(f"Computed mean flow with shape {mean_flow.shape}")
        
        # Second pass: subtract mean and normalize if needed
        if params["preprocessing_options"]["mean_subtraction"]:
            print("Applying mean subtraction...")
            for chunk_start in range(0, n_snapshots, CHUNK_SIZE):
                chunk_end = min(chunk_start + CHUNK_SIZE, n_snapshots)
                print(f"Subtracting mean from chunk {chunk_start+1}-{chunk_end}")
                
                # Subtract mean
                snapshot_matrix[:, chunk_start:chunk_end] -= mean_flow
                
                print_memory_usage(f"after mean subtraction for chunk {chunk_start+1}-{chunk_end}")
            
            print("Mean subtraction completed")
        else:
            print("Mean subtraction skipped")
        
        # Normalize if needed
        if params["preprocessing_options"]["normalization"]:
            print("Applying normalization...")
            # Compute Frobenius norm in chunks to save memory
            frob_norm_squared = 0
            for chunk_start in range(0, n_snapshots, CHUNK_SIZE):
                chunk_end = min(chunk_start + CHUNK_SIZE, n_snapshots)
                chunk = snapshot_matrix[:, chunk_start:chunk_end]
                frob_norm_squared += np.sum(chunk**2)
            
            frob_norm = np.sqrt(frob_norm_squared)
            print(f"Computed Frobenius norm: {frob_norm:.4f}")
            
            # Normalize
            for chunk_start in range(0, n_snapshots, CHUNK_SIZE):
                chunk_end = min(chunk_start + CHUNK_SIZE, n_snapshots)
                print(f"Normalizing chunk {chunk_start+1}-{chunk_end}")
                snapshot_matrix[:, chunk_start:chunk_end] /= frob_norm
            
            # Save the normalization factor for later use
            params["preprocessing_options"]["normalization_factor"] = float(frob_norm)
            print("Normalization completed")
        else:
            print("Normalization skipped")
else:
    # Process the full data at once (non-memory-efficient mode)
    print("Processing all data at once")
    
    # Create snapshot matrix - reshape data into a 2D matrix
    # For POD, we need a matrix of shape (n_points*n_dims, n_snapshots)
    if structured_grid:
        if n_dims == 1:
            # For scalar field
            snapshot_matrix = np.zeros((n_points, n_snapshots))
            for i in range(n_snapshots):
                snapshot_matrix[:, i] = data[i].reshape(n_points)
        else:
            # For vector field
            snapshot_matrix = np.zeros((n_points * n_dims, n_snapshots))
            for i in range(n_snapshots):
                # Reshape and stack components
                reshaped = data[i].reshape(n_points, n_dims)
                snapshot_matrix[:, i] = reshaped.flatten()
    else:
        # Already in the right format for unstructured grid
        if n_dims == 1:
            if data.shape[0] < data.shape[1]:
                snapshot_matrix = data.T  # Transpose to get (n_points, n_snapshots)
            else:
                snapshot_matrix = data  # Already (n_points, n_snapshots)
        else:
            snapshot_matrix = np.zeros((n_points * n_dims, n_snapshots))
            for i in range(n_snapshots):
                snapshot_matrix[:, i] = data[i].flatten()
    
    print(f"Created snapshot matrix with shape: {snapshot_matrix.shape}")
    
    # Compute mean flow
    mean_flow = np.mean(snapshot_matrix, axis=1, keepdims=True)
    print(f"Mean flow shape: {mean_flow.shape}")
    
    # Subtract mean (centering the data)
    if params["preprocessing_options"]["mean_subtraction"]:
        snapshot_matrix -= mean_flow
        print("Mean subtraction applied")
    else:
        print("Mean subtraction skipped")
    
    # Normalize the data if needed
    if params["preprocessing_options"]["normalization"]:
        # Compute the Frobenius norm of the fluctuation matrix
        frob_norm = np.linalg.norm(snapshot_matrix)
        # Normalize
        snapshot_matrix = snapshot_matrix / frob_norm
        print(f"Normalization applied (Frobenius norm: {frob_norm:.4f})")
        
        # Save the normalization factor for later use
        params["preprocessing_options"]["normalization_factor"] = float(frob_norm)
    else:
        print("Normalization skipped")

print_memory_usage("after preprocessing")

# Save metadata about the preprocessing
metadata = {
    "n_snapshots": n_snapshots,
    "n_points": n_points,
    "n_dims": n_dims,
    "structured_grid": structured_grid
}

if structured_grid:
    metadata["grid_shape"] = grid_shape

# Update parameters with metadata
params.update(metadata)

# Save metadata to local file
metadata_path = os.path.join(temp_dir, 'metadata.json')
save_dict_to_json(metadata, metadata_path)

# Save mean flow to local file
mean_flow_path = os.path.join(temp_dir, 'mean_flow.npy')
np.save(mean_flow_path, mean_flow)

# Save snapshot matrix to local file
snapshot_matrix_path = os.path.join(temp_dir, 'snapshot_matrix.npy')
np.save(snapshot_matrix_path, snapshot_matrix)

# Upload files to MinIO
print("\nUploading preprocessed data to MinIO:")
preprocessed_prefix = f"{MINIO_OUTPUT_PREFIX}/preprocessed/"

# Upload metadata
metadata_key = f"{preprocessed_prefix}metadata.json"
print(f"Uploading metadata to {metadata_key}")
s3.upload_file(metadata_path, MINIO_BUCKET, metadata_key)

# Upload mean flow
mean_flow_key = f"{preprocessed_prefix}mean_flow.npy"
print(f"Uploading mean flow to {mean_flow_key}")
s3.upload_file(mean_flow_path, MINIO_BUCKET, mean_flow_key)

# Upload snapshot matrix
snapshot_matrix_key = f"{preprocessed_prefix}snapshot_matrix.npy"
print(f"Uploading snapshot matrix to {snapshot_matrix_key}")
s3.upload_file(snapshot_matrix_path, MINIO_BUCKET, snapshot_matrix_key)

# Update and upload parameters
params_path = os.path.join(temp_dir, 'params.json')
save_dict_to_json(params, params_path)
params_key = f"{MINIO_OUTPUT_PREFIX}/params.json"
print(f"Uploading updated parameters to {params_key}")
s3.upload_file(params_path, MINIO_BUCKET, params_key)

# Create a marker file to indicate this step is complete
marker_path = os.path.join(temp_dir, 'preprocessing_completed.txt')
with open(marker_path, 'w') as f:
    f.write(f"Preprocessing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
marker_key = f"{MINIO_OUTPUT_PREFIX}/preprocessing_completed.txt"
s3.upload_file(marker_path, MINIO_BUCKET, marker_key)

print("\nPreprocessing completed successfully!")
print(f"Processed {n_snapshots} snapshots with {n_points} spatial points and {n_dims} dimensions")
print(f"All results uploaded to MinIO bucket: {MINIO_BUCKET}, prefix: {preprocessed_prefix}")
