In [3]:
# Install required dependencies with specific versions to avoid conflicts
import sys
import subprocess
import importlib

def install_package(package, version=None):
    """Install a package with specific version if needed."""
    if version:
        package_with_version = f"{package}=={version}"
    else:
        package_with_version = package
    
    print(f"Installing {package_with_version}...")
    if package in ["boto3"]:  # Allow boto3 to install its dependencies
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_with_version])
    else:
        subprocess.check_call([sys.executable, "-m", "pip", "install", 
                              package_with_version, "--no-deps"])
    print(f"Successfully installed {package_with_version}")

# Install specific versions to resolve conflicts
install_package("MarkupSafe", "1.1.1")  # Version compatible with cookiecutter
install_package("Jinja2", "2.11.3")     # Version compatible with cookiecutter

# Install boto3 WITH its dependencies (including botocore)
install_package("boto3")                # For S3/MinIO operations
install_package("h5py")                # For HDF5 file operations
install_package("matplotlib")          # For plotting

# Force reload the modules if they were already imported
if "markupsafe" in sys.modules:
    importlib.reload(sys.modules["markupsafe"])
if "jinja2" in sys.modules:
    importlib.reload(sys.modules["jinja2"])

print("Dependencies installed successfully.")


Installing MarkupSafe==1.1.1...
Successfully installed MarkupSafe==1.1.1
Installing Jinja2==2.11.3...
Successfully installed Jinja2==2.11.3
Installing boto3...
Successfully installed boto3
Installing h5py...
Successfully installed h5py
Installing matplotlib...
Successfully installed matplotlib
Dependencies installed successfully.


In [None]:
# 1_preprocess_data.ipynb
#
# This notebook preprocesses the Cylinder Flow Dataset:
# - Loads the data from MinIO
# - Extracts velocity fields
# - Creates the snapshot matrix
# - Performs mean subtraction
# - Normalizes the data if needed
# - Uploads processed data back to MinIO

import os
import numpy as np
import h5py
import matplotlib.pyplot as plt
import json
from datetime import datetime
import boto3
from botocore.client import Config
import tempfile
import io

# Create a temporary local directory for processing
temp_dir = tempfile.mkdtemp()
print(f"Using temporary directory: {temp_dir}")

# Connect to MinIO
print("Connecting to MinIO...")
s3_endpoint = os.environ.get('S3_ENDPOINT', 'http://minio.minio-system.svc.cluster.local:9000')

# Fix the endpoint URL if the protocol is missing
if s3_endpoint and not s3_endpoint.startswith(('http://', 'https://')):
    s3_endpoint = f"http://{s3_endpoint}"
    print(f"Adding http:// prefix to endpoint: {s3_endpoint}")

s3_access_key = os.environ.get('AWS_ACCESS_KEY_ID', 'minio')
s3_secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY', 'minio123')

s3 = boto3.client('s3',
                  endpoint_url=s3_endpoint,
                  aws_access_key_id=s3_access_key,
                  aws_secret_access_key=s3_secret_key,
                  config=Config(signature_version='s3v4'))

# Load parameters from previous step
MINIO_BUCKET = 'rom-data'
MINIO_OUTPUT_PREFIX = 'rom-pipeline/outputs'

# Download parameters file from MinIO
try:
    params_key = f"{MINIO_OUTPUT_PREFIX}/params.json"
    params_path = os.path.join(temp_dir, 'params.json')
    s3.download_file(MINIO_BUCKET, params_key, params_path)
    
    with open(params_path, 'r') as f:
        params = json.load(f)
        
    print(f"Loaded parameters: {params}")
except Exception as e:
    print(f"Error loading parameters, using defaults: {str(e)}")
    params = {
        "dataset_name": "cylinder",
        "minio_bucket": MINIO_BUCKET,
        "minio_output_prefix": MINIO_OUTPUT_PREFIX
    }

# Update parameters for this step
params.update({
    "preprocessing_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "preprocessing_options": {
        "mean_subtraction": True,
        "normalization": True
    }
})

# Verify the previous step completed
try:
    marker_key = f"{MINIO_OUTPUT_PREFIX}/data_download_completed.txt"
    s3.head_object(Bucket=MINIO_BUCKET, Key=marker_key)
    print("Previous step (data fetching) completed successfully")
except Exception as e:
    print(f"Warning: Previous step completion marker not found: {str(e)}")
    print("Continuing anyway...")

# List objects in the data directory to check what's available
print("\nListing files in MinIO data directory:")
data_prefix = f"{MINIO_OUTPUT_PREFIX}/data/"
response = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix=data_prefix)

if 'Contents' not in response or len(response.get('Contents', [])) == 0:
    print(f"Warning: No data files found in {MINIO_BUCKET}/{data_prefix}")
    print("Please ensure that data files have been uploaded.")
    raise FileNotFoundError(f"No data files found in {MINIO_BUCKET}/{data_prefix}")

# Download data files from MinIO
print("\nDownloading data files from MinIO:")
for obj in response.get('Contents', []):
    if obj['Key'].endswith(('.h5', '.hdf5')):
        filename = os.path.basename(obj['Key'])
        local_path = os.path.join(temp_dir, filename)
        
        print(f"  Downloading {obj['Key']} to {local_path}")
        s3.download_file(MINIO_BUCKET, obj['Key'], local_path)

# Find HDF5 files
h5_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) 
            if f.endswith('.h5') or f.endswith('.hdf5')]

if not h5_files:
    raise FileNotFoundError(f"No HDF5 files found in downloaded data")

print(f"Found {len(h5_files)} HDF5 files: {[os.path.basename(f) for f in h5_files]}")

# Load velocity data from the first file
h5_file = h5_files[0]
print(f"Loading data from {os.path.basename(h5_file)}")

try:
    with h5py.File(h5_file, 'r') as f:
        # First, explore the HDF5 file structure to find the velocity field
        print("Exploring HDF5 file structure:")
        print("Top-level groups/datasets:")
        for key in f.keys():
            if isinstance(f[key], h5py.Group):
                print(f"  Group: {key} (contains: {list(f[key].keys())})")
            elif isinstance(f[key], h5py.Dataset):
                print(f"  Dataset: {key} (shape: {f[key].shape}, dtype: {f[key].dtype})")
        
        # Define possible velocity field names to check
        velocity_keys = ['velocity', 'u', 'v', 'vel', 'flow', 'flowfield', 'vector']
        
        # Search for velocity data
        velocity_key = None
        
        # Step 1: Check direct keys in root level
        for key in velocity_keys:
            if key in f:
                velocity_key = key
                print(f"Found direct velocity key: {key}")
                break
        
        # Step 2: Look for keys containing velocity terms
        if velocity_key is None:
            for key in f.keys():
                if any(vk in key.lower() for vk in velocity_keys):
                    velocity_key = key
                    print(f"Found key containing velocity term: {key}")
                    break
        
        # Step 3: Search in groups
        if velocity_key is None:
            for key in f.keys():
                if isinstance(f[key], h5py.Group):
                    for subkey in f[key].keys():
                        if any(vk == subkey.lower() or vk in subkey.lower() for vk in velocity_keys):
                            velocity_key = f"{key}/{subkey}"
                            print(f"Found velocity data in group: {velocity_key}")
                            break
        
        # Step 4: Look for large datasets (likely to be the flow field)
        if velocity_key is None:
            largest_dataset = None
            largest_size = 0
            
            def find_largest_dataset(name, obj):
                global largest_dataset, largest_size
                if isinstance(obj, h5py.Dataset):
                    # Skip small datasets (likely metadata)
                    if obj.size > 1000 and obj.size > largest_size:
                        largest_size = obj.size
                        largest_dataset = name
            
            # Traverse the entire file
            f.visititems(find_largest_dataset)
            
            if largest_dataset:
                velocity_key = largest_dataset
                print(f"Selected largest dataset as velocity data: {velocity_key} (size: {largest_size})")
        
        if velocity_key is None:
            raise KeyError("Could not find velocity data in the file")
            
        print(f"Loading velocity data from '{velocity_key}'")
        velocity = f[velocity_key][:]
        
        # Load any coordinates or metadata if available
        x_coords = None
        y_coords = None
        time_data = None
        
        # Look for common coordinate names
        for coord_name in ['x', 'y', 'z', 'X', 'Y', 'Z', 'coord', 'coords', 'coordinates', 'grid', 'time', 't']:
            if coord_name in f:
                if coord_name.lower() in ['x', 'X']:
                    x_coords = f[coord_name][:]
                    print(f"Found x coordinates: shape {x_coords.shape}")
                elif coord_name.lower() in ['y', 'Y']:
                    y_coords = f[coord_name][:]
                    print(f"Found y coordinates: shape {y_coords.shape}")
                elif coord_name.lower() in ['time', 't']:
                    time_data = f[coord_name][:]
                    print(f"Found time data: shape {time_data.shape}")
                else:
                    print(f"Found other coordinate data '{coord_name}': shape {f[coord_name].shape}")
        
        print(f"Velocity data shape: {velocity.shape}")
        
except Exception as e:
    print(f"Error loading data: {str(e)}")
    raise

# Determine data dimensions
if len(velocity.shape) == 2:  # (time, points) or (points, time)
    if velocity.shape[0] < velocity.shape[1]:  # Likely (time, points)
        n_snapshots = velocity.shape[0]
        n_points = velocity.shape[1]
        n_dims = 1
        structured_grid = False
        print(f"Detected unstructured data: {n_snapshots} snapshots, {n_points} points, {n_dims} dimensions")
    else:  # Likely (points, time)
        n_snapshots = velocity.shape[1]
        n_points = velocity.shape[0]
        n_dims = 1
        structured_grid = False
        print(f"Detected unstructured data: {n_snapshots} snapshots, {n_points} points, {n_dims} dimensions")
        
elif len(velocity.shape) == 3:  # (time, x, y) or (time, points, dims)
    if velocity.shape[1] > velocity.shape[2]:  # Likely (time, points, dims)
        n_snapshots = velocity.shape[0]
        n_points = velocity.shape[1]
        n_dims = velocity.shape[2]
        structured_grid = False
        print(f"Detected unstructured data: {n_snapshots} snapshots, {n_points} points, {n_dims} dimensions")
    else:  # Likely (time, x, y) for a scalar field
        n_snapshots = velocity.shape[0]
        grid_shape = (velocity.shape[1], velocity.shape[2])
        n_points = grid_shape[0] * grid_shape[1]
        structured_grid = True
        n_dims = 1
        print(f"Detected structured scalar data: {n_snapshots} snapshots, grid shape {grid_shape}")
        
elif len(velocity.shape) == 4:  # (time, x, y, components)
    n_snapshots = velocity.shape[0]
    grid_shape = (velocity.shape[1], velocity.shape[2])
    n_points = grid_shape[0] * grid_shape[1]
    n_dims = velocity.shape[3]
    structured_grid = True
    print(f"Detected structured vector data: {n_snapshots} snapshots, grid shape {grid_shape}, {n_dims} components")
    
else:
    raise ValueError(f"Unexpected velocity data shape: {velocity.shape}")

# Create snapshot matrix - reshape data into a 2D matrix
# For POD, we need a matrix of shape (n_points*n_dims, n_snapshots)
if structured_grid:
    if n_dims == 1:
        # For scalar field
        snapshot_matrix = np.zeros((n_points, n_snapshots))
        for i in range(n_snapshots):
            snapshot_matrix[:, i] = velocity[i].reshape(n_points)
    else:
        # For vector field
        snapshot_matrix = np.zeros((n_points * n_dims, n_snapshots))
        for i in range(n_snapshots):
            # Reshape and stack components
            reshaped = velocity[i].reshape(n_points, n_dims)
            snapshot_matrix[:, i] = reshaped.flatten()
else:
    # Already in the right format for unstructured grid
    if n_dims == 1:
        if velocity.shape[0] < velocity.shape[1]:
            snapshot_matrix = velocity.T  # Transpose to get (n_points, n_snapshots)
        else:
            snapshot_matrix = velocity  # Already (n_points, n_snapshots)
    else:
        snapshot_matrix = np.zeros((n_points * n_dims, n_snapshots))
        for i in range(n_snapshots):
            snapshot_matrix[:, i] = velocity[i].flatten()

print(f"Created snapshot matrix with shape: {snapshot_matrix.shape}")

# Compute mean flow
mean_flow = np.mean(snapshot_matrix, axis=1, keepdims=True)
print(f"Mean flow shape: {mean_flow.shape}")

# Subtract mean (centering the data)
if params["preprocessing_options"]["mean_subtraction"]:
    fluctuations = snapshot_matrix - mean_flow
    print("Mean subtraction applied")
else:
    fluctuations = snapshot_matrix
    print("Mean subtraction skipped")

# Normalize the data if needed
if params["preprocessing_options"]["normalization"]:
    # Compute the Frobenius norm of the fluctuation matrix
    frob_norm = np.linalg.norm(fluctuations)
    # Normalize
    fluctuations = fluctuations / frob_norm
    print(f"Normalization applied (Frobenius norm: {frob_norm:.4f})")
    
    # Save the normalization factor for later use
    params["preprocessing_options"]["normalization_factor"] = float(frob_norm)
else:
    print("Normalization skipped")

# Visualize the mean flow
if structured_grid:
    plt.figure(figsize=(10, 6))
    
    if n_dims == 1:
        # Scalar field
        plt.imshow(mean_flow.reshape(grid_shape), cmap='viridis')
        plt.colorbar(label='Mean Velocity')
    else:
        # Vector field - plot magnitude
        mean_reshaped = mean_flow.reshape(grid_shape[0], grid_shape[1], n_dims)
        mean_magnitude = np.sqrt(np.sum(mean_reshaped**2, axis=2))
        plt.imshow(mean_magnitude, cmap='viridis')
        plt.colorbar(label='Mean Velocity Magnitude')
        
    plt.title('Mean Flow')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.tight_layout()
    
    # Save visualization to local file
    mean_flow_img = os.path.join(temp_dir, 'mean_flow.png')
    plt.savefig(mean_flow_img)
    
    # Upload visualization to MinIO
    vis_key = f"{MINIO_OUTPUT_PREFIX}/visualizations/mean_flow.png"
    print(f"Uploading mean flow visualization to {MINIO_BUCKET}/{vis_key}")
    s3.upload_file(mean_flow_img, MINIO_BUCKET, vis_key)

# Visualize a few snapshots
if structured_grid and n_snapshots > 0:
    # Create a directory for snapshot visualizations
    vis_dir = os.path.join(temp_dir, 'snapshots')
    os.makedirs(vis_dir, exist_ok=True)
    
    # Visualize a few snapshots (original and fluctuations)
    num_vis = min(5, n_snapshots)  # Visualize up to 5 snapshots
    indices = np.linspace(0, n_snapshots-1, num_vis, dtype=int)
    
    for idx in indices:
        plt.figure(figsize=(15, 6))
        
        # Original snapshot
        plt.subplot(1, 2, 1)
        if n_dims == 1:
            # Scalar field
            plt.imshow(snapshot_matrix[:, idx].reshape(grid_shape), cmap='viridis')
            plt.colorbar(label='Velocity')
        else:
            # Vector field - plot magnitude
            snapshot_reshaped = snapshot_matrix[:, idx].reshape(grid_shape[0], grid_shape[1], n_dims)
            snapshot_magnitude = np.sqrt(np.sum(snapshot_reshaped**2, axis=2))
            plt.imshow(snapshot_magnitude, cmap='viridis')
            plt.colorbar(label='Velocity Magnitude')
            
        plt.title(f'Original Snapshot {idx}')
        plt.xlabel('X')
        plt.ylabel('Y')
        
        # Fluctuation
        plt.subplot(1, 2, 2)
        if n_dims == 1:
            # Scalar field
            plt.imshow(fluctuations[:, idx].reshape(grid_shape), cmap='coolwarm')
            plt.colorbar(label='Fluctuation')
        else:
            # Vector field - plot magnitude
            fluct_reshaped = fluctuations[:, idx].reshape(grid_shape[0], grid_shape[1], n_dims)
            fluct_magnitude = np.sqrt(np.sum(fluct_reshaped**2, axis=2))
            plt.imshow(fluct_magnitude, cmap='coolwarm')
            plt.colorbar(label='Fluctuation Magnitude')
            
        plt.title(f'Fluctuation Snapshot {idx}')
        plt.xlabel('X')
        plt.ylabel('Y')
        
        plt.tight_layout()
        
        # Save visualization to local file
        snapshot_img = os.path.join(vis_dir, f'snapshot_{idx}.png')
        plt.savefig(snapshot_img)
        plt.close()
        
        # Upload visualization to MinIO
        vis_key = f"{MINIO_OUTPUT_PREFIX}/visualizations/snapshot_{idx}.png"
        print(f"Uploading snapshot visualization to {MINIO_BUCKET}/{vis_key}")
        s3.upload_file(snapshot_img, MINIO_BUCKET, vis_key)

# Save the processed data
processed_data = {
    'snapshot_matrix': snapshot_matrix,
    'mean_flow': mean_flow,
    'fluctuations': fluctuations,
    'n_snapshots': n_snapshots,
    'n_points': n_points,
    'n_dims': n_dims,
    'structured_grid': structured_grid
}

if structured_grid:
    processed_data['grid_shape'] = grid_shape

if x_coords is not None:
    processed_data['x_coords'] = x_coords
if y_coords is not None:
    processed_data['y_coords'] = y_coords
if time_data is not None:
    processed_data['time_data'] = time_data

# Save processed data to NPZ file
processed_data_file = os.path.join(temp_dir, 'processed_data.npz')
print(f"Saving processed data to {processed_data_file}")
np.savez_compressed(processed_data_file, **processed_data)

# Upload processed data to MinIO
processed_data_key = f"{MINIO_OUTPUT_PREFIX}/processed_data.npz"
print(f"Uploading processed data to {MINIO_BUCKET}/{processed_data_key}")
s3.upload_file(processed_data_file, MINIO_BUCKET, processed_data_key)

# Save parameters
params_file = os.path.join(temp_dir, 'params.json')
with open(params_file, 'w') as f:
    json.dump(params, f, indent=2)

# Upload parameters to MinIO
params_key = f"{MINIO_OUTPUT_PREFIX}/params.json"
print(f"Uploading parameters to {MINIO_BUCKET}/{params_key}")
s3.upload_file(params_file, MINIO_BUCKET, params_key)

# Create a completion marker
completion_marker = os.path.join(temp_dir, 'preprocessing_completed.txt')
with open(completion_marker, 'w') as f:
    f.write(f"Preprocessing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Processed {n_snapshots} snapshots with {n_points} spatial points and {n_dims} dimensions\n")

# Upload completion marker to MinIO
marker_key = f"{MINIO_OUTPUT_PREFIX}/preprocessing_completed.txt"
print(f"Uploading completion marker to {MINIO_BUCKET}/{marker_key}")
s3.upload_file(completion_marker, MINIO_BUCKET, marker_key)

print("\nPreprocessing completed successfully!")
print(f"Processed {n_snapshots} snapshots with {n_points} spatial points and {n_dims} dimensions")
print(f"Data saved to {MINIO_BUCKET}/{processed_data_key}")
