In [2]:
# Install required dependencies
import sys
import subprocess

def install_package(package):
    """Install a package if needed."""
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    print(f"Successfully installed {package}")

# Install boto3 which is required for S3/MinIO operations
install_package("boto3")

print("Dependencies installed successfully.")


Installing boto3...
Collecting boto3
  Downloading boto3-1.38.21-py3-none-any.whl (139 kB)
Collecting botocore<1.39.0,>=1.38.21
  Downloading botocore-1.38.21-py3-none-any.whl (13.6 MB)
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.13.0,>=0.12.0
  Downloading s3transfer-0.12.0-py3-none-any.whl (84 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.38.21 botocore-1.38.21 jmespath-1.0.1 s3transfer-0.12.0
Successfully installed boto3
Dependencies installed successfully.


In [3]:
# 0_fetch_data.ipynb
# 
# This notebook fetches the Cylinder Flow Dataset from MinIO storage
# and saves it to MinIO for further processing.

import os
import boto3
from botocore.client import Config
import numpy as np
import h5py
import matplotlib.pyplot as plt
import json
from datetime import datetime
import io
import tempfile

# Create a temporary local directory for processing
temp_dir = tempfile.mkdtemp()
print(f"Using temporary directory: {temp_dir}")

# Define MinIO parameters
MINIO_BUCKET = 'rom-data'
MINIO_RAW_PREFIX = 'examples/cylinder'
MINIO_OUTPUT_PREFIX = 'rom-pipeline/outputs'

# Log parameters for traceability
params = {
    "dataset_name": "cylinder",
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "minio_bucket": MINIO_BUCKET,
    "minio_input_prefix": MINIO_RAW_PREFIX,
    "minio_output_prefix": MINIO_OUTPUT_PREFIX
}

# Connect to MinIO
print("Connecting to MinIO...")
s3_endpoint = os.environ.get('S3_ENDPOINT', 'http://minio.minio-system.svc.cluster.local:9000')

# Fix the endpoint URL if the protocol is missing
if s3_endpoint and not s3_endpoint.startswith(('http://', 'https://')):
    s3_endpoint = f"http://{s3_endpoint}"
    print(f"Adding http:// prefix to endpoint: {s3_endpoint}")

s3_access_key = os.environ.get('AWS_ACCESS_KEY_ID', 'minio')
s3_secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY', 'minio123')

s3 = boto3.client('s3',
                  endpoint_url=s3_endpoint,
                  aws_access_key_id=s3_access_key,
                  aws_secret_access_key=s3_secret_key,
                  config=Config(signature_version='s3v4'))

# Test MinIO connection
try:
    print(f"Testing MinIO connection to {s3_endpoint}")
    s3.head_bucket(Bucket=MINIO_BUCKET)
    print(f"Successfully connected to bucket {MINIO_BUCKET}")
except Exception as e:
    print(f"Error connecting to MinIO: {str(e)}")
    raise  # Re-raise to stop execution

# Verify the bucket exists
try:
    s3.head_bucket(Bucket=MINIO_BUCKET)
    print(f"Bucket {MINIO_BUCKET} exists and is accessible")
except Exception as e:
    try:
        s3.create_bucket(Bucket=MINIO_BUCKET)
        print(f"Created bucket {MINIO_BUCKET}")
    except Exception as e2:
        print(f"Error accessing or creating bucket: {str(e2)}")
        raise

# Look for all HDF5 files in the directory
print(f"\nLooking for HDF5 files in {MINIO_BUCKET}/{MINIO_RAW_PREFIX}/")
have_data = False
downloaded_files = []

try:
    response = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix=MINIO_RAW_PREFIX)
    
    if 'Contents' in response:
        print(f"Found {len(response['Contents'])} objects in the directory")
        hdf5_files = [obj for obj in response['Contents'] 
                     if obj['Key'].endswith(('.h5', '.hdf5'))]
        
        if hdf5_files:
            print(f"Found {len(hdf5_files)} HDF5 files:")
            for obj in hdf5_files:
                file_key = obj['Key']
                filename = os.path.basename(file_key)
                local_path = os.path.join(temp_dir, filename)
                
                print(f"  Downloading {file_key} ({obj['Size']} bytes) to {local_path}")
                s3.download_file(MINIO_BUCKET, file_key, local_path)
                
                # Verify the file was downloaded and is a valid HDF5 file
                if os.path.exists(local_path):
                    try:
                        with h5py.File(local_path, 'r') as f:
                            print(f"  ✓ Valid HDF5 file! Contents: {list(f.keys())}")
                            downloaded_files.append(local_path)
                    except Exception as hdf_error:
                        print(f"  ✗ Error reading HDF5 file: {str(hdf_error)}")
            
            if downloaded_files:
                have_data = True
                print(f"Successfully downloaded {len(downloaded_files)} HDF5 files")
        else:
            print("No HDF5 files found in the directory")
    else:
        print("Directory is empty or does not exist")
        
except Exception as e:
    print(f"Error listing or downloading files: {str(e)}")

# If we still don't have data, create a sample dataset
if not have_data:
    print("\nNo suitable data found, creating sample dataset")
    
    # Create a simple 2D cylinder flow simulation for testing (very simplified)
    n_snapshots = 50
    n_x, n_y = 100, 50
    
    # Create the velocity fields
    velocity = np.zeros((n_snapshots, n_x, n_y, 2))
    
    # Add a simple flow pattern (very simplistic cylinder wake)
    x = np.linspace(0, 10, n_x)
    y = np.linspace(-2.5, 2.5, n_y)
    X, Y = np.meshgrid(x, y, indexing='ij')
    
    # Generate time-varying flow
    for t in range(n_snapshots):
        # Base flow from left to right
        u = np.ones_like(X) * 1.0
        v = np.zeros_like(Y)
        
        # Add cylinder at x=2
        cylinder_x, cylinder_y = 2, 0
        cylinder_radius = 0.5
        distance = np.sqrt((X - cylinder_x)**2 + (Y - cylinder_y)**2)
        mask = distance < cylinder_radius
        u[mask] = 0
        v[mask] = 0
        
        # Add oscillating wake
        wake_mask = (X > cylinder_x) & (distance > cylinder_radius)
        phase = 0.2 * t
        v[wake_mask] = 0.3 * np.sin(0.5 * (X[wake_mask] - cylinder_x) + phase) * np.exp(-(Y[wake_mask]**2) / 0.5)
        
        # Add some random noise
        u += 0.02 * np.random.randn(*u.shape)
        v += 0.02 * np.random.randn(*v.shape)
        
        # Store u and v components
        velocity[t, :, :, 0] = u
        velocity[t, :, :, 1] = v
    
    # Save to HDF5 file
    sample_file = os.path.join(temp_dir, 'cylinder_flow.h5')
    with h5py.File(sample_file, 'w') as f:
        f.create_dataset('velocity', data=velocity)
        f.create_dataset('x', data=x)
        f.create_dataset('y', data=y)
        f.create_dataset('time', data=np.linspace(0, 10, n_snapshots))
    
    print(f"Created sample dataset at {sample_file}")
    
    # Verify file was created
    print(f"Verifying sample file exists: {os.path.exists(sample_file)}")
    print(f"Sample file size: {os.path.getsize(sample_file)} bytes")
    
    # Check file content with h5py
    with h5py.File(sample_file, 'r') as f:
        print(f"HDF5 file contents: {list(f.keys())}")
        print(f"Velocity shape: {f['velocity'].shape}")
    
    downloaded_files.append(sample_file)

# CRITICAL: Ensure the data is copied to the output location
# This section must always run, regardless of whether you found or created the data
try:
    # Create the output directory structure if needed
    output_data_key = f"{MINIO_OUTPUT_PREFIX}/data/"
    try:
        s3.put_object(Bucket=MINIO_BUCKET, Key=output_data_key)
        print(f"Created directory {output_data_key}")
    except Exception as e:
        print(f"Note: {str(e)}")
    
    # Find all HDF5 files in the temp directory
    h5_files = [f for f in os.listdir(temp_dir) if f.endswith(('.h5', '.hdf5'))]
    
    if not h5_files:
        raise FileNotFoundError("No HDF5 files found to upload!")
    
    # Upload each file to the output location
    uploaded_files = 0
    for filename in h5_files:
        source_path = os.path.join(temp_dir, filename)
        target_key = f"{MINIO_OUTPUT_PREFIX}/data/{filename}"
        
        print(f"Uploading {source_path} ({os.path.getsize(source_path)} bytes) to {target_key}")
        s3.upload_file(source_path, MINIO_BUCKET, target_key)
        print(f"✓ Upload successful")
        uploaded_files += 1
    
    if uploaded_files == 0:
        raise Exception("No files were uploaded to MinIO - pipeline cannot continue")
    
    # Create marker file to indicate success
    marker_path = os.path.join(temp_dir, 'data_download_completed.txt')
    with open(marker_path, 'w') as f:
        f.write("Data download completed successfully")
    
    s3.upload_file(
        marker_path,
        MINIO_BUCKET, 
        f"{MINIO_OUTPUT_PREFIX}/data_download_completed.txt"
    )
    
    # Upload parameters file to MinIO
    params_path = os.path.join(temp_dir, 'params.json')
    with open(params_path, 'w') as f:
        json.dump(params, f)
    
    s3.upload_file(
        params_path,
        MINIO_BUCKET, 
        f"{MINIO_OUTPUT_PREFIX}/params.json"
    )
    
except Exception as e:
    print(f"ERROR during data upload: {str(e)}")
    raise  # Re-raise to stop execution

# Inspect and visualize HDF5 files
if downloaded_files:
    print("\nInspecting and visualizing HDF5 files:")
    
    for file_path in downloaded_files:
        h5_file = os.path.basename(file_path)
        print(f"\nInspecting {h5_file}:")
        
        try:
            with h5py.File(file_path, 'r') as f:
                # Function to recursively print HDF5 group structure
                def print_group(name, obj):
                    if isinstance(obj, h5py.Dataset):
                        print(f"  Dataset: {name}, Shape: {obj.shape}, Type: {obj.dtype}")
                    elif isinstance(obj, h5py.Group):
                        print(f"  Group: {name}")
                
                # Traverse the file structure
                f.visititems(print_group)
                
                # If the file contains velocity data, plot a sample
                if 'velocity' in f:
                    velocity = f['velocity']
                    if len(velocity.shape) >= 3:  # Time, X, Y, (possibly components)
                        # Plot the first velocity snapshot
                        plt.figure(figsize=(10, 6))
                        
                        # Handle different velocity field formats
                        if len(velocity.shape) == 3:  # Time, X, Y
                            plt.imshow(velocity[0], cmap='viridis')
                            plt.title('First velocity snapshot (scalar)')
                        elif len(velocity.shape) == 4:  # Time, X, Y, Components
                            # Plot magnitude of velocity
                            v_mag = np.sqrt(velocity[0,:,:,0]**2 + velocity[0,:,:,1]**2)
                            plt.imshow(v_mag, cmap='viridis')
                            plt.title('First velocity snapshot (magnitude)')
                        
                        plt.colorbar(label='Velocity')
                        plt.xlabel('X')
                        plt.ylabel('Y')
                        plt.tight_layout()
                        
                        # Save the plot to a file
                        img_path = os.path.join(temp_dir, 'velocity_sample.png')
                        plt.savefig(img_path)
                        
                        # Upload the visualization to MinIO
                        s3.upload_file(
                            img_path,
                            MINIO_BUCKET, 
                            f"{MINIO_OUTPUT_PREFIX}/visualizations/velocity_sample.png"
                        )
                        
                        # Save the first few velocity snapshots for inspection
                        npy_path = os.path.join(temp_dir, 'velocity_samples.npy')
                        np.save(npy_path, velocity[:5] if len(velocity) >= 5 else velocity[:])
                        
                        print(f"  Sample visualization uploaded to {MINIO_BUCKET}/{MINIO_OUTPUT_PREFIX}/visualizations/velocity_sample.png")
                
        except Exception as e:
            print(f"  Error inspecting {h5_file}: {str(e)}")

print("\nData fetching and inspection completed!")
print(f"All data uploaded to {MINIO_BUCKET}/{MINIO_OUTPUT_PREFIX}/")

# List objects in the bucket to verify uploads
print("\nVerifying uploaded files:")
response = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix=f"{MINIO_OUTPUT_PREFIX}/")
if 'Contents' in response:
    for obj in response['Contents']:
        print(f"  - {obj['Key']} ({obj['Size']} bytes)")
else:
    print("No objects found in output location")

Using temporary directory: /tmp/tmpg46ifiuc
Connecting to MinIO...
Testing MinIO connection to http://minio.minio-system.svc.cluster.local:9000
Successfully connected to bucket rom-data
Bucket rom-data exists and is accessible

Looking for HDF5 files in rom-data/examples/cylinder/
Found 1 objects in the directory
Found 1 HDF5 files:
  Downloading examples/cylinder/cylinder_data.hdf5 (9057238656 bytes) to /tmp/tmpg46ifiuc/cylinder_data.hdf5
  ✓ Valid HDF5 file! Contents: ['coordinate_x', 'coordinate_y', 'density', 'momentum_x', 'momentum_y', 'pressure', 'time']
Successfully downloaded 1 HDF5 files
Created directory rom-pipeline/outputs/data/
Uploading /tmp/tmpg46ifiuc/cylinder_data.hdf5 (9057238656 bytes) to rom-pipeline/outputs/data/cylinder_data.hdf5
✓ Upload successful

Inspecting and visualizing HDF5 files:

Inspecting cylinder_data.hdf5:
  Dataset: coordinate_x, Shape: (421, 600), Type: float64
  Dataset: coordinate_y, Shape: (421, 600), Type: float64
  Dataset: density, Shape: (42