In [None]:
# Install HDF5 and related packages
!pip install h5py numpy matplotlib pandas tables

# HDF5 Tutorial - Hierarchical Data Format

**HDF5** is the gold standard for scientific data storage:
- **Hierarchical**: Groups organize data like filesystem directories
- **Self-describing**: Rich metadata and attributes
- **Cross-platform**: Works everywhere, language-agnostic
- **Efficient**: Chunking, compression, parallel I/O

Perfect for: experimental data, simulations, imaging, time series archives.

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import os

print(f"HDF5 version: {h5py.version.hdf5_version}")
print(f"h5py version: {h5py.version.version}")

## 🏗️ Basic HDF5 Structure

In [None]:
# Create sample experimental data
np.random.seed(42)

# Simulate multi-instrument experiment
time_points = np.linspace(0, 100, 1000)
temperature = 25 + 5 * np.sin(time_points * 0.1) + np.random.normal(0, 0.5, 1000)
pressure = 1013 + 10 * np.cos(time_points * 0.05) + np.random.normal(0, 2, 1000)
spectra = np.random.exponential(1, (1000, 256))  # 1000 spectra, 256 wavelengths
wavelengths = np.linspace(400, 800, 256)  # nm

# Sample metadata
experiment_info = {
    'title': 'High-Temperature Spectroscopy Experiment',
    'researcher': 'Dr. Smith',
    'date': '2024-01-15',
    'instrument': 'SpectraMax 3000',
    'sample_id': 'HTS-001'
}

print(f"Generated experimental data:")
print(f"  Time series: {len(time_points)} points")
print(f"  Spectra: {spectra.shape} (time × wavelength)")
print(f"  Total size: {(spectra.nbytes + temperature.nbytes + pressure.nbytes)/1024**2:.1f} MB")

In [None]:
# Create hierarchical HDF5 structure
filename = 'experiment_data.h5'

with h5py.File(filename, 'w') as f:
    # Root-level metadata
    f.attrs['title'] = experiment_info['title']
    f.attrs['created'] = datetime.now().isoformat()
    f.attrs['hdf5_version'] = h5py.version.hdf5_version
    
    # Create groups (like directories)
    experiment = f.create_group('experiment')
    raw_data = f.create_group('raw_data')
    processed = f.create_group('processed')
    calibration = f.create_group('calibration')
    
    # Experiment metadata group
    for key, value in experiment_info.items():
        experiment.attrs[key] = value
    
    # Raw data with compression and chunking
    temp_dset = raw_data.create_dataset('temperature', data=temperature,
                                       compression='gzip', compression_opts=9,
                                       chunks=True, shuffle=True)
    temp_dset.attrs['units'] = '°C'
    temp_dset.attrs['instrument'] = 'Thermocouple K-type'
    temp_dset.attrs['calibration_date'] = '2023-12-01'
    
    pres_dset = raw_data.create_dataset('pressure', data=pressure,
                                       compression='lzf', chunks=True)
    pres_dset.attrs['units'] = 'hPa'
    pres_dset.attrs['instrument'] = 'Digital Barometer'
    
    spec_dset = raw_data.create_dataset('spectra', data=spectra,
                                       compression='szip', chunks=(100, 256))
    spec_dset.attrs['units'] = 'counts'
    spec_dset.attrs['exposure_time'] = '100ms'
    
    # Coordinate arrays
    time_dset = raw_data.create_dataset('time', data=time_points, compression='gzip')
    time_dset.attrs['units'] = 'seconds'
    
    wave_dset = calibration.create_dataset('wavelengths', data=wavelengths)
    wave_dset.attrs['units'] = 'nm'
    wave_dset.attrs['calibration_polynomial'] = [399.8, 1.56, -0.0001]  # Example
    
    # Processed data (example: smoothed temperature)
    from scipy.ndimage import gaussian_filter1d
    temp_smooth = gaussian_filter1d(temperature, sigma=2)
    processed.create_dataset('temperature_smoothed', data=temp_smooth,
                           compression='gzip')
    
    # Create links (like symbolic links)
    processed['time'] = raw_data['time']  # Hard link to time data

print(f"HDF5 file created: {filename}")
print(f"File size: {os.path.getsize(filename)/1024:.1f} KB")

## 🔍 Exploring HDF5 Structure

In [None]:
def print_hdf5_structure(name, obj, indent=0):
    """Recursively print HDF5 structure"""
    spaces = '  ' * indent
    if isinstance(obj, h5py.Group):
        print(f"{spaces}📁 {name}/")
        if obj.attrs:
            for attr_name, attr_value in obj.attrs.items():
                print(f"{spaces}   @{attr_name}: {attr_value}")
    elif isinstance(obj, h5py.Dataset):
        compression = obj.compression if obj.compression else 'none'
        print(f"{spaces}📄 {name}: {obj.shape} {obj.dtype} [{compression}]")
        if obj.attrs:
            for attr_name, attr_value in obj.attrs.items():
                print(f"{spaces}   @{attr_name}: {attr_value}")

# Explore the file structure
print("HDF5 File Structure:")
print("=" * 40)

with h5py.File(filename, 'r') as f:
    print(f"📄 {filename}")
    # Print root attributes
    for attr_name, attr_value in f.attrs.items():
        print(f"   @{attr_name}: {attr_value}")
    print()
    
    # Recursively print structure
    f.visititems(lambda name, obj: print_hdf5_structure(name, obj, 1))

## 📖 Reading and Querying HDF5

In [None]:
# Reading data efficiently
with h5py.File(filename, 'r') as f:
    print("Dataset Information:")
    
    # Access datasets
    temp_data = f['raw_data/temperature']
    spec_data = f['raw_data/spectra']
    
    print(f"Temperature: {temp_data.shape}, compression: {temp_data.compression}")
    print(f"Spectra: {spec_data.shape}, chunks: {spec_data.chunks}")
    
    # Partial reading (efficient for large datasets)
    first_100_temps = temp_data[:100]  # Read only first 100 points
    spectrum_subset = spec_data[100:200, 50:150]  # Subset of spectra
    
    print(f"\nPartial reads:")
    print(f"First 100 temperatures: {first_100_temps.shape}")
    print(f"Spectrum subset: {spectrum_subset.shape}")
    
    # Access metadata
    experiment_title = f['experiment'].attrs['title']
    temp_units = f['raw_data/temperature'].attrs['units']
    
    print(f"\nMetadata:")
    print(f"Experiment: {experiment_title}")
    print(f"Temperature units: {temp_units}")
    
    # Query capabilities
    print(f"\nDataset properties:")
    print(f"Temperature range: {temp_data[:].min():.1f} to {temp_data[:].max():.1f} {temp_units}")
    print(f"Spectra shape: {spec_data.shape[0]} time points × {spec_data.shape[1]} wavelengths")

In [None]:
# Visualize the hierarchical data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

with h5py.File(filename, 'r') as f:
    time = f['raw_data/time'][:]
    temp = f['raw_data/temperature'][:]
    temp_smooth = f['processed/temperature_smoothed'][:]
    pressure = f['raw_data/pressure'][:]
    spectra = f['raw_data/spectra'][:]
    wavelengths = f['calibration/wavelengths'][:]
    
    # Temperature time series
    axes[0,0].plot(time, temp, 'b-', alpha=0.7, linewidth=0.8, label='Raw')
    axes[0,0].plot(time, temp_smooth, 'r-', linewidth=2, label='Smoothed')
    axes[0,0].set_xlabel('Time (s)')
    axes[0,0].set_ylabel('Temperature (°C)')
    axes[0,0].set_title('Temperature vs Time')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # Pressure time series
    axes[0,1].plot(time, pressure, 'g-', linewidth=1)
    axes[0,1].set_xlabel('Time (s)')
    axes[0,1].set_ylabel('Pressure (hPa)')
    axes[0,1].set_title('Pressure vs Time')
    axes[0,1].grid(True, alpha=0.3)
    
    # Spectral heatmap
    im = axes[1,0].imshow(spectra[:200, :].T, aspect='auto', cmap='plasma',
                         extent=[time[0], time[199], wavelengths[0], wavelengths[-1]])
    axes[1,0].set_xlabel('Time (s)')
    axes[1,0].set_ylabel('Wavelength (nm)')
    axes[1,0].set_title('Spectral Evolution (first 200 points)')
    plt.colorbar(im, ax=axes[1,0], label='Intensity')
    
    # Average spectrum
    avg_spectrum = spectra.mean(axis=0)
    axes[1,1].plot(wavelengths, avg_spectrum, 'purple', linewidth=2)
    axes[1,1].set_xlabel('Wavelength (nm)')
    axes[1,1].set_ylabel('Average Intensity')
    axes[1,1].set_title('Time-Averaged Spectrum')
    axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 🚀 Advanced HDF5 Features

In [None]:
# Advanced features: compound data types, references, external links
advanced_file = 'advanced_hdf5.h5'

with h5py.File(advanced_file, 'w') as f:
    # 1. Compound data types (like struct)
    participant_dtype = np.dtype([
        ('id', 'i4'),
        ('name', 'S20'),
        ('age', 'i4'),
        ('score', 'f8')
    ])
    
    participants = np.array([
        (1, b'Alice', 25, 85.5),
        (2, b'Bob', 30, 92.1),
        (3, b'Charlie', 28, 78.9)
    ], dtype=participant_dtype)
    
    study_group = f.create_group('study')
    study_group.create_dataset('participants', data=participants)
    
    # 2. Variable-length strings
    string_dtype = h5py.special_dtype(vlen=str)
    comments = ["Great experiment!", "Need more data points", "Interesting results"]
    study_group.create_dataset('comments', data=comments, dtype=string_dtype)
    
    # 3. Extensible datasets (unlimited dimensions)
    unlimited_data = f.create_dataset('streaming_data', 
                                     shape=(0, 3), maxshape=(None, 3),
                                     chunks=True, compression='gzip')
    
    # Add data in chunks (simulating streaming)
    for i in range(5):
        new_data = np.random.randn(10, 3)
        unlimited_data.resize(unlimited_data.shape[0] + 10, axis=0)
        unlimited_data[-10:, :] = new_data
    
    # 4. References (pointers to other datasets)
    refs_group = f.create_group('references')
    temp_ref = study_group.ref  # Reference to study group
    refs_group.attrs['study_reference'] = temp_ref
    
    print(f"Advanced HDF5 features demonstrated:")
    print(f"  Compound data: {participants.shape} records")
    print(f"  Variable strings: {len(comments)} comments")
    print(f"  Extensible dataset: {unlimited_data.shape}")
    print(f"  References: Created")

print(f"Advanced file size: {os.path.getsize(advanced_file)/1024:.1f} KB")

In [None]:
# Reading advanced features
with h5py.File(advanced_file, 'r') as f:
    print("Reading Advanced Features:")
    
    # Read compound data
    participants = f['study/participants'][:]
    print(f"\nParticipants:")
    for p in participants:
        print(f"  ID: {p['id']}, Name: {p['name'].decode()}, Age: {p['age']}, Score: {p['score']}")
    
    # Read variable-length strings
    comments = f['study/comments'][:]
    print(f"\nComments:")
    for i, comment in enumerate(comments):
        print(f"  {i+1}: {comment}")
    
    # Show extensible dataset
    streaming = f['streaming_data']
    print(f"\nStreaming data shape: {streaming.shape}")
    print(f"Last 3 rows:\n{streaming[-3:, :]}")
    
    # Follow reference
    study_ref = f['references'].attrs['study_reference']
    referenced_group = f[study_ref]
    print(f"\nReferenced group: {referenced_group.name}")

## 🔧 HDF5 for compression

Here we demonstrate some compression method can be easily used in hdf5.

Note that each compression method or combination will really behave differently depending on data type, format, ... so the number are not to be taken as "`gzip` is the best"

In [None]:
# 1) Make a compressible array (smooth signal + small noise)
nx, ny = 4000, 4000
x = np.linspace(0, 50, nx)[:, None]
y = np.linspace(0, 50, ny)[None, :]
signal = (np.sin(x) + np.cos(y)) * 1000
test_data = (signal + np.random.randn(nx, ny)*2).astype(np.float32)  # still mostly smooth

compressions = {
    'none': dict(compression=None, shuffle=False, scaleoffset=None),
    'gzip': dict(compression='gzip', shuffle=True, scaleoffset=None),
    'lzf':  dict(compression='lzf',  shuffle=True, scaleoffset=None),
    # big wow (lossy): keep ~4 significant digits -> massive size drop, usually negligible error
    'gzip+scaleoffset4': dict(compression='gzip', shuffle=True, scaleoffset=4),
}

results = {}
chunk = (256, 256)  # 2D chunks compress well for images/grids

for name, kw in compressions.items():
    fn = f'compression_{name}.h5'
    if os.path.exists(fn): os.remove(fn)

    # WRITE
    t0 = time.time()
    with h5py.File(fn, 'w') as f:
        dset = f.create_dataset(
            'data', data=test_data, chunks=chunk,
            compression=kw['compression'],
            shuffle=kw['shuffle'],
            scaleoffset=kw['scaleoffset']  # None for lossless; int for quantization
        )
    write_time = time.time() - t0

    # READ (cold-ish): reopen the file
    t0 = time.time()
    with h5py.File(fn, 'r') as f:
        _ = f['data'][:]         # full read
    read_time = time.time() - t0

    size = os.path.getsize(fn)
    ratio = test_data.nbytes / size
    results[name] = (write_time, read_time, size/1024**2, ratio)

print("HDF5 Compression Performance (structured float32, chunks=256x256)")
print(f"{'Method':18s} {'Write(s)':>8s} {'Read(s)':>8s} {'Size(MB)':>9s} {'Ratio':>6s}")
print("-"*60)
for k,(wt,rt,mb,ra) in results.items():
    print(f"{k:18s} {wt:8.3f} {rt:8.3f} {mb:9.1f} {ra:6.1f}x")

## 📋 HDF5 Quick Reference

In [None]:
from IPython.display import HTML, display
import html

html_block = """
<style>
.h5-card {
  font-family: system-ui, -apple-system, Segoe UI, Roboto, Arial, sans-serif;
  max-width: 880px; 
  border:1px solid #ddd; 
  border-radius:12px; 
  padding:20px; 
  background: #fff; 
  box-shadow: 0 2px 10px rgba(0,0,0,.05);
}
.h5-card h2 {
  margin-top:0;
  font-size:22px;
  display:flex;
  align-items:center;
  gap:8px;
}
.h5-card section {
  margin-top:18px;
}
.h5-card h3 {
  margin:0 0 6px;
  font-size:16px;
  color:#006699;
  border-bottom:1px solid #eee;
  padding-bottom:3px;
}
.h5-card pre {
  background:#111; 
  color:#f2f2f2;
  border-radius:6px; 
  padding:10px 12px; 
  overflow:auto;
  line-height:1.35;
  font-size:13px;
}
.copy-btn {
  float:right;
  margin-left:10px;
  cursor:pointer;
  border:1px solid #ccc;
  background:white;
  padding:4px 8px;
  border-radius:6px;
  font-size:12px;
}
</style>

<div class="h5-card" id="h5ref">
  <h2>📚 HDF5 Quick Reference 
    <button class="copy-btn" onclick="navigator.clipboard.writeText(document.querySelector('#h5ref').innerText)">Copy</button>
  </h2>

  <section>
    <h3>Basic Operations</h3>
    <pre>import h5py

with h5py.File('data.h5', 'w') as f:
    f.create_dataset('array', data=numpy_array)
    group = f.create_group('experiment')
    f.attrs['metadata'] = 'value'</pre>
  </section>

  <section>
    <h3>Reading</h3>
    <pre>with h5py.File('data.h5', 'r') as f:
    data = f['array'][:]           # Read all
    subset = f['array'][0:100]     # Partial read
    metadata = f.attrs['metadata'] # Read attribute</pre>
  </section>

  <section>
    <h3>Compression &amp; Performance</h3>
    <pre>f.create_dataset('data', data=array,
                 compression='gzip',     # gzip, lzf, szip
                 compression_opts=9,     # Compression level
                 chunks=True,            # Auto chunking
                 shuffle=True)           # Byte shuffle filter</pre>
  </section>

  <section>
    <h3>Groups &amp; Organization</h3>
    <pre>experiment = f.create_group('experiment')
experiment.create_dataset('data', data=array)
experiment.attrs['researcher'] = 'Dr. Smith'</pre>
  </section>

  <section>
    <h3>Advanced Features</h3>
    <pre># Unlimited dimensions
f.create_dataset('stream', shape=(0,), maxshape=(None,))

# Compound data types
dt = np.dtype([('name', 'S20'), ('value', 'f8')])
f.create_dataset('records', data=structured_array, dtype=dt)</pre>
  </section>

  <section>
    <h3>Best Practices</h3>
    <ul style="margin:0; padding-left:18px; color:gray">
      <li>Use groups to organize related datasets</li>
      <li>Add comprehensive metadata with attributes</li>
      <li>Enable compression for most datasets</li>
      <li>Use chunking for large arrays with partial access</li>
      <li>Close files properly (use 'with' statements)</li>
    </ul>
  </section>

  <div style="margin-top:15px;color:#444;font-size:13px;">
    🏗️ HDF5: The universal scientific data format! 📊
  </div>
</div>
"""

display(HTML(html_block))
