# Save state

We use .h5 file to save the simulation states. The idea is that multiple time steps will be saved in a single file, and that we should be able to save the data progressively during the simulation run. 

## 0 Packages

In [2]:
import numpy as np
import h5py
from pathlib import Path
import time

## 1 Save data

Assume we have data t and y at each step

```
t : float
    time stamp
y : array_like[float]
    simulation state 
```

We are going to save them in two datasets "t" and "y".

### 1.1 Create file and save one step

In [2]:
t = 0
y = np.random.rand(100)

In [3]:
with h5py.File("_test.h5", "w") as f:
    t_data = f.create_dataset("t", (1,), maxshape=(None,), dtype=float)
    y_data = f.create_dataset("y", (1,*y.shape), maxshape=(None,*y.shape), dtype=float)
    t_data[0] = t
    y_data[0] = np.random.rand(100)

In [5]:
# inspect the saved data
with h5py.File("_test.h5", "r") as f:
    print(f"t shape = {f["t"].size}\ny shape = {f["y"].size}")

t shape = 1
y shape = 100


### 1.2 Save multiple steps

In [6]:
with h5py.File("_test.h5", "w") as f:
    t_data = f.create_dataset("t", (0,), maxshape=(None,), dtype=float)
    y_data = f.create_dataset("y", (0,*y.shape), maxshape=(None,*y.shape), dtype=float)
    for t in np.linspace(0, 10):
        t_data.resize(t_data.shape[0]+1, axis=0)
        t_data[-1] = t
        y_data.resize(y_data.shape[0]+1, axis=0)
        y_data[-1] = np.random.rand(100)

In [9]:
# inspect the saved data
with h5py.File("_test.h5", "r") as f:
    print(f"t shape = {f["t"].shape}\ny shape = {f["y"].shape}")

t shape = (50,)
y shape = (50, 100)


### 1.3 Append data to an existing .h5 file

In a simulation, we need to save data during the run, and possibily inspect the data while the simulation is still actively writing to the file. This means that we cannot keep the .h5 data file open throughout the run, but rather repeatedly open the file, append data, and close the file. This section test this workflow. 

Intuitively, this repeated file IO process can cause significant overhead and lower the performance of the code. In this section, we will also assess how much overhead is caused by the file operations. 

#### 1.3.1 Append data

In [10]:
# append data
with h5py.File("_test.h5", "r+") as f:
    t_data = f["t"]
    y_data = f["y"]
    t_data.resize(t_data.shape[0]+1, axis=0)
    y_data.resize(y_data.shape[0]+1, axis=0)
    t_data[-1] = t + 1
    y_data[-1] = np.random.rand(*y_data.shape[1:])

In [11]:
# inspect the appended data
with h5py.File("_test.h5", "r") as f:
    print(f"t shape = {f["t"].shape}\ny shape = {f["y"].shape}")

t shape = (51,)
y shape = (51, 100)


#### 1.3.2 File operation overhead

In [None]:
N = int(2e5)

In [25]:
# method 1: open file once
t0 = time.time()
with h5py.File("_test.h5", "w") as f:
    t_data = f.create_dataset("t", (0,), maxshape=(None,), dtype=float)
    y_data = f.create_dataset("y", (0,*y.shape), maxshape=(None,*y.shape), dtype=float)
    for t in np.linspace(0, 10, N):
        t_data.resize(t_data.shape[0]+1, axis=0)
        t_data[-1] = t
        y_data.resize(y_data.shape[0]+1, axis=0)
        y_data[-1] = np.random.rand(100)
t = time.time() - t0
print(f"Open once run time: {t:.1f}")

Open once run time: 7.0


In [8]:
# method 1: open file at each iteration
t0 = time.time()

# create .h5 file
with h5py.File("_test.h5", "w") as f:
    t_data = f.create_dataset("t", (0,), maxshape=(None,), dtype=float)
    y_data = f.create_dataset("y", (0,*y.shape), maxshape=(None,*y.shape), dtype=float)

# repeatedly append data
for t in np.linspace(0, 10, N):
    y = np.random.rand(100)
    with h5py.File("_test.h5", "r+") as f:
        t_data = f["t"]
        y_data = f["y"]
        t_data.resize(t_data.shape[0]+1, axis=0)
        y_data.resize(y_data.shape[0]+1, axis=0)
        t_data[-1] = t
        y_data[-1] = y
t = time.time() - t0
print(f"Open once run time: {t:.1f}")

Open once run time: 55.3


The file operation overhead is rather obvious: the run time of repeatedly appending is 4 times of the method where the file is only opened once. Therefore, while it is desired that we append data progressively, it is also necessary to minimize the number of file opening and data appending. 

#### 1.3.4 Reduce file operation by mode="a" and flushing

This is the recommended approach by Gemini, which brings together simplicity and performance. The idea is to use "a" mode and flush the cached data at a longer interval. 

We need to test if this process actually blocks the access of other programs. 

In [40]:
h5file = Path("_test.h5")
if h5file.exists():
    h5file.unlink()

In [41]:
t0 = time.time()
flush_interval = 0.01
last_flush = 0
T = 10
N = int(2e5)
print(f"Flush {int(T/flush_interval)} times.")
with h5py.File("_test.h5", "a") as f:
    t_data = f.create_dataset("t", (0,), maxshape=(None,), dtype=float)
    y_data = f.create_dataset("y", (0,*y.shape), maxshape=(None,*y.shape), dtype=float)
    
    for t in np.linspace(0, T, N):
        
        t_data.resize(t_data.shape[0]+1, axis=0)
        t_data[-1] = t
        y_data.resize(y_data.shape[0]+1, axis=0)
        y_data[-1] = np.random.rand(100)

        if t - last_flush > flush_interval:
            f.flush()

t = time.time() - t0
print(f"Caching run time: {t:.1f}")

Flush 1000 times.
Caching run time: 87.3


No, the file is locked while the code is running. 

### 1.4 Caching and batch-saving

Cache some data in memory and save them in batches every flush_time.

In [70]:
class DataBuffer(list):
    def __init__(self, 
                 h5file: str | Path, 
                 dset_name: str, 
                 data_shape: tuple = (),
                 dtype : type = float):
        """Initiate a DataBuffer object. The main purpose is to implement a `flush()` method, so that a list of data can be flushed to a designated file on the disk.
        
        Parameters
        ----------
        h5file : str | Path
            path to the .h5 file, where the data are dumped
        dset_name : str
            dataset name
        data_shape : tuple
            data shape at each step as a tuple, e.g. () for scalar
        dtype : type
            date type

        Examples
        --------
        >>> with h5py.File(h5file, "w") as f: pass
        >>> t_buffer = DataBuffer(h5file, "t")
        >>> for t in range(100):
                t_buffer.append(t)
                if len(t_buffer) > thres:
                    t_buffer.flush()
        """
        super().__init__()
        self.dset_name = dset_name
        self.h5file = Path(h5file)
        try:
            with h5py.File(self.h5file, "r+") as f:
                f.create_dataset(self.dset_name, (0, *data_shape), maxshape=(None, *data_shape), dtype=dtype)
        except FileNotFoundError as e:
            raise FileNotFoundError(f"{h5file} not found!")
        
    def flush(self):
        """Flush the buffered data to h5file."""
        n_add = len(self)
        dset_name = self.dset_name

        with h5py.File(self.h5file, "r+") as f:
            dset = f[dset_name]
            dset.resize(dset.shape[0]+n_add, axis=0)
            dset[-n_add:] = self
        self.clear()

In [76]:
flush_interval = 0.01
last_flush = 0
T = 10
N = int(1e6)

h5file = Path("_test.h5")
y_shape = (100, )

with h5py.File(h5file, "w") as f:
    pass # create an empty .h5 file 

t_buffer = DataBuffer(h5file, "t")
y_buffer = DataBuffer(h5file, "y", y_shape)

t0 = time.time()

for t in np.linspace(0, T, N):
    y = np.random.rand(*y_shape)
    t_buffer.append(t)
    y_buffer.append(y)
    if t - last_flush > flush_interval:
        last_flush = t
        print(f"t={t:.2f}, {len(t_buffer)} steps dumped.", end="\r")
        t_buffer.flush()
        y_buffer.flush()

t = time.time() - t0
print(f"\nCaching run time: {t:.1f}")

t=9.99, 1000 steps dumped.
Caching run time: 2.6


In [74]:
with h5py.File("_test.h5", "r") as f:
    print(f["t"][:].shape)

(99901,)


In [66]:
h5file = "__test.h5"
thres = 10
with h5py.File(h5file, "w") as f: pass
t_buffer = DataBuffer(h5file, "t")
for t in range(100):
    t_buffer.append(t)
    if len(t_buffer) > thres:
        t_buffer.flush()
t_buffer.flush()

In [67]:
with h5py.File(h5file, "r") as f:
    t = f["t"][:]