In [1]:
import os
import h5py

from dotenv import load_dotenv


load_dotenv()

REPO_ROOT = os.environ["REPO_ROOT"]

DATA_ROOT = os.path.abspath(os.path.join(REPO_ROOT,  os.environ["DATA_ROOT"])) # for notebooks we need absolute path


# Working with the h5 format: On the fly vs preloaded

Now let us think of **how to treat the data**

- **Option 1: Save it as np** 

In [None]:

import os
import h5py
import numpy as np

# -------- CONFIG ---------
data_path_h5 = RAW_DATA
sample_h5 = None
# -------------------------


# 1) Find one sample .h5 file
for root, _, files in os.walk(data_path_h5):
    for name in files:
        if name.lower().endswith(".h5"):
            sample_h5 = os.path.join(root, name)
            break
    if sample_h5:
        break

if sample_h5 is None:
    raise RuntimeError("No .h5 file found in the folder!")


print("Sample:", sample_h5)


# 2) Convert sample to .npy (temp file)
sample_npy = sample_h5 + ".temp.npy"

with h5py.File(sample_h5, "r") as f:
    arr = f["array"][()]

arr = arr.astype(np.float32)
np.save(sample_npy, arr)

# 3) Measure sizes
size_h5 = os.path.getsize(sample_h5) / 1e9
size_npy = os.path.getsize(sample_npy) / 1e9

print(f"Sample .h5 size : {size_h5:.3f} GB")
print(f"Sample .npy size: {size_npy:.3f} GB")

multiplier = size_npy / size_h5
print(f"Expansion factor: {multiplier:.2f}x")

# 4) Compute total size of all .h5
total_h5 = 0
num_files = 0

for root, _, files in os.walk(data_path_h5):
    for name in files:
        if name.lower().endswith(".h5"):
            num_files += 1
            total_h5 += os.path.getsize(os.path.join(root, name))

total_h5_gb = total_h5 / 1e9

print(f"\nTotal .h5 files: {num_files}")
print(f"Total .h5 size : {total_h5_gb:.3f} GB")

# 5) Estimated required space for all .npy
estimated_npy = total_h5_gb * multiplier
print(f"\nEstimated total .npy size: {estimated_npy:.2f} GB")

# Remove temporary file
os.remove(sample_npy)
print("\nTemporary sample .npy removed.")


Sample: C:\Users\user\UPM\Imperial-4a√±o\IoT\Github\BERLIN_reduced\data\2019-01-02_BERLIN_8ch_reduced.h5
Sample .h5 size : 0.001 GB
Sample .npy size: 0.021 GB
Expansion factor: 21.04x

Total .h5 files: 180
Total .h5 size : 0.187 GB

Estimated total .npy size: 3.94 GB

Temporary sample .npy removed.


To much space.

- **Option 2: transform on the fly**
Best option for memory saving.

We will use this.