In [11]:
import os
import sys
import h5py

import torch
import numpy as np
from pathlib import Path
from matplotlib import pyplot as plt

# READ FILES

In [4]:
def load_h5(path: Path):
    "Array in .h5 is under 'data' key"
    arr = None
    with h5py.File(path, "r") as f:
        data = f["data"][:]
        try:
            # Attempt to convert the data to floats
            arr = np.array(data, dtype="float32").T  # Transpose for C-order
        except ValueError:
            # If the conversion fails, keep the data as a string
            arr = np.array(data, dtype=str)
    return arr

In [17]:
data_dir = r"C:\Users\cesar\Desktop\Projects\FoundationModels\Data\data-snr_01_05_1_5\data-snr_01_05_1_5\linear\dim-1\2"
batchdata_dir_path = Path(data_dir)
file_names = os.listdir(data_dir)
file_names

['drift_functions_at_hypercube.h5',
 'f_strs.h5',
 'g_strs.h5',
 'hypercube_locations.h5',
 'init_condition_distr_parameters.h5',
 'obs_times.h5',
 'obs_values.h5',
 'scaled_diffusion_functions_at_hypercube.h5']

In [18]:
# Load all files
loaded_data = {}
for file_name in file_names:
    file_path = batchdata_dir_path / file_name
    file_name_ = file_path.name.removesuffix(".h5")
    loaded_data[file_name_] = load_h5(file_path)

In [19]:
loaded_data.keys()

dict_keys(['drift_functions_at_hypercube', 'f_strs', 'g_strs', 'hypercube_locations', 'init_condition_distr_parameters', 'obs_times', 'obs_values', 'scaled_diffusion_functions_at_hypercube'])

In [20]:
obs_times = loaded_data['obs_times']
obs_values =  loaded_data['obs_values']
f_strs = loaded_data['f_strs']

In [21]:
obs_values.shape

(4474, 300, 128, 1, 2)

## Chop data

In [10]:
import h5py
import numpy as np
import os

# Define the directories
data_dir = r'C:\Users\cesar\Desktop\Projects\FoundationModels\Data\data-snr_01_05_1_5\data-snr_01_05_1_5\linear\dim-1\1'  # Your source directory
output_dir = r'C:\Users\cesar\Desktop\Projects\FoundationModels\Data\data-snr_01_05_1_5\data-snr_01_05_1_5\linear\dim-1\taquito2'  # Your destination directory

# List of .h5 files
h5_files = [
    'drift_functions_at_hypercube.h5',
    'f_strs.h5',
    'g_strs.h5',
    'hypercube_locations.h5',
    'init_condition_distr_parameters.h5',
    'obs_times.h5',
    'obs_values.h5',
    'scaled_diffusion_functions_at_hypercube.h5'
]

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Process each file
for file_name in h5_files:
    file_path = os.path.join(data_dir, file_name)
    output_path = os.path.join(output_dir, file_name)

    # Open the source file
    with h5py.File(file_path, 'r') as h5_file:
        # Create a new HDF5 file to save the first 100 entries
        with h5py.File(output_path, 'w') as output_file:
            # Iterate over each dataset in the file
            for dataset_name in h5_file.keys():
                dataset = h5_file[dataset_name][:]
                
                # Select the first 100 entries
                if len(dataset) > 100:
                    dataset = dataset[:100]
                
                # Save the modified dataset
                output_file.create_dataset(dataset_name, data=dataset)

print("First 100 entries from each file have been saved.")


First 100 entries from each file have been saved.
