In [1]:
import h5py
import numpy as np
import time

In [2]:
def print_dataset_sizes(h5_path):
    with h5py.File(h5_path, 'r') as file:
        for group in file.keys():
            dataset_name = f'{group}/comp_env_interp_1'
            if dataset_name in file:
                data = np.array(file[dataset_name])
                print(f'Size of dataset {dataset_name}: {data.shape}')
            else:
                print(f'Dataset {dataset_name} not found')

def print_first_group_datasets(h5_path):
    with h5py.File(h5_path, 'r') as file:
        first_group = list(file.keys())[0]
        print(f"Datasets in the first group ({first_group}):")
        for dataset in file[first_group].keys():
            data = np.array(file[first_group][dataset])
            print(f" - {dataset}: {data.shape}")

In [3]:
h5_path = '../data/dataoncosalud/res_valid/comp_env_data.h5'
# print_dataset_sizes(h5_path)
print_first_group_datasets(h5_path)

Datasets in the first group (file_0001):
 - R_matrix: (188, 200)
 - S_matrix: (188, 200)
 - a_0: (1, 1)
 - b_0: (1, 1)
 - beta_matrix: (188, 200)
 - comp_env_interp_1: (244, 256)
 - env_rf_interp: (244, 256)
 - k_matrix: (188, 200)
 - n: (1, 1)
 - validRS: (188, 200)


In [4]:
def calculate_total_windows(h5_path, n):
    total_windows = 0
    with h5py.File(h5_path, 'r') as file:
        for group in file.keys():
            dataset_name = f'{group}/comp_env_interp_1'
            if dataset_name in file:
                data = np.array(file[dataset_name])
                if data.ndim == 2:
                    num_windows = ((data.shape[0] - n + 1) * (data.shape[1] - n + 1))
                    total_windows += num_windows
                else:
                    raise ValueError(f"Dataset {dataset_name} is not 2-dimensional")
            else:
                raise ValueError(f"Dataset {dataset_name} not found")
    return total_windows

def get_window_xy(h5_path, n, window_idx):
    total_windows = calculate_total_windows(h5_path, n)
    if window_idx >= total_windows:
        raise IndexError("Window index out of range")
    
    current_window = 0
    with h5py.File(h5_path, 'r') as file:
        for group in file.keys():
            dataset_name = f'{group}/comp_env_interp_1'
            validRS_name = f'{group}/validRS'
            if dataset_name in file and validRS_name in file:
                data = np.array(file[dataset_name])
                if data.ndim == 2:
                    num_windows = ((data.shape[0] - n + 1) * (data.shape[1] - n + 1))
                    if current_window + num_windows > window_idx:
                        local_idx = window_idx - current_window
                        row_idx = local_idx // (data.shape[1] - n + 1)
                        col_idx = local_idx % (data.shape[1] - n + 1)
                        comp_env_window = data[row_idx:row_idx+n, col_idx:col_idx+n]

                        validRS = np.array(file[validRS_name])
                        validRS_value = validRS[row_idx, col_idx]

                        return comp_env_window, validRS_value
                    current_window += num_windows
                else:
                    raise ValueError(f"Dataset {dataset_name} is not 2-dimensional")
            else:
                raise ValueError(f"Dataset {dataset_name} or {validRS_name} not found")
    raise IndexError("Window index out of range")

# Example usage
n=57
total_windows = calculate_total_windows(h5_path, n)
print(f'Total number of {n}x{n} windows: {total_windows}')

window_idx = 10
window,y = get_window_xy(h5_path, n, window_idx)
print(window)
print(y)

Total number of 57x57 windows: 30098952
[[228.00782598 246.61446106 227.65132666 ... 228.97241303 247.42983931
  229.74282524]
 [240.29754907 249.00858207 240.31045496 ... 240.24563808 248.73629615
  240.24688461]
 [230.13299947 238.50901996 212.78459219 ... 230.34037456 240.20746346
  231.8715201 ]
 ...
 [190.55447766 199.11347373 202.26921848 ... 189.29886836 182.00335906
  156.5278377 ]
 [192.09987828 212.80662926 211.47231394 ... 190.68785246 188.70828037
  182.91547659]
 [190.83935153 207.86406274 210.41110231 ... 159.51572129 175.97945064
  190.18697278]]
0.0


In [5]:
def create_validRS_dataset(h5_path):
    validRS_values = []
    with h5py.File(h5_path, 'r') as file:
        for group in file.keys():
            print(f"{group}/{len(file.keys())}")
            validRS_name = f'{group}/validRS'
            if validRS_name in file:
                validRS = np.array(file[validRS_name])
                if validRS.ndim == 2:
                    for row_idx in range(validRS.shape[0]):
                        for col_idx in range(validRS.shape[1]):
                            validRS_value = validRS[row_idx, col_idx]
                            validRS_values.append(validRS_value)
                else:
                    raise ValueError(f"Dataset {validRS_name} is not 2-dimensional")
            else:
                raise ValueError(f"Dataset {validRS_name} not found")
    validRS_values = np.array(validRS_values)
    return validRS_values

validRS_values=create_validRS_dataset(h5_path)

file_0001/604
file_0002/604
file_0003/604
file_0004/604
file_0005/604
file_0006/604
file_0007/604
file_0008/604
file_0009/604
file_0010/604
file_0011/604
file_0012/604
file_0013/604
file_0014/604
file_0015/604
file_0016/604
file_0017/604
file_0018/604
file_0019/604
file_0020/604
file_0021/604
file_0022/604
file_0023/604
file_0024/604
file_0025/604
file_0026/604
file_0027/604
file_0028/604
file_0029/604
file_0030/604
file_0031/604
file_0032/604
file_0033/604
file_0034/604
file_0035/604
file_0036/604
file_0037/604
file_0038/604
file_0039/604
file_0040/604
file_0041/604
file_0042/604
file_0043/604
file_0044/604
file_0045/604
file_0046/604
file_0047/604
file_0048/604
file_0049/604
file_0050/604
file_0051/604
file_0052/604
file_0053/604
file_0054/604
file_0055/604
file_0056/604
file_0057/604
file_0058/604
file_0059/604
file_0060/604
file_0061/604
file_0062/604
file_0063/604
file_0064/604
file_0065/604
file_0066/604
file_0067/604
file_0068/604
file_0069/604
file_0070/604
file_0071/604
file_0

In [6]:
validRS_values.shape

(30098952,)

In [7]:
from sklearn.model_selection import train_test_split
import pickle

# Get indices of 0s and 1s in validRS_values
zero_indices = np.where(validRS_values == 0)[0]
one_indices = np.where(validRS_values == 1)[0]

# Calculate the number of 1s to be used in the dataset
num_ones = len(one_indices)
num_zeros = num_ones

# Select an equal number of 0s
selected_zero_indices, _ = train_test_split(zero_indices, train_size=num_zeros, random_state=42)

# Combine the selected 0s and 1s
selected_indices = np.concatenate((selected_zero_indices, one_indices))

# Split the selected indices into training and validation sets
train_indices, val_indices = train_test_split(selected_indices, test_size=0.2, random_state=42)

# Ensure 50-50 ratio in both splits
train_zeros = train_indices[validRS_values[train_indices] == 0]
train_ones = train_indices[validRS_values[train_indices] == 1]
val_zeros = val_indices[validRS_values[val_indices] == 0]
val_ones = val_indices[validRS_values[val_indices] == 1]

# Adjust the training set to have equal number of 0s and 1s
if len(train_zeros) > len(train_ones):
    train_zeros = train_zeros[:len(train_ones)]
else:
    train_ones = train_ones[:len(train_zeros)]

# Adjust the validation set to have equal number of 0s and 1s
if len(val_zeros) > len(val_ones):
    val_zeros = val_zeros[:len(val_ones)]
else:
    val_ones = val_ones[:len(val_zeros)]

# Combine adjusted indices
train_indices = np.empty((train_zeros.size + train_ones.size,), dtype=train_zeros.dtype)
train_indices[0::2] = train_zeros
train_indices[1::2] = train_ones

val_indices = np.empty((val_zeros.size + val_ones.size,), dtype=val_zeros.dtype)
val_indices[0::2] = val_zeros
val_indices[1::2] = val_ones

# Save the splits into a pickle file
split_data = {
    'train_files': train_indices,
    'val_files': val_indices
}

with open('data_splits_CNN.pkl', 'wb') as f:
    pickle.dump(split_data, f)

print(f'Training files: {len(train_indices)}, Validation files: {len(val_indices)}')
print('Training files: 838308, Validation files: 209280')

Training files: 838308, Validation files: 209280
Training files: 838308, Validation files: 209280


In [8]:
print(len(train_zeros), len(train_ones), len(val_zeros), len(val_ones))
print(validRS_values[train_indices].sum(), validRS_values[val_indices].sum())
print(validRS_values[train_indices[0:len(train_indices)//2]].sum(), validRS_values[val_indices[0:len(val_indices)//2]].sum())

# print('419154 419154 104640 104640')
# print('419154.0 104640.0')
# print('209577.0 52320.0')

419154 419154 104640 104640
419154.0 104640.0
209577.0 52320.0


In [None]:
from tqdm import tqdm

def create_windows_array(h5_path, indices, n):
    num_windows = len(indices)
    comp_env_windows = np.zeros((num_windows, n, n))
    validRS_values = np.zeros(num_windows)
    
    for i, idx in enumerate(tqdm(indices, desc="Loading windows")):
        window, validRS_value = get_window_xy(h5_path, n, idx)
        comp_env_windows[i] = window
        validRS_values[i] = validRS_value
    
    return comp_env_windows, validRS_values

# Create arrays for training and validation sets
train_comp_env_windows, train_validRS_values = create_windows_array(h5_path, train_indices[:3200], n)
val_comp_env_windows, val_validRS_values = create_windows_array(h5_path, val_indices[:800], n)

print(f'Train comp_env_windows shape: {train_comp_env_windows.shape}')
print(f'Train validRS_values shape: {train_validRS_values.shape}')
print(f'Validation comp_env_windows shape: {val_comp_env_windows.shape}')
print(f'Validation validRS_values shape: {val_validRS_values.shape}')

# Save the arrays into a pickle file
data_arrays = {
    'train_comp_env_windows': train_comp_env_windows,
    'train_validRS_values': train_validRS_values,
    'val_comp_env_windows': val_comp_env_windows,
    'val_validRS_values': val_validRS_values
}

# with open('data_arrays_CNN.pkl', 'wb') as f:
#     pickle.dump(data_arrays, f)

Loading windows: 100%|██████████| 800/800 [04:22<00:00,  3.05it/s]


Train comp_env_windows shape: (3200, 57, 57)
Train validRS_values shape: (3200,)
Validation comp_env_windows shape: (800, 57, 57)
Validation validRS_values shape: (800,)


In [90]:
def load_h5_dataset(h5_path, group, dataset):
    with h5py.File(h5_path, 'r') as file:
        data = np.array(file[group][dataset])
    return data

In [109]:
h5_path = '../data/dataoncosalud/res_valid/comp_env_data.h5'

group=310
data = load_h5_dataset(h5_path, f'file_{group}', 'comp_env_interp_1')
print(data.shape)
data = load_h5_dataset(h5_path, f'file_{group}', 'validRS')
print(data.shape)

(337, 256)
(337, 256)


In [92]:
np.min(data), np.max(data)

(73.34969801677707, 250.6374750850312)