In [10]:
import torchaudio
import torch

from transforms import (
    crop_spectrogram,
    load_path_data,
    load_channel_data,
    fft_256,
    custom_fft,
    standardize,
)

# Example parameters
n_fft = 256
win_length = 256
hop_length = 64

# Create an example signal
signal = torch.randn(16000)  # Example size, simulate 1 second at 16kHz

# Initialize the Spectrogram transform
spectrogram = torchaudio.transforms.Spectrogram(
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    normalized=True
)
crop = crop_spectrogram(target_size=[64,64])
std = standardize()

# Apply the transform
spg = spectrogram(signal)
print(spg.shape)
spg = crop(spg)
print(spg.shape)
spg = std(spg)
print(spg.shape)

torch.Size([129, 251])
torch.Size([64, 64])
torch.Size([64, 64])


In [4]:
import json

def print_ie(index_element):
    for k, v in index_element.items():
        print(f"{k}:", v)

index_path = '/itet-stor/maxihuber/deepeye_storage/index_files/full_tueg_index.json'
with open(index_path, 'r') as file:
    index = json.load(file)

short_index = index[:10]

print(len(short_index))

short_index_path = '/itet-stor/maxihuber/deepeye_storage/index_files/short_tueg_index.json'
with open(short_index_path, 'w') as file:
    json.dump(short_index, file, indent=4)
    print("Dumped file!")

10
Dumped file!


In [12]:
import os, sys
import glob
import json
from socket import gethostname

TMPDIR = '/itet-stor/maxihuber/net_scratch/runs/943432/tmp'

file_paths = sorted(glob.glob(
    os.path.join(TMPDIR, f"index_path_{gethostname()}_*.txt")
))
for file_path in file_paths:
    print(file_path)

paths = {}
num_datapoints = 0

for file_path in file_paths:

    with open(file_path, "r") as pointer_file:
        path_to_data_index = pointer_file.read()

        with open(path_to_data_index, "r") as index_file:
            chunks_index = json.load(index_file)

            for _, chunk_path in chunks_index.items():

                paths[num_datapoints] = chunk_path
                num_datapoints += 1

print(len(paths))

/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_0.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_1.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_2.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_3.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_4.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_5.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_6.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_7.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_8.txt
/itet-stor/maxihuber/net_scratch/runs/943432/tmp/index_path_arton06_9.txt
2440011


In [3]:
# try teardown
import shutil, os

STORDIR = '/dev/shm/mae'
if os.path.exists(STORDIR):
    print(f"Removing all files and subdirectories in: {STORDIR}")
    shutil.rmtree(STORDIR)
    print(f"Removed all files and subdirectories in {STORDIR}")

Removing all files and subdirectories in: /dev/shm/mae
Removed all files and subdirectories in /dev/shm/mae


# Data Management System

In [9]:
import os
import json

index_paths = ["/itet-stor/maxihuber/deepeye_storage/index_files/full_tueg_index.json", 
               "/itet-stor/maxihuber/deepeye_storage/index_files/full_pkl_index.json"]
path_prefix = '/itet-stor/maxihuber/deepeye_storage/foundation/tueg/edf'

index = []

total_size = 0

with open(index_paths[0], 'r') as file:
    tueg_index = json.load(file)

print(len(tueg_index))


tmp = """
for index_path in index_paths:
    with open(index_path, 'r') as file:
        temp_index = json.load(file)
        for i, index_element in enumerate(temp_index):
            if index_element["path"].endswith(".edf"):
                    index_element["path"] = path_prefix + index_element["path"]
            index.append(index_element)
            total_size += os.path.getsize(index_element["path"]) / (1024 ** 3)
            if i % 10_000 == 0:
                print(f"Processed {i} files. Currently at {round(total_size,2)}GB of data.")
"""
print(len(index))
print(total_size)

Processed 0 files. Currently at 0.02GB of data.
Processed 10000 files. Currently at 243.42GB of data.
Processed 20000 files. Currently at 477.77GB of data.
Processed 30000 files. Currently at 742.3GB of data.
Processed 40000 files. Currently at 967.9GB of data.
Processed 50000 files. Currently at 1168.2GB of data.
Processed 60000 files. Currently at 1386.76GB of data.
Processed 0 files. Currently at 1638.91GB of data.
Processed 10000 files. Currently at 1656.32GB of data.
Processed 20000 files. Currently at 1674.38GB of data.
Processed 30000 files. Currently at 1686.26GB of data.
Processed 40000 files. Currently at 1696.85GB of data.
Processed 50000 files. Currently at 1707.02GB of data.
Processed 60000 files. Currently at 1717.68GB of data.
Processed 70000 files. Currently at 1727.96GB of data.
Processed 80000 files. Currently at 1738.5GB of data.
Processed 90000 files. Currently at 1748.84GB of data.
Processed 100000 files. Currently at 1759.4GB of data.
Processed 110000 files. Curre

# Get size of index contents

In [21]:
def get_index_sizes(index_paths: list):
    index_sizes = [0] * len(index_paths)
    for i, index_path in enumerate(index_paths):
        with open(index_path, 'r') as file:
            temp_index = json.load(file)
            temp_index = temp_index[:len(temp_index) // 5]
            for i, index_element in enumerate(temp_index):
                if index_element["path"].endswith(".edf"):
                        index_element["path"] = path_prefix + index_element["path"]
                total_size += os.path.getsize(index_element["path"]) / (1024 ** 3)
                if i % 10_000 == 0:
                    print(f"Processed {i} files. Currently at {round(total_size,2)}GB of data.")
            index_sizes[i] = total_size

def get_index_size(index):
    total_size = 0
    for i, index_element in enumerate(index):
        total_size += os.path.getsize(index_element["path"]) / (1024 ** 3)
    return total_size

In [12]:
index_paths = ["/itet-stor/maxihuber/deepeye_storage/index_files/full_tueg_index.json", 
               "/itet-stor/maxihuber/deepeye_storage/index_files/full_pkl_index.json"]
path_prefix = '/itet-stor/maxihuber/deepeye_storage/foundation/tueg/edf'

index = []
total_size = 0

for index_path in index_paths:
    with open(index_path, 'r') as file:
        temp_index = json.load(file)
        temp_index = temp_index[:len(temp_index) // 5]
        for i, index_element in enumerate(temp_index):
            if index_element["path"].endswith(".edf"):
                    index_element["path"] = path_prefix + index_element["path"]
            index.append(index_element)
            total_size += os.path.getsize(index_element["path"]) / (1024 ** 3)
            if i % 10_000 == 0:
                print(f"Processed {i} files. Currently at {round(total_size,2)}GB of data.")

print(len(index))
print(round(total_size,2))

Processed 0 files. Currently at 0.02GB of data.
Processed 10000 files. Currently at 243.42GB of data.
13930
311.0793815199286


In [18]:
with open(index_paths[0], 'r') as file:
    temp_index = json.load(file)
    temp_index = temp_index[:len(temp_index) // 5]
    print(len(temp_index))

assert False, "break before dumping"

stor_path = '/itet-stor/maxihuber/deepeye_storage/index_files/311G_tueg_index.json'
with open(stor_path, 'w') as file:
    json.dump(temp_index, file)
    print("Dumped!")

13930
Dumped!


# Implement proportional-to-size splitting of index list

In [22]:
import os
import yaml
from math import ceil
import sys
# Add the directory containing your module to sys.path
sys.path.append('/home/maxihuber/eeg-foundation/preloading')
# Now you can import normally
from preload_chunk import LocalLoader, filter_index, get_index_sizes

def main(data_config, num_chunks, idx):
    # num_chunks: number of workers available in the system
    # idx: the id of the worker this program is called on (each worker executes this main method)

    #TMPDIR = f"{data_config['runs_dir']}/{os.environ['SLURM_ARRAY_JOB_ID']}/tmp"
    #os.makedirs(TMPDIR, exist_ok=True)

    local_loader = LocalLoader(
        base_stor_dir=data_config["STORDIR"],
    )

    index, index_lens, index_sizes = filter_index(
        index_paths=data_config["data_dir"],
        path_prefix=data_config["path_prefix"],
        min_duration=data_config["min_duration"],
        max_duration=data_config["max_duration"],
        select_sr=data_config["select_sr"],
        select_ref=data_config["select_ref"],
    )

    # Calculate total size for normalization
    total_size = sum(index_sizes)
    
    # Calculate number of chunks each root_directory should get based on size
    # E.g. 6 workers for tueg data, 4 workers for pkl data (data is distributed to num_chunks workers in total)
    num_chunks_per_directory = [ceil((size / total_size) * num_chunks) for size in index_sizes]

    # Adjust chunk numbers to sum up to the total number of chunks available
    while sum(num_chunks_per_directory) > num_chunks:
        max_index = num_chunks_per_directory.index(max(num_chunks_per_directory))
        num_chunks_per_directory[max_index] -= 1
        
    # Create global index chunks based on calculated per-directory chunks
    start_idx = 0
    global_index_chunks = []
    for dir_idx, chunks in enumerate(num_chunks_per_directory):
        # Compute the size of a chunk for this directory
        # (e.g. 1'000 files per tueg worker, 20'000 files per pkl worker)
        chunk_size = ceil(index_lens[dir_idx] / chunks)
        for chunk_idx in range(chunks):
            chunk_start = start_idx + chunk_idx * chunk_size
            chunk_end = min(chunk_start + chunk_size, start_idx + index_lens[dir_idx])
            global_index_chunks.append(index[chunk_start:chunk_end])
        start_idx += index_lens[dir_idx]

    # Assign chunk to current worker
    len_index_chunks = [len(index_chunk) for index_chunk in global_index_chunks]
    print(len_index_chunks, file=sys.stderr)

    # Print information about global chunk distribution
    processed = 0
    for idx in range(num_chunks):
        if idx < len(global_index_chunks):
            index_chunk = global_index_chunks[idx]
            print(
                f"Worker {idx} processes {len(index_chunk)} files ({round(get_index_size(index_chunk),1)} GB) from index {processed} to {processed + len(index_chunk)}",
                file=sys.stderr
            )
        else:
            index_chunk = []
            print(f"Worker {idx} has no files to process.", file=sys.stderr)
        processed += len(index_chunk)

    # Process the index_chunk for this idx...
    index_chunk = global_index_chunks[idx] if idx < len(global_index_chunks) else []
    
    # (Add your processing logic here)

num_chunks = 5
idx = 0

# Load main config file
main_config_file = "/home/maxihuber/eeg-foundation/configs/experiment/maxim.yaml"
with open(main_config_file, "r") as file:
    config = yaml.safe_load(file)
    data_config = config["data"]

main(data_config, num_chunks, idx)

Filtering data dir
361390 files found in total
[111858, 13213] files selected per index
[327.54912449698895, 311.0599145554006] GB of data selected per index
125071 files selected in total
[55929, 55929, 4405, 4405, 4403]


[3, 3]
[2, 3]


Worker 0 processes 55929 files (156.3 GB) from index 0 to 55929
Worker 1 processes 55929 files (171.2 GB) from index 55929 to 111858
Worker 2 processes 4405 files (127.4 GB) from index 111858 to 116263
Worker 3 processes 4405 files (105.2 GB) from index 116263 to 120668
Worker 4 processes 4403 files (78.4 GB) from index 120668 to 125071
