# mat_modifier (parameterized)

This version removes hardcoded paths and reads everything from `configs/default.yaml`.
It produces per-subject reduced `.mat` files containing PPG, ABP, and a small metadata block,
and writes a `manifest.json` with a summary of what was created.

**Run order:** Run all cells top-to-bottom. Make sure `configs/default.yaml` exists.


In [16]:
# Environment report (optional but helps reproducibility)
import sys, importlib
print("python:", sys.version.split()[0])
for pkg in ["yaml", "scipy", "numpy", "mat73"]:
    try:
        m = importlib.import_module(pkg)
        v = getattr(m, "__version__", "n/a")
        print(f"{pkg}:", v)
    except Exception as e:
        print(f"{pkg}: not installed ({e})")


python: 3.11.9
yaml: 6.0.1
scipy: 1.13.1
numpy: 2.0.2
mat73: n/a


In [17]:

# --- Config & imports ---
from pathlib import Path
import yaml, json
# Prefer mat73 for MATLAB v7.3 files; fallback to scipy for older versions
try:
    import mat73
    def load_mat(path): return mat73.loadmat(str(path))
except Exception as e:
    from scipy.io import loadmat as scipy_loadmat
    def load_mat(path): return scipy_loadmat(path, simplify_cells=True)

# You can override this via environment variable in notebooks if needed:
# %env CFG=configs/alt.yaml
import os
CFG_PATH = os.environ.get("CFG", "configs/default.yaml")

with open(CFG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

RAW_DIR   = Path(cfg["paths"]["raw_waveforms"])
OUT_DIR   = Path(cfg["paths"]["reduced_waveforms"])
GLOB_PAT  = cfg["mat_modifier"]["glob"]

GROUP_NAME = cfg["mat_modifier"].get("group_name", "Subj_Wins")
KEYS_TO_INCLUDE = cfg["mat_modifier"]["keys_to_include"]
TF = cfg["mat_modifier"].get("time_fields", {})
print("Config loaded from:", CFG_PATH)
print("RAW_DIR:", RAW_DIR)
print("OUT_DIR:", OUT_DIR)
print("GLOB_PAT:", GLOB_PAT)
print("GROUP_NAME:", GROUP_NAME)
print("KEYS_TO_INCLUDE:", KEYS_TO_INCLUDE)


Config loaded from: configs/default.yaml
RAW_DIR: data\mimic_waveforms
OUT_DIR: data\reduced_waveforms
GLOB_PAT: *.mat
GROUP_NAME: Subj_Wins
KEYS_TO_INCLUDE: ['Age', 'CaseID', 'Gender', 'PPG_Raw', 'SegmentID', 'SubjectID', 'T']


In [None]:
# --- Core logic copied from user's working approach (parameterized) ---
import numpy as np, pandas as pd, os
from pathlib import Path
from tqdm import tqdm
import h5py

def process_data(data, output_group, keys_to_include):
    for key, value in data.items():
        if key not in keys_to_include:
            continue
        if isinstance(value, dict):
            sub_group = output_group.create_group(key)
            process_data(value, sub_group, keys_to_include)
        else:
            try:
                if isinstance(value, list):
                    value = np.array(value)
                if isinstance(value, np.ndarray) and value.dtype == 'object':
                    uniform_array = np.array([np.asarray(x) for x in value])
                    output_group.create_dataset(key, data=uniform_array)
                elif isinstance(value, np.ndarray) and value.size == 1:
                    scalar_value = value.item()
                    output_group.create_dataset(key, data=scalar_value)
                elif isinstance(value, np.ndarray) and np.issubdtype(value.dtype, np.str_):
                    output_group.create_dataset(key, data=value.astype('S'))
                else:
                    output_group.create_dataset(key, data=value)
            except Exception as e:
                print(f"Error processing key {key}: {e}")

def process_mat_file(input_file: str, output_file: str):
    group_name = cfg["mat_modifier"].get("group_name", "Subj_Wins")
    keys_to_include = cfg["mat_modifier"]["keys_to_include"]
    case_fmt = cfg["mat_modifier"].get("time_fields", {}).get("case_id_format", "%Y-%m-%d-%H-%M")
    seconds_per_segment = int(cfg["mat_modifier"].get("time_fields", {}).get("seconds_per_segment", 10))
    output_time_key = cfg["mat_modifier"].get("time_fields", {}).get("output_key", "Segment_Time")
    output_time_fmt = cfg["mat_modifier"].get("time_fields", {}).get("output_format", "%Y-%m-%d-%H-%M-%S")

    # Load the .mat file
    mat_data = load_mat(input_file)

    subj = mat_data.get(group_name)
    if subj is None:
        for k, v in mat_data.items():
            if isinstance(k, str) and k.lower() == group_name.lower():
                subj = v; break
    assert isinstance(subj, dict), f"{group_name} not found or not a dict"

    case_ids_list = subj.get('CaseID')
    segment_ids_list = subj.get('SegmentID')

    if type(case_ids_list) == list and type(segment_ids_list) == list:
        case_ids = [case_id[0] for case_id in case_ids_list]
        segment_ids = [segment_id[0] for segment_id in segment_ids_list]
        df = pd.DataFrame({'CaseID': case_ids, 'SegmentID': segment_ids})
        df['CaseID_datetime'] = pd.to_datetime(df['CaseID'], format=case_fmt)
        df['SegmentTime'] = df['SegmentID'] * seconds_per_segment
        df['FinalTime'] = df['CaseID_datetime'] + pd.to_timedelta(df['SegmentTime'], unit='s')
        df['FinalTime_str'] = df['FinalTime'].dt.strftime(output_time_fmt)
        Segment_Time = df['FinalTime_str'].values.reshape(-1, 1)
        subj[output_time_key] = Segment_Time

    with h5py.File(output_file, 'w') as out_f:
        g = out_f.create_group(group_name)
        process_data(subj, g, keys_to_include)
        if output_time_key in subj:
            g.create_dataset(output_time_key, data=subj[output_time_key].astype('S'))

def reduce_mat_file(in_path: Path):
    out_path = OUT_DIR / in_path.name
    process_mat_file(str(in_path), str(out_path))
    present = {
        "PPG": "PPG_Raw" in load_mat(in_path).get(cfg["mat_modifier"]["group_name"], {}),
    }
    sid = Path(in_path).stem
    return {"input": str(in_path), "output": str(out_path), "subject_id": sid, "present": present}

In [19]:
# --- Run over all input files and write manifest.json ---
files = sorted(RAW_DIR.glob(GLOB_PAT))
assert files, f"No files matched '{GLOB_PAT}' in {RAW_DIR}"

manifest = []
for i, f in enumerate(files, 1):
    try:
        entry = reduce_mat_file(f)
        manifest.append(entry)
        if i % 50 == 0 or i == len(files):
            print(f"[{i}/{len(files)}] {f.name} → {entry['output']}")
    except Exception as e:
        print("ERROR:", f, e)

manifest_path = OUT_DIR / "manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
print(f"Done. Wrote {len(manifest)} reduced files and manifest → {manifest_path}")


[1/1] p000160.mat → data\reduced_waveforms\p000160.mat
Done. Wrote 1 reduced files and manifest → data\reduced_waveforms\manifest.json


In [22]:
from scipy.io import loadmat as scipy_loadmat
from mat73 import loadmat

def load_h5_file(file_path):
    def recursively_load_data(h5_obj):
        if isinstance(h5_obj, h5py.Dataset):
            data = h5_obj[()]
            if isinstance(data, bytes):  # Decode byte strings
                return data.decode()
            elif isinstance(data, np.ndarray) and data.dtype.type is np.bytes_:
                return data.astype(str)  # Decode byte strings in numpy arrays
            return data
        elif isinstance(h5_obj, h5py.Group):
            data = {}
            for key, item in h5_obj.items():
                data[key] = recursively_load_data(item)
            return data
        else:
            raise TypeError(f"Unsupported type: {type(h5_obj)}")

    with h5py.File(file_path, 'r') as f:
        return recursively_load_data(f)

input_file = 'D:/Repo files/data/reduced_waveforms/p000160.mat'
data = load_h5_file(input_file)
data['Subj_Wins']['ABP_Raw'][0][0]

KeyError: 'ABP_Raw'