In [None]:
! pip install --user ~/ml

import os
import h5py
import socket
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Union, Dict

import seaborn as sns
from matplotlib import pyplot as plt

from ml4cvd.TensorMap import TensorMap, update_tmaps
from ml4cvd.definitions import TENSOR_EXT, STS_PREOP_ECG_CSV

In [None]:
%matplotlib inline

## Get list of STS MRNs (strings)

In [None]:
df = pd.read_csv(STS_PREOP_ECG_CSV)
sts_mrns = df['medrecn'].to_list()
sts_mrns = [str(mrn) for mrn in sts_mrns]
print(f"Extracted {len(sts_mrns)} MRNs from {STS_PREOP_ECG_CSV}")

## Get list of paths to HD5 files

In [None]:
def get_fpaths(dirpath: str, ext: str = TENSOR_EXT) -> list:
    fpaths = []
    for root, dirs, files in os.walk(dirpath):
        for fname in files:
            if not fname.endswith(TENSOR_EXT):
                continue
            else:
                fpaths.append(os.path.join(root, fname))
    print(f"Found {len(fpaths)} {TENSOR_EXT} files at {dirpath}")
    return fpaths

def get_path_to_ecgs() -> str:
    """Check the hostname of the machine and return the appropriate path.
    If there is no match found, this function does not return anything, and
    the script ends up with a non-viable path prefix to HD5 files and will fail."""
    if "mithril" == socket.gethostname():
        return "/data/ecg"
    elif "anduril" == socket.gethostname():
        return "/media/4tb1/ecg"
    elif "stultzlab" in socket.gethostname():
        return "/storage/shared/ecg_deidentified"
    
fpaths = get_fpaths(dirpath=os.path.join(get_path_to_ecgs(), "mgh"))

## Isolate MRNs from HD5 paths and convert to set

In [None]:
# Isolate MRNs from fpaths
mrns_in_fpaths = []
for fpath in tqdm(fpaths):
    mrn = os.path.split(fpath)[1].replace(TENSOR_EXT, "")
    mrns_in_fpaths.append(mrn)
mrns_in_fpaths = set(mrns_in_fpaths)

print(f"Isolated MRNs from {len(mrns_in_fpaths)} list items and converted to set")

## Use set of MRNs and list of STS MRNs to efficiently find paths to HD5s with matching STS MRNs

In [None]:
# Get path prefix to HD5 data
path_prefix = os.path.split(fpath)[0]
path_prefix

# Iterate through STS MRNs and check if it is in ECG fpath list; if yes, append to list of paths
fpaths_matches = []
for mrn in tqdm(sts_mrns):
    if mrn in mrns_in_fpaths:
        fpath_match = os.path.join(path_prefix, mrn + TENSOR_EXT)
        fpaths_matches.append(fpath_match)
    
print(f"Found {len(fpaths_matches)} STS MRN matches in ECG HD5 paths")

## Define list several STS TMaps and build them
### Print the tmap name and `tensor_from_file` functions

In [None]:
# needed_tmaps = ["bypass_time", "classnyh"]
needed_tensor_maps = ["age_sts_newest", "age_scaled_sts_newest", "ecg_age_sts_newest"]

tmaps = {}
for tmap_name in needed_tensor_maps:
    tmaps = update_tmaps(tmap_name=tmap_name, tmaps=tmaps)
    print(f"Successfully created tensor map {tmaps[tmap_name].name} with normalizer = {tmaps[tmap_name].normalization}")

## Initialize dict of empty lists in which to store tensors returned by TMaps

In [None]:
tensors = dict()
for tm in needed_tensor_maps:
    tensors[tm] = []

## Iterate through fpaths, use TMap to get tensors, and append to dict

In [None]:
for fpath in tqdm(fpaths_matches[0:10]):
    with h5py.File(fpath, "r") as hf:
        for tm in needed_tensor_maps:
            tensor = tmaps[tm].tensor_from_file(tm=tmaps[tm], hd5=hf)
            if tmaps[tm].normalization is not None:
                tensor = tmaps[tm].normalization.normalize(tensor)
            tensors[tmaps[tm].name].append(tensor)

## Print arrays of some tensors to confirm we got them!

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

for tm in needed_tensor_maps:
    print(f"tm.name: {tmaps[tm].name}, shape: {tmaps[tm].shape})")
    pp.pprint(tensors[tm])
    print('\n')