# Assess feature distribution of STS pre-op across bootstraps

In [None]:
import os
import h5py
import socket
import numpy as np
import pandas as pd

# ! pip install --user ~/ml
# from ml4cvd.arguments import _get_tmap
# from ml4cvd.TensorMap import TensorMap
# from ml4cvd.definitions import TENSOR_EXT
# from ml4cvd.tensor_maps_ecg import TMAPS, build_ecg_time_series_tensor_maps

In [None]:
CONTINUOUS_FEATURES = {
    "age",
    "creatlst",
    "hct",
    "hdef",
    "heightcm",
    "platelets",
    "wbc",
    "weightkg",
    "perfustm",
    "xclamptm",
}

In [None]:
# Load STS data
fpath_sts_data = os.path.expanduser("~/dropbox/sts_data/mgh-all-features-labels.csv")
df = pd.read_csv(fpath_sts_data)
df

In [None]:
# Load bootstraps
path_bootstraps = os.path.expanduser("~/dropbox/sts_data/bootstraps")

# Iterate through each bootstrap
for bootstrap in range(10):
      
    # Init empty dict
    stats = dict()   
    
    # Init empty list to store dfs
    dfs = []
    
    # Iterate through data splits
    for split in ["train", "valid", "test", "all"]:
                
        stats[split] = dict()
        
        if split is not "all":

            # Get CSV of MRNs
            fpath = os.path.join(path_bootstraps, str(bootstrap), split + ".csv")
            df_ = pd.read_csv(fpath)
            
            # Get intersect between parent DF and this subset of MRNs
            df_merged = df.merge(right=df_, left_on="medrecn", right_on="mrn")
                      
            # Sort merged df by MRN and then surgdt
            df_merged.sort_values(by=["medrecn", "surgdt"], inplace=True)
                       
            # Drop duplicates
            df_merged.drop_duplicates(subset=["medrecn"], inplace=True)
            
            print(f"Parsing bootstrap {bootstrap}, split {split}: {df_merged.shape}")
            
            # Append this split's data to list of all dfs
            dfs.append(df_merged.copy())

        # If not working on a split, we have all data
        else:
            
            # Convert list of dfs into one aggregated df
            df_merged = pd.concat(dfs)
            
            # Sort merged df by MRN and then surgdt
            df_merged.sort_values(by=["medrecn", "surgdt"], inplace=True)
                       
            # Drop duplicates
            df_merged.drop_duplicates(subset=["medrecn"], inplace=True)
            
        # Iterate through each continuous feature
        for feature in CONTINUOUS_FEATURES:
            stats[split][f"{feature}_median"] = df_merged[feature].median()
            stats[split][f"{feature}_iqr"] = df_merged[feature].quantile(0.75) - df_merged[feature].quantile(0.25)
        
    # Convert dicts into dataframe
    df_stats = pd.DataFrame(stats)       
    print(df_stats)
    print('\n')