MODS Phenotypes: Step 1. Extract Data for Emory
===

## Imports

In [1]:
import pickle
from pathlib import Path
from tqdm import tqdm
import sys
import warnings
from random import sample
warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from functools import reduce
import numpy as np

In [2]:
import sys
sys.path.insert(0, "/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/mods/")
from src.config import *
from src.utils import *

In [3]:
site_name = 'emory'

In [4]:
output_path = (
    project_path / "data" / str(run_id) / "extraction" / site_name
)
output_path.mkdir(parents=True, exist_ok=True)

In [20]:
patient_id = project_config[site_name]["keys"]["patient_key"]
service_id = project_config[site_name]["keys"]["service_key"]
record_dt = project_config[site_name]["keys"]["record_dt"]

# TODO: this shouldnt be needed make it go away
scores_keys = project_config[site_name]["scores"]
static_keys = project_config[site_name]["static"]
dynamic_keys = project_config[site_name]["dynamic"]
times_keys = project_config[site_name]["times"]
datetimes_keys = project_config[site_name]["datetimes"]

## Functions

### Extraction

In [23]:
def extraction(pickle_path):
    encounter_pickle, filename_without_ext = load_encounter_pickle(pickle_path)
    extract_dynamic_df(encounter_pickle, pickle_path, filename_without_ext)
    extract_static_df(encounter_pickle, pickle_path, filename_without_ext)
    extract_perCSN_dfs(encounter_pickle, pickle_path)

### `extract_dynamic_df`

In [24]:
def extract_dynamic_df(encounter_pickle, pickle_path, filename_without_ext):
    super_df = get_super_df(encounter_pickle, pickle_path, filename_without_ext)
    
    scores_df = get_scores_df(encounter_pickle, pickle_path)
    
    dynamic_df = pd.merge(left=super_df, right=scores_df,
                          how='left', 
                        left_on=[record_dt], 
                        right_on=[record_dt],
                       suffixes=('_super','_dynamic'))

    dynamic_table = pa.Table.from_pandas(dynamic_df, preserve_index=False)
    
    output_folder = output_path / "dynamic_df" / str(pickle_path.parent.stem)
    output_folder.mkdir(parents=True, exist_ok=True)
    
    pq.write_table(dynamic_table,
                   output_folder / f"{pickle_path.stem}.parquet",
                   # TODO: Get from config
                   version='2.6', compression='snappy')

## SuperTable

In [25]:
def get_super_df(encounter_pickle, pickle_path, filename_without_ext, cols_to_drop=None):
    # Get SuperTable
    try:
        super_df = encounter_pickle['super_table']
    except KeyError as e:
        print(f"(get_super_df) KeyError: 'super_table' not in {str(pickle_path.stem)}")
        return

    try:
        super_df[patient_id] = str(encounter_pickle['pt_id'])
    except KeyError as e:
        try:
            super_df[patient_id] = str(encounter_pickle['pat_id'])
        except KeyError as e:
            print(f"(get_super_df) KeyError: neither 'pt_id' or 'pat_id' was found in {str(pickle_path.stem)}")
            return

    # Drop columns
    if cols_to_drop:
        for col in cols_to_drop:
            try:
                super_df.drop(labels=col, axis=1, inplace=True)
            except KeyError:
                print(f"(get_super_df) KeyError in {str(pickle_path.stem)} when dropping '{col}' column")
                pass  # may not be in all encounters
    
    # Assign CSN to SuperTable
    try:
        str(encounter_pickle[service_id])
    except KeyError as e:
        super_df[service_id] = filename_without_ext  # Use the filename without extension here
    else:
        super_df[service_id] = str(encounter_pickle[service_id])

    super_df.reset_index(inplace=True, drop=False)
    super_df.rename(columns={"index": record_dt}, inplace=True)
    
    for key in (set(pandas_schema['dynamic']['super_table'].keys()) - set(super_df.columns)):
        super_df[key] = None
        super_df[key] = super_df[key].astype(pandas_schema['dynamic']['super_table'][key])

    try:
        super_df = super_df[pandas_schema['dynamic']['super_table'].keys()]
    except KeyError as e:
        print(f"(get_super_df) KeyError in {str(encounter_pickle[service_id])} when using keys from pandas_schema: {e}")

    super_schema = {}
    for col in super_df.columns:
        try:
            super_schema[col] = pandas_schema['dynamic']['super_table'][col]
        except KeyError as e:
            print(f"(get_super_df) KeyError with {col} in {str(pickle_path.stem)} when building super_schema")

    try:
        super_df = super_df.astype(super_schema)
    except TypeError as e:
        print(f"TypeError in {str(pickle_path.stem)}: {e}")
        # Identify and print problematic column and its unique values
        for col, dtype in super_schema.items():
            try:
                super_df[col].astype(dtype)
            except Exception as inner_e:
                print(f"Column {col} with values {super_df[col].unique()} caused error: {inner_e}")

    return super_df

### `get_scores_df`

In [26]:
def get_scores_df(encounter_pickle, pickle_path):
    # SOFA scores
    sofa_df = encounter_pickle['sofa_scores']
    sofa_rename_map = {
        'hourly_total': 'SOFA_hourly_total',
        'delta_24h': 'SOFA_delta_24h',
        'hourly_total_mod': 'SOFA_hourly_total_mod',
        'delta_24h_mod': 'SOFA_delta_24h_mod'
    }
    sofa_df.rename(columns=sofa_rename_map, inplace=True)

    # Check for 'sirs_scores' in encounter_pickle
    if 'sirs_scores' in encounter_pickle:
        sirs_df = encounter_pickle['sirs_scores']
        sirs_rename_map = {
            'hourly_total': 'SIRS_hourly_total',
            'delta_24h': 'SIRS_delta_24h'
        }
        sirs_df.rename(columns=sirs_rename_map, inplace=True)
    else:
        # Create an empty DataFrame with the same index as sofa_df
        sirs_df = pd.DataFrame(index=sofa_df.index)
    
    # Merging
    scores_df = pd.merge(left=sofa_df, right=sirs_df,
                         how='outer', left_index=True, right_index=True)

    # Type casting
    scores_schema = {}
    for key in scores_keys:
        # Check if key exists in scores_df
        if key in scores_df.columns:
            scores_schema = scores_schema | pandas_schema['dynamic']['scores'][key]

    # Check the existence of the column before trying to set its dtype
    for col, dtype in scores_schema.items():
        if col in scores_df.columns:
            try:
                scores_df[col] = scores_df[col].astype(dtype)
            except Exception as e:
                print(f"Error in {str(pickle_path.stem)} for column {col}: {e}")

    scores_df.reset_index(inplace=True, drop=False)
    scores_df.rename(columns={"index": record_dt}, inplace=True)
    
    return scores_df

### `extract_static_df`

In [27]:
def extract_static_df(encounter_pickle, pickle_path, filename_without_ext):

    static_df = get_static_df(encounter_pickle)
    times_df = get_times_df(encounter_pickle)
    
    static_df = pd.concat([static_df, times_df], axis=1)

    static_schema = {}
    for key in static_keys:
        static_schema = static_schema | pandas_schema['static'][key]
    for key in times_keys:
        static_schema = static_schema | pandas_schema['static']['times'][key]

    # Filter the schema to only include columns present in the dataframe
    filtered_schema = {col: dtype for col, dtype in static_schema.items() if col in static_df.columns}
    try:
        static_df = static_df.astype(filtered_schema)
    except Exception as e:  # Catching a broader exception in case there are other issues beyond KeyError
        print(f"(extract_static_df) Error for {str(pickle_path.stem)} when doing .astype(filtered_schema): {e}")

    static_df.rename(columns={
        't_suspicion': 'times_suspicion_sepsis3',
        't_SOFA':'times_SOFA',
        't_sepsis3':'times_sepsis3',
        't_abx':'times_abx_order',
        't_clt':'times_culture'
        },
                     inplace=True)
    # dealing with `ed_wait_time`
    if type(static_df.loc[0,'ed_wait_time']) is pd.Timedelta:
        static_df.loc[0,'ed_wait_time'] = float(static_df['ed_wait_time'][0].seconds/60)

    if pd.isnull(static_df.loc[0,'ed_wait_time']):
        static_df.loc[0,'ed_wait_time'] = 0.0
        static_df.loc[0,'ed_wait_time'] = float('nan')            

    static_table = pa.Table.from_pandas(static_df, preserve_index=False)

    output_folder = output_path / "static_df" / str(pickle_path.parent.stem)
    output_folder.mkdir(parents=True, exist_ok=True)

    pq.write_table(static_table,
                   output_folder / f"{pickle_path.stem}.parquet",
                   # TODO: Get from config
                   version='2.6', compression='snappy')

### `get_static_df`

In [28]:
def get_static_df(encounter_pickle):
    static_dict = reduce(lambda a, b: {**a, **b}, [encounter_pickle[k] for k in static_keys])

    # Dealing with ed_wait_time issues
    if 'ed_wait_time' not in static_dict.keys():
        static_dict['ed_wait_time'] = float('nan')
    
    static_df = pd.DataFrame(pd.Series(static_dict)).T
    return static_df

### `get_times_df`

In [29]:
def get_times_df(encounter_pickle):
    times_data = [
        denoise_times(encounter_pickle['sep3_time'].t_suspicion),
        denoise_times(encounter_pickle['t_suspicion'].t_clt),
        denoise_times(encounter_pickle['t_suspicion'].t_abx),
        denoise_times(encounter_pickle['sep3_time'].t_SOFA),
        denoise_times(encounter_pickle['sep3_time'].t_sepsis3)
    ]

    times_df = pd.DataFrame([times_data], columns = [
        't_suspicion',
        't_clt',
        't_abx',
        't_SOFA',
        't_sepsis3'
        ])
    
    for n in times_df.columns:
        try:
            if times_df[n][0].size == 0:
                times_df[n][0] = [pd.NaT]
        except:
            pass

    return times_df

### `extract_perCSN_dfs`

In [30]:
def extract_perCSN_dfs(encounter_pickle, pickle_path):
    extract_beds_df(encounter_pickle, pickle_path)
    extract_diagnosis_df(encounter_pickle, pickle_path)
    extract_procedures_df(encounter_pickle, pickle_path)
    extract_cultures_df(encounter_pickle, pickle_path)

In [31]:
def extract_beds_df(encounter_pickle, pickle_path):
    beds_df = encounter_pickle["beds_PerCSN"]
    beds_df.reset_index(inplace=True)
    beds_df.rename(columns={'index':'csn'}, inplace=True)

    beds_table = pa.Table.from_pandas(beds_df, preserve_index=False)

    output_folder = output_path / "beds_df" / str(pickle_path.parent.stem)
    output_folder.mkdir(parents=True, exist_ok=True)

    pq.write_table(beds_table,
                   output_folder / f"{pickle_path.stem}.parquet",
                   version='2.6', compression='snappy')

In [32]:
def extract_diagnosis_df(encounter_pickle, pickle_path):
    diagnosis_df = encounter_pickle["diagnosis_PerCSN"]
    diagnosis_df.reset_index(inplace=True)
    diagnosis_df.rename(columns={'index':'csn'}, inplace=True)

    diagnosis_table = pa.Table.from_pandas(diagnosis_df, preserve_index=False)

    output_folder = output_path / "diagnosis_df" / str(pickle_path.parent.stem)
    output_folder.mkdir(parents=True, exist_ok=True)

    pq.write_table(diagnosis_table,
                   output_folder / f"{pickle_path.stem}.parquet",
                   version='2.6', compression='snappy')

In [33]:
def extract_procedures_df(encounter_pickle, pickle_path):
    procedures_df = encounter_pickle["procedures_PerCSN"]
    procedures_df.reset_index(inplace=True)
    procedures_df.rename(columns={'index':'csn'}, inplace=True)

    procedures_table = pa.Table.from_pandas(procedures_df, preserve_index=False)

    output_folder = output_path / "procedures_df" / str(pickle_path.parent.stem)
    output_folder.mkdir(parents=True, exist_ok=True)

    pq.write_table(procedures_table,
                   output_folder / f"{pickle_path.stem}.parquet",
                   version='2.6', compression='snappy')

In [34]:
def extract_cultures_df(encounter_pickle, pickle_path):
    cultures_df = encounter_pickle["cultures_PerCSN"]
    cultures_df.reset_index(inplace=True)
    cultures_df.rename(columns={'index':'csn'}, inplace=True)

    cultures_table = pa.Table.from_pandas(cultures_df, preserve_index=False)

    output_folder = output_path / "cultures_df" / str(pickle_path.parent.stem)
    output_folder.mkdir(parents=True, exist_ok=True)

    pq.write_table(cultures_table,
                   output_folder / f"{pickle_path.stem}.parquet",
                   version='2.6', compression='snappy')

## `main()`

In [None]:
def main(config):
    file_path=config[site_name]['filepaths']['encounter_pickles']
    years=config[site_name]['years']
    sample_rate=config['parameters']['sample_rate']
    num_cpus=config['parameters']['num_cpus']
    pickle_paths = find_pickle_paths(file_path=file_path,
                                     years=years,
                                     sample_rate=sample_rate)
    with Pool(processes=num_cpus) as pool:
        max_ = len(pickle_paths)
        with tqdm(total=max_) as pbar:
            for _ in pool.imap_unordered(func=extraction, iterable=pickle_paths):
                pbar.update()

main(project_config)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/779373 [00:00<?, ?it/s]

(extract_static_df) Error for 17373465110 when doing .astype(filtered_schema): datetime64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 54472218018 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 46838348052 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 19654853365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 20340007365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 17067897365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 19703377363 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 54865797365 when doing .astype(filtered_schema): object cannot be converted to

*** TODO: SOLVE ERRORS WHEN EXTRACTING FROM STATIC_DF *** 
```text
(extract_static_df) Error for 17373465110 when doing .astype(filtered_schema): datetime64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 54472218018 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 46838348052 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 19654853365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 20340007365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 17067897365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 19703377363 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 54865797365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 32283227365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 2248567365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 55148007365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 1678267365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 13743938047 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 42379867365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 17964967364 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 17016197365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 45416024034 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 15169743365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 9800677365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 41709113365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 43431347362 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 19054844023 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 36444178032 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 34136287364 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 12465303365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 6149557364 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 8768277365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 4342384002 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 57386227365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 11322093365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 8154853365 when doing .astype(filtered_schema): timedelta64[ns] cannot be converted to a FloatingDtype
(extract_static_df) Error for 47769887365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 34809097365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 13564047365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 41550738016 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 16953857365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
(extract_static_df) Error for 46322397365 when doing .astype(filtered_schema): object cannot be converted to a FloatingDtype
```

---
---
---

## Load results

### Load `dynamic_df`

In [49]:
example_csn = '10000548080'

In [50]:
%%time
dynamic_schema = (
    arrow_schema['dynamic']['super_table'] |
    reduce(lambda a, b: {**a, **b}, [arrow_schema['dynamic']['scores'][k] for k in scores_keys])
    )

arrow_dynamic_schema = make_arrow_schema(dynamic_schema)

dynamic_table = pq.read_table(
    str(output_path/'dynamic_df'/'2018'/f"{example_csn}.parquet"),
    use_pandas_metadata=True,
    schema=arrow_dynamic_schema
)
dynamic_df = dynamic_table.to_pandas()
display(dynamic_df.head())

Unnamed: 0,pat_id,csn,charttime,temperature,daily_weight_kg,height_cm,sbp_line,dbp_line,map_line,sbp_cuff,...,SOFA_hourly_total,SOFA_delta_24h,SOFA_hourly_total_mod,SOFA_delta_24h_mod,SIRS_resp,SIRS_cardio,SIRS_temp,SIRS_wbc,SIRS_hourly_total,SIRS_delta_24h
0,20610789,10000548080,2018-04-17 05:14:00,36.5,65.0,173.0,,,,152.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,20610789,10000548080,2018-04-17 06:14:00,36.5,65.0,173.0,,,,152.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,20610789,10000548080,2018-04-17 07:14:00,36.5,65.0,173.0,,,,152.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,20610789,10000548080,2018-04-17 08:14:00,36.5,65.0,173.0,,,,152.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,20610789,10000548080,2018-04-17 09:14:00,36.5,65.0,173.0,,,,152.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


CPU times: user 38.7 ms, sys: 24.4 ms, total: 63.1 ms
Wall time: 35.3 ms


### Load `static_df`

In [52]:
%%time
static_schema = (
    reduce(lambda a, b: {**a, **b}, [arrow_schema['static'][k] for k in static_keys])
    |
    reduce(lambda a, b: {**a, **b}, [arrow_schema['static']['times'][k] for k in times_keys])
    )

# TODO: I shouldnt need to manually set these here figure out how to avoid it
static_schema['times_abx_order'] = 'LIST(TIMESTAMP[NS])'
static_schema['times_culture'] = 'LIST(TIMESTAMP[NS])'
static_schema['times_suspicion_sepsis3'] = 'LIST(TIMESTAMP[NS])'
static_schema['times_SOFA'] = 'LIST(TIMESTAMP[NS])'
static_schema['times_sepsis3'] = 'LIST(TIMESTAMP[NS])'

arrow_static_schema = make_arrow_schema(static_schema)

static_table = pq.read_table(
    str(output_path/'static_df'/'2018'/f"{example_csn}.parquet"),
    use_pandas_metadata=True,
    schema=arrow_static_schema
)

static_df = static_table.to_pandas()

display(static_df.head())

Unnamed: 0,csn,pt_id,y_vent_rows,y_vent_start_time,y_vent_end_time,vent_start_time,ed_wait_time,worst_pf_pa,worst_pf_pa_time,worst_pf_sp,...,hospital_admission_date_time,hospital_discharge_date_time,start_index,first_icu_start,first_icu_end,times_culture,times_abx_order,times_sepsis3,times_suspicion_sepsis3,times_SOFA
0,10000548080,20610789,0.0,0.0,0.0,NaT,,,NaT,,...,2018-04-17 05:14:00,2018-04-20 16:00:00,2018-04-17 05:14:00,2018-04-17 13:14:00,2018-04-18 15:14:00,[None],[None],[None],[None],[None]


CPU times: user 38.8 ms, sys: 0 ns, total: 38.8 ms
Wall time: 29.9 ms


### Load `beds_PerCSN`

In [53]:
%%time
arrow_beds_schema = make_arrow_schema(arrow_schema['perCSN']['beds_PerCSN'])

beds_table = pq.read_table(
    str(output_path/'beds_df'/'2018'/f"{example_csn}.parquet"),
    use_pandas_metadata=True,
    schema=arrow_beds_schema
)

beds_df = beds_table.to_pandas()

display(beds_df.head())

Unnamed: 0,csn,pat_id,bed_location_start,bed_location_end,bed_unit,bed_room,bed_id,bed_label,hospital_service,accomodation_code,accomodation_description,icu,imc,ed,procedure
0,10000548080,20610789,2018-04-09 11:51:57,2018-04-17 05:13:56,Not Recorded,Not Recorded,Not Recorded,Not Recorded,THORACIC SURGERY,--,Not Recorded,0.0,0.0,0.0,0.0
1,10000548080,20610789,2018-04-17 05:13:56,2018-04-17 06:04:42,Main Registration ARR SJH,Not Recorded,Not Recorded,Not Recorded,THORACIC SURGERY,--,Not Recorded,0.0,0.0,0.0,0.0
2,10000548080,20610789,2018-04-17 06:04:42,2018-04-17 13:09:59,SURG 1FL SJH,Not Recorded,Not Recorded,Not Recorded,THORACIC SURGERY,--,Not Recorded,0.0,0.0,0.0,0.0
3,10000548080,20610789,2018-04-17 13:09:59,2018-04-18 15:51:36,2SW ICU SJH,295,01,INTENSIVE CARE,THORACIC SURGERY,ICU,INTENSIVE CARE,1.0,0.0,0.0,0.0
4,10000548080,20610789,2018-04-18 15:51:36,2018-04-20 16:00:00,3W SJH,349,01,SPECIAL CARE,THORACIC SURGERY,R,ROUTINE,0.0,0.0,0.0,0.0


CPU times: user 23.9 ms, sys: 0 ns, total: 23.9 ms
Wall time: 18.9 ms


### Load `cultures_PerCSN`

In [54]:
%%time

arrow_cultures_schema = make_arrow_schema(arrow_schema['perCSN']['cultures_PerCSN'])

cultures_table = pq.read_table(
    str(output_path/'cultures_df'/'2018'/f"{example_csn}.parquet"),
    use_pandas_metadata=True,
    schema=arrow_cultures_schema
)

cultures_df = cultures_table.to_pandas()

display(cultures_df.head())

Unnamed: 0,csn,pat_id,proc_code,proc_desc,component_id,component,loinc_code,specimen_collect_time,order_time,order_id,result_id,lab_result_time,result_status,lab_result


CPU times: user 14.4 ms, sys: 1.59 ms, total: 16 ms
Wall time: 12 ms


### Load `procedures_PerCSN`

In [56]:
%%time
arrow_schema['perCSN']['procedures_PerCSN']['csn']='STRING'
arrow_schema['perCSN']['procedures_PerCSN']['pat_id']='STRING'
arrow_procedures_schema = make_arrow_schema(arrow_schema['perCSN']['procedures_PerCSN'])

procedures_table = pq.read_table(
    str(output_path/'procedures_df'/'2018'/f"{example_csn}.parquet"),
    use_pandas_metadata=True,
    schema=arrow_procedures_schema
)

procedures_df = procedures_table.to_pandas()

display(procedures_df.head())

Unnamed: 0,csn,pat_id,surgery_date,in_or_dttm,procedure_start_dttm,procedure_comp_dttm,out_or_dttm,or_procedure_id,primary_procedure_nm,cpt_code,service_nm,primary_physician_nm


CPU times: user 14.8 ms, sys: 0 ns, total: 14.8 ms
Wall time: 12.1 ms


### Load `diagnosis_PerCSN`

In [57]:
%%time
arrow_schema['perCSN']['diagnosis_PerCSN']['csn']='STRING'
arrow_schema['perCSN']['diagnosis_PerCSN']['pat_id']='STRING'
arrow_diagnosis_schema = make_arrow_schema(arrow_schema['perCSN']['diagnosis_PerCSN'])

diagnosis_table = pq.read_table(
    str(output_path/'diagnosis_df'/'2018'/f"{example_csn}.parquet"),
    use_pandas_metadata=True,
    schema=arrow_diagnosis_schema
)

diagnosis_df = diagnosis_table.to_pandas()

display(diagnosis_df.head())

Unnamed: 0,csn,pat_id,dx_line,dx_icd_scope,dx_code_icd9,dx_code_icd10,dx_source,dx_time_date,dx_code,dx_name
0,10000548080,20610789,Secondary,Billing Diagnosis,401.9,I10,Medical Records Coding System,2018-04-24,401.9,Unspecified essential hypertension
1,10000548080,20610789,Not Recorded,Admitting Diagnosis,424.0,I340,Saint Joseph's Hospital (HQ),2018-03-21,424.0,Mitral valve disorders
2,10000548080,20610789,Primary,Billing Diagnosis,424.0,I340,The Emory Clinic Registration (IDX),2018-04-19,424.0,Mitral valve disorders
3,10000548080,20610789,Secondary,Billing Diagnosis,--,Z98890,The Emory Clinic Registration (IDX),2018-04-18,--,Not Recorded
4,10000548080,20610789,Not Recorded,Admitting Diagnosis,786.09,R0600,Medical Records Coding System,2018-04-24,786.09,Other dyspnea and respiratory abnormality


CPU times: user 16.4 ms, sys: 3.04 ms, total: 19.4 ms
Wall time: 14.7 ms
