# MODS Phenotypes: Step 2. Coallesce Grady Data to Parquet

## `import`

In [3]:
import pickle
from pathlib import Path
from tqdm.auto import tqdm
import os
import sys
import warnings
from random import sample
warnings.simplefilter(action="ignore", category=FutureWarning)
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from functools import reduce

sys.path.insert(0, "/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/mods/")
from src.config import *
from src.utils import *
site_name = 'grady'

# TODO: I think these are in the config files so just get them there and delete this
input_path = Path('/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/grady/')
years=['2014','2015','2016','2017','2018','2019','2020']

## Coallesce `static_dfs`

In [2]:
static_schema = (
    reduce(lambda a, b: {**a, **b}, [arrow_schema['static'][k] for k in static_keys])
    |
    reduce(lambda a, b: {**a, **b}, [arrow_schema['static']['times'][k] for k in times_keys])
    )

time_columns = [
    'times_abx_order', 
    'times_culture', 
    'times_suspicion_sepsis3', 
    'times_SOFA', 
    'times_sepsis3'
]

for col in time_columns:
    static_schema[col] = 'LIST(TIMESTAMP[NS])'

arrow_static_schema = make_arrow_schema(static_schema)

In [None]:
%%time
(input_path / 'static_df_YEARLY').mkdir(exist_ok=True, parents=True)
for year in tqdm(years):
    static_table = read_parquet_files_in_parallel(directory=input_path / 'static_df' / year,
                                                  schema=arrow_static_schema,
                                                  max_workers=num_cpus)
    pq.write_table(static_table, input_path / 'static_df_YEARLY' / f"static_df_{year}.parquet")

## Coallesce `dynamic_dfs`

In [4]:
dynamic_schema = (
    arrow_schema['dynamic']['super_table'] |
    reduce(lambda a, b: {**a, **b}, [arrow_schema['dynamic']['scores'][k] for k in scores_keys])
    )

arrow_dynamic_schema = make_arrow_schema(dynamic_schema)

In [None]:
%%time
(input_path / 'dynamic_df_YEARLY').mkdir(exist_ok=True, parents=True)
for year in tqdm(years):
    dynamic_table = read_parquet_files_in_parallel(directory=input_path / 'dynamic_df' / year,
                                                   schema=arrow_dynamic_schema,
                                                   max_workers=num_cpus)
    pq.write_table(dynamic_table, input_path / 'dynamic_df_YEARLY' / f"dynamic_df_{year}.parquet")

Reading files: 100%|██████████| 22818/22818 [01:03<00:00, 357.79it/s]
Reading files: 100%|██████████| 24464/24464 [01:08<00:00, 358.99it/s]
Reading files: 100%|██████████| 26196/26196 [01:26<00:00, 302.91it/s] 


## Re-Load `dfs` to confirm

### Load `static_df`

In [None]:
%%time
__static_table = pq.read_table(
    input_path / 'static_df_YEARLY' / '2018',
    schema=arrow_static_schema
)
__static_table.head()

### Load `dynamic_df`

In [None]:
%%time
__dynamic_table = pq.read_table(
    input_path / 'dynamic_df_YEARLY' / '2018',
    schema=arrow_dynamic_schema
)
__dynamic_table.head()