# MODS Phenotypes: Step 2. Coallesce Emory Data to Parquet

## Setup

### `import`

In [1]:
import pickle
from pathlib import Path
from tqdm.auto import tqdm
tqdm(total=100, mininterval=1.0) # only update tqdm every second
import os
import sys
import warnings
from random import sample
warnings.simplefilter(action="ignore", category=FutureWarning)
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from functools import reduce

  0%|          | 0/100 [00:00<?, ?it/s]

In [2]:
site_name = 'emory'
sys.path.insert(0, "/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/mods/")
from src.config import *
from src.utils import *

In [3]:
from src.log import setup_logger
logs_path = Path("/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/mods/logs/")
logs_path.mkdir(exist_ok=True)
logger = setup_logger("2_emory_to_parquet", root_folder=logs_path)

### `config`

In [4]:
# TODO: I think these are in the config files so just get them there and delete this
input_path = Path('/opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/')
# years=['2014','2016','2017','2018','2019','2020']
years=['2020']

# TODO: this shouldnt be needed make it go away
patient_id = project_config[site_name]["keys"]["patient_key"]
service_id = project_config[site_name]["keys"]["service_key"]
record_dt = project_config[site_name]["keys"]["record_dt"]

scores_keys = project_config[site_name]["scores"]
static_keys = project_config[site_name]["static"]
dynamic_keys = project_config[site_name]["dynamic"]
times_keys = project_config[site_name]["times"]
datetimes_keys = project_config[site_name]["datetimes"]

num_cpus = project_config['parameters']['num_cpus']
num_gpus = project_config['parameters']['num_gpus']

## Run

### Coallesce `static_dfs`

In [5]:
static_schema = (
    reduce(lambda a, b: {**a, **b}, [arrow_schema['static'][k] for k in static_keys])
    |
    reduce(lambda a, b: {**a, **b}, [arrow_schema['static']['times'][k] for k in times_keys])
    )

time_columns = [
    'times_abx_order', 
    'times_culture', 
    'times_suspicion_sepsis3', 
    'times_SOFA', 
    'times_sepsis3'
]

for col in time_columns:
    static_schema[col] = 'LIST(TIMESTAMP[NS])'

arrow_static_schema = make_arrow_schema(static_schema)

In [6]:
%%time
(input_path / 'static_df_YEARLY').mkdir(exist_ok=True, parents=True)
for year in tqdm(years):
    static_table = read_parquet_files_in_parallel(directory=input_path / 'static_df' / year,
                                                  schema=arrow_static_schema,
                                                  max_workers=num_cpus)
    pq.write_table(static_table, input_path / 'static_df_YEARLY' / f"static_df_{year}.parquet")

  0%|          | 0/1 [00:00<?, ?it/s]

Reading files:   0%|          | 0/207279 [00:00<?, ?it/s]

KeyboardInterrupt: 

**TODO:** ***FIX ERRORS**
```text
"2023-07-30 14:33:36,380 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/54472218018.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:38,001 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/46838348052.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:38,209 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/20340007365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:40,713 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/17067897365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:41,591 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/19703377363.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:43,182 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/54865797365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:43,627 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/32283227365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:45,617 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/2248567365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:47,246 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/55148007365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:50,188 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/1678267365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:51,799 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/13743938047.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:52,614 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/42379867365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:52,714 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/17964967364.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:54,933 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/17016197365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:55,817 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/9800677365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:57,573 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/43431347362.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:33:58,570 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/36444178032.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:01,495 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/34136287364.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
 Output of this cell has been trimmed on the initial display.
Displaying the first 50 top and last bottom outputs.
Click on this message to get the complete output.
 "2023-07-30 14:34:34,542 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/57220337365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:36,301 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/47606468066.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:37,491 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/57045737362.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:39,187 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/50703747365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:39,323 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/43634017365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:39,745 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/6870918147.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:42,436 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/20488677361.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:43,485 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/54107107363.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:51,055 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/56793048079.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:52,444 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/11315517365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:52,461 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/9073788018.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:53,119 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/57386377365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:34:57,242 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/18442958159.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:04,286 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/20677177365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:05,141 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/10272517365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:10,465 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/47965688037.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:13,675 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/37810567365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:14,435 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/41826817365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:20,002 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/57348257365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:21,193 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/17781797365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:23,881 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/43173137365.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:29,177 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/54834618030.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
"2023-07-30 14:35:34,706 - ERROR - Error reading file: /opt/scratchspace/KLAB_SAIL/MODSPhenotypes/data/2023_07_29/extraction/emory/static_df/2018/57182007364.parquet. Error: Failed to parse string: '(null)' as a scalar of type double"
```

### Coallesce `dynamic_dfs`

In [7]:
dynamic_schema = (
    arrow_schema['dynamic']['super_table'] |
    reduce(lambda a, b: {**a, **b}, [arrow_schema['dynamic']['scores'][k] for k in scores_keys])
    )

arrow_dynamic_schema = make_arrow_schema(dynamic_schema)

In [8]:
%%time
(input_path / 'dynamic_df_YEARLY').mkdir(exist_ok=True, parents=True)
for year in tqdm(years):
    dynamic_table = read_parquet_files_in_parallel(directory=input_path / 'dynamic_df' / year,
                                                   schema=arrow_dynamic_schema,
                                                   max_workers=num_cpus)
    pq.write_table(dynamic_table, input_path / 'dynamic_df_YEARLY' / f"dynamic_df_{year}.parquet")

  0%|          | 0/1 [00:00<?, ?it/s]

Reading files:   0%|          | 0/207279 [00:00<?, ?it/s]

CPU times: user 1h 19min 52s, sys: 16min 23s, total: 1h 36min 16s
Wall time: 13min 49s


---

## Quality Check

### Load `static_df`

In [None]:
%%time
__static_table = pq.read_table(
    input_path / 'static_df_YEARLY' / '2018',
    schema=arrow_static_schema
)
__static_table.head()

### Load `dynamic_df`

In [None]:
%%time
__dynamic_table = pq.read_table(
    input_path / 'dynamic_df_YEARLY' / '2018',
    schema=arrow_dynamic_schema
)
__dynamic_table.head()