## This notebook is used to import & export 03-feature-generation data from S3

In [None]:
import sys
from pathlib import Path

sys.path.insert(0, '/src')
import pandas as pd
from shared.utils import get_client_class, get_memory_usage
from eliot import start_action, start_task, to_file, log_message
to_file(sys.stdout)

## Load config

In [None]:
from shared.constants import LOCAL_TRAINING_CONFIG_PATH
from shared.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
# Constants
CLIENT = "+".join([config.organization_id for config in training_config.ml_model_org_configs])
TRAIN_START_DATE = training_config.training_metadata.experiment_dates.train_start_date
TEST_END_DATE = training_config.training_metadata.experiment_dates.test_end_date

processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

date_range = f'{TRAIN_START_DATE}-TO-{TEST_END_DATE}'

s3_path = f's3://saiva-dev-data-bucket/training_data/saiva-3-day-hosp-v5/{CLIENT}/{date_range}'

feature_list = [ 
    'census_df.parquet',
    'demo_df.parquet',
    'vitals_df.parquet',
    'orders_df.parquet',
    'meds_df.parquet',
#     'alerts_df.parquet',
    'labs_df.parquet',
    'rehosp_df.parquet',
    'admissions_df.parquet',
    'diagnosis_df.parquet',
    'notes_df.parquet',
]

print(TRAIN_START_DATE, TEST_END_DATE)
print(CLIENT)

# ================ Export to S3 ==================

In [None]:
for name in feature_list:
    try:
        df = pd.read_parquet(processed_path/name)
        df.to_parquet(f'{s3_path}/{name}')
        print(f'{name} --------------- written to S3')
    except:
        print(f'{name} --------------- does not exist')

# ================ Import from S3 ==================

In [None]:
for name in feature_list:
    df = pd.read_parquet(f'{s3_path}/{name}')
    df.to_parquet(processed_path/name)
    print(f'{name} --------------- read from S3')