In [None]:
import boto3
import os
from pathlib import Path
import pandas as pd
from saiva.model.shared.utils import get_client_class, get_memory_usage

## Load config

In [None]:
from saiva.model.shared.constants import saiva_api, LOCAL_TRAINING_CONFIG_PATH
from saiva.training.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

### These datacards need to be run for every client we're interested in - please see the below options for available client and datasource_id

In [None]:
[(organization_config.organization_id, organization_config.datasource.id) for organization_config in training_config.organization_configs]

In [None]:
CLIENT = training_config.organization_configs[0].organization_id
# can be modified if the client and datasource_ids are different
datasource_id = training_config.organization_configs[0].datasource.id

In [None]:
TRAIN_START_DATE = training_config.training_metadata.experiment_dates.train_start_date
TEST_END_DATE = training_config.training_metadata.experiment_dates.test_end_date

date_range = f'{TRAIN_START_DATE}-TO-{TEST_END_DATE}'

bucket = 'saiva-dev-data-bucket'
s3_file_folder = f"training_data/saiva-3-day-hosp-v6/{CLIENT}/{date_range}/datacard_data/"

In [None]:
s3_folder_path = f's3://saiva-dev-data-bucket/training_data/saiva-3-day-hosp-v6/{CLIENT}/{date_range}/datacard_data/'

s3_folder_path

In [None]:
folder_path = "/data/raw/"

In [None]:
files = [dataset for dataset in training_config.all_datasets if dataset not in training_config.training_metadata.missing_datasets]

In [None]:
def upload_to_s3(file_path, bucket, s3_file_path):
    s3 = boto3.client('s3')
    s3.upload_file(file_path, bucket, s3_file_path)

In [None]:
for file in files:
    file_path = os.path.join(folder_path, f"{file}.parquet")
    upload_to_s3(file_path, bucket, os.path.join(s3_file_folder, f"{file}.parquet"))

In [None]:
# commands to run the datacards in the saiva-datacards repository

# facility discovery datacard
f"""python run_datacard.py facility_discovery run --client={CLIENT} --datasource_id={datasource_id} --start-date={TRAIN_START_DATE} --end-date={TEST_END_DATE} --outfile=s3://saiva-datacards/datacards/dev/{CLIENT}/{date_range}/ --s3-folder-path={s3_folder_path} --skip-client-name-in-files"""

In [None]:
df = pd.read_parquet(os.path.join(folder_path, 'patient_census.parquet'))

In [None]:
df.head()

In [None]:
facility_ids = ",".join([str(x) for x in list(df['facilityid'].unique())])

In [None]:
facility_ids

In [None]:
# commands to run the datacards in the saiva-datacards repository

# data availability datacard
f"""python run_datacard.py data_availability run --client={CLIENT} --datasource_id={datasource_id} --facility-ids={facility_ids} --start-date={TRAIN_START_DATE} --end-date={TEST_END_DATE} --outfile=s3://saiva-datacards/datacards/dev/{CLIENT}/{date_range}/ --s3-folder-path={s3_folder_path} --skip-client-name-in-files"""