In [None]:
import pandas as pd
import gc
import sys
from pathlib import Path
sys.path.insert(0, '/src')
from shared.generate_base_features import base_feature_processing
from shared.generate_lab_features import get_lab_features
from shared.load_raw_data import fetch_training_cache_data
from shared.utils import get_client_class, get_memory_usage
from shared.constants import CLIENT
from eliot import start_action, start_task, to_file, log_message
to_file(sys.stdout)

In [None]:
# Load the data from local directory cache 

processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

S3_BUCKET = 'saiva-dev-data-bucket'

clientClass = get_client_class(client=CLIENT)
TRAIN_START_DATE, TRAIN_END_DATE  = getattr(clientClass(), 'get_training_dates')()

result_dict = fetch_training_cache_data(client=CLIENT, generic=True)
for key, value in result_dict.items():
    print(f'{key} : {result_dict[key].shape}')

In [None]:
%%time

# Save the combined dataframe to local directory

base, result_dict = base_feature_processing(
    result_dict=result_dict, 
    train_start_date=TRAIN_START_DATE, 
    prediction_date=TRAIN_END_DATE, 
    s3_bucket=S3_BUCKET,
    training=True
)

In [None]:
%%time

# If Lab features are present then combine them
if not result_dict.get('patient_lab_results', pd.DataFrame()).empty:
    combined = get_lab_features(
        base=base,
        patient_lab_results=result_dict.get('patient_lab_results'),
        training=True
    )
else:
    combined = base

del base

In [None]:
# Write to new parquet file
combined.to_parquet(processed_path/'02-result.parquet')

In [None]:
print(get_memory_usage(combined))