In [None]:
import sys
from pathlib import Path

from omegaconf import OmegaConf

sys.path.insert(0, '/src')
from shared.demographics import DemographicFeatures
from shared.labs import LabFeatures
from shared.meds import MedFeatures
from shared.orders import OrderFeatures
from shared.vitals import VitalFeatures
from shared.alerts import AlertFeatures
from shared.rehosp import RehospFeatures
from shared.notes import NoteFeatures
from shared.diagnosis import DiagnosisFeatures
from shared.patient_census import PatientCensus
from shared.admissions import AdmissionFeatures
import pandas as pd

from shared.load_raw_data import fetch_training_cache_data
from shared.utils import get_client_class, get_memory_usage
from eliot import start_action, start_task, to_file, log_message
to_file(sys.stdout)

## Load config

In [None]:
from shared.constants import saiva_api, LOCAL_TRAINING_CONFIG_PATH
from shared.utils import load_config

config = load_config(LOCAL_TRAINING_CONFIG_PATH)
training_config = config.training_config

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
# Load the data from local directory cache 

processed_path = Path('/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

# Replace this if necessary
CLIENT = training_config.ml_model_org_configs[0].organization_id

result_dict = fetch_training_cache_data(client=CLIENT, generic=True)
for key, value in result_dict.items():
    print(f'{key} : {result_dict[key].shape}')

In [None]:
training = True

TRAIN_START_DATE = training_config.training_metadata.experiment_dates.train_start_date
TEST_END_DATE = training_config.training_metadata.experiment_dates.test_end_date

model_version = saiva_api.model_types.get_by_model_type_id(model_type_id=training_config.model_type, version=training_config.model_version)

print(TRAIN_START_DATE)
print(TEST_END_DATE)
print(training)
print(model_version.model_type_id, model_version.id)

In [None]:
training_metadata = training_config.training_metadata
training_metadata['model_type_version_id'] = model_version.id

print(training_metadata)

conf = OmegaConf.create({'training_config': {'training_metadata': training_metadata}})
OmegaConf.save(conf, f'{LOCAL_TRAINING_CONFIG_PATH}generated/training_metadata.yaml')

In [None]:
# read from parquet file
# census_df = pd.read_parquet(processed_path/'census_df.parquet')

In [None]:
%%time

print(TRAIN_START_DATE)

rehosp_df = result_dict.get('patient_rehosps', None)
rehosp_df = rehosp_df.query('dateoftransfer > @TRAIN_START_DATE').copy()

patient_census = PatientCensus(
            census_df=result_dict.get('patient_census', None),
            train_start_date=TRAIN_START_DATE,
            test_end_date=TEST_END_DATE,
            rehosp_df=rehosp_df,
        )
census_df = patient_census.generate_features()

# Write to new parquet file
census_df.to_parquet(processed_path/'census_df.parquet')

print(census_df.shape)
census_df.head(3)

In [None]:
%%time

demo = DemographicFeatures(
            census_df=census_df.copy(),
            demo_df=result_dict.get('patient_demographics', None),
            training=training
        )
demo_df = demo.generate_features()

# Write to new parquet file
demo_df.to_parquet(processed_path/'demo_df.parquet')

print(demo_df.shape)
demo_df.head(3)

In [None]:
%%time

vitals = VitalFeatures(
            census_df=census_df.copy(),
            vitals=result_dict.get('patient_vitals', None),
            training=training)
vitals_df = vitals.generate_features()

# Write to new parquet file
vitals_df.to_parquet(processed_path/'vitals_df.parquet')

print(vitals_df.shape)
vitals_df.head(3)

In [None]:
# %%time

# if not result_dict.get('patient_orders', pd.DataFrame()).empty:
#     orders = OrderFeatures(
#                 census_df=census_df.copy(),
#                 orders=result_dict.get('patient_orders', None),
#                 training=training)
#     orders_df = orders.generate_features()

#     # Write to new parquet file
#     orders_df.to_parquet(processed_path/'orders_df.parquet')

#     print(orders_df.shape)
#     orders_df.head(3)

In [None]:
# %%time

if not result_dict.get('patient_meds', pd.DataFrame()).empty:
    meds = MedFeatures(
                census_df=census_df.copy(),
                meds=result_dict.get('patient_meds', None),
                training=training)
    meds_df,result_dict['patient_meds'] = meds.generate_features()

    # Write to new parquet file
    meds_df.to_parquet(processed_path/'meds_df.parquet')

    print(meds_df.shape)
    meds_df.head(3)

In [None]:
# %%time

# alerts = AlertFeatures(
#             census_df=census_df.copy(),
#             alerts=result_dict.get('patient_alerts', None),
#             training=training)
# alerts_df = alerts.generate_features()

# # Write to new parquet file
# alerts_df.to_parquet(processed_path/'alerts_df.parquet')

# print(alerts_df.shape)
# alerts_df.head(3)

In [None]:
# %%time

if not result_dict.get('patient_lab_results', pd.DataFrame()).empty:
    labs = LabFeatures(
                census_df=census_df.copy(),
                labs=result_dict.get('patient_lab_results', None),
                training=training)
    labs_df = labs.generate_features()

    # Write to new parquet file
    labs_df.to_parquet(processed_path/'labs_df.parquet')

    print(labs_df.shape)
    labs_df.head(3)

In [None]:
%%time

if not result_dict.get('patient_rehosps', pd.DataFrame()).empty:
    rehosp = RehospFeatures(
            census_df=census_df.copy(),
            rehosps=result_dict.get('patient_rehosps', None),
            training=training
    )
    rehosp_df = rehosp.generate_features()

    # Write to new parquet file
    rehosp_df.to_parquet(processed_path/'rehosp_df.parquet')

    print(rehosp_df.shape)
    rehosp_df.head(3)

In [None]:
%%time

if not result_dict.get('patient_admissions', pd.DataFrame()).empty:
    admissions = AdmissionFeatures(
            census_df=census_df.copy(),
            admissions=result_dict.get('patient_admissions', None),
            training=training
    )
    admissions_df = admissions.generate_features()

    # Write to new parquet file
    admissions_df.to_parquet(processed_path/'admissions_df.parquet')

    print(admissions_df.shape)
    admissions_df.head(3)

In [None]:
%%time

if not result_dict.get('patient_diagnosis', pd.DataFrame()).empty:
    diagnosis = DiagnosisFeatures(
                census_df=census_df.copy(),
                diagnosis=result_dict.get('patient_diagnosis', None),
                diagnosis_lookup_ccs_s3_file_path=model_version.diagnosis_lookup_ccs_s3_uri,
                training=training)
    diagnosis_df, result_dict['patient_diagnosis'] = diagnosis.generate_features()

    # Write to new parquet file
    diagnosis_df.to_parquet(processed_path/'diagnosis_df.parquet')

    print(diagnosis_df.shape)
    diagnosis_df.head(3)

In [None]:
%%time

if not result_dict.get('patient_progress_notes', pd.DataFrame()).empty:
    notes = NoteFeatures(
                census_df=census_df.copy(),
                notes=result_dict.get('patient_progress_notes', None),
                client=CLIENT,
                training=training,
                vector_model=training_metadata.vector_model,
    )
    notes_df = notes.generate_features()

    # Write to new parquet file
    notes_df.to_parquet(processed_path/'notes_df.parquet')

    print(notes_df.shape)
    notes_df.head(3)

## +++++++++++++++++++++END+++++++++++++++++++

In [None]:
notes_df.shape