In [None]:
import json
import sys
import pandas as pd
from pathlib import Path

from eliot import start_action, start_task, to_file, log_message
from saiva.model.shared.utils import get_client_class, get_memory_usage
to_file(sys.stdout)

In [None]:
processed_path = Path('/data/processed')
processed_file_names = {
    'Demographics': 'demo_df.parquet',
    'Vitals': 'vitals_df.parquet',
    'Orders': 'orders_df.parquet',
    'Alerts': 'alerts_df.parquet',
    'Medications': 'meds_df.parquet',
    'Transfers': 'rehosp_df.parquet',
    'Admissions': 'admissions_df.parquet',
    'Diagnoses': 'diagnosis_df.parquet',
    'Labs': 'labs_df.parquet',
    'ProgressNotes': 'notes_df.parquet',
    'Immunizations': 'immuns_df.parquet',
    'Risks': 'risks_df.parquet',
    'Assessments': 'assessments_df.parquet',
    'Adt': 'adt_df.parquet',
    'Mds': 'mds_df.parquet'
}
processed_file_paths = {group: processed_path/file_name for group, file_name in processed_file_names.items()}

In [None]:
%%time
feature_groups = dict()
for i, (group, file_name) in enumerate(processed_file_names.items()):
    print('merging', file_name)
    file_path = processed_path/file_name
    if i == 0:
        final_df = pd.read_parquet(file_path)
        feature_groups[group] = list(final_df.columns)
    elif Path.exists(file_path):
        df = pd.read_parquet(file_path)
        reordered_cols = list(df.columns.sort_values())
        df = df[reordered_cols]
        final_df = final_df.merge(
            df,
            how='left',
            on=['masterpatientid', 'facilityid', 'censusdate']
        )
        feature_groups[group] = list(df.columns)
        del df

In [None]:
%%time
# drop unwanted columns
columns_to_drop = final_df.columns[
    (final_df.columns.str.contains('_masterpatientid|_facilityid|_x$|_y$|^patientid'))|(final_df.columns.duplicated())
].tolist()
if len(columns_to_drop) > 0:
    final_df.drop(columns_to_drop, axis=1, inplace = True)

print('Number of columns in the dataframe:', final_df.shape[1])

In [None]:
%%time
# Write to new parquet file
final_df.to_parquet(processed_path/'final_df.parquet')

In [None]:
exclude_columns = ['masterpatientid', 'facilityid', 'censusdate', 'client', 'date_of_transfer', 'na_indictator_date_of_transfer']
feature_groups = {
    group: [
        feat for feat in features if (feat in final_df.columns) and not (feat in exclude_columns)
    ] for group, features in feature_groups.items()
}
with open('./feature_groups.json', 'w') as outfile: json.dump(feature_groups, outfile)  

In [None]:
print(get_memory_usage(final_df))

In [None]:
final_df.shape

In [None]:
nan_cols = [i for i in final_df.columns if final_df[i].isna().any()]
len(nan_cols)