In [36]:
import sys
sys.path.insert(0, '/src')
import pickle
from data_models import BaseModel
from shared.load_raw_data import join_tables
from explanations import DataProcessor, get_config_value
from pathlib import Path
import boto3
from urllib.parse import urlparse
import pandas as pd

import shap
import re
from shared.explanations_config import exp_dictionary
from shared.explanations_config import FEATURE_GROUP_MAPPING, FEATURE_TYPE_MAPPING
import numpy as np



In [30]:
prediction_date = '2021-02-02'
client = 'avante'
facilityid = '1'
s3_path = f's3://saiva-dev-data-bucket/unit_test_data/explanations/{client}/{prediction_date}'
modelid = 'd0c497c8b9b04f4d9e1e1e0c9297cc1f'

table_list = ['master_patient_lookup', 'patient_census', 'patient_rehosps','patient_room_details',
              'patient_progress_notes', 'patient_diagnosis', 'patient_vitals', 'patient_lab_results',
              'patient_meds', 'patient_orders', 'patient_alerts', 'patient_demographics']

In [31]:
raw_data_dict = {}
for table in table_list:
    print(f"reading {table}")
    raw_data_dict[table] = pd.read_parquet(
        f"{s3_path}/{table}.parquet"
    )

with open(f"/data/models/{modelid}/artifacts/{modelid}.pickle", "rb") as f:
    model = pickle.load(f)

final_x = pd.read_parquet(f"{s3_path}/pd_final_df.parquet")
final_idens = pd.read_parquet(f"{s3_path}/pd_final_idens.parquet")

reading master_patient_lookup
reading patient_census
reading patient_rehosps
reading patient_room_details
reading patient_progress_notes
reading patient_diagnosis
reading patient_vitals
reading patient_lab_results
reading patient_meds
reading patient_orders
reading patient_alerts
reading patient_demographics


In [33]:
# ret_dict = process_raw_data(raw_data_dict)
    
explainer = shap.TreeExplainer(model.model)
shap_values = explainer.shap_values(final_x)

shap_results = []

for idx, row in final_x.iterrows():
    shaps = pd.DataFrame(
        {
            "feature": final_x.columns,
            "attribution_score": shap_values[1][idx],
            "feature_value": final_x.iloc[idx],
        }
    )

    shaps["masterpatientid"] = final_idens.iloc[idx].masterpatientid
    shaps["facilityid"] = final_idens.iloc[idx].facilityid
    shaps["censusdate"] = final_idens.iloc[idx].censusdate
    shaps['human_readable_name'] = ''
    shaps['mapping_status'] = 'NOT_MAPPED'
    
    shap_results.append(shaps)


results = pd.concat(shap_results)

In [34]:

results['mapped_feature'] = results['feature'].replace(
        FEATURE_GROUP_MAPPING,
        regex=True
    )
results['feature_type'] = results['feature'].replace(
    FEATURE_TYPE_MAPPING,
    regex=True
)
results['day_count'] = results['feature'].str.extract(r'_(\d+)_day')
results['day_count'] = results['day_count'].fillna("100").astype(int)
results['all_time'] = results['feature'].str.extract(r'_(all)_')

condition = (results.mapped_feature.str.startswith(('cumsum_alert_','cumsum_med_','cumsum_labs_','cumsum_order_','cumsum_order_','cumsum_dx_'))) & (results.feature_value == 0) 
results.loc[condition,'mapping_status'] = 'NOT_RELEVANT'

# Calculate Avg attribution_score for all rows
_df = results.groupby(['masterpatientid', 'facilityid', 'mapped_feature']
                      )['attribution_score'].mean().reset_index()
_df = _df.rename(columns={'attribution_score': 'sum_attribution_score'})
results = results.merge(_df, how='left', on=['masterpatientid', 'facilityid', 'mapped_feature'])

""" Remove duplicate feature columns.
ie. cumsum columns have 7, 14, 30 & ALL day variants.
Use the most recent variant ie. sort by day_count and pick the first row
"""

results.sort_values(by=['day_count'], inplace=True, ascending=True)
results = results.drop_duplicates(
    subset=['masterpatientid', 'facilityid', 'mapped_feature'],
    keep='first'
)

results['censusdate'] = pd.to_datetime(results.censusdate)

In [32]:
dp = DataProcessor(raw_data_dict)
raw_data = dp.fetch_processed_data()

This pattern has match groups. To actually get the groups, use str.extract.


In [40]:
def dx_mapper(dx_attribution, diag_data, mpid_to_use, date_to_use, diagnosis_explanation_config):
    print('+++++++++++++++++++++++++++++++++0')
    diag_label = dx_attribution['mapped_feature'].replace('cumsum_dx_', '')

    diag_matches = diag_data[(diag_data.masterpatientid == mpid_to_use) &
                             (diag_data.ccs_label == diag_label)]
    day_count = dx_attribution['day_count']
    diag_reason = filter_date_range(
        day_count,
        diag_matches,
        diag_matches.onsetdate,
        date_to_use
    )
    print('+++++++++++++++++++++++++++++++++1')
    if (len(diag_reason) > 0):
        # do not show onsetdate for patients whose initialadmissiondate equals onsetdate.
        # patients set diagnosis onsetdate as initialadmissiondate when they're unaware of the real onsetdate.
        if diag_reason.iloc[0]['onsetdate'] != diag_reason.iloc[0]['initialadmissiondate']:
            human_readable_name = f"Diagnosis of {diag_reason.iloc[0]['diagnosiscode']} : {diag_reason.iloc[0]['diagnosisdesc']} on {diag_reason.iloc[0]['onsetdate']:%m/%d/%Y}"
        else:
            human_readable_name = f"Diagnosis of {diag_reason.iloc[0]['diagnosiscode']} : {diag_reason.iloc[0]['diagnosisdesc']}"
        print('+++++++++++++++++++++++++++++++++2')
        if day_count in diagnosis_explanation_config.keys() and diag_label.lower() in diagnosis_explanation_config[
            day_count]:
            print('+++++++++++++++++++++++++++++++++3')
            mapping_status = 'MAPPED'
        elif (dx_attribution['all_time'] == 'all') and \
                (any(diag_all_string in diag_label.lower() for diag_all_string in
                     diagnosis_explanation_config['all'])):
            print('+++++++++++++++++++++++++++++++++4')
            mapping_status = 'MAPPED'
        else:
            mapping_status = 'DATA FOUND'
    else:
        mapping_status = 'DATA_NOT_FOUND'
        human_readable_name = ''
    print('+++++++++++++++++++++++++++++++++5')
    return pd.Series([human_readable_name, mapping_status], index=['a', 'b'])

In [41]:
def _mapper(dx_attribution, diag_data, mpid_to_use, date_to_use, diagnosis_explanation_config):
    human_readable_name = ''
    mapping_status = 'NOT_mapped'
    return pd.Series([human_readable_name, mapping_status], index=['a', 'b'])

In [43]:
diagnosis_explanation_config = get_config_value(
            exp_dictionary,
            client=client,
            key='Patient_Diagnosis',
            default_value={}
)
print('+++++++++++++++++++++=1')
condition = (results.mapped_feature.str.startswith('cumsum_dx_')) & (
        results.mapping_status == 'NOT_MAPPED')
print('+++++++++++++++++++++=2')        
# results.loc[condition, ['human_readable_name', 'mapping_status']] = (
#     results
#         .loc[condition]
#         .apply(lambda x: _mapper(x,
#                                         raw_data['patient_diagnosis'],
#                                         x.masterpatientid,
#                                         x.censusdate,
#                                         diagnosis_explanation_config),
#                axis=1)
#         .values
# )
results.loc[condition, ['human_readable_name', 'mapping_status']]

+++++++++++++++++++++=1
+++++++++++++++++++++=2


Unnamed: 0,human_readable_name,mapping_status


In [51]:
condition = (results.mapped_feature.str.startswith('cumsum_dx_')) & (results.feature_value == 1)
len(results[condition]) > 0 

False