In [1]:
import boto3
import pandas as pd
import numpy as np
from pathlib import Path
import json
import re

from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

from sklearn.ensemble import forest, gradient_boosting
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_selection import SelectFromModel, SelectKBest

import mlflow
from mlflow import log_metric, log_param, log_artifact
from mlflow.sklearn import log_model
from sklearn.model_selection import ParameterGrid
#import shap

In [None]:
shap.initjs()

In [2]:
train_start_date = '2017-01-01'
train_end_date = '2019-02-28'
valid_end_date = ''

In [None]:
secret_name = "cust_db_credentials"
region_name = "us-east-1"

# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(service_name="secretsmanager", region_name=region_name)

get_secret_value_response = client.get_secret_value(SecretId=secret_name)

db_info = json.loads(get_secret_value_response["SecretString"])

avan_connect_url = URL(
    drivername="mssql+pyodbc",
    username=db_info["username"],
    password=db_info["password"],
    host=db_info["host"],
    database="AVAN",
    query={'driver': 'ODBC Driver 17 for SQL Server'}
)

avan_engine = create_engine(avan_connect_url)

In [None]:
# Avante Data

In [None]:
avan_engine.execute('select 1').fetchall()

In [None]:
data_path = Path('/code/data/raw')
data_path.mkdir(parents=True, exist_ok=True)

In [None]:
query = f'''
select patientid, facilityid, masterpatientid
from view_ods_facility_patient
where facilityid in (select facilityid from view_ods_facility where lineofbusiness = 'SNF')
'''

master_patient_lookup = pd.read_sql(query, con=avan_engine)
master_patient_lookup.to_parquet(data_path/'master_patient_lookup.parquet')

In [None]:
query = f'''
select patientid, facilityid, dateoftransfer, purposeofstay, transferredto,
orderedbyid, transferreason, otherreasonfortransfer, planned,
hospitaldischargedate, primaryphysicianid 
from view_ods_hospital_transfers_transfer_log_v2
where dateoftransfer between '{train_start_date}' and '{train_end_date}'
and facilityid in (select facilityid from view_ods_facility where lineofbusiness = 'SNF')
'''

patient_rehosps = pd.read_sql(query, con=avan_engine)
patient_rehosps = patient_rehosps.merge(master_patient_lookup, on=['patientid', 'facilityid'])
patient_rehosps.to_parquet(data_path/'patient_rehosps.parquet')

In [None]:
query = f'''
select masterpatientid, gender, dateofbirth, education, citizenship, race, religion, state, primarylanguage
from view_ods_master_patient
'''

patient_demographics = pd.read_sql(query, con=avan_engine)
patient_demographics.to_parquet(data_path/'patient_demographics.parquet')

In [None]:
query = f'''
select clientid as patientid, censusdate, facilityid, bedid, beddescription, roomratetypedescription, payercode, carelevelcode
from view_ods_daily_census_v2
where censusdate between '{train_start_date}' and '{train_end_date}'
and facilityid in (select facilityid from view_ods_facility where lineofbusiness = 'SNF')
'''

patient_census = pd.read_sql(query, con=avan_engine)
patient_census = patient_census.merge(master_patient_lookup, on=['patientid','facilityid'])
patient_census.to_parquet(data_path/'patient_census.parquet')

In [None]:
query = f'''
select patientid, onsetdate, facilityid, diagnosiscode, diagnosisdesc, classification, rank
from view_ods_patient_diagnosis
where onsetdate between '{train_start_date}' and '{train_end_date}'
and facilityid in (select facilityid from view_ods_facility where lineofbusiness = 'SNF')
'''

patient_diagnosis = pd.read_sql(query, con=avan_engine)
patient_diagnosis = patient_diagnosis.merge(master_patient_lookup, on=['patientid','facilityid'])
patient_diagnosis.to_parquet(data_path/'patient_diagnosis.parquet')

In [None]:
query = f'''
select clientid as patientid, facilityid, date, bmi, vitalsdescription, value, diastolicvalue, warnings
from view_ods_Patient_weights_vitals
where date between '{train_start_date}' and '{train_end_date}'
and facilityid in (select facilityid from view_ods_facility where lineofbusiness = 'SNF')
and clientid in (select distinct clientid from view_ods_daily_census_v2 where censusdate between '{train_start_date}' and '{train_end_date}')
'''

patient_vitals = pd.read_sql(query, con=avan_engine)
patient_vitals = patient_vitals.merge(master_patient_lookup, on=['patientid', 'facilityid'])
patient_vitals.to_parquet(data_path/'patient_vitals.parquet')

In [None]:
query = f'''
select distinct patientid, facilityid, orderdate, gpiclassdescription, gpisubclassdescription
from view_ods_physician_order_list_v2 a
inner join view_ods_physician_order_list_med b
on a.PhysicianOrderID = b.PhysiciansOrderID 
where orderdate between '{train_start_date}' and '{train_end_date}';
'''

patient_meds = pd.read_sql(query, con=avan_engine)
patient_meds = patient_meds.merge(master_patient_lookup, on=['patientid', 'facilityid'])
patient_meds.to_parquet(data_path/'patient_meds.parquet')

In [None]:
query = f'''
select distinct patientid, facilityid, orderdate, ordercategory, ordertype, orderdescription, pharmacymedicationname, diettype, diettexture, dietsupplement
from view_ods_physician_order_list_v2
where orderdate between '{train_start_date}' and '{train_end_date}'
and ordercategory in ('Diagnostic', 'Enteral - Feeding', 'Dietary - Diet', 'Dietary - Supplements')
'''

patient_orders = pd.read_sql(query, con=avan_engine)
patient_orders = patient_orders.merge(master_patient_lookup, on=['patientid','facilityid'])
patient_orders.to_parquet(data_path/'patient_orders.parquet')

In [None]:
query = f'''
select distinct patientid, facilityid, orderdate, ordercategory, ordertype, orderdescription, pharmacymedicationname, diettype, diettexture, dietsupplement
from view_ods_physician_order_list_v2
where orderdate between '{train_start_date}' and '{train_end_date}'
and ordercategory in ('Pharmacy', 'Diagnostic', 'Enteral - Feeding', 'Dietary - Diet', 'Dietary - Supplements')
'''

patient_orders = pd.read_sql(query, con=avan_engine)
patient_orders = patient_orders.merge(master_patient_lookup, on=['patientid','facilityid'])
patient_orders.to_parquet(data_path/'patient_orders.parquet')

In [None]:
# detailed results seem to only start on 2019-03-03 and onwards - commenting out for now, revisit when we have more data

#query = f'''
#select c.patientid, a.resultdate, a.profiledescription, a.referencerange, a.result, a.abnormalityid, e.abnormalitydescription, b.reportdesciption, b.severityid, d.severitydescription from view_ods_result_lab_report_detail a
#left join view_ods_result_lab_report b on a.LabReportID = b.LabReportID
#left join view_ods_result_order_source c on b.ResultOrderSourceID = c.ResultOrderSourceID
#left join view_ods_result_lab_report_severity d on b.SeverityID = d.SeverityID
#left join view_ods_result_lab_test_abnormality e on a.AbnormalityID = e.AbnormalityID
#'''

#patient_lab_results = pd.read_sql(query, con=avan_engine)
#patient_lab_results.to_parquet(data_path/'patient_detailed_lab_results.parquet')

In [None]:
query = f'''
select patientid, facilityid, createddate, stdalertid, alertdescription, a.triggereditemtype, description
from [view_ods_cr_alert] a left join view_ods_cr_alert_triggered_item_type b
on a.triggereditemtype = b.triggereditemtype
where createddate between '{train_start_date}' and '{train_end_date}' and 
((triggereditemid is not null) or (a.triggereditemtype is not null))
'''

patient_alerts = pd.read_sql(query, con=avan_engine)
patient_alerts = patient_alerts.merge(master_patient_lookup, on=['patientid','facilityid'])
patient_alerts.to_parquet(data_path/'patient_alerts.parquet')

In [None]:
patient_demographics = pd.read_parquet(data_path/'patient_demographics.parquet')
patient_orders = pd.read_parquet(data_path/'patient_orders.parquet')
patient_vitals = pd.read_parquet(data_path/'patient_vitals.parquet')
patient_rehosps = pd.read_parquet(data_path/'patient_rehosps.parquet')
patient_orders = pd.read_parquet(data_path/'patient_orders.parquet')
patient_census = pd.read_parquet(data_path/'patient_census.parquet')
patient_diagnosis = pd.read_parquet(data_path/'patient_diagnosis.parquet')
patient_alerts = pd.read_parquet(data_path/'patient_alerts.parquet')
patient_meds = pd.read_parquet(data_path/'patient_meds.parquet')

In [None]:
def sorter(df, sort_keys=[]):
    return df.sort_values(by=sort_keys)

def deduper(df, unique_keys=[]):
    df = df.drop_duplicates(subset=unique_keys, keep='last')
    assert df.duplicated(subset=unique_keys).sum() == 0, f'''Still have dupes!'''
    
    return df

In [None]:
patient_census = sorter(patient_census, sort_keys=['masterpatientid', 'censusdate'])
patient_vitals = sorter(patient_vitals, sort_keys=['masterpatientid', 'date'])
patient_orders = sorter(patient_orders, sort_keys=['masterpatientid', 'orderdate'])
patient_rehosps = sorter(patient_rehosps, sort_keys=['masterpatientid', 'dateoftransfer'])
patient_alerts = sorter(patient_alerts, sort_keys=['masterpatientid', 'createddate'])
patient_meds = sorter(patient_meds, sort_keys=['masterpatientid', 'orderdate'])

In [None]:
patient_census = deduper(patient_census, unique_keys=['masterpatientid', 'censusdate'])
patient_demographics = deduper(patient_demographics, unique_keys=['masterpatientid'])
patient_vitals = deduper(patient_vitals, unique_keys=['masterpatientid', 'date', 'vitalsdescription'])
patient_orders = deduper(patient_orders, unique_keys=['masterpatientid', 'orderdate', 'orderdescription'])
patient_rehosps = deduper(patient_rehosps, unique_keys=['masterpatientid', 'dateoftransfer'])
patient_alerts = deduper(patient_alerts, unique_keys=['masterpatientid', 'createddate', 'alertdescription'])
patient_meds = deduper(patient_meds, unique_keys=['masterpatientid', 'orderdate', 'gpisubclassdescription'])

In [None]:
base = pd.DataFrame({'censusdate': pd.date_range(start='2017-01-01', end='2019-02-28')})

In [None]:
patient_rehosps['dateoftransfer'] = pd.to_datetime(patient_rehosps.dateoftransfer.dt.date)

In [None]:
base2 = base.merge(patient_census, how='left', on=['censusdate'])
base3 = base2.merge(patient_demographics, how='left', on=['masterpatientid'])

del base2;

In [None]:
vitals = patient_vitals.set_index(keys=['masterpatientid','facilityid','date']).drop(columns='patientid')
vitals['warnings'] = vitals.warnings.notna()

diastolic = vitals.pop('diastolicvalue')
diastolic = diastolic.dropna()

warnings = vitals.pop('warnings')
bmi = vitals.pop('bmi')

vitals = vitals.reset_index()
diastolic = diastolic.reset_index()
warnings = warnings.reset_index()
bmi = bmi.reset_index()

bmi['bmi'] = bmi.bmi.replace({'Height required': None, 'Height and weight required':None, 'Weight required':None}).astype(float)

vitals['date'] = vitals.pop('date').dt.date
diastolic['date'] = diastolic.pop('date').dt.date
warnings['date'] = warnings.pop('date').dt.date
bmi['date'] = bmi.pop('date').dt.date

In [None]:
aggs=['median','std', 'max', 'min']
vitals_pivoted = vitals.pivot_table(index=['masterpatientid','facilityid', 'date'], values='value', columns='vitalsdescription', aggfunc=aggs).reset_index()
diastolic_pivoted = diastolic.pivot_table(index=['masterpatientid','facilityid', 'date'], values='diastolicvalue', aggfunc=aggs).reset_index()
warnings_pivoted = warnings.pivot_table(index=['masterpatientid', 'facilityid', 'date'], values='warnings', aggfunc=sum).reset_index()
bmi_pivoted = bmi.pivot_table(index=['masterpatientid', 'facilityid', 'date'], values='bmi', aggfunc=max).reset_index()

In [None]:
def clean_multi_columns(cols):
    new_cols = []
    
    for col in cols:
        if col[1] == '':
            new_cols.append(col[0])
        else:
            new_cols.append('_'.join(col))
            
    return new_cols

In [None]:
vitals_pivoted.columns = clean_multi_columns(vitals_pivoted.columns)
diastolic_pivoted.columns = clean_multi_columns(diastolic_pivoted.columns)

In [None]:
warnings_pivoted['date'] = pd.to_datetime(warnings_pivoted['date'])
bmi_pivoted['date'] = pd.to_datetime(bmi_pivoted['date'])

In [None]:
vitals_pivoted = vitals_pivoted.drop_duplicates(subset=['masterpatientid','date'], keep='last')
diastolic_pivoted = diastolic_pivoted.drop_duplicates(subset=['masterpatientid','date'], keep='last')
warnings_pivoted = warnings_pivoted.drop_duplicates(subset=['masterpatientid','date'], keep='last')
bmi_pivoted = bmi_pivoted.drop_duplicates(subset=['masterpatientid','date'], keep='last')

In [None]:
vitals_base = vitals_pivoted.merge(diastolic_pivoted, how='left', on=['masterpatientid', 'facilityid', 'date'])
vitals_base2 = vitals_base.merge(warnings_pivoted, how='left', on=['masterpatientid', 'facilityid', 'date'])
vitals_final = vitals_base2.merge(bmi_pivoted, how='left', on=['masterpatientid', 'facilityid', 'date'])
vitals_final.columns = 'vtl_' + vitals_final.columns

In [None]:
base4 = base3.merge(vitals_final, how='left', left_on=['masterpatientid','facilityid','censusdate'], right_on=['vtl_masterpatientid', 'vtl_facilityid','vtl_date'])

In [None]:
del vitals; del vitals_pivoted; del diastolic_pivoted; del warnings_pivoted; del bmi_pivoted; del vitals_base; del vitals_base2; del vitals_final; del base3;

In [None]:
lookup_ccs = pd.read_csv('/code/data/lookup_tables/ccs_dx_icd10cm_2019_1.csv')
lookup_ccs.columns = lookup_ccs.columns.str.replace("'","")
lookup_ccs = lookup_ccs.apply(lambda x: x.str.replace("'",""))

In [None]:
patient_diagnosis['indicator'] = 1
patient_diagnosis['diagnosiscode'] = patient_diagnosis.diagnosiscode.str.replace('.','')
patient_diagnosis['onsetdate'] = patient_diagnosis.onsetdate.dt.date

In [None]:
patient_diagnosis = patient_diagnosis.merge(lookup_ccs, how='left', left_on=['diagnosiscode'], right_on=['ICD-10-CM CODE'])
patient_diagnosis['ccs_label'] = patient_diagnosis['MULTI CCS LVL 1 LABEL'] + ' - ' + patient_diagnosis['MULTI CCS LVL 2 LABEL']

In [None]:
diagnosis_pivoted = patient_diagnosis.loc[:,['masterpatientid', 'onsetdate', 'ccs_label', 'indicator']].pivot_table(index=['masterpatientid', 'onsetdate'], columns=['ccs_label'], values='indicator', fill_value=0).reset_index()
diagnosis_pivoted['onsetdate'] = pd.to_datetime(diagnosis_pivoted.onsetdate)
diagnosis_pivoted.columns = 'dx_' + diagnosis_pivoted.columns

In [None]:
base5 = base4.merge(diagnosis_pivoted, how='left', left_on=['masterpatientid','censusdate'], right_on=['dx_masterpatientid','dx_onsetdate'])

In [None]:
del base4; del diagnosis_pivoted;

In [None]:
patient_meds.loc[patient_meds.gpisubclassdescription.isna(), 'gpisubclassdescription'] = patient_meds.loc[patient_meds.gpisubclassdescription.isna(), 'gpiclassdescription']
patient_meds['orderdate'] = patient_meds.orderdate.dt.date
patient_meds['indicator'] = 1
meds_pivoted = patient_meds.loc[:,['masterpatientid', 'orderdate', 'gpisubclassdescription', 'indicator']].pivot_table(index=['masterpatientid', 'orderdate'], columns=['gpisubclassdescription'], values='indicator', fill_value=0).reset_index()

In [None]:
meds_pivoted.columns = 'med_' + meds_pivoted.columns

In [None]:
meds_pivoted = meds_pivoted.drop_duplicates(subset=['med_masterpatientid','med_orderdate'])

In [None]:
meds_pivoted['med_orderdate'] = pd.to_datetime(meds_pivoted.med_orderdate)

In [None]:
base6 = base5.merge(meds_pivoted, how='left', left_on=['masterpatientid', 'censusdate'], right_on=['med_masterpatientid', 'med_orderdate'])

In [None]:
del base5; del meds_pivoted;

In [None]:
diagnostic_orders = patient_orders.loc[patient_orders.ordercategory == 'Diagnostic']
diagnostic_orders['orderdate'] = diagnostic_orders.orderdate.dt.date
diagnostic_orders['count_indicator_diagnostic_orders'] = 1

In [None]:
diagnostic_pivoted = diagnostic_orders.drop(columns=['patientid', 'ordercategory', 'ordertype', 'orderdescription', 'pharmacymedicationname', 'diettype', 'diettexture', 'dietsupplement']).pivot_table(index=['masterpatientid', 'facilityid', 'orderdate'], values=['count_indicator_diagnostic_orders'], aggfunc=sum).reset_index()

In [None]:
diagnostic_pivoted['orderdate'] = pd.to_datetime(diagnostic_pivoted.orderdate)
diagnostic_pivoted.columns = 'order_' + diagnostic_pivoted.columns

In [None]:
base7 = base6.merge(diagnostic_pivoted, how='left', left_on=['masterpatientid','facilityid','censusdate'], right_on=['order_masterpatientid','order_facilityid','order_orderdate'])

In [None]:
del base6; del diagnostic_pivoted; del diagnostic_orders;

In [None]:
diet_orders = patient_orders[patient_orders.ordercategory == 'Dietary - Diet']
diet_orders['orderdate'] = diet_orders.orderdate.dt.date
diet_orders['indicator'] = 1
diet_orders = diet_orders.drop_duplicates(subset=['masterpatientid', 'orderdate', 'diettype', 'diettexture'])

diet_type_pivoted = diet_orders.loc[:,['masterpatientid', 'orderdate', 'diettype', 'indicator']].pivot_table(index=['masterpatientid', 'orderdate'], columns=['diettype'], values='indicator', aggfunc=min).reset_index()
#diet_type_pivoted.columns = clean_multi_columns(diet_type_pivoted)
diet_type_pivoted.head()
diet_type_pivoted['orderdate'] = pd.to_datetime(diet_type_pivoted.orderdate)
diet_type_pivoted.columns = 'order_' + diet_type_pivoted.columns

diet_texture_pivoted = diet_orders.loc[:,['masterpatientid', 'orderdate', 'diettexture', 'indicator']].pivot_table(index=['masterpatientid', 'orderdate'], columns=['diettexture'], values='indicator', aggfunc=min).reset_index()
#diet_texture_pivoted.columns = clean_multi_columns(diet_texture_pivoted)
diet_texture_pivoted['orderdate'] = pd.to_datetime(diet_texture_pivoted.orderdate)
diet_texture_pivoted.columns = 'order_' + diet_texture_pivoted.columns

In [None]:
base8 = base7.merge(diet_type_pivoted, how='left', left_on=['masterpatientid','censusdate'], right_on=['order_masterpatientid','order_orderdate'])
base8 = base7.merge(diet_texture_pivoted, how='left', left_on=['masterpatientid','censusdate'], right_on=['order_masterpatientid','order_orderdate'])

In [None]:
del base7;

In [None]:
diet_supplements = patient_orders[patient_orders.ordercategory == 'Dietary - Supplements']
diet_supplements['orderdate'] = diet_supplements.orderdate.dt.date
diet_supplements['indicator'] = 1
diet_supplements = diet_supplements.drop_duplicates(subset=['masterpatientid', 'orderdate', 'dietsupplement'])
                                                    
diet_supplements_pivoted = diet_supplements.loc[:,['masterpatientid', 'orderdate', 'dietsupplement', 'indicator']].pivot_table(index=['masterpatientid', 'orderdate'], columns='dietsupplement', values='indicator', aggfunc=min).reset_index()
diet_supplements_pivoted['orderdate'] = pd.to_datetime(diet_supplements_pivoted.orderdate)

In [None]:
diet_supplements_counts = diet_supplements.groupby(['masterpatientid', 'facilityid', 'orderdate']).dietsupplement.count().reset_index().rename(columns={'dietsupplement':'count_indicator_dietsupplement'})
diet_supplements_counts['orderdate'] = pd.to_datetime(diet_supplements_counts.orderdate)

In [None]:
diet_supplements_pivoted.columns = 'order_' + diet_supplements_pivoted.columns
diet_supplements_counts.columns = 'order_' + diet_supplements_counts.columns

In [None]:
base9 = base8.merge(diet_supplements_pivoted, how='left', left_on=['masterpatientid','censusdate'], right_on=['order_masterpatientid','order_orderdate'])

In [None]:
base9 = base9.merge(diet_supplements_counts, how='left', left_on=['masterpatientid','censusdate'], right_on=['order_masterpatientid','order_orderdate'])

In [None]:
del base8;

In [None]:
patient_alerts_system = patient_alerts.loc[patient_alerts.triggereditemtype.notna()]

In [None]:
patient_alerts_therapy = patient_alerts_system.loc[patient_alerts_system.triggereditemtype == 'T']
patient_alerts_therapy['createddate'] = patient_alerts_therapy.createddate.dt.date
patient_alerts_therapy['alertdescription'] = patient_alerts_therapy.alertdescription.str.split(':').str[0]

patient_alerts_therapy['indicator'] = 1
patient_alerts_therapy_pivoted = patient_alerts_therapy.loc[:,['masterpatientid', 'createddate', 'alertdescription', 'indicator']].pivot_table(index=['masterpatientid','createddate'], columns='alertdescription', values='indicator', aggfunc=sum).reset_index()
patient_alerts_therapy_pivoted['createddate'] = pd.to_datetime(patient_alerts_therapy_pivoted.createddate)

In [None]:
allergy_alerts = patient_alerts_system[patient_alerts_system.triggereditemtype == 'A']
allergy_alerts['createddate'] = allergy_alerts.createddate.dt.date

In [None]:
allergy_alert_counts = allergy_alerts.groupby(['masterpatientid', 'createddate']).alertdescription.count().reset_index().rename({'alertdescription':'count_indicator_allergy'}, axis=1)
allergy_alert_counts['createddate'] = pd.to_datetime(allergy_alert_counts.createddate)

In [None]:
dispense_alerts = patient_alerts_system[patient_alerts_system.triggereditemtype == 'D']
dispense_alerts['createddate'] = dispense_alerts.createddate.dt.date
dispense_alert_counts = dispense_alerts.groupby(['masterpatientid', 'createddate']).alertdescription.count().reset_index().rename(columns={'alertdescription':'count_indicator_dispense'})
dispense_alert_counts['createddate'] = pd.to_datetime(allergy_alert_counts.createddate)

In [None]:
order_alerts = patient_alerts_system[patient_alerts_system.triggereditemtype == 'O']
order_alerts['createddate'] = order_alerts.createddate.dt.date
order_alert_counts = order_alerts.groupby(['masterpatientid', 'createddate']).alertdescription.count().reset_index().rename(columns={'alertdescription':'count_indicator_order'})
order_alert_counts['createddate'] = pd.to_datetime(order_alert_counts.createddate)

In [None]:
patient_alerts_therapy_pivoted.columns = 'alert_' + patient_alerts_therapy_pivoted.columns
allergy_alert_counts.columns = 'alert_' + allergy_alert_counts.columns
dispense_alert_counts.columns = 'alert_' + dispense_alert_counts.columns
order_alert_counts.columns = 'alert_' + order_alert_counts.columns

In [None]:
base10 = base9.merge(patient_alerts_therapy_pivoted, how='left', left_on=['masterpatientid', 'censusdate'], right_on=['alert_masterpatientid', 'alert_createddate'])

In [None]:
base10 = base10.merge(allergy_alert_counts, how='left', left_on=['masterpatientid', 'censusdate'], right_on=['alert_masterpatientid', 'alert_createddate'])
base10 = base10.merge(dispense_alert_counts, how='left', left_on=['masterpatientid', 'censusdate'], right_on=['alert_masterpatientid', 'alert_createddate'])
base10 = base10.merge(order_alert_counts, how='left', left_on=['masterpatientid', 'censusdate'], right_on=['alert_masterpatientid', 'alert_createddate'])

In [None]:
del base9;

In [None]:
nonsystem_alerts = patient_alerts.loc[patient_alerts.triggereditemtype.isna()]
nonsystem_alerts['createddate'] = nonsystem_alerts.createddate.dt.date
nonsystem_alerts['indicator'] = 1
nonsystem_alerts = nonsystem_alerts.loc[nonsystem_alerts.alertdescription != '-1']
alerts_pivoted = nonsystem_alerts.loc[:,['masterpatientid', 'createddate', 'alertdescription', 'indicator']].pivot_table(index=['masterpatientid', 'createddate'], columns=['alertdescription'], values=['indicator'], aggfunc=sum).reset_index()

In [None]:
alerts_pivoted.columns = clean_multi_columns(alerts_pivoted.columns)
alerts_pivoted['createddate'] = pd.to_datetime(alerts_pivoted.createddate)
alerts_pivoted.columns = 'alert_' + alerts_pivoted.columns

In [None]:
base11 = base10.merge(alerts_pivoted, how='left', left_on=['masterpatientid', 'censusdate'], right_on=['alert_masterpatientid', 'alert_createddate'])

In [None]:
del base10;

In [None]:
rehosp = patient_rehosps.merge(patient_census, on=['masterpatientid'])

In [None]:
last_hosp = rehosp[rehosp.dateoftransfer < rehosp.censusdate]
last_hosp['count_prior_hosp'] = last_hosp.groupby(['masterpatientid', 'censusdate']).dateoftransfer.cumcount() + 1
last_hosp = last_hosp.groupby(['masterpatientid','censusdate']).tail(n=1).loc[:,['masterpatientid', 'censusdate', 'dateoftransfer', 'count_prior_hosp']].rename(columns={'dateoftransfer': 'last_hosp_date'})
last_hosp['days_since_last_hosp'] = (last_hosp.censusdate - last_hosp.last_hosp_date).dt.days

In [None]:
next_hosp = rehosp[rehosp.dateoftransfer > rehosp.censusdate].groupby(['masterpatientid','censusdate']).head(n=1).loc[:,['masterpatientid', 'censusdate', 'dateoftransfer']].rename(columns={'dateoftransfer': 'next_hosp_date'})

In [None]:
next_hosp['target_3_day_hosp'] = (next_hosp.next_hosp_date - next_hosp.censusdate) <= pd.to_timedelta('4 days')
next_hosp['target_7_day_hosp'] = (next_hosp.next_hosp_date - next_hosp.censusdate) <= pd.to_timedelta('8 days')

In [None]:
last_hosp.columns = 'hosp_' + last_hosp.columns
next_hosp.columns = 'hosp_' + next_hosp.columns

In [None]:
base12 = base11.merge(last_hosp, how='left', left_on=['masterpatientid','censusdate'], right_on=['hosp_masterpatientid', 'hosp_censusdate'])

In [None]:
base12 = base12.merge(next_hosp, how='left', left_on=['masterpatientid','censusdate'], right_on=['hosp_masterpatientid', 'hosp_censusdate'])

In [None]:
base12 = base12.loc[:,~base12.columns.duplicated()]

In [None]:
processed_path = Path('/code/data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

In [None]:
base12 = base12.loc[:, base12.columns[~base12.columns.str.contains('_masterpatientid|_facilityid|vtl_date|onsetdate|orderdate|createddate|_x$|_y$')].tolist()]
base12 = base12.drop_duplicates(subset=['masterpatientid', 'censusdate'], keep='last')
base12.to_parquet(processed_path/'combined.parquet')

In [None]:
combined = pd.read_parquet(processed_path/'combined.parquet')

In [None]:
combined = combined.drop(columns=['hosp_last_hosp_date', 'hosp_next_hosp_date'])

In [None]:
vtl_cols = [col for col in combined.columns if col.startswith('vtl')]
dx_cols = [col for col in combined.columns if col.startswith('dx')]
med_cols = [col for col in combined.columns if col.startswith('med')]
order_cols = [col for col in combined.columns if col.startswith('order')]
alert_cols = [col for col in combined.columns if col.startswith('alert')]
hosp_cols = [col for col in combined.columns if col.startswith('hosp')]
ignore_cols = [col for col in combined.columns if 'target' in col] + ['masterpatientid', 'patientid','censusdate', 'facilityid', 'bedid']

In [None]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    attr = ['Year','Month', 'Week', 'Day', 'Dayofweek',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[fldname + '_' + n] = getattr(fld.dt, n.lower())
    if drop: df.drop(fldname, axis=1, inplace=True)
        
def add_na_indicators(df, ignore_cols):
    missings = df.drop(columns=ignore_cols).isna()
    missings.columns = 'na_indictator_' + missings.columns
    missings_sums = missings.sum()
    
    return pd.concat([df, missings.loc[:, (missings_sums > 0)]], axis=1)

        
def proc_vitals(df, vtl_cols):
    ffilled = df.groupby('masterpatientid')[vtl_cols].fillna(method='ffill').reset_index()
    ffilled['masterpatientid'] = df.masterpatientid
    
    diff_1_day = ffilled.groupby('masterpatientid')[vtl_cols].diff()
    diff_1_day.columns = 'diff_1_day_' + diff_1_day.columns
    
    diff_7_day = ffilled.groupby('masterpatientid')[vtl_cols].diff(periods=7)
    diff_7_day.columns = 'diff_7_day_' + diff_7_day.columns
    
    rolling_avg_7_day = ffilled.groupby('masterpatientid')[vtl_cols].rolling(7, min_periods=1).mean().reset_index(0, drop=True)
    rolling_avg_7_day.columns = 'rol_avg_7_day_' + rolling_avg_7_day.columns
    
    rolling_avg_14_day = ffilled.groupby('masterpatientid')[vtl_cols].rolling(14, min_periods=1).mean().reset_index(0, drop=True)
    rolling_avg_14_day.columns = 'rol_avg_14_day_' + rolling_avg_14_day.columns
    
    rolling_std_7_day = ffilled.groupby('masterpatientid')[vtl_cols].rolling(7, min_periods=1).std().reset_index(0, drop=True)
    rolling_std_7_day.columns = 'rol_std_7_day_' + rolling_std_7_day.columns
    
    rolling_std_14_day = ffilled.groupby('masterpatientid')[vtl_cols].rolling(14, min_periods=1).std().reset_index(0, drop=True)
    rolling_std_14_day.columns = 'rol_std_14_day_' + rolling_std_14_day.columns
    
    df.loc[:,vtl_cols] = ffilled.loc[:, vtl_cols]
    
    df = pd.concat([df, diff_1_day, diff_7_day], axis=1) # diffs all indexed the same as original in the same order
    
    rollings = pd.concat([rolling_avg_7_day, rolling_avg_14_day, rolling_std_7_day, rolling_std_14_day], axis=1)
    df = df.merge(rollings, how='left', left_index=True, right_index=True) # rollings were sorted so we explictly join via index
    
    return df
    
def proc_dx_meds_alerts_orders(df, dx_cols, med_cols, alert_cols, order_cols):
    cols = dx_cols + med_cols + alert_cols + order_cols
    filled = df.groupby('masterpatientid')[cols].fillna(0).reset_index()
    filled['masterpatientid'] = df.masterpatientid
    
    cumsum_all_time = filled.groupby('masterpatientid')[cols].cumsum()
    cumsum_all_time.columns = 'cumsum_all_' + cumsum_all_time.columns
    
    cumsum_7_day = filled.groupby('masterpatientid')[cols].rolling(7, min_periods=1).sum().reset_index(0, drop=True)
    cumsum_7_day.columns = 'cumsum_7_day_' + cumsum_7_day.columns
    
    cumsum_15_day = filled.groupby('masterpatientid')[cols].rolling(15, min_periods=1).sum().reset_index(0, drop=True)
    cumsum_15_day.columns = 'cumsum_15_day_' + cumsum_15_day.columns
    
    #cumsum_30_day = filled.groupby('masterpatientid')[cols].rolling(30, min_periods=1).sum().reset_index(0, drop=True)
    #cumsum_30_day.columns = 'cumsum_30_day_' + cumsum_30_day.columns
    
    df = df.drop(columns=cols)
    df = pd.concat([df, cumsum_all_time], axis=1) # cumsum is indexed the same as original in the same order
    
    rollings = pd.concat([cumsum_7_day, cumsum_15_day], axis=1)
    df = df.merge(rollings, how='left', left_index=True, right_index=True) # rollings were sorted so we explictly join via index
    
    return df

def proc_demo(df):
    df['demo_gender'] = df.gender == 'M'
    df['demo_age_in_days'] = (df.censusdate - df.dateofbirth).dt.days
    df = pd.concat([df.drop(columns='primarylanguage'), pd.get_dummies(df.primarylanguage, prefix='demo_primarylanguage')], axis=1)
    df = pd.concat([df.drop(columns='carelevelcode'), pd.get_dummies(df.carelevelcode, prefix='demo_carelevel')], axis=1)
    df = pd.concat([df.drop(columns='race'), pd.get_dummies(df.race, prefix='demo_race')], axis=1)
    df = pd.concat([df.drop(columns='education'), pd.get_dummies(df.education, prefix='demo_education')], axis=1)
    df = pd.concat([df.drop(columns='religion'), pd.get_dummies(df.religion, prefix='demo_religion')], axis=1)
    df = pd.concat([df.drop(columns='facilityid'), pd.get_dummies(df.facilityid, prefix='demo_facility')], axis=1)
    
    return df
    
    

In [None]:
combined = add_na_indicators(combined, ignore_cols)

In [None]:
add_datepart(combined, 'censusdate', drop=False)

In [None]:
add_datepart(combined, 'dateofbirth', drop=False)

In [None]:
combined = proc_vitals(combined, vtl_cols)

In [None]:
combined = proc_dx_meds_alerts_orders(combined, dx_cols, med_cols, alert_cols, order_cols)

In [None]:
combined.to_parquet(processed_path/'after_vtl_and_cumsums.parquet')

In [None]:
combined = pd.read_parquet(processed_path/'after_vtl_and_cumsums.parquet')

In [None]:
combined = proc_demo(combined)

In [None]:
combined = combined.drop_duplicates(subset=['masterpatientid', 'censusdate'])

In [None]:
combined.to_parquet(processed_path/'final_processed.parquet')

In [None]:
processed_path = Path('/code/data/processed')
final = pd.read_parquet(processed_path/'final_processed.parquet')

In [None]:
drop_cols = ['bedid', 'beddescription', 'roomratetypedescription', 'payercode', 'patientid', 
             'gender', 'dateofbirth', 'citizenship', 'state', 'hosp_target_3_day_hosp']

In [None]:
final = final.drop(columns=drop_cols)

In [None]:
final = final.reset_index(drop=True)

In [None]:
# manual check to make sure we're not including any columns that could leak data
with open('/code/columns.txt','w') as f:
    for col in final.columns:
        f.write(col + '\n')

In [None]:
split_day = final.loc[:,'censusdate'].iloc[round(final.shape[0] * (1-.2))]

In [None]:
train = final.loc[final.censusdate <= split_day]
valid = final.loc[final.censusdate > split_day]

train.to_pickle(processed_path/'train.pickle')
valid.to_pickle(processed_path/'valid.pickle')

In [18]:
processed_path = Path('/code/data/processed')
train = pd.read_pickle(processed_path/'train.pickle')
valid = pd.read_pickle(processed_path/'valid.pickle')

In [None]:
#train_mask = train['cumsum_all_dx_Diseases of the respiratory system - Chronic obstructive pulmonary disease and bronchiectasis [127.]'] > 0
#valid_mask = valid['cumsum_all_dx_Diseases of the respiratory system - Chronic obstructive pulmonary disease and bronchiectasis [127.]'] > 0

In [None]:
#print(f'{len(train[train_mask].masterpatientid.unique())} patients out of {len(train.masterpatientid.unique())} have COPD associated with them')

In [None]:
#train_patients = train[train_mask].masterpatientid.unique()
#valid_patients = valid[valid_mask].masterpatientid.unique()

In [None]:
#train = train[train.masterpatientid.isin(train_patients)]
#valid = valid[valid.masterpatientid.isin(valid_patients)]

In [20]:
def fill_na_train(df):
    has_na = df.isna().sum() > 0
    d = df.loc[:, has_na].median()
    df = df.fillna(d)
    
    return df, d

def fill_na_valid(df, na_filler):
    return df.fillna(na_filler)

In [21]:
# fill in any remaining na's - now that we're not forwardfilling past info it's not correct to use a global imputation
# hence we impute on the train and apply to the valid
train, na_filler = fill_na_train(train)
valid = fill_na_valid(valid, na_filler)

In [22]:
train.to_pickle(processed_path/'train_filled.pickle')
valid.to_pickle(processed_path/'valid_filled.pickle')

In [2]:
processed_path = Path('/code/data/processed')
train = pd.read_pickle(processed_path/'train_filled.pickle')
valid = pd.read_pickle(processed_path/'valid_filled.pickle')

In [23]:
train.censusdate.min(), train.censusdate.max(), train.hosp_target_3_day_hosp.mean()

(Timestamp('2017-01-01 00:00:00'),
 Timestamp('2018-06-17 00:00:00'),
 0.01831044447422798)

In [24]:
valid.censusdate.min(), valid.censusdate.max(), valid.hosp_target_3_day_hosp.mean()

(Timestamp('2018-06-18 00:00:00'),
 Timestamp('2019-02-28 00:00:00'),
 0.017582460000386005)

In [25]:
def prep(df):
    drop_cols = ['censusdate', 'masterpatientid']
    drop_cols = drop_cols + [col for col in df.columns if 'target' in col]

    y = df.hosp_target_3_day_hosp.astype('float32').values
    x = df.drop(columns=drop_cols).reset_index(drop=True).astype('float32')
    idens = df.loc[:,['masterpatientid','censusdate']]
    
    return x, y, idens

In [26]:
train_x, train_y, train_idens = prep(train)
valid_x, valid_y, valid_idens = prep(valid)

In [2]:
import pickle;
with open('/code/data/processed/train_x.pickle','wb') as f: pickle.dump(train_x, f, protocol=4)
with open('/code/data/processed/train_y.pickle','wb') as f: pickle.dump(train_y, f, protocol=4)
with open('/code/data/processed/train_idens.pickle','wb') as f: pickle.dump(train_idens, f, protocol=4)
with open('/code/data/processed/valid_x.pickle','wb') as f: pickle.dump(valid_x, f, protocol=4)
with open('/code/data/processed/valid_y.pickle','wb') as f: pickle.dump(valid_y, f, protocol=4)
with open('/code/data/processed/valid_idens.pickle','wb') as f: pickle.dump(valid_idens, f, protocol=4)

NameError: name 'train_x' is not defined

In [4]:
import pickle;
with open('/code/data/processed/copd_train_x.pickle','rb') as f: train_x = pickle.load(f)
with open('/code/data/processed/copd_train_y.pickle','rb') as f: train_y = pickle.load(f)
with open('/code/data/processed/copd_train_idens.pickle','rb') as f: train_idens = pickle.load(f)
with open('/code/data/processed/copd_valid_x.pickle','rb') as f: valid_x = pickle.load(f)
with open('/code/data/processed/copd_valid_y.pickle','rb') as f: valid_y =pickle.load(f)
with open('/code/data/processed/copd_valid_idens.pickle','rb') as f: valid_idens =pickle.load(f)

In [5]:
from sklearn.neural_network import MLPClassifier

In [30]:
param_grid = {
    'alpha':[0.1, 0.2]
}

In [6]:
mlflow.set_experiment('target_hosp_3_day')

In [15]:
grid = ParameterGrid(param_grid)

In [24]:
config = {
    'hidden_layer_sizes': (1000, 500, 300, 300),
    'learning_rate': 'adaptive',
    'early_stopping': True,
    'batch_size': 64,
    'learning_rate_init': 0.0001
}

In [15]:
from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
scaled_train_x = scaler.fit_transform(train_x)

In [20]:
scaled_valid_x = scaler.transform(valid_x)

In [25]:
with mlflow.start_run():
    clf = MLPClassifier(verbose=True, **config)
    clf.fit(scaled_train_x, train_y)

    train_preds = clf.predict(scaled_train_x)
    valid_preds = clf.predict(scaled_valid_x)

    log_param('model_type', 'MLPClassifier')
    for param in config:
        log_param(param, config[param])

#     log_metric('valid_aucroc', roc_auc_score(valid_y, valid_preds))
#     log_metric('valid_ap', average_precision_score(valid_y, valid_preds))

    print(sum(valid_preds))

    log_metric('valid_aucroc', roc_auc_score(valid_y, valid_preds))
    log_metric('valid_ap', average_precision_score(valid_y, valid_preds))

    log_model(clf, 'model')

    input_features = pd.DataFrame(train_x.columns, columns=['feature'])
    input_features.to_csv('./input_features.csv', index=False)
    log_artifact('./input_features.csv')

Iteration 1, loss = 0.14320186
Validation score: 0.964580
Iteration 2, loss = 0.10194296
Validation score: 0.965830
Iteration 3, loss = 0.07324998
Validation score: 0.966914
Iteration 4, loss = 0.05669433
Validation score: 0.969081
Iteration 5, loss = 0.04762364
Validation score: 0.970664
Iteration 6, loss = 0.04123001
Validation score: 0.971331
Iteration 7, loss = 0.03625496
Validation score: 0.971914
Iteration 8, loss = 0.03201126
Validation score: 0.972748
Iteration 9, loss = 0.02860583
Validation score: 0.974165
Iteration 10, loss = 0.02556555
Validation score: 0.974498
Iteration 11, loss = 0.02425493
Validation score: 0.973498
Iteration 12, loss = 0.02244578
Validation score: 0.974081
Iteration 13, loss = 0.02058664
Validation score: 0.975831
Iteration 14, loss = 0.01971628
Validation score: 0.976748
Iteration 15, loss = 0.01764790
Validation score: 0.976498
Iteration 16, loss = 0.01747438
Validation score: 0.973081
Iteration 17, loss = 0.01603371
Validation score: 0.975331
Iterat

In [27]:
roc_auc_score(valid_y, valid_preds)

0.507684983183347

In [31]:
for config in ParameterGrid(param_grid):
    print(f'Trying hyperparamters: {config}')
    
    with mlflow.start_run():
        reg = linear_model.Lasso(max_iter=1000, fit_intercept=True, tol=0.001, **config)
        reg.fit(train_x, train_y)
        
        train_preds = reg.predict(train_x)
        valid_preds = reg.predict(valid_x)
        
        log_param('model_type', 'Lasso')
        for param in config:
            log_param(param, config[param])
            
        log_metric('valid_aucroc', roc_auc_score(valid_y, valid_preds))
        log_metric('valid_ap', average_precision_score(valid_y, valid_preds))
        
        log_metric('valid_aucroc', roc_auc_score(valid_y, [pred[1] for pred in valid_preds]))
        log_metric('valid_ap', average_precision_score(valid_y, [pred[1] for pred in valid_preds]))
        
        log_model(reg, 'model')
        
        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv('./input_features.csv', index=False)
        log_artifact('./input_features.csv')
        
#         with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
#         log_artifact('./na_filler.pickle')

    

Trying hyperparamters: {'alpha': 0.1}
Trying hyperparamters: {'alpha': 0.2}


In [None]:
reg = linear_model.Lasso(alpha=0.1)
reg.fit([[0, 0], [1, 1]], [0, 1])  
Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)



In [35]:
param_grid = {
    'n_estimators':[100],
    'feat_select_threshold': ['64*median'],
    'max_features': ['auto'], 
    'min_samples_leaf': [200], 
    'class_weight': [None],
}

In [36]:
mlflow.set_experiment('target_hosp_3_day')

In [37]:
for config in ParameterGrid(param_grid):
    print(f'Trying hyperparamters: {config}')
    
    with mlflow.start_run():
        feat_est = forest.RandomForestClassifier(
            n_estimators=config['n_estimators'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            class_weight=config['class_weight'],
            n_jobs=-1,
            verbose=3
        )

        feat_selector = SelectFromModel(feat_est, threshold=config['feat_select_threshold'])
        train_x_new = feat_selector.fit_transform(train_x, train_y)

        clf = forest.RandomForestClassifier(
            n_estimators=config['n_estimators'],
            max_features=config['max_features'],
            min_samples_leaf=config['min_samples_leaf'],
            class_weight=config['class_weight'],
            n_jobs=-1,
            verbose=3
        )

        clf.fit(train_x_new, train_y)

        valid_x_new = feat_selector.transform(valid_x)

        train_preds = clf.predict_proba(train_x_new)
        valid_preds = clf.predict_proba(valid_x_new)

        for param in config:
            log_param(param, config[param])

        #log_metric('train_aucroc', roc_auc_score(train_y, [pred[1] for pred in train_preds]))
        #log_metric('train_ap', average_precision_score(train_y, [pred[1] for pred in train_preds]))
        log_metric('valid_aucroc', roc_auc_score(valid_y, [pred[1] for pred in valid_preds]))
        log_metric('valid_ap', average_precision_score(valid_y, [pred[1] for pred in valid_preds]))

        log_model(feat_selector, 'feat_selector')
        log_model(clf, "model")

        feature_selected_features = pd.DataFrame(zip(train_x.columns[feat_selector.get_support()], clf.feature_importances_), columns=['feature', 'rf_importance']).sort_values('rf_importance', ascending=False)
        feature_selected_features.to_csv('./feature_selected_features.csv', index=False)
        log_artifact('./feature_selected_features.csv')

        input_features = pd.DataFrame(train_x.columns, columns=['feature'])
        input_features.to_csv('./input_features.csv', index=False)
        log_artifact('./input_features.csv')
        
        with open('./na_filler.pickle','wb') as f: pickle.dump(na_filler, f, protocol=4)
        log_artifact('./na_filler.pickle')
        
        

Trying hyperparamters: {'class_weight': None, 'feat_select_threshold': '64*median', 'max_features': 'auto', 'min_samples_leaf': 200, 'n_estimators': 100}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100

building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100building tree 12 of 100

building tree 13 of 100
building tree 14 of 100building tree 15 of 100
building tree 16 of 100

building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.9min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100building tree 12 of 100

building tree 13 of 100building tree 14 of 100

building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.7min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   10.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.4s finished


In [None]:
feat_est = forest.RandomForestClassifier(
    n_estimators=1000,
    max_features='auto',
    min_samples_leaf=200,
    class_weight=None,
    n_jobs=-1
)

feat_selector = SelectFromModel(feat_est, threshold='32*median')
train_x_new = feat_selector.fit_transform(train_x, train_y)

clf = forest.RandomForestClassifier(
    n_estimators=1000,
    max_features='auto',
    min_samples_leaf=200,
    class_weight=None,
    n_jobs=-1
)

clf.fit(train_x_new, train_y)

valid_x_new = feat_selector.transform(valid_x)
valid_preds = clf.predict_proba(valid_x_new)

In [None]:
average_precision_score(valid_y, [pred[1] for pred in valid_preds])

In [None]:
valid_x_shap = pd.concat([valid_x.loc[:, feat_selector.get_support()].reset_index(drop=True), valid_idens.reset_index(drop=True)], axis=1)
valid_x_shap['preds'] = [pred[1] for pred in valid_preds]
valid_x_shap['target'] = valid_y

In [None]:
valid_x_shap = valid_x_shap[valid_x_shap.censusdate == pd.to_datetime('2019-02-20')]

In [None]:
sorted_valid = valid_x_shap.sort_values(by='preds', ascending=False)
sorted_valid_dr = sorted_valid.drop(columns=['preds', 'masterpatientid', 'censusdate'])
idens = sorted_valid.loc[:, ['masterpatientid', 'censusdate', 'preds', 'target']]

In [None]:
tt = sorted_valid_dr.head(10)

In [None]:
explainer = shap.TreeExplainer(clf)

In [None]:
shap_values = explainer.shap_values(tt)

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[1][0], tt.iloc[0])

In [None]:
out = []

for i in range(0,10):
    shaps = pd.DataFrame({'feature_name':tt.columns, 'shap_value': shap_values[1][i], 'feature_value': tt.iloc[i]}).sort_values(by='shap_value', ascending=False)
    shaps = shaps.head(n=10)
    shaps['masterpatientid'] = hash(str(idens.iloc[i].masterpatientid))
    shaps['censusdate'] = idens.iloc[i].censusdate
    shaps['prediction'] = idens.iloc[i].preds
    shaps['rehosped'] = idens.iloc[i].target
    out.append(shaps)
    

In [None]:
pd.concat(out).to_csv('/code/data/copd_model_2019-02-20.csv', index=False)

In [None]:
pd.DataFrame({'feature_name':tt.columns, 'shap_value': shap_values[1][0], 'feature_value': tt.iloc[0]}).sort_values(by='shap_value', ascending=False)