# SnorkelMED - Identifying Opioid-Induced Respiratory Depression  

The purpose of this analysis is to probabilistically identify which patient visits included an opioid-induced respiratory depression (OIRD) event. 

In [None]:
%matplotlib inline
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter, LabelModel
from snorkel.analysis import get_label_buckets

majority_model = MajorityLabelVoter()
label_model = LabelModel(cardinality=2, verbose=True)

import helper as hlp
import importlib
importlib.reload(hlp)

# global variables
ABSTAIN = -1; CONTROL = 0; CASE = 1

In [None]:
# load raw & aggregated (grouped) data
df, dfg = hlp.load_data()

# add numeric data
dfg = hlp.add_numeric_data(dfg)

# train/deve/valid/test split
df_train, df_dev, df_valid, df_test = hlp.make_splits(dfg)

In [None]:
# import from chart review
df_dev_labeled = pd.read_csv('./dev_set_labeled.csv')
df_dev = df_dev.merge(df_dev_labeled[['visit_occurrence_id', 'label']], on='visit_occurrence_id', how='left')

In [None]:
# store Y values for ease of evaluation
Y_dev = df_dev['label'].values
Y_dev = np.where(Y_dev=='case', 1, 0) 

#Y_valid = df_valid['label'].values

# Round 1 - Attempt to Expand Dev & Valid Sets

Even though I attempted to oversample from the cases during the creation of the development & validation sets, upon manual review, I only had 2 actual positive cases in the development set. Therefore, let's start with what I learned from review those 50 encounters and see about applying learning functions (LFs) to the training set & using the most likely encounters to enrich the development & validation sets. 

In [None]:
# create dictionary to keep track of rule names for easier reference later
lfd = dict()

@labeling_function()
def LF_naloxone_admin(x):
    if x['naloxone_admin_prob'] >= 0.75:
        return CASE
    elif x['naloxone_admin_prob'] < 0.75:
        return CONTROL
    else:
        # if missing
        return ABSTAIN
lfd['LF_naloxone_admin'] = 0
    
@labeling_function()
def LF_respiratory_failure_any(x):
    if '1' in x['respiratory_failure_any'].lower(): 
        return CASE
    return CONTROL
lfd['LF_respiratory_failure_any'] = max(lfd.values()) + 1

@labeling_function()
def LF_eligible_vent(x):
    if 'yes' in x['eligible_vent'].lower(): 
        return CASE
    return ABSTAIN
lfd['LF_eligible_vent'] = max(lfd.values()) + 1

@labeling_function()
def LF_counts_naloxone(x):
    if x['counts_naloxone'] > 0: 
        return CASE
    return ABSTAIN
lfd['LF_counts_naloxone'] = max(lfd.values()) + 1

@labeling_function()
def LF_counts_resp_care_notes(x):
    if x['counts_resp_care_notes'] == 0:
        return CONTROL
    return ABSTAIN
lfd['LF_counts_resp_care_notes'] = max(lfd.values()) + 1

@labeling_function()
def LF_counts_rapid_response(x):
    if x['counts_rapid_response'] > 0:
        return CASE
    return CONTROL
lfd['LF_counts_rapid_response'] = max(lfd.values()) + 1

In [None]:
# combine all relevant LFs
lfs = [LF_naloxone_admin,
      LF_respiratory_failure_any,
      LF_eligible_vent,
      LF_counts_naloxone,
      LF_counts_resp_care_notes,
      LF_counts_rapid_response]

# apply LFs
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)

In [None]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

In [None]:
LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

### Further Exploration

In [None]:
df_train.iloc[L_train[:, lfd['LF_counts_rapid_response']] == CASE]

In [None]:
df_dev.iloc[L_dev[:, lfd['LF_naloxone_admin']] == CASE]

### Conflicts

In [None]:
buckets = get_label_buckets(L_dev[:, lfd['LF_respiratory_failure_any']], L_dev[:, lfd['LF_counts_rapid_response']])
df_dev.iloc[buckets[(CASE, CONTROL)]]

At this point, we're only getting rid of `eligible_vent` because it was wrong in every instance. We might add it back in later. 

In [None]:
lfs.remove(LF_eligible_vent)
lfs

In [None]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)

## Voting

In [None]:
# ensure the code can run
label_model.fit(L_train=L_train, Y_dev = Y_dev, 
                n_epochs = 4000, lr = 0.004, #l2 = 0.01,
                optimizer = 'adamax', lr_scheduler = 'step', #prec_init = 0.7,
                log_freq = 100, seed = 987)
analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary(est_weights=label_model.get_weights())

### Hyper-Parameter Tuning

In [None]:
# create empty dataframe to hold learned weights for each rule
df_cols = ['n_epochs', 'lr', 'lr_scheduler']
df_cols.extend(analysis.index)

# specify potential hyperparameters
n_epochs = [2000, 4000]
lr = [0.001, 0.005, 0.01]
lr_scheduler = ['step', 'exponential', 'linear']

# tune
df_tune, df_tune_long = hlp.label_model_tuning(lfs, df_cols, 
                                               L_train, L_dev, Y_dev, 
                                               n_epochs, lr, lr_scheduler)

In [None]:
# review best accuracies among development set
df_tune[df_tune['accuracy'] == np.max(df_tune['accuracy'])]

In [None]:
for scheduler in lr_scheduler:
    g = sns.FacetGrid(df_tune_long[df_tune_long['lr_scheduler']==scheduler], 
                      col='lr', hue='learning_function', col_wrap=3, height=4)
    g = (g.map(plt.scatter, 'n_epochs', 'learned_weight')
            .add_legend()
            .fig.suptitle('Learned Weights Using ' + str(scheduler) + ' Scheduler', 
                          y=1.05, fontsize=16))

Evaluation: `counts_resp_care_notes` and `counts_rapid_response` were fairly accurate & had good coverage. `naloxone_admin` was also pretty accurate & makes sense as being important. 

In [None]:
label_model.fit(L_train=L_train, Y_dev = Y_dev, n_epochs = 2000, lr = 0.01, optimizer = 'adamax', 
                lr_scheduler = 'step', log_freq = 100, seed = 987)

In [None]:
LFAnalysis(L=L_train, lfs=lfs) \
    .lf_summary(est_weights = label_model.get_weights()) \
    .sort_values(by='Learned Weight', ascending=False)

In [None]:
majority_acc = majority_model.score(L=L_dev, Y=Y_dev)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_dev, Y=Y_dev)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

In [None]:
# assign probabilities from either majority vote or label model
#gen_probs_train = majority_model.predict_proba(L=L_train)
#gen_probs_dev = majority_model.predict_proba(L=L_dev)

gen_probs_train = label_model.predict_proba(L=L_train)
gen_probs_dev = label_model.predict_proba(L=L_dev)

In [None]:
hlp.plot_probabilities_histogram(gen_probs_train[:, CASE])

In [None]:
hlp.plot_probabilities_histogram(gen_probs_dev[:, CASE])

In [None]:
# attach LabelModel predictions to dataframes
train_with_probs = df_train.copy()
train_with_probs['label_model_prob'] = gen_probs_train[:, 1]

In [None]:
# extract top 20 highest probabilities from train set
top_probs = train_with_probs.nlargest(n=20, columns='label_model_prob')

# send half to the dev set & half to the valid set
visits_for_dev = top_probs['visit_occurrence_id'].sample(frac=0.5, random_state=123)
visits_for_valid = top_probs[~np.isin(top_probs['visit_occurrence_id'], visits_for_dev)]['visit_occurrence_id']

# concatenate the respective training set rows to dev & valid sets
df_dev2 = pd.concat([df_dev, df_train[df_train['visit_occurrence_id'].isin(visits_for_dev)]], sort=True)
df_valid2 = pd.concat([df_valid, df_train[np.isin(df_train['visit_occurrence_id'], visits_for_valid)]], sort=True)

# remove the rows from the training set
df_train2 = df_train.drop(top_probs.index)

assert df_dev2.shape[0] == df_dev.shape[0] + 0.5*top_probs.shape[0]
assert df_valid2.shape[0] == df_valid.shape[0] + 0.5*top_probs.shape[0]
assert df_train2.shape[0] == df_train.shape[0] - top_probs.shape[0]
assert not np.isin(df_train2['visit_occurrence_id'], df_dev2['visit_occurrence_id']).any()
assert not np.isin(df_train2['visit_occurrence_id'], df_valid2['visit_occurrence_id']).any()

In [None]:
# for manual chart review, attach new cases to the dev (and/or valid set)

#df_dev2_labeled = pd.concat([df_dev_labeled, 
#                             df_train[df_train['visit_occurrence_id'].isin(visits_for_dev)]], 
#                            sort=True)

#df_valid2_labeled = pd.concat([df_valid_labeled, 
#                               df_train[df_train['visit_occurrence_id'].isin(visits_for_valid)]], 
#                            sort=True)

# export
#df_train2.to_csv('./train_set2.csv', index=False)
#df_dev2_labeled.to_csv('./dev_set2_labeled.csv', index=False)
#df_valid2_labeled.to_csv('./valid_set2_labeled.csv', index=False)