*Purpose:* Questions surfaced about how a discriminative model could perform better than the generative model. Therefore, I'm exploring whether some noise awareness in our models of choice are responsible. This includes testing:  
- Purely Generative labels without any information from Development Set adjudication  
- A non-weighted random forest classifier so that noise isn't accounted for  

This notebook is a duplicate of the original discriminative model notebook with many cells deleted & only a few added.  

In [None]:
%matplotlib inline
import os
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, cross_validate
from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
import seaborn as sns

# models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# snorkel
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis, filter_unlabeled_dataframe
from snorkel.labeling.model import MajorityLabelVoter, LabelModel
from snorkel.analysis import get_label_buckets, metric_score
from snorkel.utils import probs_to_preds

majority_model = MajorityLabelVoter()
label_model = LabelModel(cardinality=2, verbose=True)

import helper as hlp
import importlib
importlib.reload(hlp)

# global variables
ABSTAIN = -1; CONTROL = 0; CASE = 1
SEED = 987

In [None]:
# import data, which was saved as a tuple...
#export_data = (df_train_dev, df_valid, df_test, Y_dev, Y_valid,
#               L_train, L_dev, L_train_dev, L_valid,  
#               label_model, majority_model, L_test)

with open('./data_for_analysis.pkl', 'rb') as f:
    data = pickle.load(f)
    
df_train_dev = data[0]; df_valid = data[1]; df_test = data[2]
Y_dev = data[3]; Y_valid = data[4]
L_train = data[5]; L_dev = data[6]; L_train_dev = data[7]; L_valid = data[8]
label_model = data[9]; majority_model = data[10]
L_test = data[11]

# Prepare Model Features

## Outcome  
Although we're primarily depending on the Generative model for labels, we can still leverage our manually adjudicated information for more robust information - something is better than nothing, right? 

In [None]:
# find observed values from label model probabilities that are closest to 0 or 1 and make 
#    manually-adjudicated labels slightly closer to 0 or 1, respectively
label_model_probs = label_model.predict_proba(L_train_dev)[:, CASE]
lower_limit = 0.95 * np.min(label_model_probs)
upper_limit = 0.95 * (1-np.max(label_model_probs)) + np.max(label_model_probs)

In [None]:
# store on dataframe, using manual adjudication if available
df_train_dev['outcome_generative_model'] = label_model_probs
df_train_dev['outcome'] = np.where(pd.isnull(df_train_dev['label']), # if label is missing...
                                           # use generative model
                                           df_train_dev['outcome_generative_model'], 
                                           # otherwise, use manually-adjudicated label 
                                           # but with offset for regression-based models
                                           np.where(df_train_dev['label']=='case', upper_limit, lower_limit))

# create y variables
y_train_probs = np.array(df_train_dev['outcome'])
y_train_preds = np.where(df_train_dev['outcome'] >= 0.5, 1, 0)

y_valid_probs = label_model.predict_proba(L_valid)#[:, CASE] # only used as FYI
y_valid_preds = probs_to_preds(y_valid_probs) 

### FYI: Generative Model Performance on Validation Set

In [None]:
eval = pd.DataFrame({'predicted': np.round(y_valid_probs[:, CASE], 2), 
                     'actual': np.where(Y_valid==0, 'Control', 'Case')})
eval = eval.sort_values(by=['predicted', 'actual'])
eval = eval.assign(counts =eval.groupby(['predicted']).cumcount())

fig = sns.scatterplot(data=eval, x="predicted", y="counts", 
                      hue=eval["actual"].tolist(), palette="colorblind", s=100)
plt.ylabel('Counts')
plt.xlabel('Predicted Value')
plt.legend(loc='upper center')
sns.set(rc={'figure.figsize': (15, 5)})

In [None]:
print(metrics.classification_report(Y_valid, y_valid_preds, digits=3))

## Predictors  

For the deterministic model, we're keeping a generalizable set of features. We could depend on the previously-developed learning functions, but one draw-back is the amount of feature engineering that's put into that. Alternatively, we can start with the raw features, e.g., age, regular expression counts, etc. It might also be unwise to use the `nalxone_admin_prob` value due to it being created with a previous Snorkel model. 

In [None]:
# pull in original naloxone administration info & only count "received" if "epic ip admin" or "hed" present
naloxone = pd.read_csv('../sd_structured/meds/naloxone/naloxone_exposure_pre.csv', sep='\t')
naloxone.columns = naloxone.columns.str.lower()

# collapse all visit day onto a single row
SEP = ';;'
join_as_strings = lambda x: SEP.join(map(str, x))

naloxone = naloxone.groupby(['visit_occurrence_id', 'grid', 'label']) \
    ['x_frequency', 'drug_source_value', 'x_doc_type', 'x_doc_stype'] \
    .agg(join_as_strings) \
    .reset_index()

# create binary indicator of whether naloxone received based on simple rule
naloxone['binary_naloxone_admin'] = np.where((naloxone['x_doc_type'].str.contains('HED')) | 
                                            (naloxone['x_doc_type'].str.contains('EPIC IP ADMIN')),
                                            1, 0)

# attach to train/dev and validation sets
df_train_dev = df_train_dev.merge(naloxone[['visit_occurrence_id', 'binary_naloxone_admin']], 
                                  how='left', on=['visit_occurrence_id'])
df_valid = df_valid.merge(naloxone[['visit_occurrence_id', 'binary_naloxone_admin']], 
                          how='left', on=['visit_occurrence_id'])

In [None]:
# create numeric columns from string-based columns
df_train_dev['binary_respiratory_failure_any'] = \
    np.where(df_train_dev['respiratory_failure_any'].str.contains('1'), 1, 0)
df_valid['binary_respiratory_failure_any'] = \
    np.where(df_valid['respiratory_failure_any'].str.contains('1'), 1, 0)

df_train_dev['binary_eligible_vent'] = \
    np.where(df_train_dev['eligible_vent'].str.contains('Yes'), 1, 0)
df_valid['binary_eligible_vent'] = \
    np.where(df_valid['eligible_vent'].str.contains('Yes'), 1, 0)

# coerce only categorical column into binary
df_train_dev['binary_gender_female'] = np.where(df_train_dev['gender']=='FEMALE', 1, 0)
df_valid['binary_gender_female'] = np.where(df_valid['gender']=='FEMALE', 1, 0)

# replace missing values from naloxone join with "0"
df_train_dev = df_train_dev.fillna(value={'binary_naloxone_admin': 0})
df_valid = df_valid.fillna(value={'binary_naloxone_admin': 0})

# replace NaN values with 0 for ICD conditions
df_train_dev['binary_cond_resp_failure'] = np.where(df_train_dev['cond_resp_failure']==1, 1, 0)
df_valid['binary_cond_resp_failure'] = np.where(df_valid['cond_resp_failure']==1, 1, 0)

df_train_dev['binary_cond_sepsis'] = np.where(df_train_dev['cond_sepsis']==1, 1, 0)
df_valid['binary_cond_sepsis'] = np.where(df_valid['cond_sepsis']==1, 1, 0)

df_train_dev['binary_cond_cva'] = np.where(df_train_dev['cond_cva']==1, 1, 0)
df_valid['binary_cond_cva'] = np.where(df_valid['cond_cva']==1, 1, 0)

df_train_dev['binary_cond_resp_disease'] = np.where(df_train_dev['cond_resp_disease']==1, 1, 0)
df_valid['binary_cond_resp_disease'] = np.where(df_valid['cond_resp_disease']==1, 1, 0)

df_train_dev['binary_cond_cv_disease'] = np.where(df_train_dev['cond_cv_disease']==1, 1, 0)
df_valid['binary_cond_cv_disease'] = np.where(df_valid['cond_cv_disease']==1, 1, 0)

## Create Data Matrices  

In [None]:
# specify columns for model building
cols_binary = df_train_dev.columns[df_train_dev.columns.str.contains('binary_')]
cols_counts = df_train_dev.columns[df_train_dev.columns.str.contains('counts_')]

cols = ['age_on_admission'] #'naloxone_admin_prob'
cols.extend(cols_binary)
cols.extend(cols_counts)
#cols

In [None]:
# subset columns
X_train = df_train_dev[cols]
X_valid = df_valid[cols]

# also, some of the "counts" variables didn't have any results because those patients didn't have charts
#    consider imputing "0" here, too
X_train.fillna(0, inplace=True)
X_valid.fillna(0, inplace=True)

In [None]:
# scale data 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

# Build Discriminative Model(s)

In [None]:
# attempt sample weights with by downweighting probabilities closer to 0.5
weights = np.abs(df_train_dev['outcome_generative_model']-0.5)

In [None]:
# original best model (unchanged)
best_rfcw = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                   class_weight={0: 0.99, 1: 0.01}, max_depth=50, max_features=None)

best_rfcw.fit(X_train, y_train_preds, sample_weight=weights)

In [None]:
# sensitivity 1 - unweighted
rfc_unweighted = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                    class_weight={0: 0.99, 1: 0.01}, max_depth=50, max_features=None)

rfc_unweighted.fit(X_train, y_train_preds)

In [None]:
# don't pull any information from manually-adjudicated Dev/Valid Sets
y_train_preds_gen = np.where(df_train_dev['outcome_generative_model'] >= 0.5, 1, 0)

In [None]:
# sensitivity 2 - Generative labels only (i.e., not Dev Set informed) & unweighted fit
fully_generative_unweighted = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                          class_weight={0: 0.99, 1: 0.01}, max_depth=50, max_features=None)

fully_generative_unweighted.fit(X_train, y_train_preds_gen)

In [None]:
# sensitivity 3 - Generative labels only (i.e., not Dev Set informed) with weighted fit
fully_generative_weighted = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                                   class_weight={0: 0.99, 1: 0.01}, 
                                                   max_depth=50, max_features=None)

weights = np.abs(df_train_dev['outcome_generative_model']-0.5)
fully_generative_weighted.fit(X_train, y_train_preds_gen, sample_weight=weights)

## Performance of Best Models

### Validation Set Performance  

FYI, the Generative model had an F1 score of 0.737 in the Validation Set.  

In [None]:
# classifiers
for model in [best_rfcw, rfc_unweighted, fully_generative_unweighted, fully_generative_weighted]:
    y_pred = model.predict(X_valid)
    print(model)
    print(metrics.classification_report(Y_valid, y_pred, digits=3))
    print(metrics.roc_auc_score(Y_valid, y_pred))

From looking at the metrics here in the Validation Set, it was the sample weighting during the modeling fitting that influenced metrics & not the inclusion of manual adjudication information.  

In [None]:
# accuracy of random guess in Validation set
1-sum(Y_valid)/len(Y_valid)

In [None]:
# probability distribution from weighted random forest classifier 
y_pred_proba = best_rfcw.predict_proba(X_valid)[:,CASE]

eval = pd.DataFrame({'predicted': np.round(y_pred_proba, 2), 'actual': Y_valid})
eval = eval.sort_values(by=['predicted', 'actual'])
eval = eval.assign(counts =eval.groupby(['predicted']).cumcount())

fig = sns.scatterplot(data=eval, x="predicted", y="counts", 
                      hue=eval["actual"].tolist(), palette="colorblind", s=100)
plt.ylabel('Counts')
plt.xlabel('Predicted Value')

In [None]:
# probability distribution from UNweighted random forest classifier 
y_pred_proba = rfc_unweighted.predict_proba(X_valid)[:,CASE]

eval = pd.DataFrame({'predicted': np.round(y_pred_proba, 2), 'actual': Y_valid})
eval = eval.sort_values(by=['predicted', 'actual'])
eval = eval.assign(counts =eval.groupby(['predicted']).cumcount())

fig = sns.scatterplot(data=eval, x="predicted", y="counts", 
                      hue=eval["actual"].tolist(), palette="colorblind", s=100)
plt.ylabel('Counts')
plt.xlabel('Predicted Value')

## Re-Fit "Best" Model on All Data (except Test Set)

In [None]:
# the train/dev set has an "outcome_generative_model" column that is used for creating
#   weights in the weighted RF model - replicating that in the validation set before merging
df_valid_temp = df_valid.copy()
df_valid_temp['outcome_generative_model'] = y_valid_probs[:, CASE]

# merge train/dev and validation sets 
df_final = df_train_dev.append(df_valid_temp, sort=False)

In [None]:
# update the 'outcome' column now that validation is also there
df_final['outcome'] = np.where(pd.isnull(df_final['outcome']), # if label missing...
                                  # pull from manual 'label' (same as above code)
                                  np.where(df_final['label']=='case', upper_limit, lower_limit), 
                                  # otherwise, keep it what it is
                                  df_final['outcome'])

In [None]:
# create y variables
y_final_probs = np.array(df_final['outcome'])
y_final_preds = np.where(df_final['outcome'] >= 0.5, 1, 0)

In [None]:
# prepare features - code taken from above

# subset columns
X_final = df_final[cols]

# also, some of the "counts" variables didn't have any results because those patients didn't have charts
#    consider imputing "0" here, too
X_final.fillna(0, inplace=True)

# scale data 
X_final = sc.transform(X_final)

In [None]:
# store weights
weights_final = np.abs(df_final['outcome_generative_model']-0.5)

In [None]:
model_final = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                     class_weight={0: 0.99, 1: 0.01}, max_depth=50, max_features=None)
model_final.fit(X_final, y_final_preds, sample_weight=weights_final)

In [None]:
# performance in training set (should be highly fit)
y_pred = model_final.predict(X_final)
y_pred_proba_final = model_final.predict_proba(X_final)[:,CASE]
print(model_final)
print(metrics.classification_report(y_final_preds, y_pred, digits=3))
print(metrics.roc_auc_score(y_final_preds, y_pred))

In [None]:
# re-fit the models from the sensitivity analysis

In [None]:
# sensitivity 1 - unweighted
rfc_unweighted = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                    class_weight={0: 0.99, 1: 0.01}, max_depth=50, max_features=None)
rfc_unweighted.fit(X_final, y_final_preds)

In [None]:
# don't pull any information from manually-adjudicated Dev/Valid Sets
y_final_preds_gen = np.where(df_final['outcome_generative_model'] >= 0.5, 1, 0)

In [None]:
# sensitivity 2 - Generative labels only (i.e., not Dev Set informed) & unweighted fit
fully_generative_unweighted = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                          class_weight={0: 0.99, 1: 0.01}, max_depth=50, max_features=None)
fully_generative_unweighted.fit(X_final, y_final_preds_gen)

In [None]:
# sensitivity 3 - Generative labels only (i.e., not Dev Set informed) with weighted fit
fully_generative_weighted = RandomForestClassifier(n_estimators=1000, random_state=SEED, 
                                                   class_weight={0: 0.99, 1: 0.01}, 
                                                   max_depth=50, max_features=None)
weights = np.abs(df_final['outcome_generative_model']-0.5)
fully_generative_weighted.fit(X_final, y_final_preds_gen, sample_weight=weights)

In [None]:
# store predictions on data set & export for prediction model development 
df_final['snorkel_deterministic_model_prob'] = y_pred_proba_final
df_final['sens_rfc_unweighted_prob'] = rfc_unweighted.predict_proba(X_final)[:,CASE]
df_final['sens_fully_generative_unweighted_prob'] =fully_generative_unweighted.predict_proba(X_final)[:,CASE]
df_final['sens_fully_generative_weighted_prob'] = fully_generative_weighted.predict_proba(X_final)[:,CASE]

In [None]:
df_final

In [None]:
df_final.to_csv('./train_dev_valid_set_with_predicted_labels.csv', index=False)

# Apply Predictions from Final Discriminative Model to Test Set

In [None]:
# repeating code from above on train/dev and valid sets
_, _, _, df_test = hlp.reattach_numeric_data(df_train_dev, df_train_dev, df_valid, df_test)

df_test = df_test.merge(naloxone[['visit_occurrence_id', 'binary_naloxone_admin']], 
                          how='left', on=['visit_occurrence_id'])
df_test['binary_respiratory_failure_any'] = \
    np.where(df_test['respiratory_failure_any'].str.contains('1'), 1, 0)
df_test['binary_eligible_vent'] = \
    np.where(df_test['eligible_vent'].str.contains('Yes'), 1, 0)
df_test['binary_gender_female'] = np.where(df_test['gender']=='FEMALE', 1, 0)
df_test = df_test.fillna(value={'binary_naloxone_admin': 0})
df_test['binary_cond_resp_failure'] = np.where(df_test['cond_resp_failure']==1, 1, 0)
df_test['binary_cond_sepsis'] = np.where(df_test['cond_sepsis']==1, 1, 0)
df_test['binary_cond_cva'] = np.where(df_test['cond_cva']==1, 1, 0)
df_test['binary_cond_resp_disease'] = np.where(df_test['cond_resp_disease']==1, 1, 0)
df_test['binary_cond_cv_disease'] = np.where(df_test['cond_cv_disease']==1, 1, 0)

X_test = df_test[cols]
X_test.fillna(0, inplace=True)
X_test = sc.transform(X_test)

In [None]:
y_pred_binar_test = model_final.predict(X_test)
y_pred_proba_test = model_final.predict_proba(X_test)[:,CASE]

In [None]:
sum(y_pred_binar_test)

In [None]:
sum(y_pred_binar_test)/len(y_pred_binar_test)

In [None]:
plt.hist(y_pred_proba_test, bins=100);

In [None]:
non_small = y_pred_proba_test[np.where(y_pred_proba_test > 0.01)]
plt.hist(non_small, bins=100);

In [None]:
# keep generative model probs for comparison
df_test['snorkel_generative_model_prob'] = label_model.predict_proba(L_test)[:, CASE]

In [None]:
# export
df_test['snorkel_deterministic_model_prob'] = y_pred_proba_test

# sensitivity analyses
df_test['sens_rfc_unweighted_prob'] = rfc_unweighted.predict_proba(X_test)[:,CASE]
df_test['sens_fully_generative_unweighted_prob'] =fully_generative_unweighted.predict_proba(X_test)[:,CASE]
df_test['sens_fully_generative_weighted_prob'] = fully_generative_weighted.predict_proba(X_test)[:,CASE]

In [None]:
df_test

In [None]:
df_test.to_csv('./test_set_with_predicted_labels.csv', index=False)