In [None]:
%matplotlib inline
import os
import pickle
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
import seaborn as sns

import helper as hlp 
import importlib
importlib.reload(hlp)

# global variables
ABSTAIN = -1; CONTROL = 0; CASE = 1
SEED = 987

In [None]:
# load test set data with snorkel's predictions included
df_test = pd.read_csv('./test_set_with_predicted_labels.csv')

# including majority vote model for comparisons
df_test.rename(columns={"majority_model_label": 'snorkel_majority_model_prob'},
              inplace=True)

# load crowdsourcing determinations
crowdsourced = pd.read_csv('./test_set_adjudicated.csv')

# merge dataframes
merged = crowdsourced.merge(df_test, on=['visit_occurrence_id', 'grid'], how='left')

merged.shape

In [None]:
print('The original test set size included ' + str(df_test.shape[0]) + ' visits.')
print('The Crowdsourcing Core reviewed ' + str(crowdsourced.shape[0]) + \
      ' visits for respiratory depression.')
print(str(df_test.shape[0] - crowdsourced.shape[0]) + \
      ' visits were excluded in the 1st task of determing non-emergent nature of surgery.')

In [None]:
# which charts had disagreements with insufficient reviewers?
merged[np.isnan(merged['outcome'])]

In [None]:
# who are the predicted cases?
merged[merged['snorkel_deterministic_model_prob'] >= 0.5]['grid'].values

In the original hold-out test set (n=764 visits), 25 visits had Snorkel discriminative model probabilities >= 0.5. Of those 25 visits, only 19 were reviewed for OIRD by the crowdsourcing core (i.e.,6 visits were determined to not be an elective/non-emergent surgery).  

Of the 19 reviewed (see GRIDs above), 4 were identified as CASEs by the crowdsourcing core. Alvin reviewed the remaining 15 to see if any could be re-classified - unfortunately, all were controls except for 3 of which 1 was a case & 2 were indeterminate. Further, the single CASE was only reviewed by 2 external reviewers & they disagreed on the determination without a 3rd reviewer to break the tie.  

For purposes of this project, instead of removing the high-probability visit due to insufficient reviews, we will classify that visit as a CASE.  

In [None]:
# manually re-classify the single review 
merged.loc[merged['visit_occurrence_id'] == 41820280, 'outcome'] = 1.0

# also specify the 'agreement' as a disagreement
merged.loc[merged['visit_occurrence_id'] == 41820280, 'agreement'] = 0.0

In [None]:
# drop the indeterminate visit
merged = merged[~np.isnan(merged['outcome'])]
merged.shape

In [None]:
# of the 598 visits, how many unique patients? 
len(np.unique(merged['grid']))

In [None]:
merged[merged['outcome']==1.0]

In [None]:
# ensure none of the duplicated patients are associated with visits in both case & control
assert merged[merged['grid']=='R286378592'].shape[0] == 1
assert merged[merged['grid']=='R285802495'].shape[0] == 1
assert merged[merged['grid']=='R284169073'].shape[0] == 1
assert merged[merged['grid']=='R262848683'].shape[0] == 1
assert merged[merged['grid']=='R206264220'].shape[0] == 1

In [None]:
# export the final determinations for genetic analysis

# export GRIDs that were evaluated for OIRD for genetic analysis
df = merged.reset_index()['grid']
grids_crowdsourced = pd.DataFrame(df)
len(np.unique(grids_crowdsourced))

# export the cases into phenotype file
df = merged[merged['outcome']==1].reset_index()['grid']
pheno_crowdsourced = pd.DataFrame(df)
pheno_crowdsourced['grid_repeated'] = df

# ensure none of the duplicated patients are associated with visits in both case & control
assert sum(pheno_crowdsourced['grid'].isin(grids_crowdsourced[grids_crowdsourced.duplicated()]['grid'])) == 0
assert sum(grids_crowdsourced[grids_crowdsourced.duplicated()]['grid'].isin(pheno_crowdsourced['grid'])) == 0

In [None]:
# plot similar to previous evaluation plots
for m in ['deterministic', 'generative', 'majority']:
    plt.figure()
    eval = pd.DataFrame({'predicted': np.round(merged['snorkel_' + m + '_model_prob'], 2), 
                         'actual': np.where(merged['outcome']==0, 'Control', 'Case')})
    eval = eval.sort_values(by=['predicted', 'actual'])
    eval = eval.assign(counts =eval.groupby(['predicted']).cumcount())

    fig = sns.scatterplot(data=eval, x="predicted", y="counts", 
                          hue=eval["actual"].tolist(), palette="colorblind", s=100)
    plt.ylabel('Counts')
    plt.xlabel('Predicted Value')
    plt.legend(loc='upper center')
    plt.title(m + ' model')
    sns.set(rc={'figure.figsize': (15, 5)})
    plt.show()

In [None]:
# because the sample size is larger, it's difficult to see some of the nuances
# sub-setting to the non-0 group
for m in ['deterministic', 'generative', 'majority']:
    plt.figure()
    non_zero = merged[merged['snorkel_' + m + '_model_prob'] > 0.]
    eval = pd.DataFrame({'predicted': np.round(non_zero['snorkel_' + m + '_model_prob'], 2), 
                         'actual': np.where(non_zero['outcome']==0, 'Control', 'Case')})
    eval = eval.sort_values(by=['predicted', 'actual'])
    eval = eval.assign(counts =eval.groupby(['predicted']).cumcount())

    fig = sns.scatterplot(data=eval, x="predicted", y="counts", 
                          hue=eval["actual"].tolist(), palette="colorblind", s=100)
    plt.ylabel('Counts')
    plt.xlabel('Predicted Value')
    plt.legend(loc='upper center')
    plt.title(m + ' model with non-zero values')
    sns.set(rc={'figure.figsize': (15, 5)})
    plt.show()

In [None]:
# were any of the predicted 0 values actually a case?
for m in ['deterministic', 'generative', 'majority']:
    print(merged[(merged['snorkel_' + m + '_model_prob'] == 0.) & \
                 (merged['outcome'] == 1.0)].shape[0])

In [None]:
# what were the values when disagreements occurred?
merged[merged['agreement'] < 1][['outcome', 'snorkel_deterministic_model_prob', 
                                 'snorkel_generative_model_prob', 'snorkel_majority_model_prob']]

In [None]:
# creating another plot for when there was disagreement
for m in ['deterministic', 'generative', 'majority']:
    plt.figure()
    eval = pd.DataFrame({'predicted': np.round(merged['snorkel_' + m + '_model_prob'], 2), 
                         'actual': np.where(merged['agreement']==0, 'Disagreement Present', 'Concensus')})
    eval = eval.sort_values(by=['predicted', 'actual'])
    eval = eval.assign(counts =eval.groupby(['predicted']).cumcount())

    fig = sns.scatterplot(data=eval, x="predicted", y="counts", 
                          hue=eval["actual"].tolist(), palette="colorblind", s=100)
    plt.ylabel('Counts')
    plt.xlabel('Predicted Value')
    plt.legend(loc='upper center')
    plt.title(m + ' model with disagreements')
    sns.set(rc={'figure.figsize': (15, 5)})
    plt.show()

In [None]:
# direct comparison of generative  vs. deterministic model
colors = np.where(merged['outcome'] == 1.0, 'orange', 'blue')

plt.scatter(merged['snorkel_generative_model_prob'], 
            merged['snorkel_deterministic_model_prob'],
            color=colors)
plt.xlabel('Generative Model')
plt.ylabel('Deterministic Model')
plt.title('Comparison of Predicted Probabilities between Generative & Deterministic Models')
plt.show()

In [None]:
# how would this plot look if some of the controls were re-classified due to:
# disagreement from manual reviewers and/or Alvin's post-hoc designation as 'indeterminate'
reclassified = merged.copy()

# keep original outcome determination
reclassified['outcome_orig'] = reclassified['outcome']

# create new categories of outcomes
reclassified['outcome'] = np.where(reclassified['outcome'] == 1.0, 'case-crowdsourced', 'control-crowdsourced')
reclassified.loc[reclassified['visit_occurrence_id'] == 30427405, 'outcome'] = 'indeterminate-per-alvin'
reclassified.loc[reclassified['visit_occurrence_id'] == 46362145, 'outcome'] = 'indeterminate-per-alvin'

# create color map for plotting
col_map = {'case-crowdsourced': 'orange',
          'control-crowdsourced': 'blue',
          'indeterminate-per-alvin': 'red'}
reclassified['color'] = reclassified['outcome'].map(col_map)

In [None]:
# full agreement for cases & controls
rc1a = reclassified[(reclassified['agreement'] == 1.0) & (reclassified['outcome'] == 'case-crowdsourced')]
rc1b = reclassified[(reclassified['agreement'] == 1.0) & (reclassified['outcome'] == 'control-crowdsourced')]

# some disagreement
rc2a = reclassified[(reclassified['agreement'] == 0.0)  & (reclassified['outcome'] == 'case-crowdsourced')]
rc2b = reclassified[(reclassified['agreement'] == 0.0)  & (reclassified['outcome'] == 'control-crowdsourced')]

# review of top-scoring
rc3 = reclassified[reclassified['outcome'] == 'indeterminate-per-alvin']

plt.figure(figsize=(10, 10), dpi=150)

plt.scatter(rc1a['snorkel_generative_model_prob'], rc1a['snorkel_deterministic_model_prob'],
            color='red', marker='^', label='Full Agreement for CASE')
plt.scatter(rc1b['snorkel_generative_model_prob'], rc1b['snorkel_deterministic_model_prob'],
            color='blue', marker='.', label='Full Agreement CONTROL')

plt.scatter(rc2a['snorkel_generative_model_prob'], rc2a['snorkel_deterministic_model_prob'],
            color='red', marker='^', facecolors='none', label='Some Disagreement but determined CASE')
plt.scatter(rc2b['snorkel_generative_model_prob'], rc2b['snorkel_deterministic_model_prob'],
            color='blue', marker='v', facecolors='none',label='Some Disagreement but determined CONTROL')

plt.scatter(rc3['snorkel_generative_model_prob'], rc3['snorkel_deterministic_model_prob'],
            color='orange', marker='>', facecolors='none', label='Indeterminate')

# add line to show potential thresholds with gen > 0.8 and det > 0.7
#plt.plot([0.8, 1.05], [0.7, 0.7], color='red', linestyle='-', linewidth=2)
#plt.plot([0.8, 0.8], [0.7, 1.05], color='red', linestyle='-', linewidth=2)

plt.xlabel('Probability(OIRD) - Generative Model')
plt.ylabel('Probability(OIRD) - Discriminative Model')
plt.legend(loc='upper left')
plt.title('Comparison of Predicted Probabilities between Generative & Discriminative Models')
plt.show();

In [None]:
# full agreement controls removed
plt.figure(figsize=(10, 10), dpi=150)

plt.scatter(rc1a['snorkel_generative_model_prob'], rc1a['snorkel_deterministic_model_prob'],
            color='red', marker='^', label='Full Agreement for CASE')
#plt.scatter(rc1b['snorkel_generative_model_prob'], rc1b['snorkel_deterministic_model_prob'],
#            color='blue', marker='.', label='Full Agreement CONTROL')

plt.scatter(rc2a['snorkel_generative_model_prob'], rc2a['snorkel_deterministic_model_prob'],
            color='red', marker='^', facecolors='none', label='Some Disagreement but determined CASE')
plt.scatter(rc2b['snorkel_generative_model_prob'], rc2b['snorkel_deterministic_model_prob'],
            color='blue', marker='v', facecolors='none',label='Some Disagreement but determined CONTROL')

plt.scatter(rc3['snorkel_generative_model_prob'], rc3['snorkel_deterministic_model_prob'],
            color='orange', marker='>', facecolors='none', label='Indeterminate')

plt.xlabel('Probability(OIRD) - Generative Model')
plt.ylabel('Probability(OIRD) - Discriminative Model')
plt.legend(loc='upper left')
plt.title('Comparison of Predicted Probabilities between Generative & Discriminative Models' + 
          ' (Full Agreement Controls Removed)')
plt.show();

In [None]:
for m in ['deterministic', 'generative', 'majority']:
    plt.figure()
    sns.stripplot('outcome', 'snorkel_' + m + '_model_prob', data=merged, jitter=0.2)
    plt.title(m + ' model')
    plt.show()

In [None]:
# auc curve
for m in ['deterministic', 'generative', 'majority']:
    plt.figure()
    fpr, tpr, thresholds = metrics.roc_curve(merged['outcome'].tolist(), 
                                             merged['snorkel_' + m + '_model_prob'].tolist())
    plt.plot(fpr,tpr, label = 'Crowdsourced')
    fs=15
    plt.xlabel("False Positive Rate", fontsize=fs)
    plt.ylabel("True Positive Rate", fontsize=fs)
    plt.tick_params(labelsize=fs-3)
    plt.legend(loc='upper left', fontsize=fs-2, title='Test Set Performance')
    plt.title(m + ' model AUC')
    plt.show()

In [None]:
# recall precision curve
for m in ['deterministic', 'generative', 'majority']:
    plt.figure()
    ppv, sens, thresh = metrics.precision_recall_curve(merged['outcome'].tolist(), 
                                                       merged['snorkel_' + m + '_model_prob'].tolist())
    plt.plot(sens, ppv, label = 'Crowdsourced') # cui difference
    fs=15
    plt.xlabel("Recall", fontsize=fs)
    plt.ylabel("Precision", fontsize=fs)
    plt.xlim(0, 1.1)
    plt.ylim(0, 1.1)
    plt.tick_params(labelsize=fs-3)
    plt.legend(loc='upper right', fontsize=fs-2, title='Test Set Performance')
    plt.title(m + ' model F1')
    plt.show()

In [None]:
# for thesis document
plt.figure()
ppv, sens, thresh = metrics.precision_recall_curve(merged['outcome'].tolist(), 
                                                   merged['snorkel_generative_model_prob'].tolist())
plt.plot(sens, ppv, label = 'Crowdsourced') 
fs=15
plt.xlabel("Recall", fontsize=fs)
plt.ylabel("Precision", fontsize=fs)
plt.xlim(0, 1.1)
plt.ylim(0, 1.1)
plt.tick_params(labelsize=fs-3)
#plt.legend(loc='upper right', fontsize=fs-2, title='Model')
plt.title('Recall-Precision Curve for Generative Model in Test Set (F1=0.417)',
         fontsize=fs)
plt.show()

In [None]:
plt.figure()
ppv, sens, thresh = metrics.precision_recall_curve(merged['outcome'].tolist(), 
                                                   merged['snorkel_deterministic_model_prob'].tolist())
plt.plot(sens, ppv, label = 'Crowdsourced') 
fs=15
plt.xlabel("Recall", fontsize=fs)
plt.ylabel("Precision", fontsize=fs)
plt.xlim(0, 1.1)
plt.ylim(0, 1.1)
plt.tick_params(labelsize=fs-3)
#plt.legend(loc='upper right', fontsize=fs-2, title='Model')
plt.title('Recall-Precision Curve for Discriminative Model in Test Set (F1=0.417)',
         fontsize=fs)
plt.show()

In [None]:
plt.figure()
ppv, sens, thresh = metrics.precision_recall_curve(merged['outcome'].tolist(), 
                                                   merged['snorkel_majority_model_prob'].tolist())
plt.plot(sens, ppv, label = 'Crowdsourced') 
fs=15
plt.xlabel("Recall", fontsize=fs)
plt.ylabel("Precision", fontsize=fs)
plt.xlim(0, 1.1)
plt.ylim(0, 1.1)
plt.tick_params(labelsize=fs-3)
#plt.legend(loc='upper right', fontsize=fs-2, title='Model')
plt.title('Recall-Precision Curve for Discriminative Model in Test Set (F1=0.333)',
         fontsize=fs)
plt.show()

In [None]:
# accuracy
for t in [0.5, 0.7, 0.8, 0.9]:
    for m in ['deterministic', 'generative', 'majority']:
        p = metrics.accuracy_score(merged['outcome'], 
                             np.where(merged['snorkel_' + m + '_model_prob'] >= t, 1, 0))
        print(m + ' Accuracy with threshold of ' + str(t) + ' = ' + str(p))

In [None]:
# AUC
for t in [0.5, 0.7, 0.8, 0.9]:
    for m in ['deterministic', 'generative', 'majority']:
        p = metrics.roc_auc_score(merged['outcome'], 
                             np.where(merged['snorkel_' + m + '_model_prob'] >= t, 1, 0))
        print(m + ' AUC with threshold of ' + str(t) + ' = ' + str(p))

In [None]:
# F1-score
for t in [0.5, 0.7, 0.8, 0.9]:
    for m in ['deterministic', 'generative', 'majority']:
        p = metrics.f1_score(merged['outcome'], 
                             np.where(merged['snorkel_' + m + '_model_prob'] >= t, 1, 0))
        print(m + ' F1 Score with threshold of ' + str(t) + ' = ' + str(p))

In [None]:
# what if we pulled 2 thresholds from the test set (e.g., see scatter plot above)
# using generative > 0.8 and deterministic > 0.7
metrics.f1_score(merged['outcome'],
                 np.where((merged['snorkel_deterministic_model_prob'] >= 0.7) & \
                          (merged['snorkel_generative_model_prob'] >= 0.8), 
                          1, 0))

If the goal is massive labeling, an F1 score of 0.625 isn't too bad. 

## Sensitivity Analysis  

In [None]:
for m in ['snorkel_deterministic_model_prob', 'sens_rfc_unweighted_prob',
       'sens_fully_generative_unweighted_prob',
       'sens_fully_generative_weighted_prob']:
    plt.figure()
    sns.stripplot('outcome', m, data=merged, jitter=0.2)
    plt.title(m + ' model')
    plt.show()

In [None]:
rc1a

In [None]:
rc2a

In [None]:
rc2b

In [None]:
rc3

In [None]:
# original 
plt.figure(figsize=(10, 10))

plt.scatter(rc1a['snorkel_generative_model_prob'], rc1a['snorkel_deterministic_model_prob'],
            color='red', marker='^', label='Full Agreement for CASE')
#plt.scatter(rc1b['snorkel_generative_model_prob'], rc1b['snorkel_deterministic_model_prob'],
#            color='blue', marker='.', label='Full Agreement CONTROL')

plt.scatter(rc2a['snorkel_generative_model_prob'], rc2a['snorkel_deterministic_model_prob'],
            color='red', marker='^', facecolors='none', label='Some Disagreement but determined CASE')
plt.scatter(rc2b['snorkel_generative_model_prob'], rc2b['snorkel_deterministic_model_prob'],
            color='blue', marker='v', facecolors='none',label='Some Disagreement but determined CONTROL')

plt.scatter(rc3['snorkel_generative_model_prob'], rc3['snorkel_deterministic_model_prob'],
            color='orange', marker='>', facecolors='none', label='Indeterminate')

plt.xlabel('Generative Model')
plt.ylabel('Discriminative Model')
plt.legend(loc='upper left')
plt.title('Comparison of Predicted Probabilities between Generative & Discriminative Models' + 
          ' (Full Agreement Controls Removed)')
plt.show();

In [None]:
# Generative vs. Unweighted random forest classifier
plt.figure(figsize=(10, 10))

comparator = 'sens_rfc_unweighted_prob'
plt.scatter(rc1a['snorkel_generative_model_prob'], rc1a[comparator],
            color='red', marker='^', label='Full Agreement for CASE')
#plt.scatter(rc1b['snorkel_generative_model_prob'], rc1b[comparator],
#            color='blue', marker='.', label='Full Agreement CONTROL')

plt.scatter(rc2a['snorkel_generative_model_prob'], rc2a[comparator],
            color='red', marker='^', facecolors='none', label='Some Disagreement but determined CASE')
plt.scatter(rc2b['snorkel_generative_model_prob'], rc2b[comparator],
            color='blue', marker='v', facecolors='none',label='Some Disagreement but determined CONTROL')

plt.scatter(rc3['snorkel_generative_model_prob'], rc3[comparator],
            color='orange', marker='>', facecolors='none', label='Indeterminate')

plt.xlabel('Generative Model')
plt.ylabel('Discriminative Model - Unweighted Random Forest')
plt.legend(loc='upper left')
plt.title('Comparison of Predicted Probabilities between Generative & Discriminative Models' + 
          ' (Full Agreement Controls Removed)')
plt.show();

In [None]:
# Generative vs. Fully Generative Labels used when building nweighted random forest classifier
plt.figure(figsize=(10, 10))

comparator = 'sens_fully_generative_unweighted_prob'
plt.scatter(rc1a['snorkel_generative_model_prob'], rc1a[comparator],
            color='red', marker='^', label='Full Agreement for CASE')
#plt.scatter(rc1b['snorkel_generative_model_prob'], rc1b[comparator],
#            color='blue', marker='.', label='Full Agreement CONTROL')

plt.scatter(rc2a['snorkel_generative_model_prob'], rc2a[comparator],
            color='red', marker='^', facecolors='none', label='Some Disagreement but determined CASE')
plt.scatter(rc2b['snorkel_generative_model_prob'], rc2b[comparator],
            color='blue', marker='v', facecolors='none',label='Some Disagreement but determined CONTROL')

plt.scatter(rc3['snorkel_generative_model_prob'], rc3[comparator],
            color='orange', marker='>', facecolors='none', label='Indeterminate')

plt.xlabel('Generative Model')
plt.ylabel('Discriminative Model - Unweighted RF with Fully Generative Labels')
plt.legend(loc='upper left')
plt.title('Comparison of Predicted Probabilities between Generative & Discriminative Models' + 
          ' (Full Agreement Controls Removed)')
plt.show();

In [None]:
# Generative vs. Fully Generative Labels used when building weighted random forest classifier
plt.figure(figsize=(10, 10))

comparator = 'sens_fully_generative_weighted_prob'
plt.scatter(rc1a['snorkel_generative_model_prob'], rc1a[comparator],
            color='red', marker='^', label='Full Agreement for CASE')
#plt.scatter(rc1b['snorkel_generative_model_prob'], rc1b[comparator],
#            color='blue', marker='.', label='Full Agreement CONTROL')

plt.scatter(rc2a['snorkel_generative_model_prob'], rc2a[comparator],
            color='red', marker='^', facecolors='none', label='Some Disagreement but determined CASE')
plt.scatter(rc2b['snorkel_generative_model_prob'], rc2b[comparator],
            color='blue', marker='v', facecolors='none',label='Some Disagreement but determined CONTROL')

plt.scatter(rc3['snorkel_generative_model_prob'], rc3[comparator],
            color='orange', marker='>', facecolors='none', label='Indeterminate')

plt.xlabel('Generative Model')
plt.ylabel('Discriminative Model - Weighted RF with Fully Generative Labels')
plt.legend(loc='upper left')
plt.title('Comparison of Predicted Probabilities between Generative & Discriminative Models' + 
          ' (Full Agreement Controls Removed)')
plt.show();

In [None]:
plt.scatter(merged['snorkel_deterministic_model_prob'], merged['sens_rfc_unweighted_prob'])

In [None]:
['snorkel_deterministic_model_prob', 'sens_rfc_unweighted_prob',
       'sens_fully_generative_unweighted_prob',
       'sens_fully_generative_weighted_prob']

In [None]:
# review of non-test set
df_train = pd.read_csv('./train_dev_valid_set_with_predicted_labels.csv')

In [None]:
discrepancies = df_train[['visit_occurrence_id', 'label', 'outcome_generative_model', 
                          'snorkel_deterministic_model_prob', 'sens_rfc_unweighted_prob',
                          'sens_fully_generative_unweighted_prob', 'sens_fully_generative_weighted_prob']].copy()
discrepancies = discrepancies[(np.abs(discrepancies['snorkel_deterministic_model_prob'] - 
                               discrepancies['outcome_generative_model']) > 0.2)]
discrepancies.sort_values('outcome_generative_model')

When looking at the actual values of various models, those with really high generative model values that were lowered in the discriminative model occurred in the cases where the model was trained with incorporation of manually adjudicated labels rather than the fully-generative labels. I thought it would've been the weighting that did this, but it seems here that it was actually having some manually-adjudicated data in the mix. 

In [None]:
# dig into the validation set a bit more to see if that was the case...
with open('./data_for_analysis.pkl', 'rb') as f:
    data = pickle.load(f)
df_valid = data[1]

In [None]:
discrepancies[discrepancies['visit_occurrence_id'].isin(df_valid['visit_occurrence_id'])] \
    .sort_values('outcome_generative_model')

Yes, it looks like there was plenty of information in these hand labels for the validation set. 

## AHRQ PSI-11 Performance  

How well did the original criteria we used perform compared to the SNORKEL model? 

In [None]:
# how many cases in AHRQ? 
merged[merged['study_group']=='case'].shape[0]

In [None]:
# of the AHRQ cases, how many had a true outcome of 0
merged[(merged['study_group']=='case') & (merged['outcome']==1.0)].shape[0]

In [None]:
# repeat with controls
merged[merged['study_group']=='control'].shape[0]

In [None]:
merged[(merged['study_group']=='control') & (merged['outcome']==1.0)].shape[0]

# Model Performances for Written Thesis

In [None]:
# ensure code is working by producing variation
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    print(np.mean(merged[m]))

In [None]:
# accuracy    
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    p = metrics.accuracy_score(merged['outcome'], 
                               np.where(merged[m] >= 0.5, 1, 0))
    print(m + ' Accuracy ' + ' = ' + str(p))

In [None]:
# AUC    
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    p = metrics.roc_auc_score(merged['outcome'], 
                               np.where(merged[m] >= 0.5, 1, 0))
    print(m + ' AUC ' + ' = ' + str(p))

In [None]:
# F1    
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    p = metrics.f1_score(merged['outcome'], 
                               np.where(merged[m] >= 0.5, 1, 0))
    print(m + ' F1 Score ' + ' = ' + str(p))

In [None]:
# PPV
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    p = metrics.precision_score(merged['outcome'], 
                               np.where(merged[m] >= 0.5, 1, 0))
    print(m + ' PPV ' + ' = ' + str(p))

In [None]:
# sensitivity
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    p = metrics.recall_score(merged['outcome'], 
                               np.where(merged[m] >= 0.5, 1, 0))
    print(m + ' Sensitivity ' + ' = ' + str(p))

In [None]:
# specificity
for m in ['snorkel_generative_model_prob', 'snorkel_deterministic_model_prob', 
          'sens_rfc_unweighted_prob', 'sens_fully_generative_weighted_prob', 
          'sens_fully_generative_unweighted_prob']:
    p = metrics.classification_report(merged['outcome'], 
                               np.where(merged[m] >= 0.5, 1, 0))
    print(m + ' Specificity (recall of negative class) ' + ' = \n' + str(p))

In [None]:
# ensure all these false positive counts are true
merged[(merged['snorkel_generative_model_prob'] >= 0.5) & 
       (merged['outcome']==0.0)].shape[0]

In [None]:
merged[(merged['snorkel_deterministic_model_prob'] >= 0.5) & 
       (merged['outcome']==0.0)].shape[0]

In [None]:
merged[(merged['sens_rfc_unweighted_prob'] >= 0.5) & 
       (merged['outcome']==0.0)].shape[0]

In [None]:
merged[(merged['sens_fully_generative_weighted_prob'] >= 0.5) & 
       (merged['outcome']==0.0)].shape[0]

In [None]:
merged[(merged['sens_fully_generative_unweighted_prob'] >= 0.5) & 
       (merged['outcome']==0.0)].shape[0]

In [None]:
# AHRQ performance
metrics.accuracy_score(merged['outcome'], 
                       np.where(merged['study_group'] == 'case', 1, 0))

In [None]:
metrics.roc_auc_score(merged['outcome'], 
                       np.where(merged['study_group'] == 'case', 1, 0))

In [None]:
metrics.f1_score(merged['outcome'], 
                       np.where(merged['study_group'] == 'case', 1, 0))

In [None]:
metrics.precision_score(merged['outcome'], 
                       np.where(merged['study_group'] == 'case', 1, 0))

In [None]:
metrics.recall_score(merged['outcome'], 
                       np.where(merged['study_group'] == 'case', 1, 0))

In [None]:
print(metrics.classification_report(merged['outcome'], 
                       np.where(merged['study_group'] == 'case', 1, 0)))

In [None]:
# majority vote model performance
metrics.accuracy_score(merged['outcome'], 
                       np.where(merged['snorkel_majority_model_prob'] >= 0.5, 1, 0))

In [None]:
metrics.roc_auc_score(merged['outcome'], 
                       np.where(merged['snorkel_majority_model_prob'] >= 0.5, 1, 0))

In [None]:
metrics.f1_score(merged['outcome'], 
                       np.where(merged['snorkel_majority_model_prob'] >= 0.5, 1, 0))

In [None]:
metrics.precision_score(merged['outcome'], 
                       np.where(merged['snorkel_majority_model_prob'] >= 0.5, 1, 0))

In [None]:
metrics.recall_score(merged['outcome'], 
                       np.where(merged['snorkel_majority_model_prob'] >= 0.5, 1, 0))

In [None]:
print(metrics.classification_report(merged['outcome'], 
                       np.where(merged['snorkel_majority_model_prob'] >= 0.5, 1, 0)))