In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import cohen_kappa_score, roc_auc_score, recall_score, confusion_matrix, precision_score, accuracy_score
from sklearn.utils import resample
import pickle

# Compare EHR and ENDO diagnoses - Diversity Assessment

* Metrics include: kappa, AUC, percent agreement

In [None]:
combined = pd.read_pickle('../data/combined_data.pkl')

# pos = dx with endo in EHR after study, neg = dx with endo in EHR prior to study
combined['months_between'] = ((combined['EHR_Dx_Date'] - combined['ENDO_study_date']) / 
                              np.timedelta64(1, 'M')).fillna(0).astype(int)

In [None]:
with open('diversity_ids.pkl', 'rb') as f:
    diversity_ids = pickle.load(f)

In [None]:
white = combined.loc[combined['StudyID_final'].isin(diversity_ids['white'])].reset_index(drop=True)
poc = combined.loc[combined['StudyID_final'].isin(diversity_ids['poc'])].reset_index(drop=True)

hispanic = combined.loc[combined['StudyID_final'].isin(diversity_ids['hispanic'])].reset_index(drop=True)
nonhispanic = combined.loc[combined['StudyID_final'].isin(diversity_ids['nonhispanic'])].reset_index(drop=True)

In [None]:
def get_diagnosis_dataframe(original, threshold, cohort):
    if threshold is None:
        keep_dx = original.groupby(['Masked_PersonID','EndoID']).max().reset_index()
    else: 
        keep_dx = original.copy()

        keep_dx.loc[(keep_dx['months_between'] > threshold), 'endo_dx_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'SE_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'OE_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'DE_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'other_EHR'] = 0 

        keep_dx = keep_dx.groupby(['Masked_PersonID','EndoID']).max().reset_index()
        
    keep_dx = keep_dx.loc[keep_dx['Cohort_final'] == cohort]
    return keep_dx

def bootstrap_performance_metrics(prediction_data):
    # Bootstrap the data
    boot_data = resample(prediction_data, stratify=prediction_data['endo_dx_ENDO'])

    # Performance metrics
    acc = accuracy_score(boot_data['endo_dx_ENDO'], boot_data['endo_dx_EHR'])
    auc = roc_auc_score(boot_data['endo_dx_ENDO'], boot_data['endo_dx_EHR'])
    kappa = cohen_kappa_score(boot_data['endo_dx_ENDO'], boot_data['endo_dx_EHR'])
    
    # Collect metrics in dataframe
    bootstrap_df = pd.DataFrame({'AGREEMENT': [acc],
                                 'AUC': [auc],
                                 'KAPPA': [kappa]})
    return bootstrap_df

def summarize_bootstrap_results(bootstrap_results):    
    alpha = 100-95
    metrics = []
    medians = []
    ci_low = []
    ci_high = []
    
    for col in bootstrap_results.columns:
        metrics.append(col)
        medians.append(np.percentile(bootstrap_results[col], 50))
        ci_low.append(np.percentile(bootstrap_results[col], alpha/2))
        ci_high.append(np.percentile(bootstrap_results[col], 100-alpha/2))

    metrics = pd.DataFrame({'METRIC': metrics, 'MEDIAN': medians, 'CI_LOW': ci_low, 'CI_HIGH': ci_high})
    return metrics

In [None]:
n = 1000
threshold = 1

In [None]:
# EHR dx more than 1 month after study date are voided
keep_dx = get_diagnosis_dataframe(white, threshold, 1)
white_bootstrap_results = pd.DataFrame()
for i in range(n):
    white_bootstrap_results = pd.concat([white_bootstrap_results, 
                                         bootstrap_performance_metrics(keep_dx)])
white_metrics = summarize_bootstrap_results(white_bootstrap_results)

In [None]:
# EHR dx more than 1 month after study date are voided
keep_dx = get_diagnosis_dataframe(poc, threshold, 1)
poc_bootstrap_results = pd.DataFrame()
for i in range(n):
    poc_bootstrap_results = pd.concat([poc_bootstrap_results, 
                                       bootstrap_performance_metrics(keep_dx)])
poc_metrics = summarize_bootstrap_results(poc_bootstrap_results)

In [None]:
# EHR dx more than 1 month after study date are voided
keep_dx = get_diagnosis_dataframe(hispanic, threshold, 1)
hispanic_bootstrap_results = pd.DataFrame()
for i in range(n):
    hispanic_bootstrap_results = pd.concat([hispanic_bootstrap_results, 
                                            bootstrap_performance_metrics(keep_dx)])
hispanic_metrics = summarize_bootstrap_results(hispanic_bootstrap_results)

In [None]:
# EHR dx more than 1 month after study date are voided
keep_dx = get_diagnosis_dataframe(nonhispanic, threshold, 1)
nonhispanic_bootstrap_results = pd.DataFrame()
for i in range(n):
    nonhispanic_bootstrap_results = pd.concat([nonhispanic_bootstrap_results, 
                                               bootstrap_performance_metrics(keep_dx)])
nonhispanic_metrics = summarize_bootstrap_results(nonhispanic_bootstrap_results)

In [None]:
white_metrics.to_csv('../results/diagnosis/white_metrics.csv', index=False)
poc_metrics.to_csv('../results/diagnosis/poc_metrics.csv', index=False)

hispanic_metrics.to_csv('../results/diagnosis/hispanic_metrics.csv', index=False)
nonhispanic_metrics.to_csv('../results/diagnosis/nonhispanic_metrics.csv', index=False)