In [None]:
import pandas as pd
import numpy as np

# Error Analysis

In [None]:
combined = pd.read_pickle('../data/combined_datasets.pkl')

# pos = dx with endo in EHR after study, neg = dx with endo in EHR prior to study
combined['months_between'] = ((combined['EHR_Dx_Date'] - combined['ENDO_study_date']) / 
                              np.timedelta64(1, 'M')).fillna(0).astype(int)

In [None]:
def get_diagnosis_dataframe(original, threshold, cohort):
    if threshold is None:
        keep_dx = original.groupby(['Masked_PersonID','EndoID']).max().reset_index()
    else: 
        keep_dx = original.copy()
        
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'endo_dx_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'SE_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'OE_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'DE_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'tubal_EHR'] = 0 
        keep_dx.loc[(keep_dx['months_between'] > threshold), 'scar_EHR'] = 0 
        
        keep_dx = keep_dx.groupby(['Masked_PersonID','EndoID']).max().reset_index()
    
    keep_dx = keep_dx.loc[keep_dx['Cohort_final'] == cohort]
    return keep_dx

In [None]:
diagnosis_data = get_diagnosis_dataframe(combined, 1, 1)

In [None]:
tp = diagnosis_data.loc[(diagnosis_data['endo_dx_EHR'] == 1) & 
                        (diagnosis_data['endo_dx_EHR'] == diagnosis_data['endo_dx_ENDO'])]

In [None]:
tn = diagnosis_data.loc[(diagnosis_data['endo_dx_EHR'] == 0) & 
                        (diagnosis_data['endo_dx_EHR'] == diagnosis_data['endo_dx_ENDO'])]

In [None]:
fn = diagnosis_data.loc[(diagnosis_data['endo_dx_ENDO'] == 1) & (diagnosis_data['endo_dx_EHR'] == 0)]

In [None]:
fp = diagnosis_data.loc[(diagnosis_data['endo_dx_ENDO'] == 0) & (diagnosis_data['endo_dx_EHR'] == 1)]

## Get full data for investigation

### EHR

In [None]:
ehr_dx = pd.read_csv('../data/ehr_data.txt', sep='|', header=0)

ehr_dx['EHR_Dx_Date'] = ehr_dx['Dx_Date'].astype('datetime64')

ehr_dx['SE_EHR'] = np.where((ehr_dx['code'].values == '617') | 
                            (ehr_dx['code'].values == '617.0') |
                            (ehr_dx['code'].values == '617.00') |
                            (ehr_dx['code'].values == '617.2') |
                            (ehr_dx['code'].values == '617.3') |
                            (ehr_dx['code'].values == 'N80.0') |
                            (ehr_dx['code'].values == 'N80.2') |
                            (ehr_dx['code'].values == 'N80.3'), 1, 0)

ehr_dx['OE_EHR'] = np.where((ehr_dx['code'].values == '617.1') | 
                            (ehr_dx['code'].values == 'N80.1'), 1, 0)

ehr_dx['DE_EHR'] = np.where((ehr_dx['code'].values == '617.4') |
                            (ehr_dx['code'].values == '617.49') |
                            (ehr_dx['code'].values == '617.5') |
                            (ehr_dx['code'].values == 'N80.4') |
                            (ehr_dx['code'].values == 'N80.5'), 1, 0)

ehr_dx['other_EHR'] = np.where((ehr_dx['code'].values == '617.6') |
                               (ehr_dx['code'].values == '617.8') |
                               (ehr_dx['code'].values == '617.9') |
                               (ehr_dx['code'].values == '617.95') |
                               (ehr_dx['code'].values == 'N80.6') |
                               (ehr_dx['code'].values == 'N80.8') |
                               (ehr_dx['code'].values == 'N80.9'), 1, 0)

In [None]:
ehr_dx.loc[ehr_dx['Masked_PersonID'] == 'a123'].sort_values(['Dx_Date', 'code'])

### ENDO Study

In [None]:
typology = pd.read_csv('../data/ENDOStudyData/typology.txt', sep='\t', lineterminator='\n', encoding='ISO-8859-1')
operative = pd.read_csv('../data/ENDOStudyData/operative.txt', sep='\t', lineterminator='\n', encoding='ISO-8859-1')

In [None]:
# Get surgical indication and surgeon
operative.loc[operative['StudyID_op'] == 'a123', ['Q4_op', 'Q1_op']]

In [None]:
# Check ENDO diagnosis
operative.loc[operative['StudyID_op'] == 'a123', ['Q6_op', 'Q7b_op', 'Q5_op']]

In [None]:
typology.loc[typology['StudyID_typo'] == 'a123']