In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sys 
import os

from os.path import join as oj
from tqdm import tqdm
import re

# TODO: make more pythonic
# this line walks up file directory so rule-vetting is cwd
# Currently this breaks if chunk is run multiple times
os.chdir(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))))
print(os.getcwd())

/accounts/grad/wtorous/rule-vetting


In [2]:
import rulevetting
import rulevetting.api.util
from rulevetting.projects.csi_pecarn import helper

In [3]:
from rulevetting.projects.csi_pecarn.dataset import Dataset

In [4]:
import rulevetting.api.viz as viz

In [5]:
def output_processed_df(df, input_name,output_name,preprocess_description="Altered"):
    covar_info = df.loc[df['original_name']==input_name].copy() # copy data of covar to be transformed
    covar_info['filename'] = "derived"
    covar_info['original_name'] = output_name
    covar_info['final_name'] = output_name
    covar_info['pre_processing'] = preprocess_description
    covar_df = pd.DataFrame(data=covar_info,columns=covar_col_names)
    
    return covar_df

In [6]:
raw_data_path = oj(rulevetting.DATA_PATH, 'csi_pecarn', 'raw')
print(raw_data_path)
os.makedirs(raw_data_path, exist_ok=True)
# all the fnames to be loaded and searched over        
fnames = sorted([fname for fname in os.listdir(raw_data_path) if 'csv' in fname])
# read through each fname and save into the r dictionary
r = {}
print('read all the csvs...\n', fnames)
if len(fnames) == 0:
    print('no csvs found in path', rulevetting.DATA_PATH)

# replace studysubjectid cases with id
for fname in tqdm(fnames):
    df = pd.read_csv(oj(raw_data_path, fname), encoding="ISO-8859-1")
    df.rename(columns={'StudySubjectID': 'id'}, inplace=True)
    df.rename(columns={'studysubjectid': 'id'}, inplace=True)
    df.columns = [re.sub('SITE','site',x) for x in df.columns]
    df.columns = [re.sub('CaseID','case_id',x) for x in df.columns]
    df.columns = [re.sub('caseid','case_id',x) for x in df.columns]
    df.columns = [re.sub('ControlType','control_type',x,flags=re.IGNORECASE) for x in df.columns]
    
    #df.columns = [re.sub('CSpine','CervicalSpine',x) for x in df.columns]

    '''
    if fname == "clinicalpresentationfield.csv": 
        print("There")
        df.iloc[:,4:] = df.add_suffix('_ems')
        print(df.iloc[:,4:].head())
    if fname == "clinicalpresentationoutside.csv":
        df.iloc[:,4:] = df.add_suffix('_outside')
    if fname == "clinicalpresentationsite.csv":
        df.iloc[:,4:] = df.add_suffix('_site')
    '''
    assert ('id' in df.keys())   
    
    r[fname] = df

100%|██████████| 12/12 [00:00<00:00, 68.96it/s]

/accounts/grad/wtorous/rule-vetting/data/csi_pecarn/raw
read all the csvs...
 ['analysisvariables.csv', 'clinicalpresentationfield.csv', 'clinicalpresentationoutside.csv', 'clinicalpresentationsite.csv', 'demographics.csv', 'injuryclassification.csv', 'injurymechanism.csv', 'kappa.csv', 'medicalhistory.csv', 'radiologyoutside.csv', 'radiologyreview.csv', 'radiologysite.csv']





We do not include data from `kappa` for now because it is only used as a withheld robustness check.  Data from `radiologyreview` is too detailed because we only need information about the types of radiographs ordered.

In [7]:
fnames_small = [fname for fname in fnames
                        if not 'kappa' in fname
                        and not 'radiologyreview' in fname]

In [8]:
# Will output final summary of cleaning
covar_col_names = ['filename','final_name','original_name','type','frac_missing','pre_processing']
covariate_summary = pd.DataFrame(columns=covar_col_names)

In [9]:
index_variables = ['id','case_id','site','control_type']
for file_name in fnames_small:
    file_df = r[file_name]
    file_na = file_df.isnull().mean()
    for covar_index, covar_name in enumerate(file_df.columns):
        if covar_name in index_variables: continue
        covar_final_name = np.nan
        covar_file = file_name.split('.')[0]
        
        # convert to boolean if possible
        if np.isin(file_df[covar_name].dropna().unique(), [0, 1]).all():
            file_df[covar_name] = file_df[covar_name].astype(bool)
            
        covar_type = file_df.dtypes[covar_index] 
        covar_frac_missing = file_na[covar_index]
        covar_summary = [[covar_file,covar_final_name,covar_name,covar_type,covar_frac_missing,""]]
        covar_df = pd.DataFrame(data=covar_summary,columns=covar_col_names)
        covariate_summary = covariate_summary.append(covar_df,ignore_index=True)

We start with 666 covariates from 8 files.

In [10]:
print(covariate_summary.shape[0])

664


We keep all of the binary features derived by Leonard et al. (2011). Observational units are multi-indexed by `id`, `case_id`,`control_type`, and `site`. We include a new variable `NonAmbulatory` which negates `ambulatory` because Leonard et al. use that transformation. By using `NonAmbulatory` over `ambulatory`, all covariates from this file are encoded as `1` if they increase the risk for CSI injuries and `0` otherwise.

In [11]:
df_features = r[fnames[0]].copy()
post_hoc_features = [] # keep track of post-hoc features to remove before prediction
# create multi-index
df_features = df_features.set_index(['id','case_id','site','control_type'])

We add a binary outcome indicator `csi_injury` which is True if a subject has `control_type=="case"`.

In [12]:
df_features.loc[:,'csi_injury'] = df_features.index.get_level_values('control_type').map(helper.assign_binary_outcome)
outcome_df = output_processed_df(covariate_summary,'ambulatory','csi_injury',preprocess_description="Derived from control_type")
covariate_summary = covariate_summary.append(outcome_df,ignore_index=True)

In [13]:
# change binary variable label so that 1 is negative result
df_features.loc[:,'NonAmbulatory'] = df_features.loc[:,'ambulatory'].replace([True,False],[False,True])
df_features.drop(['ambulatory'], axis=1, inplace=True)

In [14]:
# Update Information
nonamb_df = output_processed_df(covariate_summary,"ambulatory","NonAmbulatory",preprocess_description="Derived from ambulatory")
covariate_summary = covariate_summary.append(nonamb_df,ignore_index=True)

# Mark rest of analysis variables as included and unchanged except of ambulatory
covariate_summary.loc[covariate_summary['filename']=='analysisvariables','final_name'] = covariate_summary.loc[covariate_summary['filename']=='analysisvariables','original_name']
covariate_summary.loc[covariate_summary['filename']=='analysisvariables','pre_processing'] = 'Unchanged'

covariate_summary.loc[covariate_summary['original_name']=='ambulatory','final_name'] = 'ambulatory'
covariate_summary.loc[covariate_summary['original_name']=='ambulatory','pre_processing'] = 'Removed; Information in NonAmbulatory'

The `injuryclassification.csv` data contains the specific diagnosis of the CSI injuries for 540 of the 541 subjects with a CSI. Since our problem is framed as a binary decision task for any CSI injury, these covariates are only useful for post-hoc analysis of the classifiers. For this reason we keep only the aggregate covariates `CSpineSignalChange`, `CSFractures`, `Ligamentoptions`.

In [15]:
injuryclassification_df = r['injuryclassification.csv'].copy()
injuryclassification_df = injuryclassification_df.set_index(['id','case_id','site','control_type'])
ic_summary_df = injuryclassification_df[["CSFractures","Ligamentoptions","CSpineSignalChange"]]
post_hoc_features.extend(list(ic_summary_df.columns))

In [16]:
# merge in aggregated injuryclassification data
df_features = pd.merge(df_features,ic_summary_df,how="left",left_index=True,right_index=True)

In [17]:
# Update Information
for injury_class_covar in injuryclassification_df.columns:
    if injury_class_covar in ic_summary_df.columns:
        covariate_summary.loc[(covariate_summary['filename']=='injuryclassification') & (covariate_summary['original_name']==injury_class_covar),'final_name'] = \
            covariate_summary.loc[(covariate_summary['filename']=='injuryclassification') & (covariate_summary['original_name']==injury_class_covar),'original_name']
        covariate_summary.loc[(covariate_summary['filename']=='injuryclassification') & (covariate_summary['original_name']==injury_class_covar),'pre_processing'] = \
            'Unchanged'
    else:
        covariate_summary.loc[(covariate_summary['filename']=='injuryclassification') & (covariate_summary['original_name']==injury_class_covar),'pre_processing'] = \
            'Removed; Use aggregate instead'

We include information about radiographs ordered at previous hospitals a patient had been evaluated at, if applicable. This comes from the file `radiologyoutside.csv`. We make a **judgement call** to only include indicators if an X-Ray, CT Scan, or MRI was performed.

In [18]:
radiologyoutside_df = r['radiologyoutside.csv'].copy()
radiologyoutside_df = radiologyoutside_df.set_index(['id','case_id','site','control_type'])
ro_summary_df = radiologyoutside_df[["Xrays","CTPerformed","MRIPerformed"]]

# Update Information
for radiology_outside_covar in radiologyoutside_df.columns:
    if radiology_outside_covar in ro_summary_df.columns:
        covariate_summary.loc[(covariate_summary['filename']=='radiologyoutside') & (covariate_summary['original_name']==radiology_outside_covar),'final_name'] = \
        covariate_summary.loc[(covariate_summary['filename']=='radiologyoutside') & (covariate_summary['original_name']==radiology_outside_covar),'original_name'] + "_outside"
        covariate_summary.loc[(covariate_summary['filename']=='radiologyoutside') & (covariate_summary['original_name']==radiology_outside_covar),'pre_processing'] = 'Unchanged'
    else:
        covariate_summary.loc[(covariate_summary['filename']=='radiologyoutside') & (covariate_summary['original_name']==radiology_outside_covar),'pre_processing'] = 'Removed; Use aggregate instead'

ro_summary_df.loc[:,["Xrays","CTPerformed","MRIPerformed"]] = np.where(ro_summary_df.copy() == "Y", True, False)
ro_summary_df.columns += '_outside'
ro_summary_df = ro_summary_df.astype(bool)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
# merge in aggregated radiologyoutside data
df_features = pd.merge(df_features,ro_summary_df,how="left",left_index=True,right_index=True)

The file `radiologysite.csv` contains information about the radiographs ordered at the study side. This is post-hoc information which we use to evaluate our predictions. For the same reasons as injury classification data, we only include the aggregated outcomes of whether an X-Ray, CT Scan, or MRI was performed.

In [20]:
radiologysite_df= r['radiologysite.csv'].copy()
radiologysite_df = radiologysite_df.set_index(['id','case_id','site','control_type'])
rs_summary_df = radiologysite_df[["Xrays","CTPerformed","MRIPerformed"]]

# Update Information
for radiology_site_covar in radiologysite_df.columns:
    if radiology_site_covar in rs_summary_df.columns:
        covariate_summary.loc[(covariate_summary['filename']=='radiologysite') & (covariate_summary['original_name']==radiology_site_covar),'final_name'] = \
        covariate_summary.loc[(covariate_summary['filename']=='radiologysite') & (covariate_summary['original_name']==radiology_site_covar),'original_name'] + "_site"
        covariate_summary.loc[(covariate_summary['filename']=='radiologysite') & (covariate_summary['original_name']==radiology_site_covar),'pre_processing'] = 'Unchanged'
    else:
        covariate_summary.loc[(covariate_summary['filename']=='radiologysite') & (covariate_summary['original_name']==radiology_site_covar),'pre_processing'] = 'Removed; Use aggregate instead'

rs_summary_df.loc[:,["Xrays","CTPerformed","MRIPerformed"]] = np.where(rs_summary_df.copy() == "Y", True, False)
rs_summary_df.columns += '_site'
rs_summary_df = rs_summary_df.astype(bool)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
# merge in aggregated radiologysite data
df_features = pd.merge(df_features,rs_summary_df,how="left",left_index=True,right_index=True)
post_hoc_features.extend(list(rs_summary_df.columns))

Data from `demographics.csv` is mainly used in justify this data as a representative sample. The only demographic information we include for prediction is age and gender. It is a **judgement call** to incorporate information about race and insurace type (ethnicity is rarely reported) only as post-hoc variables. 

Gender information is included in the binary covariate `male` while age is transformed into binary indicators. We derive `infant` if `age < 2` (this is a cutoff used in Leonard et al. 2011), non-verbal if `age < 5` (a cutoff suggested by Dr. Devlin), and teenage if `age >= 13`. The `AgeInYear` covariate is never missing.

In [22]:
demogrpahic_df= r['demographics.csv'].copy()
demogrpahic_df = demogrpahic_df.set_index(['id','case_id','site','control_type'])
demogrpahic_df.loc[:,'infant'] = np.where(demogrpahic_df['AgeInYears']< 2, True, False)
demogrpahic_df.loc[:,'non_verbal'] = np.where(demogrpahic_df['AgeInYears']< 5, True, False)
demogrpahic_df.loc[:,'teenage'] = np.where(demogrpahic_df['AgeInYears'] >= 13, True, False)
# TODO: 1 unit has no gender
demogrpahic_df.loc[:,'male'] = np.where(demogrpahic_df['Gender'] == "M", True, False)

dg_summary_df = demogrpahic_df[["infant","non_verbal","teenage","male","Race","PayorType"]]

In [23]:
# merge in aggregated demographic data
df_features = pd.merge(df_features,dg_summary_df,how="left",left_index=True,right_index=True)
post_hoc_features.extend(["Race","PayorType"])

In [24]:
# Update covariate information
for demographic_covar in demogrpahic_df.columns:
    if demographic_covar in post_hoc_features:
        covariate_summary.loc[(covariate_summary['filename']=='demographics') & (covariate_summary['original_name']==demographic_covar),'final_name'] = \
        covariate_summary.loc[(covariate_summary['filename']=='demographics') & (covariate_summary['original_name']==demographic_covar),'original_name']
        covariate_summary.loc[(covariate_summary['filename']=='demographics') & (covariate_summary['original_name']==demographic_covar),'pre_processing'] = 'Unchanged'
    else:
        covariate_summary.loc[(covariate_summary['filename']=='demographics') & (covariate_summary['original_name']==demographic_covar),'pre_processing'] = 'Removed'

for demographic_covar in dg_summary_df.columns:
    if demographic_covar not in post_hoc_features:
        outcome_df = output_processed_df(covariate_summary,"AgeInYears",demographic_covar,preprocess_description="Dervied from AgeInYears")
        outcome_df['type'] = dg_summary_df[demographic_covar].dtypes
        covariate_summary = covariate_summary.append(outcome_df,ignore_index=True)
        
        

Information from `medicalhistory.csv` can be used to determine if a patient has pre-existing condtions which put them at higher risk for CSIs. The variable `Predisposed` from `analysisvariables` summarizes much of the infomration in this datatable. We therefore make a **judgement call** to not consider these as separate covariates. 


We note that 712 units do not have any medical history. We mark their information as `NaN` in all relavent columns (not `False`). We need to make a **judgement call** about what to do with these units.

In [25]:
medicalhistory_df= r['medicalhistory.csv'].copy()
medicalhistory_df = medicalhistory_df.set_index(['id','case_id','site','control_type'])

useful_mh_covar = ['HEENT','Cardiovascular','Respiratory','Gastrointestinal','Musculoskeletal','Neurological','Medications']
for column in useful_mh_covar:
        char_column = medicalhistory_df[column] # select column
        unique_values = pd.unique(char_column) # get unique entries
        if (('Y' in unique_values)|('A' in unique_values)) & ('N' in unique_values):
            conditions  = [char_column == 'A',char_column == 'Y',char_column == 'N']
            encodings = [True, True, False]
            binary_encoded = np.select(conditions, encodings, default=np.nan)
            col_name = column+"_binary"
            medicalhistory_df.loc[:,col_name] = binary_encoded.copy()  
extracted_mh = medicalhistory_df.loc[:,medicalhistory_df.columns.str.endswith('binary')]

In [26]:
df_features = pd.merge(df_features,extracted_mh,how="left",left_index=True,right_index=True)

Much of the information recorded in `injurymechanism.csv` is summarized by variables in `analysisvariables.csv`. We make many **judgement calls** about which additional features to include from this dataset.

Information about injury date and time has high rates of missingness and does not seem particularly informative [verify this numerically]. 

We do not include ICD-9 codes from injury mechansim and location because the annotating physicicans created additional covariates which caputre informative patterns.

For motor vehicle crashes the `analysisvariables.csv` covariate `HighRiskMVC` summarizes information from a number of covariates. One it does not is whether a seatbelt was worn. Only 2 units have an other motor vehicle speed equal to 3, this is the decision boundary for motor vehicle speed in `HighRiskMVC`. Even assuming crashes in this are more dangerous because this category include motorcycles and ATVs, 14 units have speed greater than 2.

Booleans for assualt, child abuse, and helment wearing are included from this dataset. The is a duplicate of the covariate `Clotheslining` from `analysisvariables.csv`. We do not include covariates about shaken baby syndrome observed in only 6 subject. We make a **judgement call** `FallDownStairs` greater than 2 is high risk. Axial load analysis variable covariates have the most important information about head impacts, so we do not include the categorical `HeadFirstRegion`.

In [27]:
injurymechanism_df= r['injurymechanism.csv'].copy()
injurymechanism_df = injurymechanism_df.set_index(['id','case_id','site','control_type'])

helmet_column = injurymechanism_df["helmet"] # select column
unique_values = pd.unique(helmet_column) # get unique entries
print(unique_values)
if (('Y' in unique_values)) & (('N' in unique_values) |('ND' in unique_values)):
    conditions  = [helmet_column == 'Y',helmet_column == 'ND',helmet_column == 'N']
    encodings = [True, False, False]
    binary_encoded = np.select(conditions, encodings, default=False)
    col_name = "helmet_binary"
    injurymechanism_df[col_name] = binary_encoded 

injurymechanism_df.loc[:,"FallDownStairs_binary"] = np.where(injurymechanism_df['FallDownStairs'] >= 13, True, False)
useful_im_covar = ['PassRestraint','Assault','ChildAbuse','helmet_binary','FallDownStairs_binary']
injurymechanism_df = injurymechanism_df.loc[:,useful_im_covar]



[nan 'N' 'Y' 'ND']


In [28]:
df_features = pd.merge(df_features,injurymechanism_df,how="left",left_index=True,right_index=True)

Many of the covariates collected at the study site, at an outside hospital, or by EMS are the same. Covariates measured at the study site are clearly the most important. Analysis variables capture much of this information and the robust ones ending with `2` include criteria met from EMS and outside hospital data. 

We start with the study site. We again do not include information about arrival date and time. `LocalEvalPhysician` is highly skewed towards missing and the categories do not have labels, so we do not include it. Booleans about the type of cervical spine interventions used at other hospitals is included. Any information about medications recieved at another study site as well as intubaiton are included.

For mapping the categorical GCS and AVPU scores into a binary variable, we make a domain-informed **judgement call** to use `AlteredMentalStatus`.

Many variables record information about whether a patient complained about localized pain and whether they had tenderness. The `analysisvariables` summarize the neck pain information well. Should we also include face or head tenderness?

The descriptive `FocalNeurologicalFindings` covariate combines key indicators such as loss of sensation and extremity weakness. The rest of the covariates contain information about medications given, interventions recieved, and long-term outcomes. We already include three injury classification categories for post-hoc analysis. 

Is useing `MedsGiven` looking ahead? Is one `MedsRecd` category enough?

The covariates in `ClinicalPresentationOutside.csv` and `ClinicalPresentationField.csv` contain information already included such as `MedsRecd`. Following the strategy of Leonard et al. (2011), we make some robust covariates indicated by `2` which indicate a positive observation at any location, such as `PtCompPainNeckMove2`.

One unsure covariate is `EMSEstimateTime`.

In [29]:
site_df= r['clinicalpresentationsite.csv'].copy()
site_df = site_df.set_index(['id','case_id','site','control_type'])

useful_site_covariates = ['ReceivedInTransfer','DxCspineInjury','PtCompPainNeckMove']

for covar in useful_site_covariates:
        char_column = site_df[covar] # select column
        unique_values = pd.unique(char_column) # get unique entries
        print(unique_values)
        if (('Y' in unique_values)) & ('N' in unique_values):
            conditions  = [char_column == 'Y',char_column == 'N',char_column == 'ND']
            encodings = [True, False, False]
            binary_encoded = np.select(conditions, encodings, default=np.nan)
            col_name = covar+"_binary"
            site_df.loc[:,col_name] = binary_encoded.copy().astype(bool) # TODO: converts to NAN
            
useful_site_binary_covariates = ['ReceivedInTransfer_binary','DxCspineInjury_binary','PtCompPainNeckMove_binary',\
                                 'CervicalSpineIntervCC','CervicalSpineIntervRLB','CervicalSpineIntervOther']
sitebinary_df = site_df.loc[:,useful_site_binary_covariates]
df_features = pd.merge(df_features,sitebinary_df,how="left",left_index=True,right_index=True)

['N' 'Y']
['N' 'Y']
[nan 'ND' 'Y' 'N']


In [30]:
#TODO: Make robust versions

In [31]:
# TODO: standardize names

In [32]:
df_features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,AlteredMentalStatus,LOC,FocalNeuroFindings,PainNeck,PosMidNeckTenderness,TenderNeck,Torticollis,SubInj_Head,SubInj_Face,SubInj_Ext,...,Assault,ChildAbuse,helmet_binary,FallDownStairs_binary,ReceivedInTransfer_binary,DxCspineInjury_binary,PtCompPainNeckMove_binary,CervicalSpineIntervCC,CervicalSpineIntervRLB,CervicalSpineIntervOther
id,case_id,site,control_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
110001,110001,1,case,True,True,False,False,False,False,False,True,False,False,...,True,True,False,False,False,False,True,True,True,False
110002,110002,1,case,False,False,False,True,True,True,False,False,False,False,...,True,True,False,False,False,False,False,True,True,True
110003,110003,1,case,False,False,False,True,True,True,True,False,False,False,...,True,True,False,False,False,False,False,False,False,False
110004,110004,1,case,False,False,False,True,True,True,False,False,False,False,...,True,True,True,False,True,True,False,True,False,False
110005,110005,1,case,True,True,False,True,True,True,True,False,False,False,...,True,True,False,False,False,False,False,False,False,False
110006,110006,1,case,False,True,False,True,True,True,False,False,False,False,...,True,True,False,False,False,False,False,True,True,False
110007,110007,1,case,False,False,True,True,True,True,False,False,False,False,...,True,True,False,False,False,False,False,True,True,False
110008,110008,1,case,False,False,False,True,True,True,False,False,False,False,...,True,True,False,False,False,False,True,False,False,False
110009,110009,1,case,False,True,False,True,True,True,False,False,False,False,...,True,True,False,False,False,False,True,False,False,False
110010,110010,1,case,False,False,False,True,False,False,False,False,False,False,...,True,True,False,False,False,False,False,True,True,False
