# Imports

In [1]:
# Only uncomment if you haven't had these installed before
# !pip install numpy pandas

In [2]:
import numpy as np
import pandas as pd

# Import Data

You should have 4 spreadsheets in the `Data` folder with the following columns:

**Data/flowsheets.csv**
* HSH_ADMSN_ID
* FLO_MEAS_NAME
* NORMALIZED_NAME
* NFLO_MEAS_ID
* GROUPED_FLO_ID
* MEAS_VALUE
* UNITS
* RECODED_TIME


**Data/Labs.csv**
* HSH_ADMSN_ID
* NORMALIZED_LAB_NAME
* COMPONENT_ID
* COMPONENT_NAME
* ORD_VALUE
* ORD_NUM_VALUE_CORRECTED
* RESULT_TIME


**Data/flowsheetspressures.csv**
* HSH_ADMSN_ID
* FLO_MEAS_NAME
* NORMALIZED_NAME
* NFLO_MEAS_ID
* GROUPED_FLO_ID
* MEAS_VALUE
* UNITS
* RECODED_TIME
* SYSTOLIC
* DIASTOLIC
* MAP


**Data/main.csv**
* HSH_ADMSN_ID
* ADMSN_MINUTES
* ICU_MINS
* DISCHARGE_MINUTES
* IN_ICU_TIME
* FIRST_LOCATION
* PROC_NAME
* EXAM_BEGIN_MINUTES
* VERAPAMIL_TAKEN_TIME
* CPT_COIL
* INPT_DEATH_YN
* BMI
* AGE_LT90
* SEX
* ETHNICITY
* RACE
* ADMITTING_SERVICE
* MEANS_OF_ARRIVAL

In [3]:
flowsheets = pd.read_csv('Data/flowsheets.csv')
labs = pd.read_csv('Data/Labs.csv')
flowsheets_pressures = pd.read_csv('Data/flowsheetspressures.csv')
pts = pd.read_csv('Data/main.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# standardize names of the tests in the flowsheets file
mapping_dict = {
        'O2 FLOW':'Supplemental O2', 
        'FIO2':'Supplemental FIO2',
        'ETCO2': 'Supplemental ETCO2',
        'AIRWAY VIEW GRADE ': 'Mechanical Ventilation',
        'AIRWAY DIFFICULTY': 'Mechanical Ventilation',
        'ETT NUMBER OF ATTEMPTS': 'Mechanical Ventilation',
        'ETT EQUIPMENT': 'Mechanical Ventilation',
        'O2 FACE MASK': 'Supplemental O2',
        'TUBE OUTPUT': 'TUBE FEEDING',
        'MAINTENANCE IV VOLUME': 'IV Fluids',
        'O2 NASAL': 'Supplemental O2',
        'BLOOD ADMINISTRATION VOLUME': 'Blood Admin',
        'BLOOD OUTPUT': 'Blood Loss',
        'AIRWAY COMMENTS': 'Mechanical Ventilation',
        'GASTRIC TUBE': 'TUBE FEEDING'
    }

flowsheets.NORMALIZED_NAME.replace(
    to_replace=mapping_dict, 
inplace=True)


In [5]:
pts_filtered = pts[['HSH_ADMSN_ID', 'DISCHARGE_MINUTES', 'VERAPAMIL_TAKEN_TIME', 'IN_ICU_TIME', 'ICU_MINS', 'BMI', 'AGE_LT90', 'SEX', 'ETHNICITY', 'RACE']]

In [6]:
pts_filtered.loc[~pts_filtered['VERAPAMIL_TAKEN_TIME'].isna(), 'VERAPAMIL_TAKEN'] = True
pts_filtered.loc[pts_filtered['VERAPAMIL_TAKEN_TIME'].isna(), 'VERAPAMIL_TAKEN'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [7]:
pts_filtered['VERAPAMIL_TAKEN_TIME'] = pts_filtered['VERAPAMIL_TAKEN_TIME'].fillna(pts_filtered['ICU_MINS'] + pts_filtered['IN_ICU_TIME'])
pts_filtered['VERAPAMIL_TAKEN_TIME'] = pd.to_numeric(pts_filtered['VERAPAMIL_TAKEN_TIME'])

cutpts = [240, 1440, 2880, 4320, 5760, 7200, 10080, 14400, 20160]

for t in cutpts:
    pts_filtered[f"{t}_cutpoint"] = pts_filtered['IN_ICU_TIME'] + t


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [8]:
flowsheets = flowsheets[['HSH_ADMSN_ID', 'NORMALIZED_NAME', 'MEAS_VALUE', 'RECODED_TIME']]
flowsheets_pressures = flowsheets_pressures[['HSH_ADMSN_ID', 'NORMALIZED_NAME', 'SYSTOLIC', 'DIASTOLIC', 'MAP', 'RECODED_TIME']]
countlabs = labs.groupby('HSH_ADMSN_ID').agg('count')
lab_filtered = countlabs[countlabs['NORMALIZED_LAB_NAME'] > 0].reset_index().rename(
    columns={'NORMALIZED_LAB_NAME': 'lab_count'})[['HSH_ADMSN_ID', 'lab_count']]

labs_clean = labs.merge(lab_filtered, on='HSH_ADMSN_ID', how='right')[['HSH_ADMSN_ID','NORMALIZED_LAB_NAME', 'ORD_NUM_VALUE_CORRECTED', 'RESULT_TIME']]

cutpts_nms = [f"{x}_cutpoint" for x in cutpts]
cutpoints = pts_filtered[['HSH_ADMSN_ID'] + cutpts_nms]

flowsheets_w_cutpoints = flowsheets.merge(cutpoints, on='HSH_ADMSN_ID', how='left')
flowsheets_pressures_w_cutpoints = flowsheets_pressures.merge(cutpoints, on='HSH_ADMSN_ID', how='left')
labs_clean_w_cutpoints = labs_clean.merge(cutpoints, on='HSH_ADMSN_ID', how='left')

In [9]:
# Helpers
def rename(newname):
    def decorator(f):
        f.__name__ = newname
        return f
    return decorator

def q_at(y):
    @rename(f'q{y:0.2f}')
    def q(x):
        return x.quantile(y)
    return q

In [10]:
def output_full_sheet(T):    

    print("process lab data")
    l = labs_clean_w_cutpoints.copy()

    l_time = l[l['RESULT_TIME'] < l[f'{T}_cutpoint']] #only get labs before cutpoint time

    f = {'ORD_NUM_VALUE_CORRECTED': ['min', 'max', 'mean', 'count']} # setup for aggregation of labs. didn't do 21 feat vector...too long

    lab_quantiles = l_time.groupby(['HSH_ADMSN_ID', 'NORMALIZED_LAB_NAME']).agg(f) # gets min, max, mean, and count for each lab

    lab_quantiles.columns = lab_quantiles.columns.to_flat_index() # flatten to prep for pivot

    lab_quantiles_flat = lab_quantiles.reset_index() # reset index to prep for pivot

    # pivot labs to have table structure with 
    #       LAB1 min, LAB1 max, LAB1 mean, LAB1 count, LAB2 min ...
    # ptID  ...

    wide_labs = lab_quantiles_flat.pivot(index='HSH_ADMSN_ID', columns='NORMALIZED_LAB_NAME') 

    wide_labs.columns = wide_labs.columns.to_flat_index()

    print("process ICP data")

    icp = flowsheets_w_cutpoints.copy()

    icp_time = icp[icp['RECODED_TIME'] < icp[f'{T}_cutpoint']] # only get icp before cutpoint time

    icp = icp[icp['NORMALIZED_NAME'] == 'ICP'] # only care about ICP data in this 
 
    icp['ICP_MEAS_VALUE'] = pd.to_numeric(icp['MEAS_VALUE']) # some non numbers, just coerce to 0

    f = {'ICP_MEAS_VALUE': [q_at(x) for x in np.linspace(0.05,1,20)]+['count']} # grouping stuff to get the 20 percentiles

    icp_quantiles = icp.groupby(['HSH_ADMSN_ID']).agg(f) 

    icp_quantiles.columns = icp_quantiles.columns.to_flat_index() 

    icp_quantiles_no_lt100 = icp_quantiles 

    print("process BP data")

    bp = flowsheets_pressures_w_cutpoints.copy() 

    bp = bp[bp['RECODED_TIME'] < bp[f'{T}_cutpoint']] # only stuff before the cutpoint

    f = {'MAP': [q_at(x) for x in np.linspace(0.05,1,20)]+['count']} # same 20 percentile thing thing

    bp_quantiles = bp.groupby(['HSH_ADMSN_ID','NORMALIZED_NAME']).agg(f)

    bp_quantiles.columns = bp_quantiles.columns.to_flat_index()

    bp_quantiles_flat = bp_quantiles.reset_index()

    wide_bp = bp_quantiles_flat.pivot(index='HSH_ADMSN_ID', columns='NORMALIZED_NAME')

    wide_bp.columns = wide_bp.columns.to_flat_index()

    print("get the counts for the variables etc.")

    icp_analysis_one = icp_time.groupby(['HSH_ADMSN_ID', 'NORMALIZED_NAME']).count()[['MEAS_VALUE']]

    icp_analysis_one_raw = icp_analysis_one.reset_index().pivot(index='HSH_ADMSN_ID', columns='NORMALIZED_NAME')

    icp_analysis_one_raw.columns = icp_analysis_one_raw.columns.to_flat_index()

    icp_analysis_bool = icp_analysis_one_raw.notnull()

    icp_analysis_count = icp_analysis_one_raw.fillna(0)

    demos = pts_filtered[['HSH_ADMSN_ID', 'VERAPAMIL_TAKEN_TIME', 'VERAPAMIL_TAKEN', f'{T}_cutpoint', 'BMI', 'AGE_LT90', 'SEX', 'ETHNICITY', 'RACE']]

    demos_labeled = demos[['HSH_ADMSN_ID', 'VERAPAMIL_TAKEN', f'{T}_cutpoint', 'VERAPAMIL_TAKEN_TIME', 'BMI', 'AGE_LT90', 'SEX', 'ETHNICITY', 'RACE']]
    """
    variables:
    1. mixed vars binary = icp_analysis_bool
    2. mixed vars counts = icp_analysis_count
    3. ICP actual values = icp_quantiles_no_lt100
    4. BP = wide_bp
    5. lab = wide_labs (use this as the main index probs)
    6. demographics = demos_labeled
    """
    print("merging..")
    final_df = wide_labs.reset_index().merge(demos_labeled, how='left', on='HSH_ADMSN_ID')\
        .merge(wide_bp, how='left', on='HSH_ADMSN_ID')\
        .merge(icp_quantiles_no_lt100, how='left', on='HSH_ADMSN_ID')\
        .merge(icp_analysis_count, how='left', on='HSH_ADMSN_ID')\
        .merge(icp_analysis_bool, how='left', on='HSH_ADMSN_ID')
    
    final_df = final_df[final_df['VERAPAMIL_TAKEN_TIME'] > final_df[f'{T}_cutpoint']]
    def relabel_verap(row):
        if row['VERAPAMIL_TAKEN'] == False:
            return "False"
        else:
            # return "True"
            res = int((row['VERAPAMIL_TAKEN_TIME']-row[f'{T}_cutpoint'])/1440)
            if res < 3:
                return f"Will need verapamil in [0-3) days"
            else:
                return f"Will need verapamil in 3+ days"
    final_df['VERAPAMIL_TAKEN'] = final_df.apply(relabel_verap, axis=1)
    print(final_df['VERAPAMIL_TAKEN'].value_counts())
    final_df = final_df.drop(columns=[f'{T}_cutpoint', 'VERAPAMIL_TAKEN_TIME'])
    print(f"saving to Cleaned-timeto/{T}.csv")
    final_df.to_csv(f'Cleaned-timeto/{T}.csv')
    return final_df

In [11]:
df_tmpts = []
for t in [240, 1440, 2880, 4320, 7200, 10080, 14400]:
    print("Making for t=", t)
    df_tmpt = output_full_sheet(t)
    df_tmpts.append(df_tmpt)
    print("==Value Counts==")
    print(df_tmpt['VERAPAMIL_TAKEN'].value_counts())
    print("====")

Making for t= 240
process lab data
process ICP data
process BP data
get the counts for the variables etc.
take care of demographics and finally code if someone got varap or not
merging..
1901
False                                1762
Will need verapamil in 3+ days        100
Will need verapamil in [0-3) days      24
Name: VERAPAMIL_TAKEN, dtype: int64
saving to Cleaned-timeto/240.csv
==Value Counts==
False                                1762
Will need verapamil in 3+ days        100
Will need verapamil in [0-3) days      24
Name: VERAPAMIL_TAKEN, dtype: int64
====
Making for t= 1440
process lab data
process ICP data
process BP data
get the counts for the variables etc.
take care of demographics and finally code if someone got varap or not
merging..
1947
False                                1555
Will need verapamil in 3+ days         99
Will need verapamil in [0-3) days      16
Name: VERAPAMIL_TAKEN, dtype: int64
saving to Cleaned-timeto/1440.csv
==Value Counts==
False                  

In [13]:
for t in [240, 1440, 2880, 4320, 7200, 10080, 14400]:
    predictors = pd.read_csv('annotated_predictors.csv')
    inc_columns = predictors[predictors['physiologic'] == 1.0]['HSH_ADMSN_ID'].to_numpy()
    min_columns = filter(lambda x: x.find('count') == -1 and x.find('_x') == -1, inc_columns)
    csv_orig = pd.read_csv(f'Cleaned-timeto/{t}.csv')
    csv_orig[csv_orig.columns & (['HSH_ADMSN_ID', 'VERAPAMIL_TAKEN'] + list(min_columns))].to_csv(f'Cleaned-timeto/{t}_nocounts.csv')

  
