### Script for function to patient info data

This function loads chart events (specified by user-entered ID) into a dataframe, filters events by ventilataed patients and places different chart events into different rows in df/

#### import useful libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import psutil
import os

#### print virtual memory available

In [None]:
svmem = psutil.virtual_memory()
print (svmem.available) #in bytes 

#### print size of database we're pulling from

In [None]:
os.path.getsize('./chartevents.csv') 

#### figure out chunk size for pandas dataframe reading

In [None]:
df_sample = pd.read_csv('./chartevents.csv', nrows=10)
df_sample_size = df_sample.memory_usage(index=True).sum()
my_chunk = (2000000000 / df_sample_size)/10
my_chunk = int(my_chunk//1) # we get the integer part
print (my_chunk)

4) create dataframe structure and set chunksize for iterating data into dataframe

In [None]:
iter_csv = pd.read_csv(
    'chartevents.csv',
    iterator=True,
    chunksize=my_chunk,
    dtype={'subject_id': int, 'hadm_id': int, 'stay_id': int, 
           'charttime' : str, 'storetime': str, 'itemid': int,
           'value': str, 'valuenum': float, 'valueuom': str, 'warning': int})

5) get chart events data

In [None]:
df_result = pd.concat(
    [chunk[(chunk['itemid'] == 220003)|(chunk['itemid'] == 226228)|(chunk['itemid'] == 226545)|
           (chunk['itemid'] == 226515)|(chunk['itemid'] == 226724)|(chunk['itemid'] == 227088)|
           (chunk['itemid'] == 224639)|(chunk['itemid'] == 226531)|(chunk['itemid'] == 226707)|
           (chunk['itemid'] == 226730)]
        for chunk in iter_csv])

In [None]:
df_result

#### get ids of patients that were ventilated and seletc these patients

In [1]:
svents = pd.read_csv('sample_vents.csv')
ids = svents['hadm_id']
df_result = df_result[df_result['hadm_id'].isin(ids)]

NameError: name 'pd' is not defined

#### get weight of patients (where duplicate values exist, take average)

In [None]:
weight = df_result[df_result['itemid']==226531]
avg_dupValues = weight.groupby('hadm_id')['valuenum'].mean()
meanWeight = avg_dupValues.reset_index()
meanWeight.columns = (['hadm_id','weight'])
meanWeight

#### get height of patients and convert height in cm to inches (where duplicate values exist, take average)

In [None]:
height = df_result[(df_result['itemid']==226730)|(df_result['itemid']==226707)]

# convert height in cm to inches- divide by 2.74
height['valuenum'] = np.where(height['itemid']==226730,(height['valuenum']/2.54),height['valuenum'])
avg_dupValues = height.groupby('hadm_id')['valuenum'].mean()
meanHeight = avg_dupValues.reset_index()
meanHeight.columns = (['hadm_id','height'])
meanHeight

#### load feature table and add height and weight to it

make sure feature table only includes correct hadm_ids

In [None]:
feature_table = pd.read_csv('feature_table.csv')
feature_table = feature_table[feature_table['hadm_id'].isin(ids)]
new_df = pd.merge(left = feature_table, right = meanWeight, how = 'left', left_on='hadm_id', right_on='hadm_id')
new_df =pd.merge(left = new_df, right = meanHeight, how = 'left', left_on='hadm_id', right_on='hadm_id')

#### From the admissions data for patient that were ventilated, select features that might be useful in the model or later

**** NB, keep getting error messages about setting data values on a slice, this leads to unexpected behavior. i.e. if you run the np.where commands, after re-defining svents_keep, it doesn't establish svents_keep as brand new, the calclations just accumulate from whatever the last value in there was. Also, struggling to rename 'value' column. SHould change this to the df.where df.copy together https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#the-where-method-and-masking

1) For 'value' (i.e. length of time patient was on ventilator), convert all readings into hours

2) rename 'value' column as 'time)on_vent'

In [None]:
svents_keep = svents[['hadm_id', 'endtime', 'value', 'valueuom', 're_intub_class',
       'subject_id', 'admittime', 'deathtime', 'admission_type',
       'admission_location','marital_status', 'ethnicity']]

In [None]:
svents_keep['value'] = np.where(svents_keep['valueuom']=='min',(svents_keep['value']/60),svents_keep['value'])
svents_keep['value'] = np.where(svents_keep['valueuom']=='day',(svents_keep['value']*24),svents_keep['value'])
svents_keep.drop('valueuom',axis=1,inplace=True)
svents_keep.rename({"value":"time_on_vent"})

#### add admission table data to feature table and save csv

new_df =pd.merge(left = new_df, right = svents_keep, how = 'left', left_on='hadm_id', right_on='hadm_id')

new_df.to_csv('all_feature_table.csv',index= False)