# bios8366_fp_pw_120518.ipynb

# 12.05.18

In [2]:
%matplotlib inline
import numpy as np
import pymc3 as pm
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(style="whitegrid")

# Set seed
np.random.seed(10011)

## Helper functions

In [4]:
# create function looking at top X number of labs
def top_labs(labs_df,num_labs=10):
    
    """function takes in lab in dataframe format and outputs 
    
        :param labs_df = lab dataframe with columns = {'RUID','Lab_name','Lab_date','Lab_value'}
        :param num_labs = top X of desired top labs to output from function, default = top 10
        
        :return = 
            top_num_labs: pandas dataframe with columns {'Lab_name','counts'} that contains only labs from top number of desired labs
            top_pt_labs: pandas dataframe with columns {'RUID', 'Lab_name', 'Lab_date','Lab_value'} that contains only labs from top number of desired labs
    """
    # preprocess lab dataframe to get sorted labs
    top_labs = pd.DataFrame(labs_df[['Lab_name']].groupby(['Lab_name']).size()).reset_index()
    top_labs.columns = ['Lab_name','counts']
    top_labs1 = top_labs.sort_values(by='counts',ascending=False)
    
    # get dataframe with columns = {'Lab_name', 'counts'}
    top_num_labs = top_labs1.iloc[0:int(num_labs),:]
    
    # create dataframe with only labs matching top labs
    top_pt_labs = labs_df[labs_df['Lab_name'].isin(top_num_labs['Lab_name'])]
    
    return top_num_labs, top_pt_labs

In [10]:
# write function to clean labs, to remove non-numeric values, etc.

def cleanLabs(labsdf):
    
    # sort labs dataframe to identify non-numeric values
    cleanlabs_df = labsdf.sort_values(by='lab_value',ascending=False)
    
    # convert non-numeric values to 'NaN'
    cleanlabs_df['lab_value'] = pd.to_numeric(cleanlabs_df['lab_value'],errors='coerce')
    
    # remove rows with NaN for Lab_value
    cleanlabs_df = cleanlabs_df[~cleanlabs_df['lab_value'].isna()]
    
    # return cleaned up labs dataframe
    return cleanlabs_df

In [20]:
# write function to clean labs, to remove non-numeric values, etc.
# note that this differs from the cleanLabs() function, as rows with NaN are NOT removed

def cleanLabs1(labsdf):
    
    # sort labs dataframe to identify non-numeric values
    cleanlabs_df = labsdf.sort_values(by='lab_value',ascending=False)
    
    # convert non-numeric values to 'NaN'
    cleanlabs_df['lab_value'] = pd.to_numeric(cleanlabs_df['lab_value'],errors='coerce')
    
    # return cleaned up labs dataframe
    return cleanlabs_df

In [45]:
# write function that returns merged table for labs with base_table
def stat_values_labs(labs_df, base_table):
    
    # merge cleaned up labs df with match 'ruid' and 'hospital_day' with base table
    labs_merged = pd.merge(base_table, labs_df, how='left', left_on=['ruid','hospital_day'], right_on=['ruid','lab_date'])
    
    # return merged df
    return labs_merged

## Aim #1: process labs to merge with base_table that has rows orgnaized by pid-visit_id combination

In [58]:
# import labs
df_LAB = pd.read_csv('./Data/FONNESBECK_LAB_20151202.csv',sep=',',engine='python',quoting=3,dtype={'RUID': np.str},infer_datetime_format=True, parse_dates=['Lab_date'])
df_LAB.head()

Unnamed: 0,RUID,Lab_name,Lab_date,Lab_value
0,50135262,%SAT,04/13/2008,17
1,50135262,ABO,02/08/2007,A
2,50135262,ABO,02/24/2008,a
3,50135262,AN-GAP,02/08/2007,9
4,50135262,AN-GAP,02/11/2011,8


In [59]:
# top 20 labs
top_num_labs, top_pt_labs = top_labs(df_LAB,20)

In [60]:
top_num_labs

Unnamed: 0,Lab_name,counts
1762,GluBed,252113
2218,K,238492
1154,Creat,233493
2609,Na,230471
2776,PCV,229964
592,BUN,226556
1765,Gluc,226554
926,CO2,226256
1070,Cl,226059
2920,Plt-Ct,215131


based on these results, will use all labs from `GluBed` to `Ca`, since there's a big drop-off between `Ca` and `RDWSD`

In [61]:
# get all labs from GluBed to Ca, i.e. filter RDWSD
top_num_labs1, top_pt_labs1 = top_labs(df_LAB,19)

In [62]:
top_pt_labs1['Lab_name'].value_counts()

GluBed    252113
K         238492
Creat     233493
Na        230471
PCV       229964
BUN       226556
Gluc      226554
CO2       226256
Cl        226059
Plt-Ct    215131
WBC       213850
Hgb       213328
RBC       210475
MCHC      209793
MCH       209463
MCV       207226
RDW       207028
AN-GAP    205919
Ca        198994
Name: Lab_name, dtype: int64

In [63]:
top_pt_labs1.columns = ['ruid','lab_name','lab_date','lab_value']
top_pt_labs1.head()

Unnamed: 0,ruid,lab_name,lab_date,lab_value
3,50135262,AN-GAP,02/08/2007,9
4,50135262,AN-GAP,02/11/2011,8
5,50135262,AN-GAP,02/12/2011,6
6,50135262,AN-GAP,02/13/2007,9
7,50135262,AN-GAP,02/13/2011,8


In [64]:
# clean labs, i.e. lab_value  to NaN if not numeric
top_pt_labs2 = cleanLabs1(top_pt_labs1)
top_pt_labs2.head()

Unnamed: 0,ruid,lab_name,lab_date,lab_value
5503436,53732927,Plt-Ct,07/25/2013,
6301228,53733765,Na,10/23/1999,
2683342,53730260,CO2,02/10/2000,
2684017,53730260,K,02/10/2000,
2239788,53729811,Creat,05/26/2000,


In [65]:
top_pt_labs2.isna().sum()

ruid             0
lab_name         0
lab_date         0
lab_value    18659
dtype: int64

In [66]:
top_pt_labs2.shape

(4181165, 4)

In [67]:
# read in base table that I will use for merging
base_table = pd.read_csv('./Data/base_table_112018.csv',sep=',',dtype={'ruid': np.str},infer_datetime_format=True, parse_dates=['admit_date','discharge_date','hospital_day'])

In [68]:
base_table.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,hospital_day,stay_length,n_transfers,readmit_time,readmit_30d
0,50135262,0,2007-02-08,2007-02-12,2007-02-08,4 days 00:00:00.000000000,2,,0
1,50135262,0,2007-02-08,2007-02-12,2007-02-09,4 days 00:00:00.000000000,2,,0
2,50135262,0,2007-02-08,2007-02-12,2007-02-10,4 days 00:00:00.000000000,2,,0
3,50135262,0,2007-02-08,2007-02-12,2007-02-11,4 days 00:00:00.000000000,2,,0
4,50135262,0,2007-02-08,2007-02-12,2007-02-12,4 days 00:00:00.000000000,2,,0


In [69]:
base_table = base_table[['ruid','visit_id','hospital_day']]
base_table.head()

Unnamed: 0,ruid,visit_id,hospital_day
0,50135262,0,2007-02-08
1,50135262,0,2007-02-09
2,50135262,0,2007-02-10
3,50135262,0,2007-02-11
4,50135262,0,2007-02-12


In [70]:
top_pt_labs2.head()

Unnamed: 0,ruid,lab_name,lab_date,lab_value
5503436,53732927,Plt-Ct,07/25/2013,
6301228,53733765,Na,10/23/1999,
2683342,53730260,CO2,02/10/2000,
2684017,53730260,K,02/10/2000,
2239788,53729811,Creat,05/26/2000,


In [75]:
base_table.dtypes

ruid                    object
visit_id                 int64
hospital_day    datetime64[ns]
dtype: object

In [76]:
top_pt_labs3 = top_pt_labs2.copy()
top_pt_labs3['lab_date'] = pd.to_datetime(top_pt_labs3['lab_date'], infer_datetime_format=True)

In [79]:
top_pt_labs3.dtypes

ruid                 object
lab_name             object
lab_date     datetime64[ns]
lab_value           float64
dtype: object

In [80]:
base_table.dtypes

ruid                    object
visit_id                 int64
hospital_day    datetime64[ns]
dtype: object

In [81]:
labs_merged = stat_values_labs(top_pt_labs3, base_table)

In [82]:
labs_merged.head()

Unnamed: 0,ruid,visit_id,hospital_day,lab_name,lab_date,lab_value
0,50135262,0,2007-02-08,MCV,2007-02-08,91.0
1,50135262,0,2007-02-08,AN-GAP,2007-02-08,9.0
2,50135262,0,2007-02-08,Ca,2007-02-08,8.9
3,50135262,0,2007-02-08,PCV,2007-02-08,40.0
4,50135262,0,2007-02-08,RBC,2007-02-08,4.41


In [83]:
labs_merged.isna().sum()

ruid                0
visit_id            0
hospital_day        0
lab_name        26871
lab_date        26871
lab_value       33654
dtype: int64

In [85]:
labs_merged.shape

(1924507, 6)

In [84]:
# export labs_merged to csv to upload to drive
labs_merged.to_csv('labs_merged_120518.csv',index=False)