## Loading data

In [1]:
%ls ../data

FONNESBECK_ADT_20151202.csv        [1m[31mFONNESBECK_ICD9_20151202.csv[m[m*
[1m[31mFONNESBECK_BMI_20151202.csv[m[m*       [1m[31mFONNESBECK_LAB_20151202.csv[m[m*
[1m[31mFONNESBECK_BP_20151202.csv[m[m*        [1m[31mFONNESBECK_MED_20151202.csv[m[m*
[1m[31mFONNESBECK_CPT_20151202.csv[m[m*       [1m[31mFONNESBECK_phenotype_20151202.csv[m[m*
[1m[31mFONNESBECK_EGFR_20151202.csv[m[m*      Fonnesbeck_DD_2014102014.xlsx


In [27]:
import pandas as pd
import datetime
import numpy as np

In [2]:
adt = pd.read_table('../data/FONNESBECK_ADT_20151202.csv', encoding='latin1', sep = ',', infer_datetime_format=True, parse_dates=['Admission_date','Event_Date','DISCHARGE_DATE'])
pheno = pd.read_table('../data/FONNESBECK_phenotype_20151202.csv', encoding='latin1', sep = ',', infer_datetime_format=True, parse_dates=['DOB','DOD'])
cpt = pd.read_table('../data/FONNESBECK_CPT_20151202.csv', encoding='latin1', sep = ',', infer_datetime_format=True, parse_dates=['Event_date'])

In [3]:
svc = pd.read_excel('../data/FONNESBECK_DD_2014102014.xlsx',sheet_name='Service code', sep = ',')
svc.rename(columns = {"Service Code":"SVC", "Service Code Desc":"Desc"}, inplace = True)

In [4]:
adt.Event = pd.Categorical(adt.Event,categories = ['Admit','Transfer','Discharge'])
adt = adt.sort_values(by = ['RUID','Admission_date','Event','Event_Date']).reset_index(drop = True)
adt.head()

Unnamed: 0,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
0,50135262,Admit,2007-02-08,2007-02-08,ORT,R FEMUR FX,2007-02-12
1,50135262,Transfer,2007-02-08,2007-02-08,ORT,R FEMUR FX,2007-02-12
2,50135262,Transfer,2007-02-08,2007-02-09,ORT,R FEMUR FX,2007-02-12
3,50135262,Discharge,2007-02-08,2007-02-12,ORT,R FEMUR FX,2007-02-12
4,50135262,Admit,2007-08-03,2007-08-03,CAR,CP,2007-08-06


In [5]:
adt.describe(include='all')

Unnamed: 0,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
count,121530.0,121530,119969,121530,121530,120603.0,119472
unique,,3,4192,4279,73,13118.0,4195
top,,Transfer,2013-03-14 00:00:00,2013-12-28 00:00:00,GMD,296.9,2010-12-23 00:00:00
freq,,61636,111,69,13062,2394.0,111
first,,,2004-01-28 00:00:00,2004-01-28 00:00:00,,,2004-02-11 00:00:00
last,,,2015-11-26 00:00:00,2015-11-26 00:00:00,,,2015-11-23 00:00:00
mean,53668610.0,,,,,,
std,462820.6,,,,,,
min,50135260.0,,,,,,
25%,53729800.0,,,,,,


## Looking at missingness

In [6]:
adt.isnull().sum()/adt.shape[0]

RUID               0.000000
Event              0.000000
Admission_date     0.012845
Event_Date         0.000000
SRV_CODE           0.000000
CHIEF_COMPLAINT    0.007628
DISCHARGE_DATE     0.016934
dtype: float64

In [7]:
adt[adt.Admission_date.isnull() & adt.DISCHARGE_DATE.isnull()]

Unnamed: 0,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
93,50135361,Transfer,NaT,2009-05-24,GMD,DEHYDRATION W/WEAKNESS,NaT
94,50135361,Transfer,NaT,2009-07-11,GMD,PNEUMONIA,NaT
95,50135361,Transfer,NaT,2010-04-29,GMD,PYELONEPHRITIS,NaT
96,50135361,Transfer,NaT,2010-10-03,GER,FALL,NaT
97,50135361,Transfer,NaT,2011-10-17,CAR,CHEST PAIN; ELEVATED TROPONIN,NaT
151,50135369,Transfer,NaT,2009-07-07,GMD,CHEST AND ABDOMINAL PAIN,NaT
152,50135369,Transfer,NaT,2010-03-30,PUL,COPD ACUTE HIP PAIN,NaT
153,50135369,Transfer,NaT,2010-12-21,CAR,UNSTABLE ANGINA,NaT
410,50135437,Transfer,NaT,2010-06-11,GMD,WOUND INFECTION,NaT
466,50135624,Transfer,NaT,2008-02-08,CAR,HEART TRANSPLANT,NaT


In [9]:
adt[adt.Admission_date.isnull()].Event.value_counts()

Transfer     1559
Discharge       2
Admit           0
Name: Event, dtype: int64

In [10]:
adt[adt.DISCHARGE_DATE.isnull()].Event.value_counts()

Transfer     1860
Admit         198
Discharge       0
Name: Event, dtype: int64

In [11]:
adt[(adt.Admission_date.isnull()) & (adt.Event == 'Discharge')]

Unnamed: 0,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
76409,53733158,Discharge,NaT,2007-08-06,TRA,STAT,2007-08-06
76578,53733172,Discharge,NaT,2013-02-17,PED,SEPSIS,2013-02-17


In [12]:
adt[76400:76420]

Unnamed: 0,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
76400,53733157,Admit,2012-02-14,2012-02-14,PUL,HEMOPTYSIS,2012-02-15
76401,53733157,Transfer,2012-02-14,2012-02-14,PUL,HEMOPTYSIS,2012-02-15
76402,53733157,Discharge,2012-02-14,2012-02-15,PUL,HEMOPTYSIS,2012-02-15
76403,53733157,Admit,2012-04-14,2012-04-14,ONC,FAILURE TO THRIVE; DEHYDRATION; KIDNEY CA,2012-04-15
76404,53733157,Transfer,2012-04-14,2012-04-14,ONC,FAILURE TO THRIVE; DEHYDRATION; KIDNEY CA,2012-04-15
76405,53733157,Discharge,2012-04-14,2012-04-15,ONC,FAILURE TO THRIVE; DEHYDRATION; KIDNEY CA,2012-04-15
76406,53733157,Transfer,NaT,2011-11-21,ONC,HEMOPTYSIS,NaT
76407,53733157,Transfer,NaT,2011-11-21,HEM,HEMOPTYSIS,NaT
76408,53733158,Transfer,NaT,2007-08-05,TRA,STAT,2007-08-06
76409,53733158,Discharge,NaT,2007-08-06,TRA,STAT,2007-08-06


## Adding age data & removing pediatric patients

In [14]:
adt_age = pd.merge(adt,pheno)
# admits = adt_age.Admission_date.dt
events = adt_age.Event_Date.dt
birthdays = adt_age.DOB.dt

adt_age['age'] = events.year - birthdays.year + ((events.month < birthdays.month) & (events.day < birthdays.day))
# above from https://stackoverflow.com/questions/2217488/age-from-birthdate-in-python/9754466#9754466

In [15]:
# getting rid of peds & psychiatric patients...
# we're removing these because they aren't part of the CMS criteria so 30-day readmits for them don't lose the hospital money
ped_svc = '|'.join(svc.SVC[svc.Desc.str.contains("CHILD|PED")])
psych_svc = '|'.join(svc.SVC[svc.Desc.str.contains("PSYCH")])

# ped_filter = ((adt_age.age < 18) | (adt_age.SRV_CODE.str.contains(ped_svc)) & ~((adt_age.age > 35) & (adt_age.SRV_CODE.str.contains(ped_svc))))
# the ~ condition here contains a handful of rows that I think are coding errors -- very old patients admitted to pediatric services
# the cutoff is 35 because some pediatric cancer/cardiac/etc patients will continue with pediatric services for their original condition into adulthood

ped_filter = (adt_age.age < 18)
psych_filter = (adt_age.SRV_CODE.str.contains(psych_svc))

In [16]:
adt_cms = adt[~(ped_filter | psych_filter)]

## Filtering to admits & eliminating missing discharges

In [186]:
adt_cms_admits = adt_cms[(adt_cms.Event == 'Admit') & ~(adt_cms.DISCHARGE_DATE.isnull())].copy() # removing missing discharge dates because I can't fix them right now
# adt_cms[(adt_cms.Event == 'Admit') & (adt_cms.DISCHARGE_DATE.isnull())]
adt_cms_admits = adt_cms_admits[adt_cms_admits.Admission_date == adt_cms_admits.Event_Date].reset_index(drop = True) # this removes admits that aren't the same day as the admit date
# i'm not sure what these actually are--they might be miscoded transfers or admissions to another department

## Constructing variables

In [187]:
adt_cms_admits['Stay_length'] = adt_cms_admits.DISCHARGE_DATE - adt_cms_admits.Admission_date
adt_cms_admits['Readmit_time'] = adt_cms_admits.Admission_date - adt_cms_admits.DISCHARGE_DATE.shift()

didx = ~(adt_cms_admits.RUID.shift() == adt_cms_admits.RUID)

adt_cms_admits['Readmit_time'] = adt_cms_admits['Readmit_time'].mask(didx)

adt_cms_admits['30d_readmit'] = np.where(adt_cms_admits.Readmit_time <= datetime.timedelta(days=30),1,0)


In [184]:
adt_cms_admits[adt_cms_admits.Readmit_time < datetime.timedelta(days=0)] # this appears to happen when we've got two admit events for the same date with the same discharge
# in some cases they have different event dates, in other cases they're identical but have different srv_codes
# the first type might be miscoded transfers; the second type probably needs to be squished together

Unnamed: 0,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE,Stay_length,Readmit_time,30_readmit
762,53727911,Admit,2014-05-21,2014-05-21,CAR,ACS,2014-05-29,8 days,-8 days,1
2014,53728400,Admit,2010-01-13,2010-01-13,GMD,GJ TUBE DISLODGEMENT,2010-01-19,6 days,-6 days,1
12240,53732639,Admit,2014-05-21,2014-05-21,EMR,PARALYSIS,2014-05-29,8 days,-8 days,1
12790,53732854,Admit,2009-04-03,2009-04-03,NEP,585.5,2009-04-03,0 days,-1 days,1
14286,53733529,Admit,2015-11-08,2015-11-08,ONC,FLU LIKE SYMPTOMS,2015-11-10,2 days,-2 days,1
15712,53734226,Admit,2014-03-13,2014-03-13,EMR,39 WKS,2014-03-13,0 days,-3 days,1
16075,53734366,Admit,2014-05-16,2014-05-16,CAR,AMS; HYPOGLYCEMIA,2014-05-17,1 days,-1 days,1
18952,53735533,Admit,2014-07-10,2014-07-10,EMR,DESMOPLASTIC SMALL ROUND CELL TUMOR; INTRACTAB...,2014-07-13,3 days,-3 days,1
19692,53735914,Admit,2015-06-14,2015-06-14,GMD,ASTHMA,2015-06-16,2 days,-2 days,1
20488,53736197,Admit,2014-11-26,2014-11-26,CAR,FEVER,2014-12-01,5 days,-5 days,1


In [191]:
adt_cms.groupby(by=['RUID','Admission_date']).Event.value_counts() # now pull the number of transfers and we're good

RUID      Admission_date  Event    
50135262  2007-02-08      Transfer     2
                          Admit        1
                          Discharge    1
          2007-08-03      Transfer     3
                          Admit        1
                          Discharge    1
          2007-08-28      Admit        1
                          Discharge    1
                          Transfer     1
          2008-02-24      Transfer     2
                          Admit        1
                          Discharge    1
          2008-04-12      Admit        1
                          Discharge    1
                          Transfer     1
          2010-10-28      Admit        1
                          Discharge    1
                          Transfer     1
          2011-02-11      Transfer     4
                          Admit        1
                          Discharge    1
          2012-05-23      Admit        1
                          Discharge    1
                     

In [149]:
adt_cms_admits.Admission_date[1] - adt_cms_admits.DISCHARGE_DATE[0] 

Timedelta('172 days 00:00:00')

In [151]:
adt_cms_admits.Admission_date.head() - adt_cms_admits.DISCHARGE_DATE.shift().head()

0        NaT
1   172 days
2    22 days
3   179 days
4    44 days
dtype: timedelta64[ns]

In [140]:
adt_cms_admits.DISCHARGE_DATE.head()

0   2007-02-12
1   2007-08-06
2   2007-08-29
3   2008-02-28
4   2008-04-13
Name: DISCHARGE_DATE, dtype: datetime64[ns]

## (Attempting to) Impute missing discharge dates from CPT hospitalization & discharge codes

In [19]:
hosp_ed_cpts = ["99217", "99218", "99219", "99220", "99221", "99222", "99223", "99224", "99225", "99226", "99231", "99232", "99233", "99234", "99235", "99236", "99238", "99239", "99251", "99252", "99253", "99254", "99255", "99289","99290", "99291", "99292", "99293", "99294", "99295","99296", "99297", "99356", "99357", "99358", "99359", "99433", "99435", "99460", "99461", "99462", "99463", "99466", "99467", "99468", "99469","99471", "99472", "99475", "99476", "99477", "99478", "99479", "99480", "99485", "99486", "99281", "99282", "99283", "99284", "99285"]
cpt_pat = "|".join(hosp_ed_cpts)
disch_pat = "|".join(["99217", "99238", "99239"])

In [20]:
cpt_hosp = cpt[cpt.CPT_Code.str.match(cpt_pat)].sort_values(by=['RUID','Event_date','CPT_Code'])

In [58]:
missing_discharge = adt_cms[(adt_cms.Event == "Admit") & (adt_cms.DISCHARGE_DATE.isnull())].copy().reset_index()
missing_discharge.head()

Unnamed: 0,index,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
0,1757,50141473,Admit,2015-06-26,2015-06-26,GMD,DETOX,NaT
1,2000,51319626,Admit,2013-11-19,2013-11-19,ORT,716.17/996.78,NaT
2,2257,52765702,Admit,2014-12-05,2014-12-05,GMD,BACK PAIN,NaT
3,2507,53719335,Admit,2014-09-15,2014-09-15,EMR,INTRACTABLE VOMITING AND DEHYDRATION,NaT
4,2841,53725969,Admit,2011-09-27,2011-09-27,EMR,SOB,NaT


In [22]:
for idx, row in missing_discharge.iterrows():
    cpt_sub = cpt_hosp[(cpt_hosp.RUID == row.RUID) & (cpt_hosp.Event_date > row.Admission_date)]
    cpt_disch = cpt_sub[cpt_sub.CPT_Code.str.match(disch_pat)]
    orig_idx = row.index
    
    if cpt_disch.shape[0]:
        missing_discharge.DISCHARGE_DATE[idx] = cpt_disch.iloc[0,2]
        # this will modify the original df
        # but we should probable be careful about that
        # so i'm commenting it out
        # adt_cms.DISCHARGE_DATE[orig_idx] = cpt_disch.iloc[0,2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [23]:
missing_discharge
# this isn't a reliable way of doing this -- sometimes there aren't any discharge codes for a given admit so it picks one several months later

Unnamed: 0,index,RUID,Event,Admission_date,Event_Date,SRV_CODE,CHIEF_COMPLAINT,DISCHARGE_DATE
0,1757,50141473,Admit,2015-06-26,2015-06-26,GMD,DETOX,2015-07-18
1,2000,51319626,Admit,2013-11-19,2013-11-19,ORT,716.17/996.78,NaT
2,2257,52765702,Admit,2014-12-05,2014-12-05,GMD,BACK PAIN,2014-12-08
3,2507,53719335,Admit,2014-09-15,2014-09-15,EMR,INTRACTABLE VOMITING AND DEHYDRATION,2015-01-10
4,2841,53725969,Admit,2011-09-27,2011-09-27,EMR,SOB,NaT
5,2960,53727824,Admit,2013-11-13,2013-11-13,VAS,453.40 ACUTE VENOUS EMBOLISM AND THROMBOSIS,NaT
6,4022,53727917,Admit,2012-06-17,2012-06-17,ORT,813.23 RADIUS WITH ULNA; FRACTURE OF AND ULNA ...,NaT
7,5575,53728030,Admit,2015-08-12,2015-08-12,EMR,AIDS ENTEROPATHY; FAILURE TO THRIVE; DEHYDRATION,NaT
8,6714,53728118,Admit,2015-02-03,2015-02-03,CAR,HYPERTENSIVE EMERGENCY; CHEST PAIN,NaT
9,7533,53728161,Admit,2014-10-29,2014-10-29,CAR,VOLUME OVERLOAD HEART FAILURE,2015-04-02
