In [97]:
# Import necessary
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mt
import seaborn as sns
import numpy as np
import datetime as dt
import math

In [98]:
# Shows what each ICD9 code stands for 
D_ICD_Diagnoses = pd.read_csv('D_ICD_DIAGNOSES.csv')

# Shows all ICD9 codes for each patient marked by subject ID 
Diagnoses_ICD = pd.read_csv('DIAGNOSES_ICD.csv')

# List of all patients and their information
Patients = pd.read_csv('PATIENTS.csv')

# Additional information for each patient 
Admissions = pd.read_csv('ADMISSIONS.csv')

In [99]:
# Calculating parameters for each patient

# Add Patient DOB in usable format and Merge with patient information
Patients_DOB = Patients['DOB']
Patients_DOB2 = []
for n in range(len(Patients_DOB)):
    prelim = Patients_DOB[n]
    Patients_DOB2.append(dt.datetime.strptime(prelim[0:10],'%Y-%m-%d').date())
Birth_Date_Series = pd.Series(Patients_DOB2)
Patients['DOB-2'] = Birth_Date_Series

# Add Data for Patient's Date of Death
Patients_DODeath = Patients['DOD']
Patients_DOD = []
for n in range(len(Patients_DODeath)):
    if pd.isnull(Patients_DODeath[n]) == True:  
        Patients_DOD.append(np.nan)
    else:
        prelim = Patients_DODeath[n]
        Patients_DOD.append(dt.datetime.strptime(prelim[0:10],'%Y-%m-%d').date())

# Add Data for Patient's Date of Admission
Patients_Admit = Admissions['ADMITTIME']
Patients_Admissions = []
for n in range(len(Patients_Admit)):
    if pd.isnull(Patients_Admit[n]) == True:  
        Patients_Admissions.append(np.nan)
    else:
        prelim = Patients_Admit[n]
        Patients_Admissions.append(dt.datetime.strptime(prelim[0:10],'%Y-%m-%d').date())

# Add Data for Patient's Date of Release
Patients_leave = Admissions['DISCHTIME']
Patients_Release = []
for n in range(len(Patients_leave)):
    if pd.isnull(Patients_leave[n]) == True:  
        Patients_Release.append(np.nan)
    else:
        prelim = Patients_leave[n]
        Patients_Release.append(dt.datetime.strptime(prelim[0:10],'%Y-%m-%d').date())
        
# Add Data to Series       
Birth_Date_Series = pd.Series(Patients_DOB2)
Death_Series = pd.Series(Patients_DOD)

Admit_Series = pd.Series(Patients_Admissions)
Release_Series = pd.Series(Patients_Release)

# Find Age at Death
Age_Death = (Death_Series - Birth_Date_Series)
death_age = []

for entry in range(len(Age_Death)):
    if type(Age_Death[entry]) == float:
        death_age.append(float('NAN'))
    else:
        death_age.append(Age_Death[entry].days)

Age_Death = pd.Series(death_age)

# Find Total Admission Times
Admissions_Time = (Release_Series - Admit_Series)

Admit_Time = []

for entry in range(len(Admissions_Time)):
    if type(Admissions_Time[entry]) == float:
        Admit_Time.append(float('NAN'))
    else:
        Admit_Time.append(Admissions_Time[entry].days)

Admissions_Time = pd.Series(Admit_Time)
        
# Add to Pandas Dataframe
Patients['DOB-2'] = Birth_Date_Series
Patients['DOD-2'] = Death_Series
Patients['AOD'] = Age_Death

Admissions['Admit'] = Admit_Series
Admissions['Release'] = Release_Series
Admissions['Total Admission Time'] = Admissions_Time 

Admissions_culled = Admissions.drop_duplicates('SUBJECT_ID', keep = 'last')
Admissions_culled_Admits = Admissions_culled['ADMITTIME']

Admit_Ages = (Admit_Series - Birth_Date_Series)

# Combine admissions information and patient's information
Admissions_culled = Admissions.copy(deep = False)
Admissions_culled = Admissions_culled.drop_duplicates('SUBJECT_ID', keep = 'last')

Admissions_long = Admissions_culled.merge(Patients, on='SUBJECT_ID')
Admissions_long.head()
Admit_Series = pd.Series(Patients_DOD)
Admissions_culled_Admits = Admissions_culled['ADMITTIME']

Admit_times2 = pd.Series(Admissions_long['ADMITTIME'])
Atimes = []
for n in range(len(Admit_times2)):
    if pd.isnull(Admit_times2[n]) == True:  
        Atimes.append(np.nan)
    else:
        prelim = Admit_times2[n]
        Atimes.append(dt.datetime.strptime(prelim[0:10],'%Y-%m-%d').date())

DOB_Admit_times = pd.Series(Admissions_long['DOB'])
DOBAtimes = []
for n in range(len(DOB_Admit_times)):
    if pd.isnull(DOB_Admit_times[n]) == True:  
        DOBAtimes.append(np.nan)
    else:
        prelim = DOB_Admit_times[n]
        DOBAtimes.append(dt.datetime.strptime(prelim[0:10],'%Y-%m-%d').date())

Atimes = pd.Series(Atimes)
DOBAtimes = pd.Series(DOBAtimes)
        
admit_births = (Atimes - DOBAtimes)

Admit_Timet = []
for entry in range(len(admit_births)):
    if type(admit_births[entry]) == float:
        Admit_Timet.append(float('NAN'))
    else:
        Admit_Timet.append(admit_births[entry].days)

Admit_Timet = pd.Series(Admit_Timet)

# Find total number of visits for each subject ID
visit_count = pd.DataFrame(Admissions.SUBJECT_ID.value_counts())
visit_count = pd.DataFrame(Admissions.SUBJECT_ID.value_counts())
visit_count.reset_index(level=0, inplace=True)
visit_count.columns = ['SUBJECT_ID', 'ADMISSIONS']

Admissions_long = Admissions_long.merge(visit_count, on='SUBJECT_ID')

In [100]:
# Find causes of death for patients based on diagnosis
cdeath = []
subjectid = []
ldeath = []
ideath = []
edeath = []
tdeath = []
hamdiddeath = []
admitt = []

for entry in range(len(Admissions)):
    if Admissions['HOSPITAL_EXPIRE_FLAG'][entry] == 0:
        continue
    elif Admissions['HOSPITAL_EXPIRE_FLAG'][entry] == 1:
        subjectid.append(Admissions['SUBJECT_ID'][entry])
        cdeath.append(Admissions['DIAGNOSIS'][entry])
        ldeath.append(Admissions['ADMISSION_LOCATION'][entry])
        tdeath.append(Admissions['ADMISSION_TYPE'][entry])
        ideath.append(Admissions['INSURANCE'][entry])
        edeath.append(Admissions['ETHNICITY'][entry])
        hamdiddeath.append(Admissions['HADM_ID'][entry])
        admitt.append(Admissions['Total Admission Time'][entry])
        
cdeath = pd.Series(cdeath)
subjectid = pd.Series(subjectid)
ldeath = pd.Series(ldeath)
ideath = pd.Series(ideath)
edeath = pd.Series(edeath)
tdeath = pd.Series(tdeath)
hamdiddeath = pd.Series(hamdiddeath)
admitt = pd.Series(admitt)

Death = pd.DataFrame()
Death['SUBJECT_ID'] = subjectid

Death['CAUSE'] = cdeath

Death.head()

Unnamed: 0,SUBJECT_ID,CAUSE
0,31,STATUS EPILEPTICUS
1,56,HEAD BLEED
2,61,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA
3,67,SUBARACHNOID HEMORRHAGE
4,84,"GLIOBLASTOMA,NAUSEA"


In [101]:
kidney_failure_desc = Death[Death['CAUSE'].str.contains("SEPSIS|MYOCARDIAL|CARDIAC|HEART", na=False)==True]
sepsis = Death[Death['CAUSE'].str.contains("SEPSIS", na=False)==True]
del sepsis['CAUSE']
sepsis['SEPSIS_FLAG'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [103]:
# Merge admissions information and patients information for a more comprehensive demographics
demographics = pd.DataFrame()
demographics['SUBJECT_ID'] = Admissions_long['SUBJECT_ID']
demographics['GENDER'] = Admissions_long['GENDER']
demographics['DOB'] = Admissions_long['DOB-2']
demographics['DOD'] = Admissions_long['DOD-2']
demographics['DOA'] = Admissions_long['ADMITTIME']
demographics['ADMIT_AGE'] = Admit_Timet
demographics['ETHNICITY'] = Admissions_long['ETHNICITY']
demographics['MARITAL_STATUS'] = Admissions_long['MARITAL_STATUS']
demographics['LANGUAGE'] = Admissions_long['LANGUAGE']
demographics['RELIGION'] = Admissions_long['RELIGION']
demographics['INSURANCE'] = Admissions_long['INSURANCE']
demographics['ADMISSION_LOCATION'] = Admissions_long['ADMISSION_LOCATION']
demographics['#ADMISSIONS'] = Admissions_long['ADMISSIONS']
demographics.head()

Unnamed: 0,SUBJECT_ID,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS
0,22,F,2131-05-07,,2196-04-09 12:26:00,23714,WHITE,MARRIED,,UNOBTAINABLE,Private,EMERGENCY ROOM ADMIT,1
1,23,M,2082-07-17,,2157-10-18 19:34:00,27486,WHITE,MARRIED,ENGL,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,2
2,24,M,2100-05-31,,2139-06-06 16:14:00,14250,WHITE,SINGLE,,PROTESTANT QUAKER,Private,TRANSFER FROM HOSP/EXTRAM,1
3,25,M,2101-11-21,,2160-11-02 02:06:00,21531,WHITE,MARRIED,,UNOBTAINABLE,Private,EMERGENCY ROOM ADMIT,1
4,26,M,2054-05-04,2128-02-25,2126-05-06 15:16:00,26299,UNKNOWN/NOT SPECIFIED,SINGLE,,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1


In [104]:
# Merge Death and demographics on subject ID to add relevant data about patient deaths
demographics = pd.merge(Death, demographics, on= 'SUBJECT_ID', how = 'outer')

In [105]:
demographics['DOA'] = pd.to_datetime(demographics['DOA'])
demographics['DOD'] = pd.to_datetime(demographics['DOD'])

In [106]:
# Calculate time it takes for patients to die since admission
demographics['delta'] = demographics['DOD'] - demographics['DOA']
demographics['delta'] = demographics['delta'] / np.timedelta64(1,'h')
                                                    

In [107]:
demographics.head()

Unnamed: 0,SUBJECT_ID,CAUSE,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,delta
0,31,STATUS EPILEPTICUS,M,2036-05-17,2108-08-30,2108-08-22 23:27:00,26394,WHITE,MARRIED,,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1,168.55
1,56,HEAD BLEED,F,1804-01-02,2104-01-08,2104-01-02 02:01:00,109573,WHITE,,,NOT SPECIFIED,Medicare,EMERGENCY ROOM ADMIT,1,141.983333
2,61,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,M,2063-10-21,2119-02-03,2119-01-04 18:12:00,20163,WHITE,MARRIED,,CATHOLIC,Private,CLINIC REFERRAL/PREMATURE,2,701.8
3,67,SUBARACHNOID HEMORRHAGE,M,2084-06-05,2157-12-02,2157-12-02 00:45:00,26842,WHITE,SINGLE,,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,-0.75
4,84,"GLIOBLASTOMA,NAUSEA",F,2151-10-21,2196-04-17,2196-04-14 04:02:00,16247,WHITE,MARRIED,,OTHER,Private,EMERGENCY ROOM ADMIT,2,67.966667


In [108]:
# Create a flag for patients outside of hospital where we don't have information on
death_cause = []
outside_death_cause = []
for entry in range(len(demographics)):
    if pd.isnull(demographics['DOD'][entry]) == False and pd.isnull(demographics['CAUSE'][entry]) == True:
        death_cause.append('Death Outside of Hospital')
        outside_death_cause.append(1)
    else: 
        death_cause.append(demographics['CAUSE'][entry])
        outside_death_cause.append(0)
        
death_cause = pd.Series(death_cause)
outside_death_flag = pd.Series(outside_death_cause)

demographics['CAUSE'] = death_cause
demographics['OUTSIDE_DEATH_FLAG'] = outside_death_flag
demographics.head()


# Add death flag within 1 year
death_flag = []
for entry in range(len(demographics)):
    if demographics['delta'][entry] <= 8760:
        death_flag.append(1)
    else: 
        death_flag.append(0)


death_flag = pd.Series(death_flag)
demographics['DEATH_FLAG'] = death_flag

demographics['DEATH_FLAG'] = demographics['DEATH_FLAG'].fillna(0)

# Add too old flag
old_flag = []
for entry in range(len(demographics)):
    if demographics['ADMIT_AGE'][entry] > 32850:
        old_flag.append(1)
    else:
        old_flag.append(0)

old_flag = pd.Series(old_flag)
demographics['OLD_FLAG'] = old_flag

In [109]:
# no letter list
list_values = Diagnoses_ICD['ICD9_CODE'].values.tolist()
no_letter_list = []
for i in range(len(list_values)):
    Value = str(list_values[i])
    Letter_stripped_value = Value.lstrip('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    no_letter_list.append(Letter_stripped_value)

# no number list
list_values = Diagnoses_ICD['ICD9_CODE'].values.tolist()
no_number_list= []
for i in range(len(list_values)):
    Value = str(list_values[i])
    number_stripped_value = Value.lstrip('1234567890')
    if number_stripped_value:
        final_number = number_stripped_value[0]
        no_number_list.append(final_number)
    else:
        no_number_list.append('')      
        
# Correct three digit codes for V codes where decimal is only in first two spots
three_number_list = []
for i in range(len(list_values)):
    if no_number_list[i] == 'V':
        Value = no_letter_list[i] 
        three_digits = Value[0:2]
        three_number_list.append(float(three_digits))
    else:
        Value = no_letter_list[i]
        three_digits = Value[0:3]
        three_number_list.append(float(three_digits))

# Add three number list to diagnoses codes
three_number_series = pd.Series(three_number_list)
Diagnoses_ICD['Three Numbers'] = three_number_series # Pull only first three numbers 


In [110]:
# Leave null for patients too old
age = []
for entry in range(len(demographics)):
     if demographics['OLD_FLAG'][entry] == 0:
            age.append(demographics['ADMIT_AGE'][entry])
     else: 
        age.append(np.nan)
age = pd.Series(age)
demographics['ADMIT_AGE'] = age
demographics.head()

Unnamed: 0,SUBJECT_ID,CAUSE,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG
0,31,STATUS EPILEPTICUS,M,2036-05-17,2108-08-30,2108-08-22 23:27:00,26394.0,WHITE,MARRIED,,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1,168.55,0,1,0
1,56,HEAD BLEED,F,1804-01-02,2104-01-08,2104-01-02 02:01:00,,WHITE,,,NOT SPECIFIED,Medicare,EMERGENCY ROOM ADMIT,1,141.983333,0,1,1
2,61,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,M,2063-10-21,2119-02-03,2119-01-04 18:12:00,20163.0,WHITE,MARRIED,,CATHOLIC,Private,CLINIC REFERRAL/PREMATURE,2,701.8,0,1,0
3,67,SUBARACHNOID HEMORRHAGE,M,2084-06-05,2157-12-02,2157-12-02 00:45:00,26842.0,WHITE,SINGLE,,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,-0.75,0,1,0
4,84,"GLIOBLASTOMA,NAUSEA",F,2151-10-21,2196-04-17,2196-04-14 04:02:00,16247.0,WHITE,MARRIED,,OTHER,Private,EMERGENCY ROOM ADMIT,2,67.966667,0,1,0


In [111]:
# Test whether patients who died actually lived less than or equal to a year since admission
demographics[demographics['DEATH_FLAG'] == 1].head()

Unnamed: 0,SUBJECT_ID,CAUSE,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG
0,31,STATUS EPILEPTICUS,M,2036-05-17,2108-08-30,2108-08-22 23:27:00,26394.0,WHITE,MARRIED,,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1,168.55,0,1,0
1,56,HEAD BLEED,F,1804-01-02,2104-01-08,2104-01-02 02:01:00,,WHITE,,,NOT SPECIFIED,Medicare,EMERGENCY ROOM ADMIT,1,141.983333,0,1,1
2,61,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,M,2063-10-21,2119-02-03,2119-01-04 18:12:00,20163.0,WHITE,MARRIED,,CATHOLIC,Private,CLINIC REFERRAL/PREMATURE,2,701.8,0,1,0
3,67,SUBARACHNOID HEMORRHAGE,M,2084-06-05,2157-12-02,2157-12-02 00:45:00,26842.0,WHITE,SINGLE,,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,-0.75,0,1,0
4,84,"GLIOBLASTOMA,NAUSEA",F,2151-10-21,2196-04-17,2196-04-14 04:02:00,16247.0,WHITE,MARRIED,,OTHER,Private,EMERGENCY ROOM ADMIT,2,67.966667,0,1,0


In [112]:
# Add sepsis flag to data
demographics = pd.merge(demographics, sepsis, on= 'SUBJECT_ID', how = 'outer')
demographics.head()

Unnamed: 0,SUBJECT_ID,CAUSE,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG
0,31,STATUS EPILEPTICUS,M,2036-05-17,2108-08-30,2108-08-22 23:27:00,26394.0,WHITE,MARRIED,,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1,168.55,0,1,0,
1,56,HEAD BLEED,F,1804-01-02,2104-01-08,2104-01-02 02:01:00,,WHITE,,,NOT SPECIFIED,Medicare,EMERGENCY ROOM ADMIT,1,141.983333,0,1,1,
2,61,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,M,2063-10-21,2119-02-03,2119-01-04 18:12:00,20163.0,WHITE,MARRIED,,CATHOLIC,Private,CLINIC REFERRAL/PREMATURE,2,701.8,0,1,0,
3,67,SUBARACHNOID HEMORRHAGE,M,2084-06-05,2157-12-02,2157-12-02 00:45:00,26842.0,WHITE,SINGLE,,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,-0.75,0,1,0,
4,84,"GLIOBLASTOMA,NAUSEA",F,2151-10-21,2196-04-17,2196-04-14 04:02:00,16247.0,WHITE,MARRIED,,OTHER,Private,EMERGENCY ROOM ADMIT,2,67.966667,0,1,0,


In [113]:
sepsis_flag = []

for entry in range(len(demographics)):
    if pd.isnull(demographics['SEPSIS_FLAG'][entry]) == True:
        sepsis_flag.append(0)
    else:
        sepsis_flag.append(1)

sepsis_flag = pd.Series(sepsis_flag)
demographics['SEPSIS_FLAG'] = sepsis_flag


In [114]:
# Extracting all AKI patients
codes = ['5845', '5846', '5847', '5848', '5949', '66930', '66932', '66934']

aki_Diagnoses_ICD = Diagnoses_ICD.loc[Diagnoses_ICD['ICD9_CODE'].str.match('|'.join(codes), na=False)] # All AKI patients
aki_Diagnoses = pd.DataFrame(aki_Diagnoses_ICD.SUBJECT_ID)
aki_Diagnoses = aki_Diagnoses.drop_duplicates()
aki_Diagnoses['AKI_DIAGNOSIS_FLAG'] = 1


# Merge with demographics file:
demographics = pd.merge(demographics, aki_Diagnoses, on= 'SUBJECT_ID', how = 'outer')

In [115]:
# Add AKI diagnosis flags to demographic data
aki_flag = []
for entry in range(len(demographics)):
    if demographics['AKI_DIAGNOSIS_FLAG'][entry] == 1:
        aki_flag.append(1)
    else:
        aki_flag.append(0)
aki_flag = pd.Series(aki_flag)
demographics['AKI_DIAGNOSIS_FLAG'] = aki_flag

In [116]:
# Add kidney failure cause of death flag to demographic data
demographics = pd.merge(demographics, kidney_failure_desc, on= 'SUBJECT_ID', how = 'outer')
demographics.head()

Unnamed: 0,SUBJECT_ID,CAUSE_x,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,...,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,CAUSE_y
0,31,STATUS EPILEPTICUS,M,2036-05-17,2108-08-30,2108-08-22 23:27:00,26394.0,WHITE,MARRIED,,...,Medicare,TRANSFER FROM HOSP/EXTRAM,1,168.55,0,1,0,0,0,
1,56,HEAD BLEED,F,1804-01-02,2104-01-08,2104-01-02 02:01:00,,WHITE,,,...,Medicare,EMERGENCY ROOM ADMIT,1,141.983333,0,1,1,0,0,
2,61,NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA,M,2063-10-21,2119-02-03,2119-01-04 18:12:00,20163.0,WHITE,MARRIED,,...,Private,CLINIC REFERRAL/PREMATURE,2,701.8,0,1,0,0,0,
3,67,SUBARACHNOID HEMORRHAGE,M,2084-06-05,2157-12-02,2157-12-02 00:45:00,26842.0,WHITE,SINGLE,,...,Medicare,EMERGENCY ROOM ADMIT,2,-0.75,0,1,0,0,0,
4,84,"GLIOBLASTOMA,NAUSEA",F,2151-10-21,2196-04-17,2196-04-14 04:02:00,16247.0,WHITE,MARRIED,,...,Private,EMERGENCY ROOM ADMIT,2,67.966667,0,1,0,0,0,


In [117]:
kidney_failure_flag = []

for entry in range(len(demographics)):
    if pd.isnull(demographics['CAUSE_y'][entry]) == True:
        kidney_failure_flag.append(0)
    else:
        kidney_failure_flag.append(1)

kidney_failure_flag = pd.Series(kidney_failure_flag)
demographics['KIDNEY_FAILURE_FLAG'] = aki_flag

del demographics['CAUSE_y']

demographics['CAUSE'] = demographics['CAUSE_x']
del demographics['CAUSE_x']
demographics['KIDNEY_FAILURE_FLAG'] = kidney_failure_flag

In [118]:
# Add age in years to simplify interpretation
demographics['ADMIT_AGE'] = demographics['ADMIT_AGE']/365

In [119]:
# Test: Find all demographic information on patients who died from kidney failure
demographics[demographics['KIDNEY_FAILURE_FLAG'] == 1].head()


Unnamed: 0,SUBJECT_ID,GENDER,DOB,DOD,DOA,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,...,ADMISSION_LOCATION,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG,CAUSE
8,106,M,2164-04-01,2192-08-15,2192-08-09 18:34:00,28.375342,UNKNOWN/NOT SPECIFIED,,,UNOBTAINABLE,...,EMERGENCY ROOM ADMIT,1,125.433333,0,1,0,0,0,1,CARDIAC ARREST
17,21,M,2047-04-04,2135-02-08,2135-01-30 20:50:00,87.882192,WHITE,MARRIED,,JEWISH,...,EMERGENCY ROOM ADMIT,2,195.166667,0,1,0,1,0,1,SEPSIS
19,305,F,2052-10-24,2129-09-07,2129-08-20 18:03:00,76.871233,WHITE,MARRIED,PORT,CATHOLIC,...,TRANSFER FROM HOSP/EXTRAM,4,413.95,0,1,0,0,0,1,HEART FAILURE
26,466,M,1873-01-31,2173-02-09,2173-01-31 08:54:00,,WHITE,MARRIED,,JEWISH,...,EMERGENCY ROOM ADMIT,1,207.1,0,1,1,1,0,1,PNEUMONIA/SEPSIS/RAPID AFIB
31,491,M,2090-04-25,2170-04-05,2170-04-02 21:41:00,79.989041,WHITE,MARRIED,,JEWISH,...,EMERGENCY ROOM ADMIT,1,50.316667,0,1,0,0,1,1,CHEST PAIN;RULE-OUT MYOCARDIAL INFARCTION


In [120]:
# Split List into those whose age we know, those still alive, and those too old to have age listed
# Combine those still alive with those who died at a known age
Patients_old = demographics[demographics['OLD_FLAG'] == 1]
Patients_young = demographics[demographics['OLD_FLAG'] == 0]
Patients_alive = demographics[demographics['DEATH_FLAG'] == 0]
Patients_dead = demographics[demographics['DEATH_FLAG'] == 1]
print('# of total patients is %d' % len(demographics,))
print('# of old patients is %d' % len(Patients_old))
print('# of young patients is %d' % len(Patients_young))
print('# of living patients is %d' % len(Patients_alive))
print('# of dead patients is %d' % len(Patients_dead))

# of total patients is 46561
# of old patients is 1991
# of young patients is 44570
# of living patients is 35300
# of dead patients is 11261


In [121]:
# Fill na values with "uknown" outside of age
demographics = demographics.fillna({'ETHNICITY':'UKNOWN','MARITAL_STATUS':'UKNOWN', 'RELIGION': 'UKNOWN', 'LANGUAGE':'UKNOWN', 'INSURANCE':'UKNOWN', 'ADMISSION_LOCATION':'UKNOWN'})

In [122]:
demographics.groupby('GENDER').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
GENDER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
F,34395.473517,51.872883,1.276251,9628.151767,0.222696,0.254741,0.063403,0.008722,0.045764,0.020187
M,34423.208359,51.317735,1.262427,10029.531938,0.206562,0.231799,0.026652,0.008374,0.050206,0.01954


In [123]:
demographics.groupby('INSURANCE').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
INSURANCE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Government,34431.435829,33.601264,1.157086,9051.422027,0.074198,0.097594,0.000668,0.006016,0.032086,0.00869
Medicaid,34405.221409,35.041305,1.297347,8283.222903,0.126944,0.157823,0.003888,0.007319,0.046661,0.013266
Medicare,38173.082174,73.243174,1.383479,10144.273615,0.354606,0.380468,0.091352,0.013863,0.066208,0.033032
Private,30335.024623,35.562743,1.1519,9520.133143,0.095627,0.123061,0.003019,0.003227,0.030765,0.007913
Self Pay,33697.480496,38.750768,1.046099,3629.492754,0.046099,0.180851,0.007092,0.007092,0.033688,0.015957


In [124]:
demographics.groupby('#ADMISSIONS').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
#ADMISSIONS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,34638.924685,49.433908,10947.551282,0.192443,0.208655,0.040941,0.006233,0.036144,0.016315
2,34187.033282,61.811679,7270.184413,0.30454,0.375914,0.050789,0.015968,0.085995,0.031166
3,34061.49256,65.186714,6090.164025,0.372768,0.46503,0.061756,0.022321,0.136161,0.049107
4,29136.781925,64.307946,4778.079698,0.337917,0.502947,0.053045,0.039293,0.200393,0.053045
5,25733.471545,63.771361,3994.280844,0.382114,0.544715,0.04065,0.03252,0.207317,0.04878
6,25840.283186,63.594674,2535.07338,0.336283,0.59292,0.053097,0.044248,0.168142,0.079646
7,28175.392157,60.421096,2658.317857,0.294118,0.509804,0.019608,0.058824,0.098039,0.078431
8,28306.451613,60.123023,1405.428704,0.387097,0.580645,0.0,0.032258,0.258065,0.064516
9,20859.153846,62.096628,1682.958333,0.307692,0.653846,0.0,0.0,0.346154,0.0
10,41016.428571,54.822831,2826.605,0.357143,0.642857,0.142857,0.0,0.285714,0.071429


In [125]:
demographics[demographics['AKI_DIAGNOSIS_FLAG'] == 1].groupby('#ADMISSIONS').size()

#ADMISSIONS
1     1409
2      447
3      183
4      102
5       51
6       19
7        5
8        8
9        9
10       4
11       3
12       1
13       2
14       3
16       1
dtype: int64

In [126]:
demographics.groupby('AKI_DIAGNOSIS_FLAG').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,KIDNEY_FAILURE_FLAG
AKI_DIAGNOSIS_FLAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,34102.691813,50.89291,1.243038,10332.863761,0.211536,0.226723,0.042944,0.007108,0.017105
1,40492.336004,64.577996,1.77036,4828.473386,0.255007,0.540276,0.039163,0.036493,0.073431


In [127]:
demographics.groupby('MARITAL_STATUS').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
MARITAL_STATUS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DIVORCED,40006.459691,62.276258,1.376072,10766.268988,0.272727,0.278731,0.017153,0.007719,0.070326,0.015866
LIFE PARTNER,63181.076923,55.179136,1.461538,20689.975,0.153846,0.076923,0.0,0.0,0.230769,0.0
MARRIED,37915.76128,64.887383,1.306695,10270.386962,0.242935,0.273356,0.027449,0.009456,0.057168,0.022694
SEPARATED,36035.367188,58.376208,1.513021,7806.029348,0.21875,0.263021,0.010417,0.013021,0.070312,0.023438
SINGLE,39679.142068,49.963485,1.361358,8850.548977,0.193413,0.220722,0.024343,0.00941,0.05697,0.017797
UKNOWN,19791.282675,13.055875,1.036817,8521.711636,0.055737,0.096134,0.013397,0.002864,0.010738,0.008386
UNKNOWN (DEFAULT),32384.394137,65.562132,1.117264,10362.02381,0.335505,0.381107,0.068404,0.006515,0.042345,0.022801
WIDOWED,36820.030752,77.706194,1.332418,10103.76052,0.400879,0.409116,0.192019,0.014095,0.058576,0.035512


In [128]:
demographics.groupby('ETHNICITY').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
ETHNICITY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AMERICAN INDIAN/ALASKA NATIVE,22393.636364,22.282939,1.136364,13932.022917,0.113636,0.090909,0.0,0.0,0.022727,0.0
AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,48575.0,52.306849,1.5,4971.6,0.5,0.5,0.0,0.0,0.0,0.0
ASIAN,23682.85505,26.569184,1.158057,6215.750903,0.104086,0.154202,0.020046,0.004626,0.018504,0.009252
ASIAN - ASIAN INDIAN,54308.413793,52.082094,1.568966,1451.42381,0.068966,0.12069,0.034483,0.0,0.086207,0.0
ASIAN - CAMBODIAN,61233.909091,54.512827,1.636364,646.002778,0.181818,0.545455,0.0,0.0,0.272727,0.090909
ASIAN - CHINESE,49858.337662,52.959503,1.233766,4542.416049,0.116883,0.190476,0.030303,0.017316,0.051948,0.021645
ASIAN - FILIPINO,56435.933333,53.490411,1.666667,3413.8375,0.133333,0.266667,0.066667,0.0,0.2,0.066667
ASIAN - JAPANESE,58800.428571,56.310502,1.0,90.408333,0.0,0.285714,0.142857,0.0,0.0,0.0
ASIAN - KOREAN,56058.166667,53.418721,1.166667,408.908333,0.083333,0.166667,0.0,0.0,0.083333,0.0
ASIAN - OTHER,47251.733333,39.567306,1.133333,67.916667,0.0,0.133333,0.0,0.066667,0.066667,0.066667


In [129]:
demographics.groupby('ADMISSION_LOCATION').mean()

Unnamed: 0_level_0,SUBJECT_ID,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
ADMISSION_LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
** INFO NOT AVAILABLE **,21137.763819,0.834994,1.0,5471.673333,0.01005,0.020101,0.005025,0.0,0.0,0.005025
CLINIC REFERRAL/PREMATURE,50512.374061,50.014977,1.385879,4799.879529,0.145018,0.223435,0.049775,0.00661,0.058788,0.016525
EMERGENCY ROOM ADMIT,30047.63271,61.286446,1.329435,9128.245901,0.308199,0.35775,0.070848,0.015942,0.049243,0.029792
HMO REFERRAL/SICK,15715.18,0.611178,1.0,407.745833,0.01,0.04,0.0,0.0,0.0,0.0
PHYS REFERRAL/NORMAL DELI,27891.320531,35.336562,1.152381,16376.730026,0.112724,0.078298,0.006635,0.000234,0.023185,0.002732
TRANSFER FROM HOSP/EXTRAM,34278.477194,64.160154,1.166236,12182.643196,0.275961,0.294464,0.029834,0.00918,0.075875,0.032559
TRANSFER FROM OTHER HEALT,36169.98,64.818712,2.24,4394.273232,0.28,0.6,0.06,0.0,0.22,0.02
TRANSFER FROM SKILLED NUR,28537.748718,74.313027,1.610256,6959.658122,0.548718,0.635897,0.235897,0.025641,0.117949,0.051282
TRSF WITHIN THIS FACILITY,66104.0,49.872603,2.0,,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
lang = demographics.groupby('LANGUAGE').mean()
language = pd.DataFrame()
language['English'] = lang.loc['ENGL']
lang = lang.drop('ENGL')
language['Others'] = lang.mean()
language

Unnamed: 0,English,Others
SUBJECT_ID,53097.312735,44496.716392
ADMIT_AGE,61.641422,68.752789
#ADMISSIONS,1.383941,1.408095
delta,6569.459249,6940.107464
OUTSIDE_DEATH_FLAG,0.192131,0.230451
DEATH_FLAG,0.234919,0.358727
OLD_FLAG,0.049766,0.11561
SEPSIS_FLAG,0.006519,0.028815
AKI_DIAGNOSIS_FLAG,0.059866,0.070909
KIDNEY_FAILURE_FLAG,0.016436,0.06972


In [131]:
# Export data set
demographics.to_csv('Demographics5.csv')