In [239]:
import pandas as pd

In [240]:
demographics = pd.read_csv('Demographics5.csv')

In [241]:
ml = demographics.loc[:,'GENDER': 'KIDNEY_FAILURE_FLAG']
del ml['DOB']
del ml['DOD']
del ml['DOA']
ml.head()

Unnamed: 0,GENDER,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG
0,M,72.312329,WHITE,MARRIED,UKNOWN,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1,168.55,0,1,0,0,0,0
1,F,,WHITE,UKNOWN,UKNOWN,NOT SPECIFIED,Medicare,EMERGENCY ROOM ADMIT,1,141.983333,0,1,1,0,0,0
2,M,55.241096,WHITE,MARRIED,UKNOWN,CATHOLIC,Private,CLINIC REFERRAL/PREMATURE,2,701.8,0,1,0,0,0,0
3,M,73.539726,WHITE,SINGLE,UKNOWN,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,-0.75,0,1,0,0,0,0
4,F,44.512329,WHITE,MARRIED,UKNOWN,OTHER,Private,EMERGENCY ROOM ADMIT,2,67.966667,0,1,0,0,0,0


In [242]:
# Get numerical data
ml_data = pd.get_dummies(ml, columns=['GENDER','ETHNICITY','MARITAL_STATUS', 'LANGUAGE', 'RELIGION', 'INSURANCE', 'ADMISSION_LOCATION'])
ml_data = ml_data[ml_data['OLD_FLAG']==0]
ml_data.head()

Unnamed: 0,ADMIT_AGE,#ADMISSIONS,delta,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,SEPSIS_FLAG,AKI_DIAGNOSIS_FLAG,KIDNEY_FAILURE_FLAG,GENDER_F,...,INSURANCE_Self Pay,ADMISSION_LOCATION_** INFO NOT AVAILABLE **,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,ADMISSION_LOCATION_TRANSFER FROM SKILLED NUR,ADMISSION_LOCATION_TRSF WITHIN THIS FACILITY
0,72.312329,1,168.55,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,55.241096,2,701.8,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,73.539726,2,-0.75,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,44.512329,2,67.966667,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
5,81.627397,1,407.866667,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [243]:
# Reduce population to only those with ages
ml_data = ml_data[ml_data['OLD_FLAG']==0]

In [244]:
# Produce output data sets to create models
sepsis = ml_data['SEPSIS_FLAG']
aki_diagnosis = ml_data['AKI_DIAGNOSIS_FLAG']
deaths = ml_data['DEATH_FLAG']
kidney_deaths = ml_data['KIDNEY_FAILURE_FLAG']

In [245]:
# Predict just deaths on non-diagnostic data
del ml_data['SEPSIS_FLAG']
del ml_data['AKI_DIAGNOSIS_FLAG']
del ml_data['DEATH_FLAG']
del ml_data['KIDNEY_FAILURE_FLAG']
del ml_data['OLD_FLAG']
del ml_data['OUTSIDE_DEATH_FLAG']
del ml_data['delta']

ml_data.head()

Unnamed: 0,ADMIT_AGE,#ADMISSIONS,GENDER_F,GENDER_M,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,ETHNICITY_ASIAN,ETHNICITY_ASIAN - ASIAN INDIAN,ETHNICITY_ASIAN - CAMBODIAN,ETHNICITY_ASIAN - CHINESE,...,INSURANCE_Self Pay,ADMISSION_LOCATION_** INFO NOT AVAILABLE **,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,ADMISSION_LOCATION_TRANSFER FROM SKILLED NUR,ADMISSION_LOCATION_TRSF WITHIN THIS FACILITY
0,72.312329,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,55.241096,2,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,73.539726,2,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,44.512329,2,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,81.627397,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [246]:
# Create randomly undersampled data set
from imblearn.under_sampling import RandomUnderSampler 
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_sample(ml_data, deaths)

In [247]:
# Check sampling numbers
y_resampled = pd.Series(y_resampled)
print(y_resampled.value_counts())
print(deaths.value_counts())

1    10136
0    10136
Name: DEATH_FLAG, dtype: int64
0    34434
1    10136
Name: DEATH_FLAG, dtype: int64


In [248]:
# Test an XGBoost with 10 fold cross validation for predicting just deaths on this model 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

base_model_XG = GradientBoostingClassifier()
scores = cross_val_score(base_model_XG, X_resampled, y_resampled, cv=10)
scores.mean()

0.6982519517091259

In [249]:
# Same process for predicting sepsis
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_sample(ml_data, sepsis)

In [250]:
# Check sampling numbers
y_resampled = pd.Series(y_resampled)
print(y_resampled.value_counts())
print(deaths.value_counts())

1    347
0    347
Name: SEPSIS_FLAG, dtype: int64
0    34434
1    10136
Name: DEATH_FLAG, dtype: int64


In [251]:
# Not large enough samples for sepsis alone, instead of undersampling, let's use SMOTE to oversample
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_sample(ml_data, sepsis)

In [252]:
y_resampled = pd.Series(y_resampled)
print(y_resampled.value_counts())

1    44223
0    44223
Name: SEPSIS_FLAG, dtype: int64


In [253]:
# High accuracy with SMOTE, probably overfitting due to oversampling
base_model_XG = GradientBoostingClassifier()
scores = cross_val_score(base_model_XG, X_resampled, y_resampled, cv=10)
scores.mean()

0.9465674722640459

In [254]:
# First, define AKI diagnoses from non-AKI diagnoses
aki_pre = demographics[demographics['OLD_FLAG']==0]
aki_pos = aki_pre[aki_pre['AKI_DIAGNOSIS_FLAG']== 1]
aki_neg = aki_pre[aki_pre['AKI_DIAGNOSIS_FLAG']==0]

# Clean data sets
del aki_neg['CAUSE']
del aki_pos['CAUSE']

del aki_neg['AKI_DIAGNOSIS_FLAG']
del aki_pos['AKI_DIAGNOSIS_FLAG']

del aki_neg['OLD_FLAG']
del aki_pos['OLD_FLAG']

del aki_neg['OUTSIDE_DEATH_FLAG']
del aki_pos['OUTSIDE_DEATH_FLAG']

del aki_neg['SUBJECT_ID']
del aki_pos['SUBJECT_ID']

del aki_neg['DOB']
del aki_pos['DOB']

del aki_neg['DOD']
del aki_pos['DOD']

In [255]:
aki_pos['DOA']
del aki_pos['DOA']
del aki_neg['DOA']

aki_neg['SEPSIS_FLAG']
del aki_neg['SEPSIS_FLAG']
del aki_pos['SEPSIS_FLAG']
del aki_pos['delta']

del aki_pos['Unnamed: 0']

In [256]:
len(aki_pos)

2159

In [257]:
aki_pos.head()

Unnamed: 0,GENDER,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,DEATH_FLAG,KIDNEY_FAILURE_FLAG
10,M,80.564384,OTHER,SINGLE,UKNOWN,OTHER,Medicare,EMERGENCY ROOM ADMIT,1,1,0
31,M,79.989041,WHITE,MARRIED,UKNOWN,JEWISH,Medicare,EMERGENCY ROOM ADMIT,1,1,1
32,M,76.687671,WHITE,SINGLE,UKNOWN,EPISCOPALIAN,Medicare,CLINIC REFERRAL/PREMATURE,1,1,1
34,M,50.520548,WHITE,MARRIED,ENGL,PROTESTANT QUAKER,Private,CLINIC REFERRAL/PREMATURE,2,1,0
53,F,66.019178,HISPANIC OR LATINO,SEPARATED,UKNOWN,CATHOLIC,Medicare,EMERGENCY ROOM ADMIT,1,1,0


In [258]:
# Create outcome data sets
aki_sepsis = pd.Series(aki_pos['KIDNEY_FAILURE_FLAG'])
aki_death = pd.Series(aki_pos['DEATH_FLAG'])

In [259]:
del aki_pos['KIDNEY_FAILURE_FLAG']
del aki_pos['DEATH_FLAG']


In [260]:
# Get dummies
aki_pos = pd.get_dummies(aki_pos, columns=['GENDER','ETHNICITY','MARITAL_STATUS', 'LANGUAGE', 'RELIGION', 'INSURANCE', 'ADMISSION_LOCATION'])

In [261]:
# Check outcome numbers
print(aki_sepsis.value_counts())
print(aki_death.value_counts())

0    1999
1     160
Name: KIDNEY_FAILURE_FLAG, dtype: int64
1    1153
0    1006
Name: DEATH_FLAG, dtype: int64


In [262]:
aki_pos.head()

Unnamed: 0,ADMIT_AGE,#ADMISSIONS,GENDER_F,GENDER_M,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE,ETHNICITY_ASIAN,ETHNICITY_ASIAN - ASIAN INDIAN,ETHNICITY_ASIAN - CAMBODIAN,ETHNICITY_ASIAN - CHINESE,ETHNICITY_ASIAN - FILIPINO,...,INSURANCE_Medicaid,INSURANCE_Medicare,INSURANCE_Private,INSURANCE_Self Pay,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,ADMISSION_LOCATION_TRANSFER FROM SKILLED NUR
10,80.564384,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
31,79.989041,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
32,76.687671,1,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
34,50.520548,2,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
53,66.019178,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [263]:
# We got a 0.636 10-fold cv accuracy score for AKI patients within 1 year mortality
base_model_XG = GradientBoostingClassifier()
scores = cross_val_score(base_model_XG, aki_pos, aki_death, cv=10)
scores.mean()

0.6363888888888889

In [222]:
aki_pos.head()

Unnamed: 0,ADMIT_AGE,#ADMISSIONS,GENDER_F,GENDER_M,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE,ETHNICITY_ASIAN,ETHNICITY_ASIAN - ASIAN INDIAN,ETHNICITY_ASIAN - CAMBODIAN,ETHNICITY_ASIAN - CHINESE,ETHNICITY_ASIAN - FILIPINO,...,INSURANCE_Medicaid,INSURANCE_Medicare,INSURANCE_Private,INSURANCE_Self Pay,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,ADMISSION_LOCATION_TRANSFER FROM SKILLED NUR
10,80.564384,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
31,79.989041,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
32,76.687671,1,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
34,50.520548,2,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
53,66.019178,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
