In [1]:
# import libraries needed
import IPython
import pandas as pd
import numpy as np
from statistics import mode
import matplotlib.pyplot as plt  
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, HTML
from pprint import pprint
import xgboost as xgb
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('./data/diabetic_data.csv').sort_values(by=['patient_nbr'])
len(df)

101766

In [3]:
# drop rows

mask = df['gender'] != 'Unknown/Invalid'
mask &= df['diag_1'] != '?'  # TODO: only consider primary diag
mask &= df['medical_specialty'] != '?'
df = df[mask]
print(len(df))

# consider only the first encounter for each patient
df = df.drop_duplicates(subset='patient_nbr', keep='first')

# remove all encounters that resulted in either discharge to a hospice or patient death
# to avoid biasing our analysis
df = df[~df['discharge_disposition_id'].isin([11, 13, 14, 19, 20, 21])]

print(len(df))

51803
37792


# TODO
2. use diag 2, 3
3. use 24 features
4. log transform
6. check merge id

In [4]:
# # drop columns

# drop unrelated columns
df = df.drop(columns=["encounter_id", "patient_nbr"])  

# drop columns with too many missing values
df = df.drop(columns=["payer_code", "weight"]) #, "medical_specialty"]) 

# drop columns having same value in each row
df = df.drop(columns=["citoglipton", "examide"])

# drop diag_2, diag_3
# df = df.drop(columns=["diag_2", "diag_3"])
df = df.drop(columns=["metformin",
    "repaglinide",
    "nateglinide",
    "chlorpropamide",
    "glimepiride",
    "acetohexamide",
    "glipizide",
    "glyburide",
    "tolbutamide",
    "pioglitazone",
    "rosiglitazone",
    "acarbose",
    "miglitol",
    "troglitazone",
    "tolazamide",
    "insulin",
    "glyburide-metformin",
    "glipizide-metformin",
    "glimepiride-pioglitazone",
    "metformin-rosiglitazone",
    "metformin-pioglitazone"
])

In [5]:
# merge ids with same meaning
def merge(df, col, same_ids):
    for ids in same_ids:
        for k in ids[1:]:
            df[col] = df[col].replace(k, ids[0])
    return df

df = merge(df, 'admission_type_id', [
    [1, 2, 7],  # emergence
    [5, 6, 8],  # not avaliable
])
df = merge(df, 'discharge_disposition_id', [
    [18, 25, 26],  # not avaliable
    [1, 6, 8],  # to home
    [2, 3, 4, 5],  # discharge to another hospital
    [10, 12, 15, 16, 17],  # discharge to outpatient
])
df = merge(df, 'admission_source_id', [
    [1, 2, 3], # Referral
    [4, 5, 6, 10, 22, 25], # from another hospital
    [9, 15, 17, 20, 21]  # not avaliable
])

In [6]:
# convert diagnosis information: reference from original paper
def in_range(x, bounds=None, sets=None):
    if bounds is None:
        bounds = []
    elif not isinstance(bounds[0], tuple):
        bounds = [bounds]
    
    if sets is None:
        sets = []
    elif not isinstance(sets, tuple):
        sets = [sets]
        
    for (l, r) in bounds:
        if x >= l and x < r:
            return True
    return x in sets

def convert_diag_func(x):
    ranges = {
        ((390, 460), 785),
        ((460, 520), 786),
        ((520, 580), 787),
        (None, 250),
        ((800, 1000), None),
        ((710, 740), None),
        ((580, 630), 788),
        ((140, 240), None),
    }
    if x == '?':
        return -1
    elif isinstance(x, str) and (x.startswith('V') or x.startswith('E')):
        return 0

    for i, (bounds, sets) in enumerate(ranges):
        if in_range(int(float(x)), bounds, sets):
            return i+1
    else:
        return 0
        
df['diag_1'] = df['diag_1'].apply(convert_diag_func)
df['diag_2'] = df['diag_2'].apply(convert_diag_func)
df['diag_3'] = df['diag_3'].apply(convert_diag_func)

In [7]:
def show_crosstab(col):
    grouped = df.groupby([col, 'readmitted']).size().to_frame().reset_index()
    for c, n in df.groupby([col]).size().iteritems():
        grouped.loc[grouped[col] == c, 0] /= n
    grouped[0] = grouped[0].apply(lambda x: "{:.2%}".format(x))
    crosstab = pd.crosstab(grouped[col], grouped['readmitted'], values=grouped[0], aggfunc=lambda x: x)
    display(crosstab)

show_crosstab('max_glu_serum')
show_crosstab('A1Cresult')
# show_crosstab('medical_specialty')

readmitted,<30,>30,NO
max_glu_serum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
>200,8.69%,34.41%,56.90%
>300,8.49%,35.34%,56.16%
,7.49%,26.40%,66.11%
Norm,8.80%,30.31%,60.89%


readmitted,<30,>30,NO
A1Cresult,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
>7,7.04%,25.48%,67.48%
>8,5.93%,24.73%,69.33%
,7.77%,27.22%,65.01%
Norm,7.47%,23.26%,69.27%


In [8]:
def encode(df, col, d):
    df[col] = df[col].apply(lambda x: d[x])
    return df

def encode_with_none(df, col, d):
    df[col+'_is_none'] = df[col] == 'None'
    d['None'] = -1
    return encode(df, col, d)

df['age'] = df['age'].apply(lambda x: int(x[1]) * 10 + 5)  # encode [0-10) - [90, 100] to 0 - 9
df = encode(df, 'readmitted', {"NO": 0, ">30": 1, "<30": 2})
df = encode_with_none(df, 'max_glu_serum', {">300": 2, ">200": 1, "Norm": 0})
df = encode_with_none(df, 'A1Cresult', {">8": 2, ">7": 1, "Norm": 0})

df = pd.get_dummies(df, drop_first = False, columns=[
    'race', 
    'gender', 
    'admission_type_id', 
    'discharge_disposition_id',
    'admission_source_id', 
    'diag_1', 'diag_2', 'diag_3',
    'change', 
    'diabetesMed',
    'medical_specialty'])

In [9]:
df.head(18).T

Unnamed: 0,4267,5827,67608,17494,2270,14180,18234,15848,2279,7866,25911,1083,2001,11049,2484,17342,15980,4407
age,55,55,85,85,35,65,65,45,75,65,75,65,75,85,65,45,85,65
time_in_hospital,8,2,4,3,5,10,9,2,12,8,1,2,7,7,4,1,1,4
num_lab_procedures,77,49,68,46,49,54,52,50,47,57,31,15,27,77,47,35,36,2
num_procedures,6,1,2,0,0,2,1,5,2,6,1,0,3,0,4,5,0,2
num_medications,33,11,23,20,5,19,16,13,18,31,9,14,16,12,16,13,8,20
number_outpatient,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
number_emergency,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
number_inpatient,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
number_diagnoses,8,3,9,9,3,9,9,9,9,9,7,9,9,9,7,8,5,9
max_glu_serum,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


# Train and test

In [34]:
y = df['readmitted'].values # .astype(int)
X = df.drop(columns='readmitted').values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train, y_train = SMOTE(random_state=20).fit_sample(X_train, y_train)

weight_per_class = np.sqrt(y.shape[0] / (df['readmitted'].value_counts() * 3))
w_train = pd.Series(y_train).map(weight_per_class).values

print(pd.Series(y_train).value_counts())
print(pd.Series(y_test).value_counts())

xg_train = xgb.DMatrix(X_train, label=y_train, weight=w_train)
# xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)

def f1_macro(preds, dtrain):
    labels = dtrain.get_label()
    return "f1_macro", f1_score(labels, preds, average='macro')

param = {
    'booster': 'dart',
    "objective": 'multi:softmax',
    'tree_method': 'hist',
    "num_class": 3,
    "silent": 1,
    
    'lambda': 1,
    'alpha': 0,
    
    "eta": 1,
    "max_depth": 6,
}

num_round = 6

watchlist = [(xg_train, 'train'), (xg_test, 'test')]
bst = xgb.train(param, xg_train, num_round, watchlist, feval=f1_macro)
# test
y_pred = bst.predict(xg_test)

print(classification_report(y_pred, y_test))
display(pd.crosstab(pd.Series(y_pred, name='y_pred'), 
                  pd.Series(y_test, name='y_gt'), 
                  margins=True))
# xgb.cv(param, xg_train, num_round, nfold=3, feval=f1_macro)

2    19847
1    19847
0    19847
dtype: int64
0    4991
1    1989
2     579
dtype: int64
[19:42:40] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-merror:0.372516	test-merror:0.347268	train-f1_macro:0.597717	test-f1_macro:0.407511
[1]	train-merror:0.344032	test-merror:0.338405	train-f1_macro:0.628157	test-f1_macro:0.396412
[2]	train-merror:0.320519	test-merror:0.334039	train-f1_macro:0.658751	test-f1_macro:0.377849
[3]	train-merror:0.308644	test-merror:0.332981	train-f1_macro:0.67201	test-f1_macro:0.376019
[4]	train-merror:0.300062	test-merror:0.334303	train-f1_macro:0.682521	test-f1_macro:0.375559
[5]	train-merror:0.293848	test-merror:0.332451	train-f1_macro:0.689953	test-f1_macro:0.382965
              precision    recall  f1-score   support

         0.0       0.92      0.70      0.80      6585
         1.0       0.21      0.46      0.28       898
         2.0       0.04      0.29      0.07        76

   micro avg       0.67      0.6

y_gt,0,1,2,All
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,4613,1554,418,6585
1.0,348,411,139,898
2.0,30,24,22,76
All,4991,1989,579,7559


In [33]:
weight_per_class

0    0.712166
1    1.116808
2    2.100934
Name: readmitted, dtype: float64