In [1]:
# import libraries needed
import IPython
import pandas as pd
import numpy as np
from statistics import mode
import matplotlib.pyplot as plt  
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display, HTML
from pprint import pprint
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('./data/diabetic_data.csv').sort_values(by=['patient_nbr'])
len(df)

101766

In [3]:
# drop rows

mask = (df['gender'] != 'Unknown/Invalid') & (df['diag_1'] != '?')  # TODO: only consider primary diag
df = df[mask]

# consider only the first encounter for each patient
df = df.drop_duplicates(subset='patient_nbr', keep='first')

# remove all encounters that resulted in either discharge to a hospice or patient death
# to avoid biasing our analysis
df = df[~df['discharge_disposition_id'].isin([11, 13, 14, 19, 20, 21])]

len(df)

69650

In [4]:
# # drop columns

# drop unrelated columns
df = df.drop(columns=["encounter_id", "patient_nbr"])  

# drop columns with too many missing values
df = df.drop(columns=["payer_code", "weight", "medical_specialty"]) 

# drop columns having same value in each row
df = df.drop(columns=["citoglipton", "examide"])

# drop diag_2, diag_3
df = df.drop(columns=["diag_2", "diag_3"])
df = df.drop(columns=["metformin",
    "repaglinide",
    "nateglinide",
    "chlorpropamide",
    "glimepiride",
    "acetohexamide",
    "glipizide",
    "glyburide",
    "tolbutamide",
    "pioglitazone",
    "rosiglitazone",
    "acarbose",
    "miglitol",
    "troglitazone",
    "tolazamide",
    "insulin",
    "glyburide-metformin",
    "glipizide-metformin",
    "glimepiride-pioglitazone",
    "metformin-rosiglitazone",
    "metformin-pioglitazone"
])

In [5]:
# merge ids with same meaning
def merge(df, col, same_ids):
    for ids in same_ids:
        for k in ids[1:]:
            df[col] = df[col].replace(k, ids[0])
    return df

df = merge(df, 'admission_type_id', [
    [1, 2, 7],  # emergence
    [5, 6, 8],  # not avaliable
])
df = merge(df, 'discharge_disposition_id', [
    [18, 25, 26],  # not avaliable
    [1, 6, 8],  # to home
    [2, 3, 4, 5],  # discharge to another hospital
    [10, 12, 15, 16, 17],  # discharge to outpatient
])
df = merge(df, 'admission_source_id', [
    [1, 2, 3], # Referral
    [4, 5, 6, 10, 22, 25], # from another hospital
    [9, 15, 17, 20, 21]  # not avaliable
])

In [6]:
# convert diagnosis information: reference from original paper
def in_range(x, bounds=None, sets=None):
    if bounds is None:
        bounds = []
    elif not isinstance(bounds[0], tuple):
        bounds = [bounds]
    
    if sets is None:
        sets = []
    elif not isinstance(sets, tuple):
        sets = [sets]
        
    for (l, r) in bounds:
        if x >= l and x < r:
            return True
    return x in sets

def convert_diag_level_2_func(x):
    ranges = {
        ((390, 460), 785),
        ((460, 520), 786),
        ((520, 580), 787),
        (None, 250),
        ((800, 1000), None),
        ((710, 740), None),
        ((580, 630), 788),
        ((140, 240), None),
        ((790, 799), (780, 781, 784)),
        (((240, 250), (251, 280)), None),
        ((680, 710), 782),
        ((1, 140), None),
        ((290, 320), None),
        ((280, 290), None),
        ((320, 360), None),
        ((630, 680), None),
        ((360, 390), None),
        ((740, 760), None),
    }
    if x == '?':
        return -1
    elif isinstance(x, str) and (x.startswith('V') or x.startswith('E')):
        return 0

    for i, (bounds, sets) in enumerate(ranges):
        if in_range(int(float(x)), bounds, sets):
            return i+1
    else:
        return len(ranges) + 2
    
def convert_diag_func(x):
    ranges = {
        ((390, 460), 785),
        ((460, 520), 786),
        ((520, 580), 787),
        (None, 250),
        ((800, 1000), None),
        ((710, 740), None),
        ((580, 630), 788),
        ((140, 240), None),
    }
    if x == '?':
        return -1
    elif isinstance(x, str) and (x.startswith('V') or x.startswith('E')):
        return 0

    for i, (bounds, sets) in enumerate(ranges):
        if in_range(int(float(x)), bounds, sets):
            return i+1
    else:
        return 0
        
df['diag_1'] = df['diag_1'].apply(convert_diag_func)

In [7]:
def encode(df, col, d):
    df[col] = df[col].apply(lambda x: d[x])
    return df

df['age'] = df['age'].apply(lambda x: int(x[1]) * 10 + 5)  # encode [0-10) - [90, 100] to 0 - 9
df = encode(df, 'readmitted', {"NO": 0, ">30": 1, "<30": 2})
df_pd = pd.get_dummies(df, columns=['race', 'gender', 'admission_type_id', 'discharge_disposition_id',
                                    'admission_source_id', 'max_glu_serum', 'A1Cresult', 'diag_1',
                                    'change', 'diabetesMed'], 
                       drop_first = False)

In [8]:
df_pd.head(18).T

Unnamed: 0,4267,5827,67608,17494,2270,14180,18234,15848,61382,2279,7866,25911,1083,2001,11049,2484,17342,23541
age,55,55,85,85,35,65,65,45,75,75,65,75,65,75,85,65,45,85
time_in_hospital,8,2,4,3,5,10,9,2,14,12,8,1,2,7,7,4,1,7
num_lab_procedures,77,49,68,46,49,54,52,50,21,47,57,31,15,27,77,47,35,51
num_procedures,6,1,2,0,0,2,1,5,0,2,6,1,0,3,0,4,5,0
num_medications,33,11,23,20,5,19,16,13,15,18,31,9,14,16,12,16,13,13
number_outpatient,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0
number_emergency,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
number_inpatient,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,1
number_diagnoses,8,3,9,9,3,9,9,9,7,9,9,7,9,9,9,7,8,9
readmitted,2,0,0,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0


In [9]:
# # encoding non-numeric values manully

# def encode(df, col, d):
#     df[col] = df[col].apply(lambda x: d[x])
#     return df

# df = encode(df, 'gender', {"Male": 1, "Female": 0})
# df = encode(df, 'readmitted', {"NO": 0, ">30": 1, "<30": 2})
# df = encode(df, 'change', {'No': 0, 'Ch': 1})
# df = encode(df, 'diabetesMed', {'No': 0, 'Yes': 1})
# df = encode(df, 'max_glu_serum', {'>300': 2, ">200": 1, 'Norm': 0, 'None': -10})
# df = encode(df, 'A1Cresult', {'>8': 2, ">7": 1, 'Norm': 0, 'None': -10})
# df['age'] = df['age'].apply(lambda x: int(x[1]))  # encode [0-10) - [90, 100] to 0 - 9

# # # encode race
# df = encode(df, "race", {k: i for i, k in enumerate(df['race'].unique())})

# # encode automaticlly
# # for col in ["gender", "readmitted", "change", "diabetesMed", "max_glu_serum", "A1Cresult", "race", "age"]:
# #     d = {k: i for i, k in enumerate(df[col].unique())}
# #     df[col] = df[col].apply(lambda x: d[x])

# Train and test

In [16]:
X = df_pd.values
y = df['readmitted'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [17]:
# X = df.values[:, :-1]
# y = df.values[:, -1].astype(int)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [33]:
# X_train, y_train = SMOTE(random_state=20).fit_sample(X_train, y_train)

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)

param = {
    "objective": 'multi:softmax',
    "num_class": 3,
    "silent": 1,
    
    "eta": 0.0001,
    "max_depth": 6,
}

def f1_macro(preds, dtrain):
    labels = dtrain.get_label()
    return "f1_macro", f1_score(labels, preds, average='macro')

num_round = 10

xgb.cv(param, xg_train, num_round, nfold=5, feval=f1_macro)

Unnamed: 0,train-f1_macro-mean,train-f1_macro-std,train-merror-mean,train-merror-std,test-f1_macro-mean,test-f1_macro-std,test-merror-mean,test-merror-std
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [34]:
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
bst = xgb.train(param, xg_train, 6, watchlist)
# test
y_pred = bst.predict(xg_test)

print("Precision is {0:.2f}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall is {0:.2f}".format(recall_score(y_test, y_pred, average='macro')))
print("F1-marco is {0:.2f}".format(f1_score(y_test, y_pred, average='macro')))

[0]	train-merror:0	test-merror:0
[1]	train-merror:0	test-merror:0
[2]	train-merror:0	test-merror:0
[3]	train-merror:0	test-merror:0
[4]	train-merror:0	test-merror:0
[5]	train-merror:0	test-merror:0
Precision is 1.00
Recall is 1.00
F1-marco is 1.00
