In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None
#warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

from library.sb_utils import save_file
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve


from numpy import arange
from numpy import argmax

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('../raw_data/DM_df.csv')

In [3]:
df.shape

(68629, 17)

In [4]:
id_types = df[['admission_type_id',
       'discharge_disposition_id', 'admission_source_id']].astype(str) #to object type

non_id = df[['race', 'gender', 'age','time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_diagnoses', 'change', 'diabetesMed', 'ndiag_1', 'ndiag_2',
        'readmit']]
df = pd.concat([id_types,non_id], axis=1)

In [5]:
to_standard = df[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
               'number_diagnoses']]

normalized = (to_standard-to_standard.min())/(to_standard.max()-to_standard.min())

standardized_arr = preprocessing.StandardScaler().fit(to_standard).transform(to_standard)
standardized_df = pd.DataFrame(standardized_arr,columns =['time_in_hospital', 'num_lab_procedures', 
                                                           'num_procedures', 'num_medications','number_diagnoses'])


In [6]:
one_hot = df[['admission_type_id', 'discharge_disposition_id', 'admission_source_id','race', 'gender', 'age',
              'change','diabetesMed', 'ndiag_1', 'ndiag_2']]
non_one_hot = df[[ 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
               'number_diagnoses']]

one_hotted = pd.get_dummies(one_hot, drop_first= True)

df_DM_ns = pd.concat([one_hotted, non_one_hot], axis = 1) #non-standardized/normalized numerical features

df_DM_s = pd.concat([one_hotted, standardized_df], axis = 1) # standardized numerical features

df_DM_n = pd.concat([one_hotted, normalized], axis = 1) #normalized numerical features

# lg_features_s = df_DM_ns[[ 'admission_type_id_6', 'admission_type_id_7',
#         'discharge_disposition_id_11',
#        'discharge_disposition_id_13', 'discharge_disposition_id_14',
#        'discharge_disposition_id_15', 'discharge_disposition_id_2',
#        'discharge_disposition_id_22', 'discharge_disposition_id_24',
#        'discharge_disposition_id_28', 'discharge_disposition_id_3',
#        'discharge_disposition_id_4', 'discharge_disposition_id_5',
#        'discharge_disposition_id_6', 'discharge_disposition_id_7',
#        'discharge_disposition_id_8', 'admission_source_id_20',
#        'admission_source_id_3', 'admission_source_id_4',
#        'admission_source_id_6', 'admission_source_id_7',
#        'admission_source_id_9', 'race_Asian', 
#        'ndiag_1_Neoplasms']]

# rf_features_s = df_DM_s[['admission_type_id', 'discharge_disposition_id_1',
#        'discharge_disposition_id_3', 'discharge_disposition_id_6',
#        'admission_source_id_7', 'race_AfricanAmerican', 'race_Caucasian',
#        'gender_Female', 'gender_Male', 'age_40-50', 'age_50-60', 'age_60-70',
#        'age_70-80', 'age_80-90', 'change_Ch', 'change_No',
#        'ndiag_1_Circulatory', 'ndiag_1_Digestive', 'ndiag_1_Genitourinary',
#        'ndiag_1_Respiratory', 'ndiag_1_ill-defined', 'ndiag_2_Circulatory',
#        'ndiag_2_DiabetesMellitus(DM)', 'ndiag_2_Endocrine(no DM)',
#        'ndiag_2_Genitourinary', 'ndiag_2_Respiratory', 'time_in_hospital',
#        'num_lab_procedures', 'num_procedures', 'num_medications',
#        'number_diagnoses']]

# xb_features = df_DM_s[['admission_type_id_6', 'discharge_disposition_id_11',
#        'discharge_disposition_id_13', 'discharge_disposition_id_14',
#        'admission_source_id_4', 'diabetesMed_Yes']]

In [7]:
df_DM_s = df_DM_s.astype('float64')
df_DM_ns = df_DM_ns.astype('float64')
df_DM_n = df_DM_n.astype('float64')

one_hotted = one_hotted.astype('float64')

# rf_features_s = rf_features.astype('float64')
# lg_features_s = lg_features.astype('float64')
# xb_features = xb_features.astype('float64')

In [8]:
df_DM_ns.head(5)

Unnamed: 0,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_2,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_2,admission_source_id_20,admission_source_id_22,admission_source_id_25,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Male,age_10-20,age_20-30,age_30-40,age_40-50,age_50-60,age_60-70,age_70-80,age_80-90,age_90-100,change_No,diabetesMed_Yes,ndiag_1_DiabetesMellitus(DM),ndiag_1_Digestive,ndiag_1_Endocrine(no DM),ndiag_1_Genitourinary,ndiag_1_Infection/Parasite,ndiag_1_Injury/Poisoning,ndiag_1_MS/Connective,ndiag_1_Mental,ndiag_1_Neoplasms,ndiag_1_Other,ndiag_1_Respiratory,ndiag_1_Skin/Subcutaneous,ndiag_1_ill-defined,ndiag_2_DiabetesMellitus(DM),ndiag_2_Digestive,ndiag_2_Endocrine(no DM),ndiag_2_Genitourinary,ndiag_2_Infection/Parasite,ndiag_2_Injury/Poisoning,ndiag_2_MS/Connective,ndiag_2_Mental,ndiag_2_Neoplasms,ndiag_2_Other,ndiag_2_Respiratory,ndiag_2_Skin/Subcutaneous,ndiag_2_ill-defined,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_diagnoses
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,59.0,0.0,18.0,9.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0,5.0,13.0,6.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,44.0,1.0,16.0,7.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,51.0,0.0,8.0,5.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,31.0,6.0,16.0,9.0


In [9]:
df_DM_ns.shape

(68629, 95)

In [10]:
X = df_DM_ns
#preprocessing.StandardScaler().fit(lg_features_s).transform(lg_features_s)
y = df['readmit']
y.shape

(68629,)

In [11]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

In [None]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 900))
sel.fit(X_train, y_train)
#sel.get_support()
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)

In [None]:
# sel = SelectFromModel(LogisticRegression(l1_ratio = 0.1,C = 4, penalty= 'elasticnet', solver= 'saga',  
#                                          n_jobs=-1,max_iter=200))
# sel.fit(X_train, y_train)
# sel.get_support()
# selected_feat= X_train.columns[(sel.get_support())]
# print(selected_feat)

In [None]:
sel = SelectFromModel(XGBClassifier(booster='gbtree', colsample_bytree=0.4, gamma=0.2,              
              learning_rate=0.1, max_depth=6,min_child_weight=7, n_jobs=-1  ))
sel.fit(X_train, y_train)
#sel.get_support()
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)

In [None]:
# pd.Series(sel.estimator_.feature_importances_.ravel()).hist()

In [None]:
# params = {
# 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
# 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
# 'min_child_weight' : [ 1, 3, 5, 7 ],
# 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
# 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
# }

# xboost = XGBClassifier()

# rs_model=RandomizedSearchCV(xboost,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5)
# rs_model.fit(X_train,y_train)


# rs_model.best_estimator_

In [None]:
xb = XGBClassifier(booster='gbtree', colsample_bytree=0.4, gamma=0.2,              
              learning_rate=0.1, max_depth=6,min_child_weight=7, n_jobs=-1  )

xb.fit(X_train,y_train)
y_pred = xb.predict(X_test)

print('XGBClassifier training score: ', xb.score(X_train,y_train))
print('XGBClassifier test score: ', xb.score(X_test,y_test))
print('mse:', mean_squared_error(y_test, y_pred))

In [None]:
parameters ={'C':[4,5,6,7,15],'penalty':['elasticnet', 'l1', 'l2'], 'solver':['saga'],
            'l1_ratio': [0.1,0.2,0.4,0.6]}

lr=LogisticRegression()
logreg_cv = RandomizedSearchCV(lr, parameters , cv = 10, n_jobs=-1 )
logreg_cv.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
logreg= LogisticRegression(C = 4, penalty= 'l2', solver= 'saga',  n_jobs=-1,max_iter= 500 )
#,class_weight={0:1, 1:2}
#l1_ratio = 0.1,
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)

print('LogisticRegression training score: ', logreg.score(X_train,y_train))
print('LogisticRegression test score: ', logreg.score(X_test,y_test))
print('mse:', mean_squared_error(y_test, y_pred))

In [None]:
lr_probs = logreg.predict_proba(X_test)
yhat = (lr_probs[:, 1] >= 0.001).astype('int')

accuracy = accuracy_score(y_test, yhat)
accuracy

In [None]:
thresholds = arange(0, 1, 0.001)

def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

scores = [f1_score(y_test, to_labels(yhat, t)) for t in thresholds]

ix = argmax(scores)
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix], scores[ix]))

In [None]:
KNN = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)

KNN.fit(X_train,y_train)
y_pred = KNN.predict(X_test)

print('KNN training score: ', KNN.score(X_train,y_train))
print('KNN test score: ', KNN.score(X_test,y_test))
print('mse:', mean_squared_error(y_test, y_pred))

In [None]:
# test_scores = []
# train_scores = []

# for i in range(10,20):

#     knn = KNeighborsClassifier(i)
#     knn.fit(X_train,y_train)
    
#     train_scores.append(knn.score(X_train,y_train))
#     test_scores.append(knn.score(X_test,y_test))

In [13]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
parameters_KNN = {
    'n_neighbors': (10,20, 1),
    'leaf_size': (20,40,1),
    'p': (1,2),
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')}
                   

grid_search_KNN = RandomizedSearchCV(
    estimator_KNN,
    parameters_KNN,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5
)


grid_search_KNN.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ",grid_search_KNN.best_params_)
print("accuracy :",grid_search_KNN.best_score_)

tuned hpyerparameters :(best parameters)  {'weights': 'uniform', 'p': 1, 'n_neighbors': 20, 'metric': 'minkowski', 'leaf_size': 1}
accuracy : 0.5690882597835139


In [None]:
param_grid = {'C': [0.1, 1, 10,0.001],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']}

grid = RandomizedSearchCV(SVC(), param_grid)
 

grid.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ",grid.best_params_)
print("accuracy :",grid.best_score_)

In [None]:
params = {
'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
'min_child_weight' : [ 1, 3, 5, 7 ],
'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
}

xboost = XGBClassifier()

rs_model=RandomizedSearchCV(xboost,params,scoring='roc_auc',n_jobs=-1,cv=5)
rs_model.fit(X_train,y_train)


rs_model.best_estimator_

In [None]:
grid_rf = {'n_estimators': [150, 200, 250],
           'criterion': ['gini'], 
           'max_depth': [450,500,550],
           'max_features': ['auto'],  
           'min_samples_split': [4,5,6],
           'min_samples_leaf': [8,9,10]}

rf = RandomForestClassifier()
gs_rndf = RandomizedSearchCV(rf, grid_rf, cv= 5,n_jobs=-1,n_iter=20)
gs_rndf.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",gs_rndf.best_params_)
print("accuracy :",gs_rndf.best_score_)


In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(range(1,10),train_scores,marker='*',label='Train Score')
p = sns.lineplot(range(1,10),test_scores,marker='o',label='Test Score')

In [None]:
for i in range(1,10):
    print("neighbor(s): ",i,"trainscore is:",round(train_scores[i-1],4),"testscore is:",round(test_scores[i-1],4))

In [None]:
SVM = SVC()

SVM.fit(X_train,y_train)
y_pred = SVM.predict(X_test)

print('SVM training score: ', SVM.score(X_train,y_train))
print('SVM test score: ', SVM.score(X_test,y_test))
print('mse:', mean_squared_error(y_test, y_pred))

In [None]:
lr_probs = KNN.predict_proba(X_test)
yhat = lr_probs[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, yhat)

plt.plot([0,1], [0,1], linestyle='--', label='Base')
plt.plot(fpr, tpr, marker='.', label='Logistic')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

In [None]:
thresholds = arange(0, 1, 0.001)

def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

scores = [f1_score(y_test, to_labels(yhat, t)) for t in thresholds]

ix = argmax(scores)
print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix], scores[ix]))

In [None]:
plot_confusion_matrix(KNN, X_test, y_test) 
plt.grid(False)
plt.show()

In [None]:
y_pred = KNN.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
grid_rf = {'n_estimators': [100, 200, 3 00,500,700,900],
           'criterion': ['entropy', 'gini'], 
           'max_depth': [200,300,400,500,1000,2000],
           'max_features': ['auto', 'sqrt', 'log2'],  
           'min_samples_split': range(2, 10),
           'min_samples_leaf': range(2,10)}

rf = RandomForestClassifier()
gs_rndf = RandomizedSearchCV(estimator = rf, param_distributions = grid_rf, cv= 5,n_jobs=-1,n_iter=100)
gs_rndf.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",gs_rndf.best_params_)
print("accuracy :",gs_rndf.best_score_)

In [None]:
rndf = RandomForestClassifier(criterion ='gini', max_depth=500, max_features='auto', n_estimators= 200,
                              min_samples_split= 5, min_samples_leaf = 9, n_jobs=-1 )

rndf.fit(X_train,y_train)

y_pred = rndf.predict(X_test)
    
print('training score: ', rndf.score(X_train,y_train))
print('test score: ', rndf.score(X_test,y_test))
print('mse', mean_squared_error(y_test, y_pred))

In [None]:
plot_confusion_matrix(rndf, X_test, y_test) 
plt.grid(False)
plt.show()

In [None]:
y_pred = rndf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# def plot_confusion_matrix(y,y_predict):
    
#     from sklearn.metrics import confusion_matrix

#     cm = confusion_matrix(y, y_predict)
#     ax= plt.subplot()
#     sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
#     ax.set_xlabel('Predicted labels')
#     ax.set_ylabel('True labels')
#     ax.set_title('Confusion Matrix'); 
#     ax.xaxis.set_ticklabels(['not admit', 'admit']); ax.yaxis.set_ticklabels(['not admitted', 'admitted'])

# plot_confusion_matrix(y_test, y_pred)