'''

Predictive Patient Re-admission Model

Date: 14/02/18

Author: Annye Braca

Email: annyebraca@gmail.com

'''

In [170]:
# Importing Libraries
from __future__ import absolute_import, division, print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%pylab inline
from scipy.stats import spearmanr
from pylab import rcParams
import seaborn as sn 
from sklearn.preprocessing import scale 
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn import feature_extraction
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import model_selection

Populating the interactive namespace from numpy and matplotlib


In [186]:
# Read in data
data = pd.read_csv('diabetic_data.csv')
discharge = pd.ExcelFile('dataset_diabetes/discharge.xlsx')
discharge = discharge.parse('Data')
admission = pd.ExcelFile('dataset_diabetes/admission.xlsx')
admission = admission.parse('Data')

In [187]:
data.replace('?' , np.nan, inplace=True)
data.drop(['weight', 'medical_specialty', 'payer_code', 'diag_2', 'diag_3', 'encounter_id', 'patient_nbr'], axis=1, inplace=True)

In [189]:
# Map discharge and admission id descriptions
discharge_map = dict(zip(discharge['discharge_disposition_id'], discharge['description']))
data['discharge_description'] = data['discharge_disposition_id'].map(discharge_map)
admission_map = dict(zip(admission['admission_source_id'], admission['description']))
data['admission_description'] = data['admission_source_id'].map(admission_map)
# bring in admission type id map 



In [191]:
data.dropna(inplace=True)

In [193]:
def one_hot_dataframe(data, cols, replace=False):
    """Create one-hot encodings."""
    vec = feature_extraction.DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return (data, vecData)

In [194]:
data, med = one_hot_dataframe(data, ["admission_description",
                                     "discharge_description",
                                     "race",
                                     "gender",
                                     "metformin",
                                     "repaglinide",
                                     "nateglinide",
                                     "chlorpropamide",
                                     "glimepiride",
                                     "acetohexamide",
                                     "glipizide",
                                     "glyburide",
                                     "tolbutamide",
                                     "pioglitazone",
                                     "rosiglitazone",
                                     "acarbose",
                                     "miglitol",
                                     "troglitazone",
                                     "tolazamide",
                                     "examide",
                                     "citoglipton",
                                     "insulin",
                                     "glyburide-metformin",
                                     "glipizide-metformin",
                                     "glimepiride-pioglitazone",
                                     "metformin-rosiglitazone",
                                     "metformin-pioglitazone",
                                     'A1Cresult',
                                     'max_glu_serum',
                                     'change',
                                     'diabetesMed'], replace=True)


In [195]:
inds = pd.isnull(data).any(1).nonzero()[0]
data.ix[inds]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


Unnamed: 0,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,...,rosiglitazone=Down,rosiglitazone=No,rosiglitazone=Steady,rosiglitazone=Up,tolazamide=No,tolazamide=Steady,tolbutamide=No,tolbutamide=Steady,troglitazone=No,troglitazone=Steady


In [196]:
# Mapping Age
age_ranges = list(set(data['age']))#
age_ordinal = [1, 8, 7, 6, 0, 9, 4, 2, 5, 3]
age_map = dict(zip(age_ranges, age_ordinal))
data['age'] = data['age'].map(age_map)

In [197]:
#mapping readmitted
readmitted_ranges = list(set(data['readmitted']))
readmitted_ordinal = [2, 1, 0]
readmitted_map = dict(zip(readmitted_ranges, readmitted_ordinal))
data['readmitted'] = data['readmitted'].map(readmitted_map)

In [198]:
X = data
inds = pd.isnull(X).any(1).nonzero()[0]
y = data['readmitted']
X.drop('readmitted', axis=1, inplace=True)

In [199]:
# Separate trainning and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [201]:
# Multinomial Logistic Regression - Classic train-test split (1-fold)
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [204]:
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.577


In [205]:
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[10380    28  1442]
 [ 1612    28   852]
 [ 5608    33  2295]]


In [119]:

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('loo.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
X.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

AttributeError: 'function' object has no attribute 'to_excel'

# Decision Trees

In [35]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5)
clf = clf.fit(X_train,y_train)


In [39]:
import pydotplus,StringIO
dot_data = StringIO.StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
graph.write_png('medicines.png') 
from IPython.core.display import Image 
Image(filename='medicines.png')

InvocationException: GraphViz's executables not found

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
med.shape