In [1]:
#Imports
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
#Read in data and print out top few
dataset = pd.read_csv('diabetic_data.csv')
dataset.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [3]:
#Using label encoder to transoform 'diag_n' features in attempt to preserve some of the natural order
#This takes place before replacing ? with np.nan because LabelEncoder requires string
from sklearn.preprocessing import LabelEncoder

diag_list = ['diag_1', 'diag_2', 'diag_3']

for item in diag_list:
    labelencoder = LabelEncoder()
    labelencoder.fit(dataset[item])
    dataset[item] = labelencoder.fit_transform(dataset[item])
    non_value = labelencoder.transform(['?'])
    dataset[item].replace(non_value, np.nan, inplace=True)
#Finally find out what became of ? and turn those into np.nan

In [4]:
#Make empty values into np.nan so they can be imputed
dataset.replace('?', np.nan, inplace=True)

In [5]:
#Weight problems shown as 1, not shown as 0
#All keys excpet nan get value of 1 in dict, then nan is added as 0.
replacement_dict = dict.fromkeys(dataset['weight'].unique())
replacement_dict.pop(np.nan)
for item in replacement_dict:
    replacement_dict[item] = 1
replacement_dict[np.nan] = 0
dataset['weight'].replace(replacement_dict, inplace=True)

In [6]:
#Numeric replacement for gender, make unkown into np.nan
gender_replacements = {
    'Female': 0,
    'Male': 1,
    'Unknown/Invalid': np.nan
}
dataset['gender'].replace(gender_replacements, inplace=True)

In [7]:
#Numeric replacement for race
race_replacements = {
    "Caucasian": 0,
    "AfricanAmerican": 1,
    "Hispanic": 2,
    "Other": 3,
    "Asian": 4
}
dataset['race'].replace(race_replacements, inplace=True)

In [8]:
# trying to break the ages into risk groups, we were told to be creative
# also tried using 10, 20 , 30 ...etc. I found similar results
age_replacements = {
    '[0-10)': 0,
    '[10-20)': 0,
    '[20-30)': 0,
    '[30-40)': 1,
    '[40-50)': 1,
    '[50-60)': 2,
    '[60-70)': 2,
    '[70-80)': 3,
    '[80-90)': 3,
    '[90-100)': 4
}
dataset['age'].replace(age_replacements, inplace=True)

In [9]:
#Numerical replacements for Y.
readmitted_replacements = {
  ">30": 0,
  "<30": 1,
  "NO": 0
}
dataset['readmitted'].replace(readmitted_replacements, inplace=True)

In [10]:
#Numerical replacements medicine columns
medicine_replacements = {
    'No': 0,
    'Yes': 1,
    'Ch': 1,
    'Steady': 1,
    'Down': 2,
    'Up': 3
}
medicine_cols = dataset.iloc[:, 24:49]
for col in medicine_cols:
    dataset[col].replace(medicine_replacements, inplace=True)

In [11]:
#Numerical replacements misc columns
misc_replacements = {
    'None': 0,
    '>300': 1,
    '>7': 1,
    'Norm': 2,
    '>200': 3,
    '>8': 3
}

for col in ['max_glu_serum', 'A1Cresult']:
    dataset[col].replace(misc_replacements, inplace=True)

In [12]:
#Print out some of the data after replacements
dataset.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,0.0,0.0,0,0,6,25,1,1,...,0,0,0,0,0,0,0,0,0,0
1,149190,55629189,0.0,0.0,0,0,1,1,7,3,...,0,3,0,0,0,0,0,1,1,0
2,64410,86047875,1.0,0.0,0,0,1,1,7,2,...,0,0,0,0,0,0,0,0,1,0
3,500364,82442376,0.0,1.0,1,0,1,1,7,2,...,0,3,0,0,0,0,0,1,1,0
4,16680,42519267,0.0,1.0,1,0,1,1,7,1,...,0,1,0,0,0,0,0,1,1,0


In [13]:
#Break out data into labels and features. Drop ID columns.
column_choices = list(range(3, 6)) + [9] + list(range(12, 49))
X = dataset.iloc[:, column_choices]
Y = dataset.iloc[:, 49].values

In [14]:
#Impute missing values with mean strategy
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)

In [15]:
#Split data into test and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 0)

In [16]:
#Normalizing and Scaling features
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

In [17]:
#Define common functions for calcualting specifity and sensitivity
def calculate_specificity(true_negative, false_positive):
    return true_negative / (true_negative + false_positive)

def calculate_sensitivity(true_positive, false_positive):
    return true_positive / (true_positive + false_positive)

In [18]:
#Fit LogisticRegression Algorithm to Training Set
from sklearn.linear_model import LogisticRegression
regression_classifier = LogisticRegression(random_state = 0)
regression_classifier.fit(X_train, Y_train)
#predicting the Test set results
Y_pred = regression_classifier.predict(X_test)

#Calculate result values
regression_tn, regression_fp, regression_fn, regression_tp = confusion_matrix(Y_test, Y_pred).ravel()
regression_accuracy = round(accuracy_score(Y_test, Y_pred), 3)
regression_f1 = round(f1_score(Y_test, Y_pred), 3)
regression_specificity = round(calculate_specificity(regression_tn, regression_fp), 3)
regression_sensitivity = round(calculate_sensitivity(regression_tp, regression_fp), 3)

regression_values = [regression_accuracy, regression_sensitivity, regression_specificity, regression_f1]

print(regression_accuracy)

0.887


In [19]:
#Fit GuassianNB Algorithm to Training Set
from sklearn.naive_bayes import GaussianNB
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, Y_train)

#predicting the Test set results
Y_pred = gnb_classifier.predict(X_test)

#Calculate result values
gnb_tn, gnb_fp, gnb_fn, gnb_tp = confusion_matrix(Y_test, Y_pred).ravel()
gnb_accuracy = round(accuracy_score(Y_test, Y_pred), 3)
gnb_f1 = round(f1_score(Y_test, Y_pred), 3)
gnb_specificity = round(calculate_specificity(gnb_tn, gnb_fp), 3)
gnb_sensitivity = round(calculate_sensitivity(gnb_tp, gnb_fp), 3)

gnb_values = [gnb_accuracy, gnb_sensitivity, gnb_specificity, gnb_f1]

print(gnb_accuracy)

0.113


In [None]:
#Fit KNeighborsClassifier Algorithm to Training Set
from sklearn.neighbors import KNeighborsClassifier
neighbor_classifier = KNeighborsClassifier(n_neighbors=5)
neighbor_classifier.fit(X_train, Y_train)

#predicting the Test set results
Y_pred = neighbor_classifier.predict(X_test)

#Calculate result values
knn_tn, knn_fp, knn_fn, knn_tp = confusion_matrix(Y_test, Y_pred).ravel()
knn_accuracy = round(accuracy_score(Y_test, Y_pred), 3)
knn_f1 = round(f1_score(Y_test, Y_pred), 3)
knn_specificity = round(calculate_specificity(knn_tn, knn_fp), 3)
knn_sensitivity = round(calculate_sensitivity(knn_tp, knn_fp), 3)

knn_values = [knn_accuracy, knn_sensitivity, knn_specificity, knn_f1]

print(knn_accuracy)

In [None]:
#Fit DecisionTreeClassifier Algorithm to Training Set
from sklearn import tree
tree_classifier = tree.DecisionTreeClassifier()
tree_classifier.fit(X_train, Y_train)

#predicting the Test set results
Y_pred = tree_classifier.predict(X_test)

#Calculate result values
tree_tn, tree_fp, tree_fn, tree_tp = confusion_matrix(Y_test, Y_pred).ravel()
tree_accuracy = round(accuracy_score(Y_test, Y_pred), 3)
tree_f1 = round(f1_score(Y_test, Y_pred), 3)
tree_specificity = round(calculate_specificity(tree_tn, tree_fp), 3)
tree_sensitivity = round(calculate_sensitivity(tree_tp, tree_fp), 3)

tree_values = [tree_accuracy, tree_sensitivity, tree_specificity, tree_f1]

print(tree_accuracy)

In [None]:
#Fit RandomForestClassifier Algorithm to Training Set
from sklearn.ensemble import RandomForestClassifier
forest_classifer = RandomForestClassifier(max_depth=2, random_state=0)
forest_classifer.fit(X_train, Y_train)

#predicting the Test set results
Y_pred = forest_classifer.predict(X_test)

#Calculate result values
rf_tn, rf_fp, rf_fn, rf_tp = confusion_matrix(Y_test, Y_pred).ravel()
rf_accuracy = round(accuracy_score(Y_test, Y_pred), 3)
rf_f1 = round(f1_score(Y_test, Y_pred), 3)
rf_specificity = round(calculate_specificity(rf_tn, rf_fp), 3)
rf_sensitivity = round(calculate_sensitivity(rf_tp, rf_fp), 3)

rf_values = [rf_accuracy, rf_sensitivity, rf_specificity, rf_f1]

print(rf_accuracy)

In [None]:
import plotly.graph_objects as go

value_labels = ['ACCURACY', 'SENSITIVITY', 'SPECIFICITY', 'F1']
body_values = [value_labels, regression_values, gnb_values, knn_values, tree_values, rf_values]
headers = ['', 'Logistic Regression', 'Naive Bayes', 'K-near Neighbors', 'Decision Tree', 'Random Forest']

fig = go.Figure(data=[go.Table(header=dict(values=headers),
                 cells=dict(values=body_values))
                     ])
fig.show()